From 8167cea73facbcf73135c5bd21cfddc2734362bd Mon Sep 17 00:00:00 2001 From: Rory Byrne Date: Mon, 20 Apr 2026 22:41:57 +0100 Subject: [PATCH 1/9] feat: add compound filter expressions and typed metadata tables Implement feature 076 with compound FilterExpr trees (And/Or/Not/Predicate) replacing flat Filter lists, typed field references (metadata.field, features.hook.column), and per-schema metadata tables for optimized queries. Add MetadataStore port with PostgreSQL implementation for schema-keyed DDL lifecycle, EnsureMetadataTable/InsertRecordMetadata event handlers, and discovery adapter support for cross-domain joins with configurable bounds. test: add comprehensive integration and unit tests for metadata system Add extensive test coverage for metadata table management, discovery filtering, and schema evolution including: - Cross-domain JOIN tests between records and features - Typed metadata table creation and evolution validation - Filter expression parsing and validation with AND/OR/NOT support - Metadata service integration with PostgreSQL backend - Schema evolution bounds checking and non-additive rejection - Event handler tests for metadata table lifecycle - Field reference resolution and cursor encoding tests --- server/Justfile | 14 +- .../076_add_feature_tables_record_srn_fks.py | 85 +++ .../076_add_metadata_schema_and_catalog.py | 51 ++ .../versions/076_add_records_schema_srn.py | 47 ++ .../application/api/v1/routes/discovery.py | 20 +- server/osa/application/di.py | 2 + server/osa/config.py | 5 + .../deposition/event/convention_registered.py | 9 +- .../domain/deposition/service/convention.py | 2 + server/osa/domain/discovery/model/refs.py | 78 +++ server/osa/domain/discovery/model/value.py | 161 ++++- .../discovery/port/field_definition_reader.py | 8 + .../osa/domain/discovery/port/read_store.py | 40 +- .../domain/discovery/query/search_features.py | 10 +- .../domain/discovery/query/search_records.py | 15 +- .../osa/domain/discovery/service/discovery.py | 324 +++++++-- .../osa/domain/discovery/util/di/provider.py | 8 +- server/osa/domain/feature/event/__init__.py | 4 +- .../domain/feature/event/convention_ready.py | 14 - .../feature/handler/create_feature_tables.py | 21 +- server/osa/domain/metadata/__init__.py | 0 server/osa/domain/metadata/event/__init__.py | 0 .../osa/domain/metadata/handler/__init__.py | 0 .../metadata/handler/ensure_metadata_table.py | 50 ++ .../handler/insert_record_metadata.py | 29 + server/osa/domain/metadata/model/__init__.py | 0 server/osa/domain/metadata/model/value.py | 16 + server/osa/domain/metadata/port/__init__.py | 0 .../domain/metadata/port/metadata_store.py | 40 ++ .../osa/domain/metadata/service/__init__.py | 0 .../osa/domain/metadata/service/metadata.py | 32 + server/osa/domain/metadata/util/__init__.py | 0 .../osa/domain/metadata/util/di/__init__.py | 3 + .../osa/domain/metadata/util/di/provider.py | 11 + .../domain/record/event/record_published.py | 9 +- server/osa/domain/record/model/aggregate.py | 5 +- server/osa/domain/record/service/record.py | 23 + server/osa/domain/shared/error.py | 9 +- server/osa/domain/shared/model/hook.py | 2 +- server/osa/infrastructure/event/di.py | 5 + .../persistence/adapter/discovery.py | 613 +++++++++++++++--- .../persistence/column_mapper.py | 1 - server/osa/infrastructure/persistence/di.py | 9 + .../persistence/feature_store.py | 2 +- .../persistence/feature_table.py | 18 +- .../persistence/mappers/record.py | 15 +- .../persistence/metadata_store.py | 274 ++++++++ .../persistence/metadata_table.py | 85 +++ .../osa/infrastructure/persistence/tables.py | 22 + server/tests/integration/conftest.py | 48 +- .../persistence/test_discovery_pagination.py | 79 +++ .../persistence/test_feature_store.py | 14 +- .../persistence/test_metadata_store.py | 292 +++++++++ .../test_discovery_compound_postgres.py | 157 +++++ .../test_discovery_cross_join_postgres.py | 137 ++++ .../test_discovery_records_typed_and.py | 286 ++++++++ .../integration/test_ensure_metadata_table.py | 172 +++++ .../test_event_batch_processing.py | 3 + .../test_insert_record_metadata.py | 104 +++ .../test_metadata_additive_evolve_postgres.py | 106 +++ .../test_non_additive_rejected_postgres.py | 104 +++ .../deposition/test_convention_registered.py | 11 +- .../domain/deposition/test_event_chain.py | 6 + .../discovery/test_discovery_service.py | 290 ++++----- .../discovery/test_get_feature_catalog.py | 16 +- .../domain/discovery/test_search_features.py | 90 ++- .../domain/discovery/test_search_records.py | 4 +- .../tests/unit/domain/discovery/test_value.py | 49 +- .../domain/feature/test_convention_ready.py | 42 -- .../feature/test_create_feature_tables.py | 74 +-- .../feature/test_insert_record_features.py | 7 + .../unit/domain/index/test_fanout_listener.py | 9 + .../domain/record/test_get_record_handler.py | 3 +- .../domain/record/test_record_features.py | 4 +- .../record/test_record_published_enriched.py | 8 +- .../unit/domain/record/test_record_service.py | 93 ++- .../tests/unit/test_field_ref_resolution.py | 42 ++ .../unit/test_filter_expr_and_compile.py | 197 ++++++ server/tests/unit/test_filter_expr_or_not.py | 131 ++++ .../tests/unit/test_metadata_column_mapper.py | 35 + server/tests/unit/test_metadata_service.py | 39 ++ server/tests/unit/test_metadata_slug.py | 40 ++ .../unit/test_record_schema_srn_immutable.py | 28 + server/uv.lock | 1 + 84 files changed, 4249 insertions(+), 663 deletions(-) create mode 100644 server/migrations/versions/076_add_feature_tables_record_srn_fks.py create mode 100644 server/migrations/versions/076_add_metadata_schema_and_catalog.py create mode 100644 server/migrations/versions/076_add_records_schema_srn.py create mode 100644 server/osa/domain/discovery/model/refs.py delete mode 100644 server/osa/domain/feature/event/convention_ready.py create mode 100644 server/osa/domain/metadata/__init__.py create mode 100644 server/osa/domain/metadata/event/__init__.py create mode 100644 server/osa/domain/metadata/handler/__init__.py create mode 100644 server/osa/domain/metadata/handler/ensure_metadata_table.py create mode 100644 server/osa/domain/metadata/handler/insert_record_metadata.py create mode 100644 server/osa/domain/metadata/model/__init__.py create mode 100644 server/osa/domain/metadata/model/value.py create mode 100644 server/osa/domain/metadata/port/__init__.py create mode 100644 server/osa/domain/metadata/port/metadata_store.py create mode 100644 server/osa/domain/metadata/service/__init__.py create mode 100644 server/osa/domain/metadata/service/metadata.py create mode 100644 server/osa/domain/metadata/util/__init__.py create mode 100644 server/osa/domain/metadata/util/di/__init__.py create mode 100644 server/osa/domain/metadata/util/di/provider.py create mode 100644 server/osa/infrastructure/persistence/metadata_store.py create mode 100644 server/osa/infrastructure/persistence/metadata_table.py create mode 100644 server/tests/integration/persistence/test_discovery_pagination.py create mode 100644 server/tests/integration/persistence/test_metadata_store.py create mode 100644 server/tests/integration/test_discovery_compound_postgres.py create mode 100644 server/tests/integration/test_discovery_cross_join_postgres.py create mode 100644 server/tests/integration/test_discovery_records_typed_and.py create mode 100644 server/tests/integration/test_ensure_metadata_table.py create mode 100644 server/tests/integration/test_insert_record_metadata.py create mode 100644 server/tests/integration/test_metadata_additive_evolve_postgres.py create mode 100644 server/tests/integration/test_non_additive_rejected_postgres.py delete mode 100644 server/tests/unit/domain/feature/test_convention_ready.py create mode 100644 server/tests/unit/test_field_ref_resolution.py create mode 100644 server/tests/unit/test_filter_expr_and_compile.py create mode 100644 server/tests/unit/test_filter_expr_or_not.py create mode 100644 server/tests/unit/test_metadata_column_mapper.py create mode 100644 server/tests/unit/test_metadata_service.py create mode 100644 server/tests/unit/test_metadata_slug.py create mode 100644 server/tests/unit/test_record_schema_srn_immutable.py diff --git a/server/Justfile b/server/Justfile index 6f96d22..cf80594 100644 --- a/server/Justfile +++ b/server/Justfile @@ -82,21 +82,23 @@ cli *ARGS: PG_USER := env("PG_USER", "postgres") PG_PASS := env("PG_PASS", "osa") PG_HOST := env("PG_HOST", "localhost") -PG_PORT := env("PG_PORT", "5432") +# Dedicated host port for the integration-test DB, distinct from dev (5432) +# so both can coexist. Override with TEST_PG_PORT if 55432 is also taken. +TEST_PG_PORT := env("TEST_PG_PORT", "55432") TEST_DB := "osa_test" -TEST_DB_URL := "postgresql+asyncpg://" + PG_USER + ":" + PG_PASS + "@" + PG_HOST + ":" + PG_PORT + "/" + TEST_DB +TEST_DB_URL := "postgresql+asyncpg://" + PG_USER + ":" + PG_PASS + "@" + PG_HOST + ":" + TEST_PG_PORT + "/" + TEST_DB # Create test database (idempotent) test-db-create: - PGPASSWORD={{PG_PASS}} psql -h {{PG_HOST}} -p {{PG_PORT}} -U {{PG_USER}} \ + PGPASSWORD={{PG_PASS}} psql -h {{PG_HOST}} -p {{TEST_PG_PORT}} -U {{PG_USER}} \ -tc "SELECT 1 FROM pg_database WHERE datname='{{TEST_DB}}'" \ | grep -q 1 || \ - PGPASSWORD={{PG_PASS}} psql -h {{PG_HOST}} -p {{PG_PORT}} -U {{PG_USER}} \ + PGPASSWORD={{PG_PASS}} psql -h {{PG_HOST}} -p {{TEST_PG_PORT}} -U {{PG_USER}} \ -c "CREATE DATABASE {{TEST_DB}}" # Drop test database test-db-drop: - PGPASSWORD={{PG_PASS}} psql -h {{PG_HOST}} -p {{PG_PORT}} -U {{PG_USER}} \ + PGPASSWORD={{PG_PASS}} psql -h {{PG_HOST}} -p {{TEST_PG_PORT}} -U {{PG_USER}} \ -c "DROP DATABASE IF EXISTS {{TEST_DB}} WITH (FORCE)" # Run integration tests (persistence tests skip if PG is not available) @@ -105,7 +107,7 @@ test-integration: # Run integration tests with PG: ensure DB running → wipe → create → migrate → test → wipe test-integration-pg: - just --justfile ../Justfile db-up + POSTGRES_PORT={{TEST_PG_PORT}} just --justfile ../Justfile db-up @just test-db-drop @just test-db-create OSA_DATABASE__URL="{{TEST_DB_URL}}" \ diff --git a/server/migrations/versions/076_add_feature_tables_record_srn_fks.py b/server/migrations/versions/076_add_feature_tables_record_srn_fks.py new file mode 100644 index 0000000..3c4cc57 --- /dev/null +++ b/server/migrations/versions/076_add_feature_tables_record_srn_fks.py @@ -0,0 +1,85 @@ +"""076_add_feature_tables_record_srn_fks + +For each row currently registered in the ``public.feature_tables`` catalog, +add a foreign-key constraint on ``features..record_srn`` referencing +``records.srn`` with ``ON DELETE CASCADE``. Bundles GitHub #75. + +Idempotent: skips any hook whose FK is already present (detected by naming +convention). No-op on greenfield deployments where the catalog is empty. + +Revision ID: 076_feature_fks +Revises: 076_records_schema_srn +Create Date: 2026-04-19 + +""" + +from typing import Sequence, Union + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "076_feature_fks" +down_revision: Union[str, Sequence[str], None] = "076_records_schema_srn" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +FK_NAME_TEMPLATE = "fk_features_{hook}_record_srn" + + +def upgrade() -> None: + conn = op.get_bind() + rows = conn.execute( + # text() via op.execute-style select + _select_hooks() + ).fetchall() + + for row in rows: + hook = row[0] + fk_name = FK_NAME_TEMPLATE.format(hook=hook) + # Check if constraint already exists + exists = conn.execute(_check_constraint(fk_name)).scalar() + if exists: + continue + + conn.execute(_add_fk_sql(hook, fk_name)) + + +def downgrade() -> None: + conn = op.get_bind() + rows = conn.execute(_select_hooks()).fetchall() + for row in rows: + hook = row[0] + fk_name = FK_NAME_TEMPLATE.format(hook=hook) + exists = conn.execute(_check_constraint(fk_name)).scalar() + if not exists: + continue + conn.execute(_drop_fk_sql(hook, fk_name)) + + +def _select_hooks(): + from sqlalchemy import text + + return text("SELECT hook_name FROM feature_tables") + + +def _check_constraint(fk_name: str): + from sqlalchemy import text + + return text("SELECT 1 FROM pg_constraint WHERE conname = :fk_name").bindparams(fk_name=fk_name) + + +def _add_fk_sql(hook: str, fk_name: str): + from sqlalchemy import text + + return text( + f'ALTER TABLE features."{hook}" ' + f'ADD CONSTRAINT "{fk_name}" ' + f"FOREIGN KEY (record_srn) REFERENCES records(srn) ON DELETE CASCADE" + ) + + +def _drop_fk_sql(hook: str, fk_name: str): + from sqlalchemy import text + + return text(f'ALTER TABLE features."{hook}" DROP CONSTRAINT "{fk_name}"') diff --git a/server/migrations/versions/076_add_metadata_schema_and_catalog.py b/server/migrations/versions/076_add_metadata_schema_and_catalog.py new file mode 100644 index 0000000..1ba5601 --- /dev/null +++ b/server/migrations/versions/076_add_metadata_schema_and_catalog.py @@ -0,0 +1,51 @@ +"""076_add_metadata_schema_and_catalog + +Create the ``metadata`` PostgreSQL schema and the ``public.metadata_tables`` +catalog table. Dynamic per-schema metadata tables will live inside the +``metadata`` schema; the catalog indexes them by schema identity+major. + +Revision ID: 076_metadata_catalog +Revises: add_deliver_after +Create Date: 2026-04-19 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects.postgresql import JSONB + +# revision identifiers, used by Alembic. +revision: str = "076_metadata_catalog" +down_revision: Union[str, Sequence[str], None] = "add_deliver_after" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.execute('CREATE SCHEMA IF NOT EXISTS "metadata"') + + op.create_table( + "metadata_tables", + sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("schema_identity", sa.Text(), nullable=False), + sa.Column("schema_slug", sa.Text(), nullable=False), + sa.Column("schema_major", sa.Integer(), nullable=False), + sa.Column("schema_versions", JSONB(), nullable=False), + sa.Column("pg_table", sa.Text(), nullable=False), + sa.Column("metadata_schema", JSONB(), nullable=False), + sa.Column("created_at", sa.DateTime(timezone=True), nullable=False), + sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False), + sa.UniqueConstraint( + "schema_identity", + "schema_major", + name="uq_metadata_tables_identity_major", + ), + sa.UniqueConstraint("pg_table", name="uq_metadata_tables_pg_table"), + ) + + +def downgrade() -> None: + op.drop_table("metadata_tables") + op.execute('DROP SCHEMA IF EXISTS "metadata" CASCADE') diff --git a/server/migrations/versions/076_add_records_schema_srn.py b/server/migrations/versions/076_add_records_schema_srn.py new file mode 100644 index 0000000..ea3e225 --- /dev/null +++ b/server/migrations/versions/076_add_records_schema_srn.py @@ -0,0 +1,47 @@ +"""076_add_records_schema_srn + +Add a ``records.schema_srn`` column so Record linkage to its typed metadata +shape is first-class (FR-008). Backfill from the linked convention's +``schema_srn`` before tightening to NOT NULL. + +Greenfield deployments with no records will skip the backfill (the UPDATE is +a no-op) and go straight to NOT NULL. + +Revision ID: 076_records_schema_srn +Revises: 076_metadata_catalog +Create Date: 2026-04-19 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "076_records_schema_srn" +down_revision: Union[str, Sequence[str], None] = "076_metadata_catalog" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column("records", sa.Column("schema_srn", sa.Text(), nullable=True)) + + op.execute( + """ + UPDATE records r + SET schema_srn = c.schema_srn + FROM conventions c + WHERE c.srn = r.convention_srn + AND r.schema_srn IS NULL + """ + ) + + op.alter_column("records", "schema_srn", nullable=False) + op.create_index("idx_records_schema_srn", "records", ["schema_srn"]) + + +def downgrade() -> None: + op.drop_index("idx_records_schema_srn", table_name="records") + op.drop_column("records", "schema_srn") diff --git a/server/osa/application/api/v1/routes/discovery.py b/server/osa/application/api/v1/routes/discovery.py index cdf511b..129ab9e 100644 --- a/server/osa/application/api/v1/routes/discovery.py +++ b/server/osa/application/api/v1/routes/discovery.py @@ -6,10 +6,7 @@ from fastapi import APIRouter from pydantic import BaseModel, Field -from osa.domain.discovery.model.value import ( - Filter, - SortOrder, -) +from osa.domain.discovery.model.value import FilterExpr, SortOrder from osa.domain.discovery.query.get_feature_catalog import ( GetFeatureCatalog, GetFeatureCatalogHandler, @@ -25,6 +22,7 @@ SearchRecordsHandler, SearchRecordsResult, ) +from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN router = APIRouter( prefix="/discovery", @@ -37,7 +35,9 @@ class RecordSearchRequest(BaseModel): - filters: list[Filter] = [] + schema_srn: SchemaSRN | None = None + convention_srn: ConventionSRN | None = None + filter: FilterExpr | None = None q: str | None = None sort: str = "published_at" order: SortOrder = SortOrder.DESC @@ -56,7 +56,8 @@ class FeatureCatalogResponse(BaseModel): class FeatureSearchRequest(BaseModel): - filters: list[Filter] = [] + schema_srn: SchemaSRN | None = None + filter: FilterExpr | None = None record_srn: str | None = None sort: str = "id" order: SortOrder = SortOrder.DESC @@ -81,7 +82,9 @@ async def search_records( """Search and filter published records.""" result: SearchRecordsResult = await handler.run( SearchRecords( - filters=body.filters, + filter_expr=body.filter, + schema_srn=body.schema_srn, + convention_srn=body.convention_srn, q=body.q, sort=body.sort, order=body.order, @@ -115,7 +118,8 @@ async def search_features( result: SearchFeaturesResult = await handler.run( SearchFeatures( hook_name=hook_name, - filters=body.filters, + filter_expr=body.filter, + schema_srn=body.schema_srn, record_srn=body.record_srn, sort=body.sort, order=body.order, diff --git a/server/osa/application/di.py b/server/osa/application/di.py index f7e7ff3..5227635 100644 --- a/server/osa/application/di.py +++ b/server/osa/application/di.py @@ -8,6 +8,7 @@ from osa.domain.deposition.util.di import DepositionProvider from osa.domain.discovery.util.di import DiscoveryProvider from osa.domain.feature.util.di import FeatureProvider +from osa.domain.metadata.util.di import MetadataProvider from osa.domain.semantics.util.di.provider import SemanticsProvider from osa.domain.shared.event import EventHandler from osa.domain.validation.util.di import ValidationProvider @@ -49,6 +50,7 @@ def create_container( HttpProvider(), DepositionProvider(), FeatureProvider(), + MetadataProvider(), SemanticsProvider(), ValidationProvider(), AuthProvider(), diff --git a/server/osa/config.py b/server/osa/config.py index ffeda85..46e0924 100644 --- a/server/osa/config.py +++ b/server/osa/config.py @@ -241,6 +241,11 @@ class Config(BaseSettings): runner: RunnerConfig = RunnerConfig() host_data_dir: str | None = None # Host path for OSA_DATA_DIR (sibling container mounts) + # Discovery filter-tree bounds (feature 076) + discovery_max_filter_depth: int = 10 + discovery_max_predicates: int = 200 + discovery_max_cross_domain_joins: int = 10 + model_config = { "env_prefix": "OSA_", "env_file": ".env", diff --git a/server/osa/domain/deposition/event/convention_registered.py b/server/osa/domain/deposition/event/convention_registered.py index 7a318a7..987b28a 100644 --- a/server/osa/domain/deposition/event/convention_registered.py +++ b/server/osa/domain/deposition/event/convention_registered.py @@ -1,8 +1,9 @@ """ConventionRegistered event - emitted when a new convention is created.""" +from osa.domain.semantics.model.value import FieldDefinition from osa.domain.shared.event import Event, EventId from osa.domain.shared.model.hook import HookDefinition -from osa.domain.shared.model.srn import ConventionSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN class ConventionRegistered(Event): @@ -10,8 +11,14 @@ class ConventionRegistered(Event): Carries hook definitions so downstream handlers (e.g. CreateFeatureTables) can create feature tables without querying the convention repository. + + Carries ``schema_srn`` and ``schema_fields`` so downstream handlers (e.g. + EnsureMetadataTable) can create and evolve typed metadata tables without + traversing the semantics repository. """ id: EventId convention_srn: ConventionSRN + schema_srn: SchemaSRN + schema_fields: list[FieldDefinition] = [] hooks: list[HookDefinition] = [] diff --git a/server/osa/domain/deposition/service/convention.py b/server/osa/domain/deposition/service/convention.py index 79492e9..b099acd 100644 --- a/server/osa/domain/deposition/service/convention.py +++ b/server/osa/domain/deposition/service/convention.py @@ -68,6 +68,8 @@ async def create_convention( ConventionRegistered( id=EventId(uuid4()), convention_srn=srn, + schema_srn=created_schema.srn, + schema_fields=created_schema.fields, hooks=convention.hooks, ) ) diff --git a/server/osa/domain/discovery/model/refs.py b/server/osa/domain/discovery/model/refs.py new file mode 100644 index 0000000..f5783a0 --- /dev/null +++ b/server/osa/domain/discovery/model/refs.py @@ -0,0 +1,78 @@ +"""Typed field references used inside Predicate.field. + +Two kinds of references are supported: + +- :class:`MetadataFieldRef` — resolves to a column in ``metadata._v``. +- :class:`FeatureFieldRef` — resolves to a column in ``features.``. + +Wire format is a dotted path (``metadata.`` or +``features..``). :func:`parse_field_ref` parses the wire form +into a typed reference and validates identifier shape. +""" + +from __future__ import annotations + +import re +from typing import Literal, Union + +from pydantic import BaseModel + +_IDENT = re.compile(r"^[a-z][a-z0-9_]*$") + + +class MetadataFieldRef(BaseModel): + path: Literal["metadata"] = "metadata" + field: str + + def dotted(self) -> str: + return f"metadata.{self.field}" + + +class FeatureFieldRef(BaseModel): + path: Literal["features"] = "features" + hook: str + column: str + + def dotted(self) -> str: + return f"features.{self.hook}.{self.column}" + + +FieldRef = Union[MetadataFieldRef, FeatureFieldRef] + + +def parse_field_ref(dotted: str) -> FieldRef: + """Parse a dotted-path field reference into its typed form. + + Raises :class:`ValueError` when the path shape or identifier doesn't match + the documented grammar. + """ + if not isinstance(dotted, str): + raise ValueError(f"Expected dotted string, got {type(dotted).__name__}") + + parts = dotted.split(".") + if not parts: + raise ValueError(f"Empty field reference: {dotted!r}") + + head = parts[0] + if head == "metadata": + if len(parts) != 2: + raise ValueError(f"metadata.* refs must be exactly two dotted parts, got {dotted!r}") + field = parts[1] + if not _IDENT.match(field): + raise ValueError(f"Invalid metadata field identifier: {field!r}") + return MetadataFieldRef(field=field) + + if head == "features": + if len(parts) != 3: + raise ValueError(f"features.* refs must be exactly three dotted parts, got {dotted!r}") + hook, column = parts[1], parts[2] + if not _IDENT.match(hook): + raise ValueError(f"Invalid hook identifier: {hook!r}") + if not _IDENT.match(column): + raise ValueError(f"Invalid feature column identifier: {column!r}") + return FeatureFieldRef(hook=hook, column=column) + + raise ValueError( + f"Unknown field reference prefix {head!r} in {dotted!r}. " + "Expected 'metadata.' or 'features..'." + ) diff --git a/server/osa/domain/discovery/model/value.py b/server/osa/domain/discovery/model/value.py index 1abab81..65549a0 100644 --- a/server/osa/domain/discovery/model/value.py +++ b/server/osa/domain/discovery/model/value.py @@ -1,4 +1,10 @@ -"""Discovery domain value objects — filters, cursors, result types.""" +"""Discovery domain value objects — filters, cursors, result types. + +Feature 076 replaces the flat ``Filter`` list with a compound ``FilterExpr`` +discriminated union (``And``/``Or``/``Not``/``Predicate``). Field references +inside predicates are typed (:class:`MetadataFieldRef` or +:class:`FeatureFieldRef`); the dotted wire form is parsed at the API boundary. +""" from __future__ import annotations @@ -6,19 +12,29 @@ import json from datetime import datetime from enum import StrEnum -from typing import Any +from typing import Annotated, Any, Literal, Union -from pydantic import BaseModel +from pydantic import BaseModel, Field, model_validator +from osa.domain.discovery.model.refs import ( + FeatureFieldRef, + MetadataFieldRef, + parse_field_ref, +) from osa.domain.semantics.model.value import FieldType from osa.domain.shared.model.srn import RecordSRN class FilterOperator(StrEnum): EQ = "eq" - CONTAINS = "contains" + NEQ = "neq" + GT = "gt" GTE = "gte" + LT = "lt" LTE = "lte" + IN = "in" + CONTAINS = "contains" + IS_NULL = "is_null" class SortOrder(StrEnum): @@ -26,19 +42,136 @@ class SortOrder(StrEnum): DESC = "desc" -class Filter(BaseModel): - field: str - operator: FilterOperator - value: str | float | bool +FieldRef = Annotated[ + Union[MetadataFieldRef, FeatureFieldRef], + Field(discriminator="path"), +] + + +PredicateValue = Union[str, int, float, bool, list[str], list[float], None] + + +class Predicate(BaseModel): + kind: Literal["predicate"] = "predicate" + field: FieldRef + op: FilterOperator + value: PredicateValue = None + + @model_validator(mode="before") + @classmethod + def _coerce_field(cls, data: Any) -> Any: + """Accept dotted-path strings for ``field`` and parse them into the typed form.""" + if isinstance(data, dict): + raw = data.get("field") + if isinstance(raw, str): + data = {**data, "field": parse_field_ref(raw)} + return data + + +class And(BaseModel): + kind: Literal["and"] = "and" + operands: list["FilterExpr"] = Field(min_length=2) +class Or(BaseModel): + kind: Literal["or"] = "or" + operands: list["FilterExpr"] = Field(min_length=2) + + +class Not(BaseModel): + kind: Literal["not"] = "not" + operand: "FilterExpr" + + +FilterExpr = Annotated[ + Union[And, Or, Not, Predicate], + Field(discriminator="kind"), +] + +# Resolve forward references +And.model_rebuild() +Or.model_rebuild() +Not.model_rebuild() + + +# Operators valid per column type for metadata/feature column validation. VALID_OPERATORS: dict[FieldType, set[FilterOperator]] = { - FieldType.TEXT: {FilterOperator.EQ, FilterOperator.CONTAINS}, - FieldType.URL: {FilterOperator.EQ, FilterOperator.CONTAINS}, - FieldType.NUMBER: {FilterOperator.EQ, FilterOperator.GTE, FilterOperator.LTE}, - FieldType.DATE: {FilterOperator.EQ, FilterOperator.GTE, FilterOperator.LTE}, - FieldType.BOOLEAN: {FilterOperator.EQ}, - FieldType.TERM: {FilterOperator.EQ}, + FieldType.TEXT: { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.IN, + FilterOperator.CONTAINS, + FilterOperator.IS_NULL, + }, + FieldType.URL: { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.IN, + FilterOperator.CONTAINS, + FilterOperator.IS_NULL, + }, + FieldType.TERM: { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.IN, + FilterOperator.IS_NULL, + }, + FieldType.NUMBER: { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.GT, + FilterOperator.GTE, + FilterOperator.LT, + FilterOperator.LTE, + FilterOperator.IN, + FilterOperator.IS_NULL, + }, + FieldType.DATE: { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.GT, + FilterOperator.GTE, + FilterOperator.LT, + FilterOperator.LTE, + FilterOperator.IN, + FilterOperator.IS_NULL, + }, + FieldType.BOOLEAN: {FilterOperator.EQ, FilterOperator.IS_NULL}, +} + +# Operators valid against raw JSON-schema primitive types (used for feature columns +# whose Column.json_type is a JSON Schema primitive rather than a semantic FieldType). +JSON_TYPE_OPERATORS: dict[str, set[FilterOperator]] = { + "string": { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.IN, + FilterOperator.CONTAINS, + FilterOperator.IS_NULL, + }, + "number": { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.GT, + FilterOperator.GTE, + FilterOperator.LT, + FilterOperator.LTE, + FilterOperator.IN, + FilterOperator.IS_NULL, + }, + "integer": { + FilterOperator.EQ, + FilterOperator.NEQ, + FilterOperator.GT, + FilterOperator.GTE, + FilterOperator.LT, + FilterOperator.LTE, + FilterOperator.IN, + FilterOperator.IS_NULL, + }, + "boolean": {FilterOperator.EQ, FilterOperator.IS_NULL}, + "array": {FilterOperator.EQ, FilterOperator.IS_NULL}, + "object": {FilterOperator.EQ, FilterOperator.IS_NULL}, } diff --git a/server/osa/domain/discovery/port/field_definition_reader.py b/server/osa/domain/discovery/port/field_definition_reader.py index b763c8a..5f4974e 100644 --- a/server/osa/domain/discovery/port/field_definition_reader.py +++ b/server/osa/domain/discovery/port/field_definition_reader.py @@ -6,6 +6,7 @@ if TYPE_CHECKING: from osa.domain.semantics.model.value import FieldType + from osa.domain.shared.model.srn import SchemaSRN class FieldDefinitionReader(Protocol): @@ -15,3 +16,10 @@ async def get_all_field_types(self) -> dict[str, FieldType]: Raises ValidationError if same field name has conflicting types across schemas. """ ... + + async def get_fields_for_schema(self, schema_srn: "SchemaSRN") -> dict[str, FieldType]: + """Return field_name -> FieldType for a specific schema's current major version. + + Falls back to an empty dict when the schema is unknown to the node. + """ + ... diff --git a/server/osa/domain/discovery/port/read_store.py b/server/osa/domain/discovery/port/read_store.py index 6ac054d..34faec6 100644 --- a/server/osa/domain/discovery/port/read_store.py +++ b/server/osa/domain/discovery/port/read_store.py @@ -1,4 +1,4 @@ -"""DiscoveryReadStore port — read-only access to records and feature data.""" +"""DiscoveryReadStore port — read-only access to records, features, metadata.""" from __future__ import annotations @@ -8,49 +8,43 @@ from osa.domain.discovery.model.value import ( FeatureCatalogEntry, FeatureRow, - Filter, + FilterExpr, RecordSummary, SortOrder, ) from osa.domain.semantics.model.value import FieldType - from osa.domain.shared.model.srn import RecordSRN + from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN class DiscoveryReadStore(Protocol): async def search_records( self, - filters: list[Filter], + filter_expr: "FilterExpr | None", + schema_srn: "SchemaSRN | None", + convention_srn: "ConventionSRN | None", text_fields: list[str], q: str | None, sort: str, - order: SortOrder, + order: "SortOrder", cursor: dict | None, limit: int, - field_types: dict[str, FieldType] | None = None, - ) -> list[RecordSummary]: - """Search and filter published records.""" + field_types: "dict[str, FieldType] | None" = None, + ) -> "list[RecordSummary]": + """Search published records with a compound filter.""" ... - async def get_feature_catalog(self) -> list[FeatureCatalogEntry]: - """List all feature tables with column schemas and record counts.""" - ... - - async def get_feature_table_schema(self, hook_name: str) -> FeatureCatalogEntry | None: - """Look up a single feature table's schema by hook name. + async def get_feature_catalog(self) -> "list[FeatureCatalogEntry]": ... - Returns None if the hook_name is not found. - """ - ... + async def get_feature_table_schema(self, hook_name: str) -> "FeatureCatalogEntry | None": ... async def search_features( self, hook_name: str, - filters: list[Filter], - record_srn: RecordSRN | None, + filter_expr: "FilterExpr | None", + schema_srn: "SchemaSRN | None", + record_srn: "RecordSRN | None", sort: str, - order: SortOrder, + order: "SortOrder", cursor: dict | None, limit: int, - ) -> list[FeatureRow]: - """Search and filter feature rows.""" - ... + ) -> "list[FeatureRow]": ... diff --git a/server/osa/domain/discovery/query/search_features.py b/server/osa/domain/discovery/query/search_features.py index 4019dcf..f5d8a61 100644 --- a/server/osa/domain/discovery/query/search_features.py +++ b/server/osa/domain/discovery/query/search_features.py @@ -2,19 +2,20 @@ from osa.domain.discovery.model.value import ( FeatureSearchResult, - Filter, + FilterExpr, SortOrder, ) from osa.domain.discovery.service.discovery import DiscoveryService from osa.domain.shared.authorization.gate import public from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import RecordSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaSRN from osa.domain.shared.query import Query, QueryHandler, Result class SearchFeatures(Query): hook_name: str - filters: list[Filter] = [] + filter_expr: FilterExpr | None = None + schema_srn: SchemaSRN | None = None record_srn: str | None = None sort: str = "id" order: SortOrder = SortOrder.DESC @@ -41,7 +42,8 @@ async def run(self, cmd: SearchFeatures) -> SearchFeaturesResult: raise ValidationError(str(exc), field="record_srn") from exc result: FeatureSearchResult = await self.discovery_service.search_features( hook_name=cmd.hook_name, - filters=cmd.filters, + filter_expr=cmd.filter_expr, + schema_srn=cmd.schema_srn, record_srn=record_srn, sort=cmd.sort, order=cmd.order, diff --git a/server/osa/domain/discovery/query/search_records.py b/server/osa/domain/discovery/query/search_records.py index eed8957..da27009 100644 --- a/server/osa/domain/discovery/query/search_records.py +++ b/server/osa/domain/discovery/query/search_records.py @@ -1,17 +1,22 @@ """SearchRecords query — search and filter published records.""" +from typing import Any + from osa.domain.discovery.model.value import ( - Filter, + FilterExpr, RecordSearchResult, SortOrder, ) from osa.domain.discovery.service.discovery import DiscoveryService from osa.domain.shared.authorization.gate import public +from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN from osa.domain.shared.query import Query, QueryHandler, Result class SearchRecords(Query): - filters: list[Filter] = [] + filter_expr: FilterExpr | None = None + schema_srn: SchemaSRN | None = None + convention_srn: ConventionSRN | None = None q: str | None = None sort: str = "published_at" order: SortOrder = SortOrder.DESC @@ -20,7 +25,7 @@ class SearchRecords(Query): class SearchRecordsResult(Result): - results: list[dict] + results: list[dict[str, Any]] cursor: str | None has_more: bool @@ -31,7 +36,9 @@ class SearchRecordsHandler(QueryHandler[SearchRecords, SearchRecordsResult]): async def run(self, cmd: SearchRecords) -> SearchRecordsResult: result: RecordSearchResult = await self.discovery_service.search_records( - filters=cmd.filters, + filter_expr=cmd.filter_expr, + schema_srn=cmd.schema_srn, + convention_srn=cmd.convention_srn, q=cmd.q, sort=cmd.sort, order=cmd.order, diff --git a/server/osa/domain/discovery/service/discovery.py b/server/osa/domain/discovery/service/discovery.py index 5d9bab0..1e8f850 100644 --- a/server/osa/domain/discovery/service/discovery.py +++ b/server/osa/domain/discovery/service/discovery.py @@ -1,15 +1,30 @@ -"""DiscoveryService — read-only business logic for record and feature search.""" +"""DiscoveryService — read-only business logic for record and feature search. + +Validates the compound ``FilterExpr`` tree (bounds, field resolution, operator +compatibility) before handing it to the read store for SQL compilation. +""" from __future__ import annotations import logging +from typing import Any +from osa.config import Config +from osa.domain.discovery.model.refs import ( + FeatureFieldRef, + MetadataFieldRef, +) from osa.domain.discovery.model.value import ( + JSON_TYPE_OPERATORS, VALID_OPERATORS, + And, FeatureCatalog, FeatureSearchResult, - Filter, + FilterExpr, FilterOperator, + Not, + Or, + Predicate, RecordSearchResult, SortOrder, decode_cursor, @@ -19,7 +34,7 @@ from osa.domain.discovery.port.read_store import DiscoveryReadStore from osa.domain.semantics.model.value import FieldType from osa.domain.shared.error import NotFoundError, ValidationError -from osa.domain.shared.model.srn import RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN from osa.domain.shared.service import Service logger = logging.getLogger(__name__) @@ -30,56 +45,59 @@ class DiscoveryService(Service): read_store: DiscoveryReadStore field_reader: FieldDefinitionReader + config: Config async def search_records( self, - filters: list[Filter], + filter_expr: FilterExpr | None, + schema_srn: SchemaSRN | None, + convention_srn: ConventionSRN | None, q: str | None, sort: str, order: SortOrder, cursor: str | None, limit: int, + *, + allow_compound: bool = True, ) -> RecordSearchResult: - """Validate inputs and delegate record search to the read store.""" + """Validate the filter tree and delegate record search to the read store. + + ``allow_compound`` is a staged flag — US1 delivers AND-only + Predicate + support; US2 flips this to allow OR/NOT. Callers should leave it True + once US2 lands. + """ if limit < 1 or limit > 100: raise ValidationError("limit must be between 1 and 100", field="limit") - field_map = await self.field_reader.get_all_field_types() + schema_field_map: dict[str, FieldType] = {} + if schema_srn is not None: + schema_field_map = await self.field_reader.get_fields_for_schema(schema_srn) - # Validate filter fields and operators - for f in filters: - if f.field not in field_map: - raise ValidationError( - f"Unknown field '{f.field}': not defined in any registered schema", - field=f.field, - ) - field_type = field_map[f.field] - valid_ops = VALID_OPERATORS[field_type] - if f.operator not in valid_ops: - raise ValidationError( - f"Operator '{f.operator}' is not valid for field '{f.field}' " - f"(type '{field_type}'). Valid: {sorted(valid_ops)}", - field=f.field, - ) + global_field_map = await self.field_reader.get_all_field_types() + effective_field_map = schema_field_map or global_field_map + + if filter_expr is not None: + self._validate_tree(filter_expr, allow_compound=allow_compound) + await self._validate_refs(filter_expr, schema_srn, effective_field_map) - # Validate sort field - if sort != "published_at" and sort not in field_map: + # Sort field validation + if sort != "published_at" and sort not in effective_field_map: raise ValidationError( - f"Unknown sort field '{sort}': not defined in any registered schema", + f"Unknown sort field '{sort}': not defined in registered schema", field="sort", ) - # Decode cursor - decoded_cursor = None + decoded_cursor: dict[str, Any] | None = None if cursor is not None: try: decoded_cursor = decode_cursor(cursor) except ValueError as exc: raise ValidationError(str(exc), field="cursor") from exc - # Identify text-searchable fields for free-text q text_fields = [ - name for name, ft in field_map.items() if ft in (FieldType.TEXT, FieldType.URL) + name + for name, ft in effective_field_map.items() + if ft in (FieldType.TEXT, FieldType.URL) ] if q and not text_fields: raise ValidationError( @@ -88,14 +106,16 @@ async def search_records( ) results = await self.read_store.search_records( - filters=filters, + filter_expr=filter_expr, + schema_srn=schema_srn, + convention_srn=convention_srn, text_fields=text_fields, q=q, sort=sort, order=order, cursor=decoded_cursor, limit=limit + 1, - field_types=field_map, + field_types=effective_field_map, ) has_more = len(results) > limit @@ -116,68 +136,51 @@ async def search_records( ) async def get_feature_catalog(self) -> FeatureCatalog: - """Delegate feature catalog listing to the read store.""" entries = await self.read_store.get_feature_catalog() return FeatureCatalog(tables=entries) async def search_features( self, hook_name: str, - filters: list[Filter], + filter_expr: FilterExpr | None, + schema_srn: SchemaSRN | None, record_srn: RecordSRN | None, sort: str, order: SortOrder, cursor: str | None, limit: int, + *, + allow_compound: bool = True, ) -> FeatureSearchResult: - """Validate inputs and delegate feature search to the read store.""" if limit < 1 or limit > 100: raise ValidationError("limit must be between 1 and 100", field="limit") - # Look up the feature table schema entry = await self.read_store.get_feature_table_schema(hook_name) if entry is None: raise NotFoundError(f"Feature table not found: {hook_name}") - # Build column type map from catalog schema col_map: dict[str, str] = {col.name: col.type for col in entry.columns} - # Also allow sort/filter on record_srn col_map["record_srn"] = "string" - # Map JSON types to FieldType equivalents for operator validation - json_type_to_ops: dict[str, set[FilterOperator]] = { - "string": {FilterOperator.EQ, FilterOperator.CONTAINS}, - "number": {FilterOperator.EQ, FilterOperator.GTE, FilterOperator.LTE}, - "integer": {FilterOperator.EQ, FilterOperator.GTE, FilterOperator.LTE}, - "boolean": {FilterOperator.EQ}, - "array": {FilterOperator.EQ}, - "object": {FilterOperator.EQ}, - } - - # Validate filters - for f in filters: - if f.field not in col_map: - raise ValidationError( - f"Unknown column '{f.field}' in feature table '{hook_name}'", - field=f.field, - ) - json_type = col_map[f.field] - valid_ops = json_type_to_ops.get(json_type, {FilterOperator.EQ}) - if f.operator not in valid_ops: - raise ValidationError( - f"Operator '{f.operator}' is not valid for column '{f.field}' " - f"(type '{json_type}'). Valid: {sorted(valid_ops)}", - field=f.field, - ) + schema_field_map: dict[str, FieldType] = {} + if schema_srn is not None: + schema_field_map = await self.field_reader.get_fields_for_schema(schema_srn) + + if filter_expr is not None: + self._validate_tree(filter_expr, allow_compound=allow_compound) + self._validate_feature_refs( + filter_expr, + this_hook=hook_name, + feature_col_map=col_map, + schema_field_map=schema_field_map, + ) - # Validate sort column if sort != "id" and sort not in col_map: raise ValidationError( f"Unknown sort column '{sort}' in feature table '{hook_name}'", field="sort", ) - # Decode cursor try: decoded_cursor = decode_cursor(cursor) if cursor else None except ValueError as exc: @@ -185,7 +188,8 @@ async def search_features( rows = await self.read_store.search_features( hook_name=hook_name, - filters=filters, + filter_expr=filter_expr, + schema_srn=schema_srn, record_srn=record_srn, sort=sort, order=order, @@ -204,8 +208,190 @@ async def search_features( sort_val = last.data.get(sort) next_cursor = encode_cursor(sort_val, last.row_id) - return FeatureSearchResult( - rows=rows, - cursor=next_cursor, - has_more=has_more, - ) + return FeatureSearchResult(rows=rows, cursor=next_cursor, has_more=has_more) + + # ------------------------- internal helpers ------------------------- + + def _validate_tree(self, expr: FilterExpr, *, allow_compound: bool) -> None: + """Enforce tree bounds (depth, predicate count, joins) + compound gating.""" + depth = _tree_depth(expr) + predicates = list(_iter_predicates(expr)) + + if depth > self.config.discovery_max_filter_depth: + raise ValidationError( + f"Filter tree depth {depth} exceeds configured maximum " + f"{self.config.discovery_max_filter_depth} (OSA_DISCOVERY_MAX_FILTER_DEPTH).", + field="filter", + code="filter_depth_exceeded", + ) + if len(predicates) > self.config.discovery_max_predicates: + raise ValidationError( + f"Filter tree has {len(predicates)} predicate leaves, exceeds " + f"configured maximum {self.config.discovery_max_predicates} " + "(OSA_DISCOVERY_MAX_PREDICATES).", + field="filter", + code="filter_predicates_exceeded", + ) + + distinct_hooks: set[str] = set() + for p in predicates: + if isinstance(p.field, FeatureFieldRef): + distinct_hooks.add(p.field.hook) + if len(distinct_hooks) > self.config.discovery_max_cross_domain_joins: + raise ValidationError( + f"Filter tree joins {len(distinct_hooks)} distinct feature hooks, " + f"exceeds configured maximum " + f"{self.config.discovery_max_cross_domain_joins} " + "(OSA_DISCOVERY_MAX_CROSS_DOMAIN_JOINS).", + field="filter", + code="filter_joins_exceeded", + ) + + if not allow_compound: + for node in _iter_nodes(expr): + if isinstance(node, (Or, Not)): + raise ValidationError( + "Compound OR/NOT filters are not enabled in this build.", + field="filter", + code="compound_disabled", + ) + + async def _validate_refs( + self, + expr: FilterExpr, + schema_srn: SchemaSRN | None, + field_map: dict[str, FieldType], + ) -> None: + """Resolve each predicate's field and check operator compatibility.""" + feature_catalog: dict[str, dict[str, str]] | None = None + for p in _iter_predicates(expr): + if isinstance(p.field, MetadataFieldRef): + if schema_srn is None and not field_map: + raise ValidationError( + f"Unknown metadata field '{p.field.field}': " + "no schema_srn provided and no registered schemas.", + field=p.field.dotted(), + code="unknown_field", + ) + field_name = p.field.field + if field_name not in field_map: + raise ValidationError( + f"Unknown metadata field '{field_name}' for the provided schema.", + field=p.field.dotted(), + code="unknown_field", + ) + self._check_operator_for_field_type( + p, field_type=field_map[field_name], path=p.field.dotted() + ) + elif isinstance(p.field, FeatureFieldRef): + if feature_catalog is None: + feature_catalog = await self._load_feature_catalog() + cols = feature_catalog.get(p.field.hook) + if cols is None: + raise ValidationError( + f"Unknown feature hook '{p.field.hook}'.", + field=p.field.dotted(), + code="unknown_hook", + ) + if p.field.column not in cols: + raise ValidationError( + f"Unknown feature column '{p.field.column}' on hook '{p.field.hook}'.", + field=p.field.dotted(), + code="unknown_field", + ) + json_type = cols[p.field.column] + self._check_operator_for_json_type(p, json_type=json_type, path=p.field.dotted()) + + def _validate_feature_refs( + self, + expr: FilterExpr, + *, + this_hook: str, + feature_col_map: dict[str, str], + schema_field_map: dict[str, FieldType], + ) -> None: + """Variant of ref validation for feature search — local hook columns by default.""" + for p in _iter_predicates(expr): + if isinstance(p.field, MetadataFieldRef): + if p.field.field not in schema_field_map: + raise ValidationError( + f"Unknown metadata field '{p.field.field}' for the provided schema.", + field=p.field.dotted(), + code="unknown_field", + ) + self._check_operator_for_field_type( + p, field_type=schema_field_map[p.field.field], path=p.field.dotted() + ) + elif isinstance(p.field, FeatureFieldRef): + if p.field.hook != this_hook: + # Cross-hook joins handled by US3 — accepted here, resolved in adapter. + continue + if p.field.column not in feature_col_map: + raise ValidationError( + f"Unknown feature column '{p.field.column}' on hook '{this_hook}'.", + field=p.field.dotted(), + code="unknown_field", + ) + self._check_operator_for_json_type( + p, json_type=feature_col_map[p.field.column], path=p.field.dotted() + ) + + async def _load_feature_catalog(self) -> dict[str, dict[str, str]]: + """Build hook_name → {column_name → json_type} map from the catalog.""" + catalog = await self.read_store.get_feature_catalog() + return {entry.hook_name: {col.name: col.type for col in entry.columns} for entry in catalog} + + @staticmethod + def _check_operator_for_field_type( + predicate: Predicate, *, field_type: FieldType, path: str + ) -> None: + valid = VALID_OPERATORS.get(field_type, set()) + if predicate.op not in valid: + raise ValidationError( + f"Operator '{predicate.op}' is not valid for field '{path}' " + f"(type '{field_type}'). Valid: {sorted(valid)}.", + field=path, + code="operator_not_valid_for_type", + ) + + @staticmethod + def _check_operator_for_json_type(predicate: Predicate, *, json_type: str, path: str) -> None: + valid = JSON_TYPE_OPERATORS.get(json_type, {FilterOperator.EQ}) + if predicate.op not in valid: + raise ValidationError( + f"Operator '{predicate.op}' is not valid for column '{path}' " + f"(json type '{json_type}'). Valid: {sorted(valid)}.", + field=path, + code="operator_not_valid_for_type", + ) + + +def _tree_depth(expr: FilterExpr) -> int: + if isinstance(expr, Predicate): + return 1 + if isinstance(expr, Not): + return 1 + _tree_depth(expr.operand) + if isinstance(expr, (And, Or)): + return 1 + max(_tree_depth(op) for op in expr.operands) + return 1 + + +def _iter_predicates(expr: FilterExpr): + if isinstance(expr, Predicate): + yield expr + return + if isinstance(expr, Not): + yield from _iter_predicates(expr.operand) + return + if isinstance(expr, (And, Or)): + for op in expr.operands: + yield from _iter_predicates(op) + + +def _iter_nodes(expr: FilterExpr): + yield expr + if isinstance(expr, Not): + yield from _iter_nodes(expr.operand) + elif isinstance(expr, (And, Or)): + for op in expr.operands: + yield from _iter_nodes(op) diff --git a/server/osa/domain/discovery/util/di/provider.py b/server/osa/domain/discovery/util/di/provider.py index 325367f..9715b8c 100644 --- a/server/osa/domain/discovery/util/di/provider.py +++ b/server/osa/domain/discovery/util/di/provider.py @@ -2,6 +2,7 @@ from dishka import provide +from osa.config import Config from osa.domain.discovery.port.field_definition_reader import FieldDefinitionReader from osa.domain.discovery.port.read_store import DiscoveryReadStore from osa.domain.discovery.query.get_feature_catalog import GetFeatureCatalogHandler @@ -18,8 +19,13 @@ def get_discovery_service( self, read_store: DiscoveryReadStore, field_reader: FieldDefinitionReader, + config: Config, ) -> DiscoveryService: - return DiscoveryService(read_store=read_store, field_reader=field_reader) + return DiscoveryService( + read_store=read_store, + field_reader=field_reader, + config=config, + ) # Query Handlers search_records_handler = provide(SearchRecordsHandler, scope=Scope.UOW) diff --git a/server/osa/domain/feature/event/__init__.py b/server/osa/domain/feature/event/__init__.py index 32ca6a4..8112dda 100644 --- a/server/osa/domain/feature/event/__init__.py +++ b/server/osa/domain/feature/event/__init__.py @@ -1,5 +1,3 @@ """Feature domain events.""" -from osa.domain.feature.event.convention_ready import ConventionReady - -__all__ = ["ConventionReady"] +__all__: list[str] = [] diff --git a/server/osa/domain/feature/event/convention_ready.py b/server/osa/domain/feature/event/convention_ready.py deleted file mode 100644 index 42b627e..0000000 --- a/server/osa/domain/feature/event/convention_ready.py +++ /dev/null @@ -1,14 +0,0 @@ -"""ConventionReady event — emitted after feature tables are created for a convention.""" - -from osa.domain.shared.event import Event, EventId -from osa.domain.shared.model.srn import ConventionSRN - - -class ConventionReady(Event): - """Emitted when feature tables have been created for a convention. - - Downstream handlers react to this knowing that feature tables are ready. - """ - - id: EventId - convention_srn: ConventionSRN diff --git a/server/osa/domain/feature/handler/create_feature_tables.py b/server/osa/domain/feature/handler/create_feature_tables.py index 2dd815b..0ad16a4 100644 --- a/server/osa/domain/feature/handler/create_feature_tables.py +++ b/server/osa/domain/feature/handler/create_feature_tables.py @@ -1,27 +1,24 @@ """CreateFeatureTables — creates feature tables when a convention is registered.""" import logging -from uuid import uuid4 from osa.domain.deposition.event.convention_registered import ConventionRegistered -from osa.domain.feature.event.convention_ready import ConventionReady from osa.domain.feature.service.feature import FeatureService from osa.domain.shared.error import ConflictError -from osa.domain.shared.event import EventHandler, EventId -from osa.domain.shared.outbox import Outbox +from osa.domain.shared.event import EventHandler logger = logging.getLogger(__name__) class CreateFeatureTables(EventHandler[ConventionRegistered]): - """Creates feature tables for each hook and emits ConventionReady. + """Creates feature tables for each hook declared on a registered convention. - Part of the convention initialization chain: - ConventionRegistered → CreateFeatureTables → ConventionReady + Readiness is not signalled via a follow-on event — consumers check the + ``feature_tables`` + ``metadata_tables`` catalogs at read time instead + (research.md §11). """ feature_service: FeatureService - outbox: Outbox async def handle(self, event: ConventionRegistered) -> None: for hook in event.hooks: @@ -38,11 +35,3 @@ async def handle(self, event: ConventionRegistered) -> None: hook.name, event.convention_srn, ) - - await self.outbox.append( - ConventionReady( - id=EventId(uuid4()), - convention_srn=event.convention_srn, - ) - ) - logger.info("Convention ready: %s", event.convention_srn) diff --git a/server/osa/domain/metadata/__init__.py b/server/osa/domain/metadata/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/event/__init__.py b/server/osa/domain/metadata/event/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/handler/__init__.py b/server/osa/domain/metadata/handler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/handler/ensure_metadata_table.py b/server/osa/domain/metadata/handler/ensure_metadata_table.py new file mode 100644 index 0000000..b8214ba --- /dev/null +++ b/server/osa/domain/metadata/handler/ensure_metadata_table.py @@ -0,0 +1,50 @@ +"""EnsureMetadataTable — creates/evolves the typed metadata table on ConventionRegistered.""" + +from __future__ import annotations + +import logging + +from osa.domain.deposition.event.convention_registered import ConventionRegistered +from osa.domain.deposition.port.convention_repository import ConventionRepository +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.semantics.port.schema_repository import SchemaRepository +from osa.domain.shared.error import DomainError, NotFoundError +from osa.domain.shared.event import EventHandler + +logger = logging.getLogger(__name__) + + +class EnsureMetadataTable(EventHandler[ConventionRegistered]): + """Reacts to ConventionRegistered, creates/evolves the schema's metadata table. + + Idempotent and schema-keyed: two conventions against the same + ``(schema_identity, schema_major)`` share one table. Additive minor/patch + bumps trigger ALTER ADD COLUMN. + """ + + metadata_service: MetadataService + schema_repo: SchemaRepository + convention_repo: ConventionRepository + + async def handle(self, event: ConventionRegistered) -> None: + convention = await self.convention_repo.get(event.convention_srn) + if convention is None: + raise NotFoundError(f"Convention not found: {event.convention_srn}") + + schema = await self.schema_repo.get(event.schema_srn) + if schema is None: + raise NotFoundError(f"Schema not found: {event.schema_srn}") + + try: + await self.metadata_service.ensure_table( + schema_srn=event.schema_srn, + schema_title=schema.title, + fields=event.schema_fields, + ) + except DomainError: + logger.exception( + "EnsureMetadataTable failed: convention=%s schema=%s", + event.convention_srn, + event.schema_srn, + ) + raise diff --git a/server/osa/domain/metadata/handler/insert_record_metadata.py b/server/osa/domain/metadata/handler/insert_record_metadata.py new file mode 100644 index 0000000..4ab1558 --- /dev/null +++ b/server/osa/domain/metadata/handler/insert_record_metadata.py @@ -0,0 +1,29 @@ +"""InsertRecordMetadata — writes a record's typed metadata row on RecordPublished.""" + +from __future__ import annotations + +import logging + +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.record.event.record_published import RecordPublished +from osa.domain.shared.event import EventHandler + +logger = logging.getLogger(__name__) + + +class InsertRecordMetadata(EventHandler[RecordPublished]): + """Reacts to RecordPublished, inserts a typed metadata row for the record.""" + + metadata_service: MetadataService + + async def handle(self, event: RecordPublished) -> None: + await self.metadata_service.insert( + schema_srn=event.schema_srn, + record_srn=event.record_srn, + values=event.metadata, + ) + logger.debug( + "Inserted metadata row: record=%s schema=%s", + event.record_srn, + event.schema_srn, + ) diff --git a/server/osa/domain/metadata/model/__init__.py b/server/osa/domain/metadata/model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/model/value.py b/server/osa/domain/metadata/model/value.py new file mode 100644 index 0000000..d220ab1 --- /dev/null +++ b/server/osa/domain/metadata/model/value.py @@ -0,0 +1,16 @@ +"""Metadata domain value objects — MetadataSchema, slug helpers.""" + +from __future__ import annotations + +from osa.domain.shared.model.hook import ColumnDef +from osa.domain.shared.model.value import ValueObject + + +class MetadataSchema(ValueObject): + """Typed projection of a Schema into dynamic-column form. + + Mirrors :class:`FeatureSchema` — serialised into the catalog row's + ``metadata_schema`` JSONB column and rehydrated on subsequent reads. + """ + + columns: list[ColumnDef] = [] diff --git a/server/osa/domain/metadata/port/__init__.py b/server/osa/domain/metadata/port/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/port/metadata_store.py b/server/osa/domain/metadata/port/metadata_store.py new file mode 100644 index 0000000..64899a2 --- /dev/null +++ b/server/osa/domain/metadata/port/metadata_store.py @@ -0,0 +1,40 @@ +"""MetadataStore port — DDL + DML for typed per-schema metadata tables.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Protocol + +if TYPE_CHECKING: + from osa.domain.semantics.model.value import FieldDefinition + from osa.domain.shared.model.srn import RecordSRN, SchemaSRN + + +class MetadataStore(Protocol): + """Port owned by the metadata domain. + + Implementations are responsible for: + - Creating the ``metadata._v`` table on first + registration for a (schema_identity, major) pair. + - Additively ALTER ADD COLUMN when the schema bumps (minor/patch) with + new optional fields. + - Appending SRN lineage into the catalog's ``schema_versions`` list. + - Idempotent UPSERT of a row keyed on ``record_srn``. + """ + + async def ensure_table( + self, + schema_srn: "SchemaSRN", + schema_title: str, + fields: "list[FieldDefinition]", + ) -> None: + """Create or additively evolve the typed metadata table for a schema.""" + ... + + async def insert( + self, + schema_srn: "SchemaSRN", + record_srn: "RecordSRN", + values: dict[str, Any], + ) -> None: + """Upsert a record's typed metadata row into the schema's table.""" + ... diff --git a/server/osa/domain/metadata/service/__init__.py b/server/osa/domain/metadata/service/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/service/metadata.py b/server/osa/domain/metadata/service/metadata.py new file mode 100644 index 0000000..a639c1b --- /dev/null +++ b/server/osa/domain/metadata/service/metadata.py @@ -0,0 +1,32 @@ +"""MetadataService — thin delegator over the MetadataStore port.""" + +from __future__ import annotations + +from typing import Any + +from osa.domain.metadata.port.metadata_store import MetadataStore +from osa.domain.semantics.model.value import FieldDefinition +from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.domain.shared.service import Service + + +class MetadataService(Service): + """Creates/evolves typed metadata tables and inserts record metadata.""" + + metadata_store: MetadataStore + + async def ensure_table( + self, + schema_srn: SchemaSRN, + schema_title: str, + fields: list[FieldDefinition], + ) -> None: + await self.metadata_store.ensure_table(schema_srn, schema_title, fields) + + async def insert( + self, + schema_srn: SchemaSRN, + record_srn: RecordSRN, + values: dict[str, Any], + ) -> None: + await self.metadata_store.insert(schema_srn, record_srn, values) diff --git a/server/osa/domain/metadata/util/__init__.py b/server/osa/domain/metadata/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/osa/domain/metadata/util/di/__init__.py b/server/osa/domain/metadata/util/di/__init__.py new file mode 100644 index 0000000..1013d4d --- /dev/null +++ b/server/osa/domain/metadata/util/di/__init__.py @@ -0,0 +1,3 @@ +from osa.domain.metadata.util.di.provider import MetadataProvider + +__all__ = ["MetadataProvider"] diff --git a/server/osa/domain/metadata/util/di/provider.py b/server/osa/domain/metadata/util/di/provider.py new file mode 100644 index 0000000..379f23b --- /dev/null +++ b/server/osa/domain/metadata/util/di/provider.py @@ -0,0 +1,11 @@ +"""DI provider for the metadata bounded context.""" + +from dishka import provide + +from osa.domain.metadata.service.metadata import MetadataService +from osa.util.di.base import Provider +from osa.util.di.scope import Scope + + +class MetadataProvider(Provider): + service = provide(MetadataService, scope=Scope.UOW) diff --git a/server/osa/domain/record/event/record_published.py b/server/osa/domain/record/event/record_published.py index 2c6dc62..b2a8f34 100644 --- a/server/osa/domain/record/event/record_published.py +++ b/server/osa/domain/record/event/record_published.py @@ -4,20 +4,21 @@ from osa.domain.shared.event import Event, EventId from osa.domain.shared.model.source import RecordSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN class RecordPublished(Event): """Emitted when a record is published and ready for indexing. - Enriched with source, convention_srn, and expected_features so downstream - consumers (feature insertion, indexing) can operate without querying - record/convention repositories. + Enriched with source, convention_srn, schema_srn, and expected_features so + downstream consumers (metadata insertion, feature insertion, indexing) can + operate without querying record/convention repositories. """ id: EventId record_srn: RecordSRN source: RecordSource convention_srn: ConventionSRN + schema_srn: SchemaSRN metadata: dict[str, Any] expected_features: list[str] = [] diff --git a/server/osa/domain/record/model/aggregate.py b/server/osa/domain/record/model/aggregate.py index 69e5575..0e797d5 100644 --- a/server/osa/domain/record/model/aggregate.py +++ b/server/osa/domain/record/model/aggregate.py @@ -3,9 +3,11 @@ from datetime import datetime from typing import Any +from pydantic import Field + from osa.domain.shared.model.aggregate import Aggregate from osa.domain.shared.model.source import RecordSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN class Record(Aggregate): @@ -14,5 +16,6 @@ class Record(Aggregate): srn: RecordSRN source: RecordSource convention_srn: ConventionSRN + schema_srn: SchemaSRN = Field(frozen=True) metadata: dict[str, Any] published_at: datetime diff --git a/server/osa/domain/record/service/record.py b/server/osa/domain/record/service/record.py index e2409ce..7dbbf6c 100644 --- a/server/osa/domain/record/service/record.py +++ b/server/osa/domain/record/service/record.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any from uuid import uuid4 +from osa.domain.deposition.port.convention_repository import ConventionRepository from osa.domain.record.event.record_published import RecordPublished from osa.domain.record.model.aggregate import Record from osa.domain.record.model.draft import RecordDraft @@ -14,10 +15,12 @@ from osa.domain.shared.error import NotFoundError from osa.domain.shared.event import EventId from osa.domain.shared.model.srn import ( + ConventionSRN, Domain, LocalId, RecordSRN, RecordVersion, + SchemaSRN, ) from osa.domain.shared.outbox import Outbox from osa.domain.shared.service import Service @@ -32,6 +35,7 @@ class RecordService(Service): """Creates and persists Record aggregates from any source.""" record_repo: RecordRepository + convention_repo: ConventionRepository outbox: Outbox node_domain: Domain feature_reader: FeatureReader @@ -49,6 +53,13 @@ async def get(self, srn: RecordSRN) -> Record: raise NotFoundError(f"Record not found: {srn}") return record + async def _resolve_schema_srn(self, convention_srn: ConventionSRN) -> SchemaSRN: + """Resolve a convention to its schema SRN at publication time.""" + convention = await self.convention_repo.get(convention_srn) + if convention is None: + raise NotFoundError(f"Convention not found: {convention_srn}") + return convention.schema_srn + async def bulk_publish(self, drafts: list[RecordDraft]) -> list[Record]: """Bulk-publish records from an ingest batch. @@ -59,8 +70,15 @@ async def bulk_publish(self, drafts: list[RecordDraft]) -> list[Record]: if not drafts: return [] + # All drafts in a batch target the same convention (caller contract); + # resolve schema_srn once. + schema_srn_by_conv: dict[str, SchemaSRN] = {} + records: list[Record] = [] for draft in drafts: + key = str(draft.convention_srn) + if key not in schema_srn_by_conv: + schema_srn_by_conv[key] = await self._resolve_schema_srn(draft.convention_srn) record_srn = RecordSRN( domain=self.node_domain, id=LocalId(str(uuid4())), @@ -71,6 +89,7 @@ async def bulk_publish(self, drafts: list[RecordDraft]) -> list[Record]: srn=record_srn, source=draft.source, convention_srn=draft.convention_srn, + schema_srn=schema_srn_by_conv[key], metadata=draft.metadata, published_at=datetime.now(UTC), ) @@ -83,6 +102,8 @@ async def publish_record(self, draft: RecordDraft) -> Record: """Create and persist a Record from a draft.""" logger.info(f"Creating record from {draft.source.type} source: {draft.source.id}") + schema_srn = await self._resolve_schema_srn(draft.convention_srn) + record_srn = RecordSRN( domain=self.node_domain, id=LocalId(str(uuid4())), @@ -93,6 +114,7 @@ async def publish_record(self, draft: RecordDraft) -> Record: srn=record_srn, source=draft.source, convention_srn=draft.convention_srn, + schema_srn=schema_srn, metadata=draft.metadata, published_at=datetime.now(UTC), ) @@ -105,6 +127,7 @@ async def publish_record(self, draft: RecordDraft) -> Record: record_srn=record_srn, source=draft.source, convention_srn=draft.convention_srn, + schema_srn=schema_srn, metadata=draft.metadata, expected_features=draft.expected_features, ) diff --git a/server/osa/domain/shared/error.py b/server/osa/domain/shared/error.py index a26c0d6..f03951b 100644 --- a/server/osa/domain/shared/error.py +++ b/server/osa/domain/shared/error.py @@ -34,8 +34,13 @@ class NotFoundError(DomainError): class ValidationError(DomainError): """Input validation failed.""" - def __init__(self, message: str, field: str | None = None) -> None: - super().__init__(message, code="VALIDATION_ERROR") + def __init__( + self, + message: str, + field: str | None = None, + code: str | None = None, + ) -> None: + super().__init__(message, code=code or "VALIDATION_ERROR") self.field = field diff --git a/server/osa/domain/shared/model/hook.py b/server/osa/domain/shared/model/hook.py index 346ba34..49a9ab7 100644 --- a/server/osa/domain/shared/model/hook.py +++ b/server/osa/domain/shared/model/hook.py @@ -57,7 +57,7 @@ def _format_memory(byte_count: int) -> str: class ColumnDef(ValueObject): - """Definition of a single column in a feature table.""" + """Definition of a single column in a feature or metadata table.""" name: PgIdentifier json_type: Literal["string", "number", "integer", "boolean", "array", "object"] diff --git a/server/osa/infrastructure/event/di.py b/server/osa/infrastructure/event/di.py index fb665c4..be7142e 100644 --- a/server/osa/infrastructure/event/di.py +++ b/server/osa/infrastructure/event/di.py @@ -14,6 +14,8 @@ InsertRecordFeatures, ) from osa.domain.ingest.handler import PublishBatch, RunHooks, RunIngester +from osa.domain.metadata.handler.ensure_metadata_table import EnsureMetadataTable +from osa.domain.metadata.handler.insert_record_metadata import InsertRecordMetadata from osa.domain.record.handler import ConvertDepositionToRecord from osa.domain.shared.event import EventHandler from osa.domain.shared.event_log import EventLog @@ -37,6 +39,9 @@ CreateFeatureTables, InsertRecordFeatures, InsertBatchFeatures, + # Metadata handlers (feature 076) + EnsureMetadataTable, + InsertRecordMetadata, # Ingest handlers RunIngester, RunHooks, diff --git a/server/osa/infrastructure/persistence/adapter/discovery.py b/server/osa/infrastructure/persistence/adapter/discovery.py index 23af2bc..814504f 100644 --- a/server/osa/infrastructure/persistence/adapter/discovery.py +++ b/server/osa/infrastructure/persistence/adapter/discovery.py @@ -3,6 +3,8 @@ from __future__ import annotations import logging +from collections.abc import Callable +from datetime import date, datetime from typing import Any from sqlalchemy import ( @@ -13,6 +15,7 @@ cast, func, literal, + not_, or_, select, true, @@ -21,26 +24,37 @@ from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.ext.asyncio import AsyncSession +from osa.domain.discovery.model.refs import FeatureFieldRef, MetadataFieldRef from osa.domain.discovery.model.value import ( + And, ColumnInfo, FeatureCatalogEntry, FeatureRow, - Filter, + FilterExpr, FilterOperator, + Not, + Or, + Predicate, RecordSummary, SortOrder, ) from osa.domain.semantics.model.value import FieldType from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import RecordSRN +from osa.domain.shared.model.hook import ColumnDef +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN from osa.infrastructure.persistence.feature_table import ( FeatureSchema, build_feature_table, data_columns, ) from osa.infrastructure.persistence.keyset import KeysetPage, SortKey +from osa.infrastructure.persistence.metadata_table import ( + MetadataSchema, + build_metadata_table, +) from osa.infrastructure.persistence.tables import ( feature_tables_table, + metadata_tables_table, records_table, schemas_table, ) @@ -53,13 +67,57 @@ def _escape_like(value: str) -> str: return value.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") -def _to_column_info(schema: FeatureSchema) -> list[ColumnInfo]: - """Map typed FeatureSchema columns to API-facing ColumnInfo list.""" - return [ColumnInfo(name=c.name, type=c.json_type, required=c.required) for c in schema.columns] +# Cursor-value coercers — cursor payloads round-trip through base64 JSON as +# plain strings/numbers, but keyset predicates compare against typed columns. +# Without this, ``published_at < 'iso-string'::VARCHAR`` fails on Postgres. + +CursorCoercer = Callable[[Any], Any] + + +def _coerce_identity(value: Any) -> Any: + return value + + +def _coerce_datetime(value: Any) -> Any: + if isinstance(value, str): + return datetime.fromisoformat(value) + return value + + +def _coerce_date(value: Any) -> Any: + if isinstance(value, str): + return date.fromisoformat(value) + return value + + +def _coerce_float(value: Any) -> Any: + return None if value is None else float(value) + + +def _coerce_int(value: Any) -> Any: + return None if value is None else int(value) + + +def _coercer_for_column(col_def: ColumnDef) -> CursorCoercer: + """Pick a coercer matching the Postgres type chosen by ``column_mapper``.""" + if col_def.json_type == "number": + return _coerce_float + if col_def.json_type == "integer": + return _coerce_int + if col_def.json_type == "string": + if col_def.format == "date-time": + return _coerce_datetime + if col_def.format == "date": + return _coerce_date + return _coerce_identity + + +def _to_column_info(columns: list[Any]) -> list[ColumnInfo]: + return [ColumnInfo(name=c.name, type=c.json_type, required=c.required) for c in columns] class PostgresFieldDefinitionReader: - """Builds a global field_name -> FieldType map from all registered schemas.""" + """Builds field name → FieldType maps from registered schemas.""" def __init__(self, session: AsyncSession) -> None: self.session = session @@ -84,16 +142,27 @@ async def get_all_field_types(self) -> dict[str, FieldType]: return field_map + async def get_fields_for_schema(self, schema_srn: SchemaSRN) -> dict[str, FieldType]: + rendered = str(schema_srn) + stmt = select(schemas_table.c.fields).where(schemas_table.c.srn == rendered) + result = await self.session.execute(stmt) + row = result.mappings().first() + if row is None: + return {} + return {f["name"]: FieldType(f["type"]) for f in row["fields"]} + class PostgresDiscoveryReadStore: - """Direct SQL queries against records and feature tables for discovery.""" + """Compiles FilterExpr trees into SQLAlchemy queries over records / metadata / features.""" def __init__(self, session: AsyncSession) -> None: self.session = session async def search_records( self, - filters: list[Filter], + filter_expr: FilterExpr | None, + schema_srn: SchemaSRN | None, + convention_srn: ConventionSRN | None, text_fields: list[str], q: str | None, sort: str, @@ -102,34 +171,66 @@ async def search_records( limit: int, field_types: dict[str, FieldType] | None = None, ) -> list[RecordSummary]: - """Build and execute a dynamic SQL query for record search.""" t = records_table + ft_map = field_types or {} + + metadata_table = None + metadata_schema: MetadataSchema | None = None + if schema_srn is not None: + catalog = await self._metadata_catalog_for(schema_srn) + if catalog is not None: + metadata_schema = MetadataSchema.model_validate(catalog["metadata_schema"]) + metadata_table = build_metadata_table(catalog["pg_table"], metadata_schema) + + feature_joins = await self._collect_feature_joins(filter_expr) + conditions: list[Any] = [] - ft = field_types or {} - # Build filter conditions - for f in filters: - conditions.append(self._record_filter_clause(f, ft.get(f.field))) + if convention_srn is not None: + conditions.append(t.c.convention_srn == str(convention_srn)) + + if filter_expr is not None: + conditions.append( + self._compile_filter_for_records( + filter_expr, + records_t=t, + metadata_t=metadata_table, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + field_types=ft_map, + ) + ) - # Free-text search across text fields - if q and text_fields: + if q and text_fields and metadata_table is not None and metadata_schema is not None: pattern = f"%{_escape_like(q)}%" + text_col_names = {c.name for c in metadata_schema.columns if c.json_type == "string"} text_clauses = [ - t.c.metadata[field].astext.ilike(pattern, escape="\\") for field in text_fields + cast(metadata_table.c[name], String).ilike(pattern, escape="\\") + for name in text_fields + if name in text_col_names ] - conditions.append(or_(*text_clauses)) + if text_clauses: + conditions.append(or_(*text_clauses)) - # Determine sort expression (cast to match field type for correct ordering) + # Sort expression + matching cursor-value coercer if sort == "published_at": sort_expr = t.c.published_at - elif ft.get(sort) == FieldType.NUMBER: - sort_expr = cast(t.c.metadata[sort].astext, Float) - elif ft.get(sort) == FieldType.DATE: - sort_expr = cast(t.c.metadata[sort].astext, Date) + coerce_cursor: CursorCoercer = _coerce_datetime + elif metadata_table is not None and sort in metadata_table.c: + col = metadata_table.c[sort] + if ft_map.get(sort) == FieldType.NUMBER: + sort_expr = cast(col, Float) + coerce_cursor = _coerce_float + elif ft_map.get(sort) == FieldType.DATE: + sort_expr = cast(col, Date) + coerce_cursor = _coerce_date + else: + sort_expr = col + coerce_cursor = _coerce_identity else: - sort_expr = t.c.metadata[sort].astext + sort_expr = t.c.published_at + coerce_cursor = _coerce_datetime - # Keyset pagination with correct NULL handling is_desc = order == SortOrder.DESC page = KeysetPage( [ @@ -138,31 +239,55 @@ async def search_records( ] ) order_clauses = page.order_by() - if cursor is not None: - conditions.append(page.after((cursor["s"], cursor["id"]))) + sort_value = coerce_cursor(cursor["s"]) + conditions.append(page.after((sort_value, cursor["id"]))) where_clause = and_(*conditions) if conditions else true() - stmt = ( - select(t.c.srn, t.c.published_at, t.c.metadata) - .where(where_clause) - .order_by(*order_clauses) - .limit(limit) - ) + if metadata_table is not None and metadata_schema is not None: + select_cols = [t.c.srn, t.c.published_at] + [ + metadata_table.c[c.name].label(c.name) for c in metadata_schema.columns + ] + stmt = select(*select_cols).select_from( + t.join(metadata_table, metadata_table.c.record_srn == t.c.srn) + ) + else: + # No schema pinned — project the canonical JSONB metadata column. + # Typed tables are a query-optimized projection; JSONB remains the + # authoritative source for presentation (and for cross-schema + # listings where no single typed table applies). + stmt = select(t.c.srn, t.c.published_at, t.c.metadata) + + for hook, ft in feature_joins.items(): + stmt = stmt.join(ft, ft.c.record_srn == t.c.srn, isouter=True) + + stmt = stmt.where(where_clause).order_by(*order_clauses).limit(limit) result = await self.session.execute(stmt) - return [ - RecordSummary( - srn=RecordSRN.parse(row["srn"]), - published_at=row["published_at"], - metadata=row["metadata"], - ) - for row in result.mappings() - ] + summaries: list[RecordSummary] = [] + if metadata_table is not None and metadata_schema is not None: + for row in result.mappings(): + meta = {c.name: row[c.name] for c in metadata_schema.columns if c.name in row} + summaries.append( + RecordSummary( + srn=RecordSRN.parse(row["srn"]), + published_at=row["published_at"], + metadata=meta, + ) + ) + else: + for row in result.mappings(): + summaries.append( + RecordSummary( + srn=RecordSRN.parse(row["srn"]), + published_at=row["published_at"], + metadata=row.get("metadata") or {}, + ) + ) + return summaries async def get_feature_catalog(self) -> list[FeatureCatalogEntry]: - """List all feature tables with column schemas and record counts.""" stmt = select( feature_tables_table.c.hook_name, feature_tables_table.c.pg_table, @@ -174,13 +299,11 @@ async def get_feature_catalog(self) -> list[FeatureCatalogEntry]: if not catalog_rows: return [] - # Parse schemas at the boundary parsed = [ (row["hook_name"], FeatureSchema.model_validate(row["feature_schema"]), row["pg_table"]) for row in catalog_rows ] - # Fetch all record counts in a single UNION ALL query (avoid N+1) count_parts = [] for hook_name, schema, pg_table in parsed: ft = build_feature_table(pg_table, schema) @@ -196,14 +319,13 @@ async def get_feature_catalog(self) -> list[FeatureCatalogEntry]: return [ FeatureCatalogEntry( hook_name=hook_name, - columns=_to_column_info(schema), + columns=_to_column_info(schema.columns), record_count=counts_by_hook.get(hook_name, 0), ) for hook_name, schema, _pg_table in parsed ] async def get_feature_table_schema(self, hook_name: str) -> FeatureCatalogEntry | None: - """Look up a single feature table's schema by hook name.""" stmt = select( feature_tables_table.c.hook_name, feature_tables_table.c.feature_schema, @@ -216,22 +338,21 @@ async def get_feature_table_schema(self, hook_name: str) -> FeatureCatalogEntry schema = FeatureSchema.model_validate(row["feature_schema"]) return FeatureCatalogEntry( hook_name=row["hook_name"], - columns=_to_column_info(schema), + columns=_to_column_info(schema.columns), record_count=0, ) async def search_features( self, hook_name: str, - filters: list[Filter], + filter_expr: FilterExpr | None, + schema_srn: SchemaSRN | None, record_srn: RecordSRN | None, sort: str, order: SortOrder, cursor: dict[str, Any] | None, limit: int, ) -> list[FeatureRow]: - """Build and execute a dynamic SQL query for feature row search.""" - # Look up pg_table and feature_schema from catalog pg_table_stmt = select( feature_tables_table.c.pg_table, feature_tables_table.c.feature_schema, @@ -245,33 +366,48 @@ async def search_features( ft = build_feature_table(pg_table, schema) + metadata_table = None + metadata_schema: MetadataSchema | None = None + if schema_srn is not None: + catalog = await self._metadata_catalog_for(schema_srn) + if catalog is not None: + metadata_schema = MetadataSchema.model_validate(catalog["metadata_schema"]) + metadata_table = build_metadata_table(catalog["pg_table"], metadata_schema) + + feature_joins: dict[str, Any] = {} + if filter_expr is not None: + extra = await self._collect_feature_joins(filter_expr) + for hook, tbl in extra.items(): + if hook != hook_name: + feature_joins[hook] = tbl + conditions: list[Any] = [] - # Record SRN filter if record_srn is not None: conditions.append(ft.c.record_srn == str(record_srn)) - # Column filters — all columns are known from schema - for f in filters: - col = ft.c[f.field] - if f.operator == FilterOperator.EQ: - conditions.append(col == f.value) - elif f.operator == FilterOperator.CONTAINS: - conditions.append( - cast(col, String).ilike(f"%{_escape_like(str(f.value))}%", escape="\\") + if filter_expr is not None: + conditions.append( + self._compile_filter_for_features( + filter_expr, + this_hook=hook_name, + this_ft=ft, + metadata_t=metadata_table, + metadata_schema=metadata_schema, + feature_joins=feature_joins, ) - elif f.operator == FilterOperator.GTE: - conditions.append(col >= f.value) - elif f.operator == FilterOperator.LTE: - conditions.append(col <= f.value) + ) - # Sort expression if sort == "id": sort_expr = ft.c.id + coerce_cursor: CursorCoercer = _coerce_int else: sort_expr = ft.c[sort] + col_def = next((c for c in schema.columns if c.name == sort), None) + coerce_cursor = ( + _coercer_for_column(col_def) if col_def is not None else _coerce_identity + ) - # Keyset pagination with correct NULL handling is_desc = order == SortOrder.DESC page = KeysetPage( [ @@ -280,17 +416,24 @@ async def search_features( ] ) order_clauses = page.order_by() - if cursor is not None: - conditions.append(page.after((cursor["s"], cursor["id"]))) + sort_value = coerce_cursor(cursor["s"]) + conditions.append(page.after((sort_value, cursor["id"]))) where_clause = and_(*conditions) if conditions else true() + stmt = select(ft.c.id, ft.c.record_srn, *data_columns(ft)) + select_from = ft + if metadata_table is not None: + select_from = select_from.join( + metadata_table, metadata_table.c.record_srn == ft.c.record_srn, isouter=True + ) + for hook, other_ft in feature_joins.items(): + select_from = select_from.join( + other_ft, other_ft.c.record_srn == ft.c.record_srn, isouter=True + ) stmt = ( - select(ft.c.id, ft.c.record_srn, *data_columns(ft)) - .where(where_clause) - .order_by(*order_clauses) - .limit(limit) + stmt.select_from(select_from).where(where_clause).order_by(*order_clauses).limit(limit) ) result = await self.session.execute(stmt) @@ -303,31 +446,311 @@ async def search_features( return feature_rows - @staticmethod - def _record_filter_clause(f: Filter, field_type: FieldType | None = None) -> Any: - """Build a SQL clause for a single record metadata filter.""" - t = records_table - if f.operator == FilterOperator.EQ: - # Use JSONB @> containment (GIN-indexed) - return t.c.metadata.op("@>")(cast(func.json_build_object(f.field, f.value), JSONB)) - elif f.operator == FilterOperator.CONTAINS: - return t.c.metadata[f.field].astext.ilike( - f"%{_escape_like(str(f.value))}%", escape="\\" + # ---------------- compilation helpers ---------------- + + async def _metadata_catalog_for(self, schema_srn: SchemaSRN) -> dict[str, Any] | None: + """Look up the metadata table catalog row for a Schema SRN.""" + identity = str(schema_srn).split("@", 1)[0] + major = int(schema_srn.version.root.split(".")[0]) + stmt = select(metadata_tables_table).where( + metadata_tables_table.c.schema_identity == identity, + metadata_tables_table.c.schema_major == major, + ) + result = await self.session.execute(stmt) + row = result.mappings().first() + return dict(row) if row is not None else None + + async def _collect_feature_joins(self, filter_expr: FilterExpr | None) -> dict[str, Any]: + """Build {hook_name: SQLA Table} for every distinct feature ref in the tree.""" + if filter_expr is None: + return {} + hooks: set[str] = set() + for p in _iter_predicates(filter_expr): + if isinstance(p.field, FeatureFieldRef): + hooks.add(p.field.hook) + if not hooks: + return {} + stmt = select( + feature_tables_table.c.hook_name, + feature_tables_table.c.pg_table, + feature_tables_table.c.feature_schema, + ).where(feature_tables_table.c.hook_name.in_(hooks)) + result = await self.session.execute(stmt) + joins: dict[str, Any] = {} + for row in result.mappings(): + schema = FeatureSchema.model_validate(row["feature_schema"]) + joins[row["hook_name"]] = build_feature_table(row["pg_table"], schema) + missing = hooks - joins.keys() + if missing: + raise ValidationError( + f"Unknown feature hook(s): {sorted(missing)}", + field="filter", + code="unknown_hook", ) - elif f.operator in (FilterOperator.GTE, FilterOperator.LTE): - # Use typed casts: numeric for NUMBER, date for DATE, string fallback - if field_type == FieldType.NUMBER: - col_expr = cast(t.c.metadata[f.field].astext, Float) - val = float(f.value) - elif field_type == FieldType.DATE: - col_expr = cast(t.c.metadata[f.field].astext, Date) - val = str(f.value) - else: - col_expr = cast(t.c.metadata[f.field].astext, String) - val = str(f.value) - if f.operator == FilterOperator.GTE: - return col_expr >= val + return joins + + def _compile_filter_for_records( + self, + expr: FilterExpr, + *, + records_t: Any, + metadata_t: Any, + metadata_schema: MetadataSchema | None, + feature_joins: dict[str, Any], + field_types: dict[str, FieldType], + ) -> Any: + if isinstance(expr, Predicate): + return self._compile_predicate( + expr, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + field_types=field_types, + ) + if isinstance(expr, And): + return and_( + *[ + self._compile_filter_for_records( + op, + records_t=records_t, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + field_types=field_types, + ) + for op in expr.operands + ] + ) + if isinstance(expr, Or): + return or_( + *[ + self._compile_filter_for_records( + op, + records_t=records_t, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + field_types=field_types, + ) + for op in expr.operands + ] + ) + if isinstance(expr, Not): + return not_( + self._compile_filter_for_records( + expr.operand, + records_t=records_t, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + field_types=field_types, + ) + ) + raise ValidationError(f"Unsupported filter node: {type(expr).__name__}") + + def _compile_filter_for_features( + self, + expr: FilterExpr, + *, + this_hook: str, + this_ft: Any, + metadata_t: Any, + metadata_schema: MetadataSchema | None, + feature_joins: dict[str, Any], + ) -> Any: + if isinstance(expr, Predicate): + if isinstance(expr.field, MetadataFieldRef): + if metadata_t is None: + raise ValidationError( + f"Metadata ref {expr.field.dotted()!r} requires schema_srn to be set.", + field=expr.field.dotted(), + code="metadata_ref_requires_schema", + ) + col = metadata_t.c[expr.field.field] + return _apply_scalar_op(col, expr.op, expr.value) + assert isinstance(expr.field, FeatureFieldRef) + if expr.field.hook == this_hook: + col = this_ft.c[expr.field.column] else: - return col_expr <= val + tbl = feature_joins.get(expr.field.hook) + if tbl is None: + raise ValidationError( + f"Unknown feature hook '{expr.field.hook}'.", + field=expr.field.dotted(), + code="unknown_hook", + ) + col = tbl.c[expr.field.column] + return _apply_scalar_op(col, expr.op, expr.value) + if isinstance(expr, And): + return and_( + *[ + self._compile_filter_for_features( + op, + this_hook=this_hook, + this_ft=this_ft, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + ) + for op in expr.operands + ] + ) + if isinstance(expr, Or): + return or_( + *[ + self._compile_filter_for_features( + op, + this_hook=this_hook, + this_ft=this_ft, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + ) + for op in expr.operands + ] + ) + if isinstance(expr, Not): + return not_( + self._compile_filter_for_features( + expr.operand, + this_hook=this_hook, + this_ft=this_ft, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, + ) + ) + raise ValidationError(f"Unsupported filter node: {type(expr).__name__}") + + def _compile_predicate( + self, + predicate: Predicate, + *, + metadata_t: Any, + metadata_schema: MetadataSchema | None, + feature_joins: dict[str, Any], + field_types: dict[str, FieldType], + ) -> Any: + if isinstance(predicate.field, MetadataFieldRef): + # Prefer the typed projection when a schema is pinned. + if metadata_t is not None and metadata_schema is not None: + col = metadata_t.c[predicate.field.field] + return _apply_scalar_op(col, predicate.op, predicate.value) + # Otherwise compile against the canonical records.metadata JSONB. + return _apply_jsonb_op( + records_table, + field=predicate.field.field, + op=predicate.op, + value=predicate.value, + field_type=field_types.get(predicate.field.field), + ) + + assert isinstance(predicate.field, FeatureFieldRef) + tbl = feature_joins.get(predicate.field.hook) + if tbl is None: + raise ValidationError( + f"Unknown feature hook '{predicate.field.hook}'.", + field=predicate.field.dotted(), + code="unknown_hook", + ) + col = tbl.c[predicate.field.column] + return _apply_scalar_op(col, predicate.op, predicate.value) + + +def _apply_jsonb_op( + records_t: Any, + *, + field: str, + op: FilterOperator, + value: Any, + field_type: FieldType | None, +) -> Any: + """Compile a metadata-field predicate against the canonical ``records.metadata`` JSONB. + + Used when no ``schema_srn`` is pinned (cross-schema / unscoped listings). + Equality uses JSONB containment (GIN-indexed); range ops cast the extracted + text to the appropriate type driven by ``field_type`` when known. + """ + meta = records_t.c.metadata + + if op == FilterOperator.EQ: + return meta.op("@>")(cast(func.json_build_object(field, value), JSONB)) + if op == FilterOperator.NEQ: + return not_(meta.op("@>")(cast(func.json_build_object(field, value), JSONB))) + if op == FilterOperator.IS_NULL: + # Absent key OR present-but-null both count as "null". + return or_(not_(meta.has_key(field)), meta[field].astext.is_(None)) + if op == FilterOperator.IN: + if not isinstance(value, list): + raise ValidationError( + "Operator 'in' requires a list value.", + field=field, + code="invalid_value_for_op", + ) + return meta[field].astext.in_([str(v) for v in value]) + if op == FilterOperator.CONTAINS: + return meta[field].astext.ilike(f"%{_escape_like(str(value))}%", escape="\\") + if op in (FilterOperator.GT, FilterOperator.GTE, FilterOperator.LT, FilterOperator.LTE): + if field_type == FieldType.NUMBER: + col_expr = cast(meta[field].astext, Float) + typed_value: Any = float(value) + elif field_type == FieldType.DATE: + col_expr = cast(meta[field].astext, Date) + typed_value = str(value) else: - raise ValueError(f"Unknown operator: {f.operator}") # pragma: no cover + col_expr = cast(meta[field].astext, String) + typed_value = str(value) + if op == FilterOperator.GT: + return col_expr > typed_value + if op == FilterOperator.GTE: + return col_expr >= typed_value + if op == FilterOperator.LT: + return col_expr < typed_value + return col_expr <= typed_value + raise ValidationError( + f"Unsupported operator for JSONB fallback: {op}", + field=field, + code="unsupported_operator", + ) + + +def _apply_scalar_op(col: Any, op: FilterOperator, value: Any) -> Any: + if op == FilterOperator.EQ: + return col == value + if op == FilterOperator.NEQ: + return col != value + if op == FilterOperator.GT: + return col > value + if op == FilterOperator.GTE: + return col >= value + if op == FilterOperator.LT: + return col < value + if op == FilterOperator.LTE: + return col <= value + if op == FilterOperator.IN: + if not isinstance(value, list): + raise ValidationError( + "Operator 'in' requires a list value.", + field=col.key, + code="invalid_value_for_op", + ) + return col.in_(value) + if op == FilterOperator.CONTAINS: + return cast(col, String).ilike(f"%{_escape_like(str(value))}%", escape="\\") + if op == FilterOperator.IS_NULL: + return col.is_(None) + raise ValidationError( + f"Unsupported operator: {op}", field="filter", code="unsupported_operator" + ) + + +def _iter_predicates(expr: FilterExpr): + if isinstance(expr, Predicate): + yield expr + return + if isinstance(expr, Not): + yield from _iter_predicates(expr.operand) + return + if isinstance(expr, (And, Or)): + for op in expr.operands: + yield from _iter_predicates(op) diff --git a/server/osa/infrastructure/persistence/column_mapper.py b/server/osa/infrastructure/persistence/column_mapper.py index 69c7dc9..a9b463c 100644 --- a/server/osa/infrastructure/persistence/column_mapper.py +++ b/server/osa/infrastructure/persistence/column_mapper.py @@ -27,7 +27,6 @@ def map_column(col_def: ColumnDef) -> sa.Column: type_factory = _TYPE_MAP.get(key) if type_factory is None: - # Fall back to base type without format type_factory = _TYPE_MAP.get((col_def.json_type, None), sa.Text) sa_type = type_factory() diff --git a/server/osa/infrastructure/persistence/di.py b/server/osa/infrastructure/persistence/di.py index 569945f..75908e4 100644 --- a/server/osa/infrastructure/persistence/di.py +++ b/server/osa/infrastructure/persistence/di.py @@ -60,6 +60,8 @@ PostgresSemanticsSchemaRepository, ) from osa.infrastructure.persistence.feature_store import PostgresFeatureStore +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore +from osa.domain.metadata.port.metadata_store import MetadataStore from osa.infrastructure.persistence.repository.validation import ( PostgresValidationRunRepository, ) @@ -102,6 +104,11 @@ async def get_session( def get_feature_store(self, engine: AsyncEngine, session: AsyncSession) -> FeatureStore: return PostgresFeatureStore(engine=engine, session=session) + # Metadata store + @provide(scope=Scope.UOW) + def get_metadata_store(self, engine: AsyncEngine, session: AsyncSession) -> MetadataStore: + return PostgresMetadataStore(engine=engine, session=session) + # Semantics repositories ontology_repo = provide( PostgresOntologyRepository, scope=Scope.UOW, provides=OntologyRepository @@ -146,6 +153,7 @@ def get_feature_storage(self, file_storage: FileStoragePort) -> FeatureStoragePo def get_record_service( self, record_repo: RecordRepository, + convention_repo: ConventionRepository, outbox: Outbox, config: Config, feature_reader: FeatureReader, @@ -156,6 +164,7 @@ def get_record_service( """ return RecordService( record_repo=record_repo, + convention_repo=convention_repo, outbox=outbox, node_domain=Domain(config.domain), feature_reader=feature_reader, diff --git a/server/osa/infrastructure/persistence/feature_store.py b/server/osa/infrastructure/persistence/feature_store.py index b43c729..b73bfbc 100644 --- a/server/osa/infrastructure/persistence/feature_store.py +++ b/server/osa/infrastructure/persistence/feature_store.py @@ -63,7 +63,7 @@ async def create_table(self, hook_name: str, columns: list[ColumnDef]) -> None: schema = FeatureSchema(columns=columns) table = build_feature_table(hook_name, schema) - # Create table + # Create table (FK to records.srn is declared inline on the column) await conn.run_sync(table.metadata.create_all, checkfirst=False) await conn.execute( feature_tables_table.insert().values( diff --git a/server/osa/infrastructure/persistence/feature_table.py b/server/osa/infrastructure/persistence/feature_table.py index 48eb214..32aa6ba 100644 --- a/server/osa/infrastructure/persistence/feature_table.py +++ b/server/osa/infrastructure/persistence/feature_table.py @@ -7,6 +7,7 @@ from osa.domain.shared.model.hook import ColumnDef from osa.domain.shared.model.value import ValueObject from osa.infrastructure.persistence.column_mapper import map_column +from osa.infrastructure.persistence.tables import records_table FEATURES_SCHEMA = "features" @@ -26,11 +27,12 @@ def build_feature_table(pg_table: str, schema: FeatureSchema) -> sa.Table: """Build a SQLAlchemy ``Table`` for a dynamic feature table. Returns a ``Table`` with auto columns (``id``, ``record_srn``, ``created_at``) - plus data columns derived from *schema* via :func:`map_column`, in the - ``features`` PG schema. + plus data columns derived from *schema*, in the ``features`` PG schema. - Each call creates a disposable ``MetaData`` — these Tables are used for - query building only, not for DDL lifecycle management. + ``record_srn`` carries an ``ON DELETE CASCADE`` FK to ``records.srn`` — the + FK target is the ``Column`` object itself (not a string reference), so + SQLAlchemy resolves it without requiring ``records`` to live in the same + disposable ``MetaData`` as the dynamic table. """ data_columns = [map_column(col_def) for col_def in schema.columns] @@ -39,7 +41,13 @@ def build_feature_table(pg_table: str, schema: FeatureSchema) -> sa.Table: pg_table, metadata, sa.Column("id", sa.BigInteger, primary_key=True, autoincrement=True), - sa.Column("record_srn", sa.Text, nullable=False, index=True), + sa.Column( + "record_srn", + sa.Text, + sa.ForeignKey(records_table.c.srn, ondelete="CASCADE"), + nullable=False, + index=True, + ), sa.Column( "created_at", sa.DateTime(timezone=True), diff --git a/server/osa/infrastructure/persistence/mappers/record.py b/server/osa/infrastructure/persistence/mappers/record.py index 97ad7db..8278e2c 100644 --- a/server/osa/infrastructure/persistence/mappers/record.py +++ b/server/osa/infrastructure/persistence/mappers/record.py @@ -1,4 +1,11 @@ -"""Record mapper - converts between domain and persistence.""" +"""Record mapper - converts between domain and persistence. + +Feature 076 adds ``schema_srn`` as a first-class linkage and keeps ``metadata`` +as the canonical JSONB store. The typed ``metadata._v`` +table is a discovery-optimized projection maintained asynchronously by the +``InsertRecordMetadata`` event handler; it is not the source of truth for +record metadata. +""" from datetime import datetime from typing import Any @@ -7,7 +14,7 @@ from osa.domain.record.model.aggregate import Record from osa.domain.shared.model.source import RecordSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN _source_adapter = TypeAdapter(RecordSource) @@ -24,7 +31,8 @@ def row_to_record(row: dict[str, Any]) -> Record: srn=RecordSRN.parse(row["srn"]), source=source, convention_srn=ConventionSRN.parse(row["convention_srn"]), - metadata=row.get("metadata", {}), + schema_srn=SchemaSRN.parse(row["schema_srn"]), + metadata=row.get("metadata") or {}, published_at=published_at, ) @@ -34,6 +42,7 @@ def record_to_dict(record: Record) -> dict[str, Any]: return { "srn": str(record.srn), "convention_srn": str(record.convention_srn), + "schema_srn": str(record.schema_srn), "source": _source_adapter.dump_python(record.source, mode="json"), "metadata": record.metadata, "published_at": record.published_at, diff --git a/server/osa/infrastructure/persistence/metadata_store.py b/server/osa/infrastructure/persistence/metadata_store.py new file mode 100644 index 0000000..a18e174 --- /dev/null +++ b/server/osa/infrastructure/persistence/metadata_store.py @@ -0,0 +1,274 @@ +"""PostgreSQL implementation of MetadataStore. + +Schema-keyed DDL lifecycle: one metadata table per (schema_identity, major +version) pair. The catalog row in ``public.metadata_tables`` is updated in +lock-step with ALTER ADD COLUMN operations so reads can reconstruct the +dynamic table shape without reflection. +""" + +from __future__ import annotations + +from datetime import UTC, datetime +from typing import Any, Literal, Sequence + +import sqlalchemy as sa +from sqlalchemy import select, text +from sqlalchemy.dialects.postgresql import insert +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.metadata.port.metadata_store import MetadataStore +from osa.domain.semantics.model.value import FieldDefinition, FieldType +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.hook import ColumnDef +from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.infrastructure.persistence.column_mapper import map_column +from osa.infrastructure.persistence.metadata_table import ( + METADATA_SCHEMA, + MetadataSchema, + build_metadata_table, + schema_slug, +) +from osa.infrastructure.persistence.tables import metadata_tables_table + + +_JsonType = Literal["string", "number", "integer", "boolean", "array", "object"] + + +_JSON_TYPE_MAP: dict[FieldType, tuple[_JsonType | None, str | None]] = { + FieldType.TEXT: ("string", None), + FieldType.URL: ("string", None), + FieldType.TERM: ("string", None), + FieldType.DATE: ("string", "date"), + FieldType.NUMBER: ("number", None), + FieldType.BOOLEAN: ("boolean", None), +} + + +def _field_to_column(field: FieldDefinition) -> ColumnDef: + """Translate a FieldDefinition into a ColumnDef for the metadata table.""" + json_type, fmt = _JSON_TYPE_MAP.get(field.type, (None, None)) + if json_type is None: + raise ValidationError( + f"Field {field.name!r} has unrepresentable type {field.type!r}. " + "Add a column-mapper entry for this FieldType before using it in a schema.", + field=field.name, + ) + return ColumnDef( + name=field.name, + json_type=json_type, + format=fmt, + required=field.required, + ) + + +def _identity_of(schema_srn: SchemaSRN) -> str: + """Return the version-stripped schema SRN (the schema identity).""" + rendered = str(schema_srn) + return rendered.split("@", 1)[0] + + +class PostgresMetadataStore(MetadataStore): + """DDL + DML for per-schema typed metadata tables.""" + + def __init__(self, engine: AsyncEngine, session: AsyncSession) -> None: + self._engine = engine + self._session = session + + async def ensure_table( + self, + schema_srn: SchemaSRN, + schema_title: str, + fields: list[FieldDefinition], + ) -> None: + identity = _identity_of(schema_srn) + major = int(schema_srn.version.root.split(".")[0]) + slug = schema_slug(schema_title) + pg_table = f"{slug}_v{major}" + + columns = [_field_to_column(f) for f in fields] + metadata_schema = MetadataSchema(columns=columns) + + async with self._engine.begin() as conn: + await conn.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{METADATA_SCHEMA}"')) + + existing = ( + ( + await conn.execute( + select(metadata_tables_table).where( + metadata_tables_table.c.schema_identity == identity, + metadata_tables_table.c.schema_major == major, + ) + ) + ) + .mappings() + .first() + ) + + if existing is None: + table = build_metadata_table(pg_table, metadata_schema) + await conn.run_sync(table.metadata.create_all, checkfirst=False) + now = datetime.now(UTC) + await conn.execute( + metadata_tables_table.insert().values( + schema_identity=identity, + schema_slug=slug, + schema_major=major, + schema_versions=[str(schema_srn)], + pg_table=pg_table, + metadata_schema=metadata_schema.model_dump(), + created_at=now, + updated_at=now, + ) + ) + return + + # Table exists — possibly evolve. + stored_schema = MetadataSchema.model_validate(existing["metadata_schema"]) + stored_versions: list[str] = list(existing["schema_versions"]) + pg_table = existing["pg_table"] + + _validate_additive(stored_schema.columns, columns) + + new_columns = [ + c for c in columns if c.name not in {s.name for s in stored_schema.columns} + ] + if not new_columns: + if str(schema_srn) not in stored_versions: + stored_versions.append(str(schema_srn)) + await conn.execute( + metadata_tables_table.update() + .where(metadata_tables_table.c.id == existing["id"]) + .values( + schema_versions=stored_versions, + updated_at=datetime.now(UTC), + ) + ) + return + + # Apply ALTER ADD COLUMN for each new column + for col_def in new_columns: + await conn.execute(text(_alter_add_column_stmt(pg_table, col_def))) + + merged_columns = stored_schema.columns + new_columns + if str(schema_srn) not in stored_versions: + stored_versions.append(str(schema_srn)) + await conn.execute( + metadata_tables_table.update() + .where(metadata_tables_table.c.id == existing["id"]) + .values( + metadata_schema=MetadataSchema(columns=merged_columns).model_dump(), + schema_versions=stored_versions, + updated_at=datetime.now(UTC), + ) + ) + + async def insert( + self, + schema_srn: SchemaSRN, + record_srn: RecordSRN, + values: dict[str, Any], + ) -> None: + identity = _identity_of(schema_srn) + major = int(schema_srn.version.root.split(".")[0]) + + catalog_row = ( + ( + await self._session.execute( + select(metadata_tables_table).where( + metadata_tables_table.c.schema_identity == identity, + metadata_tables_table.c.schema_major == major, + ) + ) + ) + .mappings() + .first() + ) + + if catalog_row is None: + raise ValidationError( + f"No metadata table registered for schema {schema_srn} " + f"(identity={identity}, major={major}). " + "Ensure the convention has been registered first.", + field="schema_srn", + ) + + schema = MetadataSchema.model_validate(catalog_row["metadata_schema"]) + pg_table = catalog_row["pg_table"] + table = build_metadata_table(pg_table, schema) + + known = {c.name for c in schema.columns} + payload = {k: v for k, v in values.items() if k in known} + payload["record_srn"] = str(record_srn) + + stmt = insert(table).values(**payload) + update_cols = {c: stmt.excluded[c] for c in payload.keys() if c != "record_srn"} + if update_cols: + stmt = stmt.on_conflict_do_update( + index_elements=[table.c.record_srn], + set_=update_cols, + ) + else: + stmt = stmt.on_conflict_do_nothing(index_elements=[table.c.record_srn]) + await self._session.execute(stmt) + await self._session.flush() + + +def _validate_additive(existing: Sequence[ColumnDef], incoming: Sequence[ColumnDef]) -> None: + """Raise ValidationError if the incoming column set is not additive.""" + by_name = {c.name: c for c in existing} + for col in incoming: + if col.name not in by_name: + if col.required: + raise ValidationError( + f"Non-additive evolution: new field {col.name!r} is required. " + "New fields in minor/patch bumps must be optional.", + field=col.name, + ) + continue + prev = by_name[col.name] + if prev.json_type != col.json_type or prev.format != col.format: + raise ValidationError( + f"Non-additive evolution: field {col.name!r} changed type " + f"({prev.json_type}/{prev.format} → {col.json_type}/{col.format}).", + field=col.name, + ) + if prev.required is False and col.required is True: + raise ValidationError( + f"Non-additive evolution: field {col.name!r} tightened to required.", + field=col.name, + ) + incoming_names = {c.name for c in incoming} + for prev_name in by_name.keys(): + if prev_name not in incoming_names: + raise ValidationError( + f"Non-additive evolution: field {prev_name!r} was removed.", + field=prev_name, + ) + + +def _alter_add_column_stmt(pg_table: str, col_def: ColumnDef) -> str: + """SQL string to ALTER TABLE ADD COLUMN for a single column definition.""" + sql_type = _column_type_sql(map_column(col_def).type) + null_sql = "" if not col_def.required else " NOT NULL" + return ( + f'ALTER TABLE "{METADATA_SCHEMA}"."{pg_table}" ' + f'ADD COLUMN IF NOT EXISTS "{col_def.name}" {sql_type}{null_sql}' + ) + + +def _column_type_sql(sa_type: Any) -> str: + if isinstance(sa_type, sa.Text): + return "text" + if isinstance(sa_type, sa.DateTime): + return "timestamp with time zone" if sa_type.timezone else "timestamp" + if isinstance(sa_type, sa.Date): + return "date" + if isinstance(sa_type, sa.Uuid): + return "uuid" + if isinstance(sa_type, sa.Float): + return "double precision" + if isinstance(sa_type, sa.BigInteger): + return "bigint" + if isinstance(sa_type, sa.Boolean): + return "boolean" + return "jsonb" diff --git a/server/osa/infrastructure/persistence/metadata_table.py b/server/osa/infrastructure/persistence/metadata_table.py new file mode 100644 index 0000000..239a3fb --- /dev/null +++ b/server/osa/infrastructure/persistence/metadata_table.py @@ -0,0 +1,85 @@ +"""Shared helpers for building dynamic metadata Table objects. + +Mirrors :mod:`osa.infrastructure.persistence.feature_table` — metadata tables +are schema-keyed typed stores living in the ``metadata`` PG schema, with a +catalog row in ``public.metadata_tables`` per (schema_identity, major) pair. +""" + +from __future__ import annotations + +import re + +import sqlalchemy as sa + +from osa.domain.shared.model.hook import ColumnDef +from osa.domain.shared.model.value import ValueObject +from osa.infrastructure.persistence.column_mapper import map_column +from osa.infrastructure.persistence.tables import records_table + +METADATA_SCHEMA = "metadata" + +AUTO_COLUMN_NAMES = frozenset({"id", "record_srn", "created_at"}) + +_SLUG_RE = re.compile(r"^[a-z][a-z0-9_]{0,50}$") + + +class MetadataSchema(ValueObject): + """Typed representation of the ``metadata_tables.metadata_schema`` JSON column.""" + + columns: list[ColumnDef] = [] + + +def schema_slug(title: str) -> str: + """Derive a pg-safe slug from a Schema title. + + Lowercases, replaces runs of non-alphanumerics with a single underscore, + strips leading/trailing underscores, then validates against ``^[a-z][a-z0-9_]{0,50}$``. + Raises ``ValueError`` if the derived slug is empty or cannot be validated. + """ + normalised = re.sub(r"[^a-z0-9]+", "_", title.strip().lower()).strip("_") + if not normalised or not _SLUG_RE.match(normalised): + raise ValueError( + f"Cannot derive a valid metadata table slug from title {title!r}. " + f"Expected a string that maps to ^[a-z][a-z0-9_]{{0,50}}$." + ) + return normalised + + +def build_metadata_table(pg_table: str, schema: MetadataSchema) -> sa.Table: + """Build a SQLAlchemy ``Table`` for a dynamic metadata table. + + Adds auto columns (``id``, ``record_srn``, ``created_at``) plus data columns + derived from *schema*. ``record_srn`` is ``UNIQUE`` (exactly one metadata + row per record) and carries an ``ON DELETE CASCADE`` FK to ``records.srn``. + The FK target is the ``Column`` object itself, so SQLAlchemy resolves it + without requiring ``records`` to live in the same disposable ``MetaData`` + as the dynamic table. + """ + data_columns = [map_column(col_def) for col_def in schema.columns] + + metadata_obj = sa.MetaData() + return sa.Table( + pg_table, + metadata_obj, + sa.Column("id", sa.BigInteger, primary_key=True, autoincrement=True), + sa.Column( + "record_srn", + sa.Text, + sa.ForeignKey(records_table.c.srn, ondelete="CASCADE"), + nullable=False, + unique=True, + ), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + nullable=False, + server_default=sa.func.now(), + ), + *data_columns, + schema=METADATA_SCHEMA, + ) + + +def data_columns(table: sa.Table) -> list[sa.Column]: + """Return only the user-defined data columns, excluding auto columns.""" + return [c for c in table.columns if c.key not in AUTO_COLUMN_NAMES] diff --git a/server/osa/infrastructure/persistence/tables.py b/server/osa/infrastructure/persistence/tables.py index 315f97c..42dc3f4 100644 --- a/server/osa/infrastructure/persistence/tables.py +++ b/server/osa/infrastructure/persistence/tables.py @@ -66,12 +66,14 @@ metadata, Column("srn", String, primary_key=True), Column("convention_srn", Text, nullable=False), + Column("schema_srn", Text, nullable=False), Column("source", JSONB, nullable=False), Column("metadata", JSONB, nullable=False), Column("published_at", DateTime(timezone=True), nullable=False), ) Index("idx_records_convention_srn", records_table.c.convention_srn) +Index("idx_records_schema_srn", records_table.c.schema_srn) Index( "uq_records_source", records_table.c.source["type"].as_string(), @@ -295,6 +297,26 @@ ) +# ============================================================================ +# METADATA TABLES CATALOG (Typed Metadata — feature 076) +# ============================================================================ +metadata_tables_table = Table( + "metadata_tables", + metadata, + Column("id", Integer, primary_key=True, autoincrement=True), + Column("schema_identity", Text, nullable=False), + Column("schema_slug", Text, nullable=False), + Column("schema_major", Integer, nullable=False), + Column("schema_versions", JSONB, nullable=False), + Column("pg_table", Text, nullable=False), + Column("metadata_schema", JSONB, nullable=False), + Column("created_at", DateTime(timezone=True), nullable=False), + Column("updated_at", DateTime(timezone=True), nullable=False), + UniqueConstraint("schema_identity", "schema_major", name="uq_metadata_tables_identity_major"), + UniqueConstraint("pg_table", name="uq_metadata_tables_pg_table"), +) + + # ============================================================================ # ROLE ASSIGNMENTS TABLE (Authorization) # ============================================================================ diff --git a/server/tests/integration/conftest.py b/server/tests/integration/conftest.py index a784e7a..a3c0828 100644 --- a/server/tests/integration/conftest.py +++ b/server/tests/integration/conftest.py @@ -1,6 +1,9 @@ """Fixtures for PostgreSQL integration tests.""" +import json import os +from datetime import UTC, datetime +from typing import Any import pytest import pytest_asyncio @@ -21,6 +24,42 @@ def _get_pg_url() -> str: return url +async def seed_record( + engine: AsyncEngine, + *, + srn: str, + convention_srn: str = "urn:osa:localhost:conv:test@1.0.0", + schema_srn: str = "urn:osa:localhost:schema:test@1.0.0", + source: dict[str, Any] | None = None, + metadata: dict[str, Any] | None = None, + published_at: datetime | None = None, +) -> None: + """Insert a records row directly so typed-table FK inserts succeed. + + Keeps tests independent of the full publish event chain when they only + need a persisted Record to anchor metadata/feature rows against. + """ + src = source or {"type": "deposition", "id": f"dep-{srn.split(':')[-1]}"} + async with engine.begin() as conn: + await conn.execute( + text( + """ + INSERT INTO records (srn, convention_srn, schema_srn, source, metadata, published_at) + VALUES (:srn, :conv, :schema, CAST(:source AS JSONB), + CAST(:meta AS JSONB), :published_at) + """ + ), + { + "srn": srn, + "conv": convention_srn, + "schema": schema_srn, + "source": json.dumps(src), + "meta": json.dumps(metadata or {}), + "published_at": published_at or datetime.now(UTC), + }, + ) + + @pytest_asyncio.fixture async def pg_engine(): """Per-test async engine pointing at osa_test.""" @@ -39,16 +78,21 @@ async def pg_session(pg_engine: AsyncEngine): yield session await session.rollback() - # Truncate all tables after each test + # Truncate static tables + drop the two schemas that hold runtime-created + # dynamic tables (features., metadata._v). Without the + # drop, a dynamic table created by test A survives TRUNCATE and collides + # when test B tries to ensure/create it again. async with pg_engine.begin() as conn: await conn.execute( text( "TRUNCATE TABLE depositions, conventions, schemas, ontologies, " "ontology_terms, events, deliveries, records, validation_runs, " - "feature_tables, users, identities, refresh_tokens, " + "feature_tables, metadata_tables, users, identities, refresh_tokens, " "role_assignments CASCADE" ) ) + await conn.execute(text('DROP SCHEMA IF EXISTS "features" CASCADE')) + await conn.execute(text('DROP SCHEMA IF EXISTS "metadata" CASCADE')) # Re-seed system user after truncate await ensure_system_user(pg_engine) diff --git a/server/tests/integration/persistence/test_discovery_pagination.py b/server/tests/integration/persistence/test_discovery_pagination.py new file mode 100644 index 0000000..a45f4ea --- /dev/null +++ b/server/tests/integration/persistence/test_discovery_pagination.py @@ -0,0 +1,79 @@ +"""Integration tests for discovery keyset pagination against real Postgres. + +Regression coverage for a production bug where paginating past page 1 raised +``operator does not exist: timestamp with time zone < character varying`` +because the cursor's sort value round-tripped through JSON as a plain string +and was bound as ``VARCHAR`` against the typed ``records.published_at`` column. +""" + +from __future__ import annotations + +from datetime import UTC, datetime + +import pytest +from sqlalchemy.ext.asyncio import AsyncSession + +from osa.domain.discovery.model.value import SortOrder, decode_cursor, encode_cursor +from osa.infrastructure.persistence.adapter.discovery import PostgresDiscoveryReadStore +from osa.infrastructure.persistence.tables import records_table + + +async def _insert_record(session: AsyncSession, srn: str, published_at: datetime) -> None: + await session.execute( + records_table.insert().values( + srn=srn, + convention_srn="urn:osa:localhost:conv:test@1.0.0", + schema_srn="urn:osa:localhost:schema:test@1.0.0", + source={"type": "test", "id": srn}, + metadata={}, + published_at=published_at, + ) + ) + await session.commit() + + +@pytest.mark.asyncio +class TestDiscoveryPaginationPublishedAt: + async def test_second_page_with_published_at_cursor(self, pg_session: AsyncSession) -> None: + """Fetching page 2 with a cursor must not trip the timestamptz/varchar + mismatch — the bug manifested only on requests that supplied a cursor.""" + store = PostgresDiscoveryReadStore(pg_session) + base = datetime(2026, 4, 7, 9, 0, 0, tzinfo=UTC) + records = [(f"urn:osa:localhost:rec:page-{i}@1", base.replace(second=i)) for i in range(3)] + for srn, ts in records: + await _insert_record(pg_session, srn, ts) + + first_page = await store.search_records( + filter_expr=None, + schema_srn=None, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=2, + ) + assert len(first_page) == 2 + + # Encode + decode the cursor the same way the service does — this is the + # round-trip that previously produced a VARCHAR bind. + last = first_page[-1] + cursor_str = encode_cursor(last.published_at.isoformat(), str(last.srn)) + decoded = decode_cursor(cursor_str) + + second_page = await store.search_records( + filter_expr=None, + schema_srn=None, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=decoded, + limit=2, + ) + + assert len(second_page) == 1 + returned = {str(r.srn) for r in first_page} | {str(r.srn) for r in second_page} + assert returned == {srn for srn, _ in records} diff --git a/server/tests/integration/persistence/test_feature_store.py b/server/tests/integration/persistence/test_feature_store.py index 0c81bc8..a05b28e 100644 --- a/server/tests/integration/persistence/test_feature_store.py +++ b/server/tests/integration/persistence/test_feature_store.py @@ -111,16 +111,21 @@ async def test_create_table_registers_in_catalog( @pytest.mark.asyncio class TestFeatureStoreInsert: async def test_insert_features(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + from tests.integration.conftest import seed_record + store = PostgresFeatureStore(pg_engine, pg_session) hook = _make_hook(name="insert_hook") await store.create_table("insert_hook", hook.feature.columns) + record_srn = "urn:osa:localhost:rec:rec-001@1" + await seed_record(pg_engine, srn=record_srn) + rows = [ {"score": 0.95, "label": "good"}, {"score": 0.42, "label": "poor"}, {"score": 0.78, "label": None}, ] - count = await store.insert_features("insert_hook", "urn:osa:localhost:rec:rec-001@1", rows) + count = await store.insert_features("insert_hook", record_srn, rows) assert count == 3 # Verify data is in the table @@ -171,6 +176,11 @@ async def test_jsonb_column_for_array_and_object( assert col_types["metadata"] == "jsonb" assert col_types["count"] == "bigint" + from tests.integration.conftest import seed_record + + record_srn = "urn:osa:localhost:rec:rec-jsonb@1" + await seed_record(pg_engine, srn=record_srn) + # Insert data with JSONB values rows = [ { @@ -179,5 +189,5 @@ async def test_jsonb_column_for_array_and_object( "count": 42, } ] - count = await store.insert_features("jsonb_hook", "urn:osa:localhost:rec:rec-jsonb@1", rows) + count = await store.insert_features("jsonb_hook", record_srn, rows) assert count == 1 diff --git a/server/tests/integration/persistence/test_metadata_store.py b/server/tests/integration/persistence/test_metadata_store.py new file mode 100644 index 0000000..60d8349 --- /dev/null +++ b/server/tests/integration/persistence/test_metadata_store.py @@ -0,0 +1,292 @@ +"""Integration tests for PostgresMetadataStore — DDL, UPSERT, FK cascade, additive evolution.""" + +import pytest +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore +from osa.infrastructure.persistence.metadata_table import METADATA_SCHEMA + +from tests.integration.conftest import seed_record + +SCHEMA_IDENTITY = "urn:osa:localhost:schema:bio-sample" +SCHEMA_V1 = SchemaSRN.parse(f"{SCHEMA_IDENTITY}@1.0.0") +SCHEMA_V11 = SchemaSRN.parse(f"{SCHEMA_IDENTITY}@1.1.0") +SCHEMA_V2 = SchemaSRN.parse(f"{SCHEMA_IDENTITY}@2.0.0") + + +def _fields_v1() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +def _fields_v11_additive() -> list[FieldDefinition]: + return _fields_v1() + [ + FieldDefinition( + name="collection_site", + type=FieldType.TEXT, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +def _fields_rename() -> list[FieldDefinition]: + # 'species' renamed to 'organism' — not additive. + return [ + FieldDefinition( + name="organism", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +async def _table_exists(engine: AsyncEngine, pg_table: str) -> bool: + async with engine.begin() as conn: + result = await conn.execute( + text( + "SELECT EXISTS (SELECT 1 FROM information_schema.tables " + "WHERE table_schema = :s AND table_name = :t)" + ), + {"s": METADATA_SCHEMA, "t": pg_table}, + ) + return bool(result.scalar()) + + +async def _column_names(engine: AsyncEngine, pg_table: str) -> list[str]: + async with engine.begin() as conn: + result = await conn.execute( + text( + "SELECT column_name FROM information_schema.columns " + "WHERE table_schema = :s AND table_name = :t " + "ORDER BY ordinal_position" + ), + {"s": METADATA_SCHEMA, "t": pg_table}, + ) + return [row[0] for row in result.fetchall()] + + +@pytest.mark.asyncio +class TestEnsureTable: + async def test_creates_table_and_catalog_row( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + + assert await _table_exists(pg_engine, "bio_sample_v1") + cols = await _column_names(pg_engine, "bio_sample_v1") + for expected in ("id", "record_srn", "created_at", "species", "resolution"): + assert expected in cols + + async with pg_engine.begin() as conn: + row = ( + await conn.execute( + text( + "SELECT schema_identity, schema_major, pg_table, schema_versions " + "FROM metadata_tables WHERE schema_identity = :id" + ), + {"id": SCHEMA_IDENTITY}, + ) + ).first() + assert row is not None + assert row[0] == SCHEMA_IDENTITY + assert row[1] == 1 + assert row[2] == "bio_sample_v1" + assert str(SCHEMA_V1) in row[3] + + async def test_idempotent_on_same_version( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + # Second call with same SRN should not raise and should not duplicate catalog rows. + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + + async with pg_engine.begin() as conn: + count = ( + await conn.execute( + text("SELECT COUNT(*) FROM metadata_tables WHERE schema_identity = :id"), + {"id": SCHEMA_IDENTITY}, + ) + ).scalar() + assert count == 1 + + async def test_foreign_key_cascade_on_record_srn( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + + async with pg_engine.begin() as conn: + constraint = ( + await conn.execute( + text( + "SELECT confdeltype FROM pg_constraint " + "WHERE conrelid = 'metadata.bio_sample_v1'::regclass " + "AND contype = 'f'" + ) + ) + ).scalar() + # 'c' = CASCADE in pg_constraint.confdeltype. asyncpg returns the + # Postgres "char" type as bytes; normalize for comparison. + if isinstance(constraint, bytes): + constraint = constraint.decode() + assert constraint == "c" + + +@pytest.mark.asyncio +class TestInsert: + async def test_insert_typed_row(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + + record_srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") + await seed_record(pg_engine, srn=str(record_srn), schema_srn=str(SCHEMA_V1)) + + await store.insert( + SCHEMA_V1, + record_srn, + {"species": "Homo sapiens", "resolution": 3.5}, + ) + await pg_session.commit() + + async with pg_engine.begin() as conn: + row = ( + await conn.execute( + text( + f"SELECT record_srn, species, resolution " + f'FROM "{METADATA_SCHEMA}"."bio_sample_v1"' + ) + ) + ).first() + assert row is not None + assert row[0] == str(record_srn) + assert row[1] == "Homo sapiens" + assert row[2] == 3.5 + + async def test_insert_is_idempotent_on_duplicate_delivery( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + + record_srn = RecordSRN.parse("urn:osa:localhost:rec:dup@1") + await seed_record(pg_engine, srn=str(record_srn), schema_srn=str(SCHEMA_V1)) + + await store.insert(SCHEMA_V1, record_srn, {"species": "Mus musculus", "resolution": 1.0}) + await store.insert(SCHEMA_V1, record_srn, {"species": "Mus musculus", "resolution": 1.0}) + await pg_session.commit() + + async with pg_engine.begin() as conn: + count = ( + await conn.execute( + text(f'SELECT COUNT(*) FROM "{METADATA_SCHEMA}"."bio_sample_v1"') + ) + ).scalar() + assert count == 1 + + async def test_cascade_delete_removes_metadata_row( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + + record_srn = RecordSRN.parse("urn:osa:localhost:rec:cascade@1") + await seed_record(pg_engine, srn=str(record_srn), schema_srn=str(SCHEMA_V1)) + await store.insert(SCHEMA_V1, record_srn, {"species": "Cascade", "resolution": 0.1}) + await pg_session.commit() + + async with pg_engine.begin() as conn: + await conn.execute( + text("DELETE FROM records WHERE srn = :srn"), {"srn": str(record_srn)} + ) + + async with pg_engine.begin() as conn: + count = ( + await conn.execute( + text(f'SELECT COUNT(*) FROM "{METADATA_SCHEMA}"."bio_sample_v1"') + ) + ).scalar() + assert count == 0 + + +@pytest.mark.asyncio +class TestAdditiveEvolution: + async def test_add_column_on_minor_bump(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + cols_before = await _column_names(pg_engine, "bio_sample_v1") + assert "collection_site" not in cols_before + + await store.ensure_table(SCHEMA_V11, "bio_sample", _fields_v11_additive()) + cols_after = await _column_names(pg_engine, "bio_sample_v1") + assert "collection_site" in cols_after + + async def test_catalog_lineage_appended(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + await store.ensure_table(SCHEMA_V11, "bio_sample", _fields_v11_additive()) + + async with pg_engine.begin() as conn: + versions = ( + await conn.execute( + text( + "SELECT schema_versions FROM metadata_tables " + "WHERE schema_identity = :id AND schema_major = 1" + ), + {"id": SCHEMA_IDENTITY}, + ) + ).scalar() + assert str(SCHEMA_V1) in versions + assert str(SCHEMA_V11) in versions + + +@pytest.mark.asyncio +class TestNonAdditiveRejection: + async def test_rename_raises(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + + with pytest.raises(ValidationError, match="Non-additive"): + await store.ensure_table(SCHEMA_V11, "bio_sample", _fields_rename()) + + async def test_required_new_field_raises( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + + bad = _fields_v1() + [ + FieldDefinition( + name="must_have", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ) + ] + with pytest.raises(ValidationError, match="required"): + await store.ensure_table(SCHEMA_V11, "bio_sample", bad) diff --git a/server/tests/integration/test_discovery_compound_postgres.py b/server/tests/integration/test_discovery_compound_postgres.py new file mode 100644 index 0000000..a34ebca --- /dev/null +++ b/server/tests/integration/test_discovery_compound_postgres.py @@ -0,0 +1,157 @@ +"""Integration tests for compound OR/NOT discovery filters against real PG.""" + +import pytest +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.discovery.model.refs import MetadataFieldRef +from osa.domain.discovery.model.value import ( + And, + FilterOperator, + Not, + Or, + Predicate, + SortOrder, +) +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.infrastructure.persistence.adapter.discovery import PostgresDiscoveryReadStore +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore + +from tests.integration.conftest import seed_record + +SCHEMA_V1 = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") +FIELD_TYPES = {"species": FieldType.TEXT, "resolution": FieldType.NUMBER} + + +def _fields() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +@pytest.fixture +async def seeded_store(pg_engine: AsyncEngine, pg_session: AsyncSession) -> PostgresMetadataStore: + from datetime import UTC, datetime + + from osa.domain.semantics.model.schema import Schema + from osa.infrastructure.persistence.repository.schema import ( + PostgresSemanticsSchemaRepository, + ) + + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + + repo = PostgresSemanticsSchemaRepository(pg_session) + await repo.save( + Schema(srn=SCHEMA_V1, title="bio_sample", fields=_fields(), created_at=datetime.now(UTC)) + ) + + rows = [ + ("rec-a1", "Homo sapiens", 3.5), + ("rec-b1", "Homo sapiens", 1.0), + ("rec-c1", "Mus musculus", 3.5), + ("rec-d1", "Drosophila", 0.5), + ] + for rid, sp, res in rows: + srn = RecordSRN.parse(f"urn:osa:localhost:rec:{rid}@1") + await seed_record(pg_engine, srn=str(srn), schema_srn=str(SCHEMA_V1)) + await store.insert(SCHEMA_V1, srn, {"species": sp, "resolution": res}) + + await pg_session.commit() + return store + + +def _pred(field: str, op: FilterOperator, value: object) -> Predicate: + return Predicate(field=MetadataFieldRef(field=field), op=op, value=value) + + +@pytest.mark.asyncio +class TestCompound: + async def test_or_tree(self, pg_engine: AsyncEngine, pg_session: AsyncSession, seeded_store): + read_store = PostgresDiscoveryReadStore(pg_session) + # species = Homo sapiens OR resolution < 1.0 + tree = Or( + operands=[ + _pred("species", FilterOperator.EQ, "Homo sapiens"), + _pred("resolution", FilterOperator.LT, 1.0), + ] + ) + results = await read_store.search_records( + filter_expr=tree, + schema_srn=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + srns = {str(r.srn) for r in results} + # a, b (species match) + d (resolution 0.5) — not c + assert srns == { + "urn:osa:localhost:rec:rec-a1@1", + "urn:osa:localhost:rec:rec-b1@1", + "urn:osa:localhost:rec:rec-d1@1", + } + + async def test_not_tree(self, pg_engine: AsyncEngine, pg_session: AsyncSession, seeded_store): + read_store = PostgresDiscoveryReadStore(pg_session) + tree = Not(operand=_pred("species", FilterOperator.EQ, "Homo sapiens")) + results = await read_store.search_records( + filter_expr=tree, + schema_srn=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + srns = {str(r.srn) for r in results} + assert srns == {"urn:osa:localhost:rec:rec-c1@1", "urn:osa:localhost:rec:rec-d1@1"} + + async def test_nested_and_or( + self, pg_engine: AsyncEngine, pg_session: AsyncSession, seeded_store + ): + read_store = PostgresDiscoveryReadStore(pg_session) + # resolution >= 3.0 AND (species = Homo sapiens OR species = Mus musculus) + tree = And( + operands=[ + _pred("resolution", FilterOperator.GTE, 3.0), + Or( + operands=[ + _pred("species", FilterOperator.EQ, "Homo sapiens"), + _pred("species", FilterOperator.EQ, "Mus musculus"), + ] + ), + ] + ) + results = await read_store.search_records( + filter_expr=tree, + schema_srn=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + srns = {str(r.srn) for r in results} + assert srns == {"urn:osa:localhost:rec:rec-a1@1", "urn:osa:localhost:rec:rec-c1@1"} diff --git a/server/tests/integration/test_discovery_cross_join_postgres.py b/server/tests/integration/test_discovery_cross_join_postgres.py new file mode 100644 index 0000000..9e888ab --- /dev/null +++ b/server/tests/integration/test_discovery_cross_join_postgres.py @@ -0,0 +1,137 @@ +"""Integration tests for cross-domain JOINs between records ⋈ features in discovery.""" + +import pytest +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.discovery.model.refs import FeatureFieldRef, MetadataFieldRef +from osa.domain.discovery.model.value import And, FilterOperator, Predicate, SortOrder +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.hook import ColumnDef +from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.infrastructure.persistence.adapter.discovery import PostgresDiscoveryReadStore +from osa.infrastructure.persistence.feature_store import PostgresFeatureStore +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore + +from tests.integration.conftest import seed_record + +SCHEMA_V1 = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") +FIELD_TYPES = {"species": FieldType.TEXT} + + +def _metadata_fields() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +def _feature_columns() -> list[ColumnDef]: + return [ + ColumnDef(name="confidence", json_type="number", required=True), + ] + + +@pytest.fixture +async def seeded_both(pg_engine: AsyncEngine, pg_session: AsyncSession): + from datetime import UTC, datetime + + from osa.domain.semantics.model.schema import Schema + from osa.infrastructure.persistence.repository.schema import ( + PostgresSemanticsSchemaRepository, + ) + + mstore = PostgresMetadataStore(pg_engine, pg_session) + await mstore.ensure_table(SCHEMA_V1, "bio_sample", _metadata_fields()) + + fstore = PostgresFeatureStore(pg_engine, pg_session) + await fstore.create_table("cell_classifier", _feature_columns()) + + repo = PostgresSemanticsSchemaRepository(pg_session) + await repo.save( + Schema( + srn=SCHEMA_V1, + title="bio_sample", + fields=_metadata_fields(), + created_at=datetime.now(UTC), + ) + ) + + # r1: Homo sapiens + confidence 0.95 + # r2: Homo sapiens + confidence 0.5 + # r3: Mus musculus + confidence 0.95 + for rid, sp, conf in [ + ("rec-r1", "Homo sapiens", 0.95), + ("rec-r2", "Homo sapiens", 0.5), + ("rec-r3", "Mus musculus", 0.95), + ]: + srn = RecordSRN.parse(f"urn:osa:localhost:rec:{rid}@1") + await seed_record(pg_engine, srn=str(srn), schema_srn=str(SCHEMA_V1)) + await mstore.insert(SCHEMA_V1, srn, {"species": sp}) + await fstore.insert_features("cell_classifier", str(srn), [{"confidence": conf}]) + + await pg_session.commit() + return mstore, fstore + + +@pytest.mark.asyncio +class TestCrossDomainJoin: + async def test_joined_intersection( + self, pg_engine: AsyncEngine, pg_session: AsyncSession, seeded_both + ): + read_store = PostgresDiscoveryReadStore(pg_session) + tree = And( + operands=[ + Predicate( + field=MetadataFieldRef(field="species"), + op=FilterOperator.EQ, + value="Homo sapiens", + ), + Predicate( + field=FeatureFieldRef(hook="cell_classifier", column="confidence"), + op=FilterOperator.GT, + value=0.9, + ), + ] + ) + results = await read_store.search_records( + filter_expr=tree, + schema_srn=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + srns = {str(r.srn) for r in results} + assert srns == {"urn:osa:localhost:rec:rec-r1@1"} + + async def test_unknown_hook_raises( + self, pg_engine: AsyncEngine, pg_session: AsyncSession, seeded_both + ): + read_store = PostgresDiscoveryReadStore(pg_session) + tree = Predicate( + field=FeatureFieldRef(hook="does_not_exist", column="anything"), + op=FilterOperator.EQ, + value=1, + ) + with pytest.raises(ValidationError, match="Unknown feature hook"): + await read_store.search_records( + filter_expr=tree, + schema_srn=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) diff --git a/server/tests/integration/test_discovery_records_typed_and.py b/server/tests/integration/test_discovery_records_typed_and.py new file mode 100644 index 0000000..4b8ccbf --- /dev/null +++ b/server/tests/integration/test_discovery_records_typed_and.py @@ -0,0 +1,286 @@ +"""Integration tests for /discovery/records with typed-table AND filters.""" + +import pytest +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.discovery.model.refs import MetadataFieldRef +from osa.domain.discovery.model.value import And, FilterOperator, Predicate, SortOrder +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.infrastructure.persistence.adapter.discovery import ( + PostgresDiscoveryReadStore, + PostgresFieldDefinitionReader, +) +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore + +from tests.integration.conftest import seed_record + +SCHEMA_V1 = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") + + +def _fields() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="method", + type=FieldType.TEXT, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +async def _seed_schema_row(session: AsyncSession) -> None: + """Seed the `schemas` row so the discovery field reader can resolve types.""" + from datetime import UTC, datetime + + from osa.domain.semantics.model.schema import Schema + from osa.infrastructure.persistence.repository.schema import ( + PostgresSemanticsSchemaRepository, + ) + + repo = PostgresSemanticsSchemaRepository(session) + await repo.save( + Schema(srn=SCHEMA_V1, title="bio_sample", fields=_fields(), created_at=datetime.now(UTC)) + ) + + +async def _publish( + engine: AsyncEngine, + session: AsyncSession, + store: PostgresMetadataStore, + record_srn: RecordSRN, + species: str, + resolution: float, + method: str, +) -> None: + await seed_record( + engine, + srn=str(record_srn), + schema_srn=str(SCHEMA_V1), + metadata={"species": species, "resolution": resolution, "method": method}, + ) + await store.insert( + SCHEMA_V1, + record_srn, + {"species": species, "resolution": resolution, "method": method}, + ) + + +@pytest.mark.asyncio +class TestDiscoveryTypedAnd: + async def test_and_filter_returns_matching_records( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + await _seed_schema_row(pg_session) + + rows = [ + ("rec-r1", "Homo sapiens", 3.5, "cryo-EM"), + ("rec-r2", "Homo sapiens", 1.8, "X-ray"), + ("rec-r3", "Mus musculus", 3.0, "cryo-EM"), + ] + for rid, sp, res, meth in rows: + await _publish( + pg_engine, + pg_session, + store, + RecordSRN.parse(f"urn:osa:localhost:rec:{rid}@1"), + sp, + res, + meth, + ) + await pg_session.commit() + + read_store = PostgresDiscoveryReadStore(pg_session) + tree = And( + operands=[ + Predicate( + field=MetadataFieldRef(field="species"), + op=FilterOperator.EQ, + value="Homo sapiens", + ), + Predicate( + field=MetadataFieldRef(field="resolution"), + op=FilterOperator.GTE, + value=2.0, + ), + ] + ) + + results = await read_store.search_records( + filter_expr=tree, + schema_srn=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types={ + "species": FieldType.TEXT, + "resolution": FieldType.NUMBER, + "method": FieldType.TEXT, + }, + ) + + srns = {str(r.srn) for r in results} + assert srns == {"urn:osa:localhost:rec:rec-r1@1"} + + async def test_scalar_op_succeeds_on_unindexed_column( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """FR-020: scalar ops must NOT be rejected for lack of index.""" + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + await _seed_schema_row(pg_session) + + await _publish( + pg_engine, + pg_session, + store, + RecordSRN.parse("urn:osa:localhost:rec:rec-ra@1"), + "Homo sapiens", + 3.5, + "cryo-EM", + ) + await pg_session.commit() + + read_store = PostgresDiscoveryReadStore(pg_session) + results = await read_store.search_records( + filter_expr=Predicate( + field=MetadataFieldRef(field="method"), + op=FilterOperator.CONTAINS, + value="cryo", + ), + schema_srn=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types={ + "species": FieldType.TEXT, + "resolution": FieldType.NUMBER, + "method": FieldType.TEXT, + }, + ) + assert len(results) == 1 + + +@pytest.mark.asyncio +class TestUnscopedListing: + """When no schema_srn is passed, discovery should still return canonical + JSONB metadata — the typed table is an optimization, not the sole source.""" + + async def test_unscoped_predicate_filter_hits_jsonb( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """Filtering by a metadata field without schema_srn must compile + against the canonical JSONB column (the Pockets frontend pattern: + fetch-by-pdb_id without knowing the schema SRN).""" + from osa.domain.discovery.model.refs import MetadataFieldRef + from osa.domain.discovery.model.value import FilterOperator, Predicate + + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + await _seed_schema_row(pg_session) + + # Two records with distinct pdb-like ids in JSONB; typed table row + # written for completeness but not read by this test. + for srn_id, species in [("rec-9x1w", "Homo sapiens"), ("rec-8abc", "Mus musculus")]: + await _publish( + pg_engine, + pg_session, + store, + RecordSRN.parse(f"urn:osa:localhost:rec:{srn_id}@1"), + species, + 3.5, + "cryo-EM", + ) + await pg_session.commit() + + read_store = PostgresDiscoveryReadStore(pg_session) + results = await read_store.search_records( + filter_expr=Predicate( + field=MetadataFieldRef(field="species"), + op=FilterOperator.EQ, + value="Homo sapiens", + ), + schema_srn=None, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types={"species": FieldType.TEXT}, + ) + srns = {str(r.srn) for r in results} + assert srns == {"urn:osa:localhost:rec:rec-9x1w@1"} + + async def test_unscoped_listing_returns_jsonb_metadata( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + await _seed_schema_row(pg_session) + + await _publish( + pg_engine, + pg_session, + store, + RecordSRN.parse("urn:osa:localhost:rec:rec-unscoped@1"), + "Homo sapiens", + 3.5, + "cryo-EM", + ) + await pg_session.commit() + + read_store = PostgresDiscoveryReadStore(pg_session) + results = await read_store.search_records( + filter_expr=None, + schema_srn=None, # deliberately unscoped — exercises the JSONB path + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types={}, + ) + assert len(results) == 1 + assert results[0].metadata == { + "species": "Homo sapiens", + "resolution": 3.5, + "method": "cryo-EM", + } + + +@pytest.mark.asyncio +class TestFieldDefinitionReader: + async def test_get_fields_for_schema(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + await _seed_schema_row(pg_session) + await pg_session.commit() + + reader = PostgresFieldDefinitionReader(pg_session) + fields = await reader.get_fields_for_schema(SCHEMA_V1) + assert fields["species"] == FieldType.TEXT + assert fields["resolution"] == FieldType.NUMBER diff --git a/server/tests/integration/test_ensure_metadata_table.py b/server/tests/integration/test_ensure_metadata_table.py new file mode 100644 index 0000000..4482c22 --- /dev/null +++ b/server/tests/integration/test_ensure_metadata_table.py @@ -0,0 +1,172 @@ +"""Integration tests for EnsureMetadataTable event handler.""" + +from datetime import UTC, datetime +from uuid import uuid4 + +import pytest +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.deposition.event.convention_registered import ConventionRegistered +from osa.domain.deposition.model.convention import Convention +from osa.domain.deposition.model.value import FileRequirements +from osa.domain.metadata.handler.ensure_metadata_table import EnsureMetadataTable +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.semantics.model.schema import Schema +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.event import EventId +from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore +from osa.infrastructure.persistence.metadata_table import METADATA_SCHEMA +from osa.infrastructure.persistence.repository.convention import PostgresConventionRepository +from osa.infrastructure.persistence.repository.schema import PostgresSemanticsSchemaRepository + +SCHEMA_IDENTITY = "urn:osa:localhost:schema:bio-sample" +SCHEMA_V1 = SchemaSRN.parse(f"{SCHEMA_IDENTITY}@1.0.0") +SCHEMA_V11 = SchemaSRN.parse(f"{SCHEMA_IDENTITY}@1.1.0") + + +def _fields_v1() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +def _fields_v11() -> list[FieldDefinition]: + return _fields_v1() + [ + FieldDefinition( + name="collection_site", + type=FieldType.TEXT, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +async def _seed_schema( + session: AsyncSession, srn: SchemaSRN, fields: list[FieldDefinition], title: str = "bio_sample" +) -> None: + repo = PostgresSemanticsSchemaRepository(session) + await repo.save(Schema(srn=srn, title=title, fields=fields, created_at=datetime.now(UTC))) + + +async def _seed_convention( + session: AsyncSession, srn: ConventionSRN, schema_srn: SchemaSRN +) -> None: + repo = PostgresConventionRepository(session) + await repo.save( + Convention( + srn=srn, + title="bio_sample_v1", + description=None, + schema_srn=schema_srn, + file_requirements=FileRequirements(accepted_types=[], max_count=0, max_file_size=0), + hooks=[], + created_at=datetime.now(UTC), + ) + ) + + +def _event( + convention_srn: ConventionSRN, + schema_srn: SchemaSRN, + schema_fields: list[FieldDefinition], +) -> ConventionRegistered: + return ConventionRegistered( + id=EventId(uuid4()), + convention_srn=convention_srn, + schema_srn=schema_srn, + schema_fields=schema_fields, + hooks=[], + ) + + +async def _make_handler(pg_engine: AsyncEngine, pg_session: AsyncSession) -> EnsureMetadataTable: + store = PostgresMetadataStore(pg_engine, pg_session) + service = MetadataService(metadata_store=store) + return EnsureMetadataTable( + metadata_service=service, + schema_repo=PostgresSemanticsSchemaRepository(pg_session), + convention_repo=PostgresConventionRepository(pg_session), + ) + + +async def _catalog_row_count(engine: AsyncEngine) -> int: + async with engine.begin() as conn: + return int((await conn.execute(text("SELECT COUNT(*) FROM metadata_tables"))).scalar() or 0) + + +async def _table_columns(engine: AsyncEngine, pg_table: str) -> set[str]: + async with engine.begin() as conn: + result = await conn.execute( + text( + "SELECT column_name FROM information_schema.columns " + "WHERE table_schema = :s AND table_name = :t" + ), + {"s": METADATA_SCHEMA, "t": pg_table}, + ) + return {row[0] for row in result.fetchall()} + + +@pytest.mark.asyncio +class TestEnsureMetadataTable: + async def test_first_event_creates_table_and_catalog_row( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + conv_srn = ConventionSRN.parse("urn:osa:localhost:conv:conv-c1@1.0.0") + await _seed_schema(pg_session, SCHEMA_V1, _fields_v1()) + await _seed_convention(pg_session, conv_srn, SCHEMA_V1) + await pg_session.commit() + + handler = await _make_handler(pg_engine, pg_session) + await handler.handle(_event(conv_srn, SCHEMA_V1, _fields_v1())) + await pg_session.commit() + + assert await _catalog_row_count(pg_engine) == 1 + cols = await _table_columns(pg_engine, "bio_sample_v1") + assert "species" in cols + + async def test_second_event_same_schema_is_noop( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + conv_a = ConventionSRN.parse("urn:osa:localhost:conv:conv-a1@1.0.0") + conv_b = ConventionSRN.parse("urn:osa:localhost:conv:conv-b1@1.0.0") + await _seed_schema(pg_session, SCHEMA_V1, _fields_v1()) + await _seed_convention(pg_session, conv_a, SCHEMA_V1) + await _seed_convention(pg_session, conv_b, SCHEMA_V1) + await pg_session.commit() + + handler = await _make_handler(pg_engine, pg_session) + await handler.handle(_event(conv_a, SCHEMA_V1, _fields_v1())) + await handler.handle(_event(conv_b, SCHEMA_V1, _fields_v1())) + await pg_session.commit() + + # Still one catalog row, one table. + assert await _catalog_row_count(pg_engine) == 1 + + async def test_additive_bump_alters_table( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + conv_a = ConventionSRN.parse("urn:osa:localhost:conv:conv-a1@1.0.0") + conv_b = ConventionSRN.parse("urn:osa:localhost:conv:conv-b1@1.0.0") + await _seed_schema(pg_session, SCHEMA_V1, _fields_v1()) + await _seed_schema(pg_session, SCHEMA_V11, _fields_v11()) + await _seed_convention(pg_session, conv_a, SCHEMA_V1) + await _seed_convention(pg_session, conv_b, SCHEMA_V11) + await pg_session.commit() + + handler = await _make_handler(pg_engine, pg_session) + await handler.handle(_event(conv_a, SCHEMA_V1, _fields_v1())) + cols_before = await _table_columns(pg_engine, "bio_sample_v1") + assert "collection_site" not in cols_before + + await handler.handle(_event(conv_b, SCHEMA_V11, _fields_v11())) + await pg_session.commit() + + cols_after = await _table_columns(pg_engine, "bio_sample_v1") + assert "collection_site" in cols_after diff --git a/server/tests/integration/test_event_batch_processing.py b/server/tests/integration/test_event_batch_processing.py index 41b026f..49c8d7e 100644 --- a/server/tests/integration/test_event_batch_processing.py +++ b/server/tests/integration/test_event_batch_processing.py @@ -64,6 +64,8 @@ def make_record_published( domain=Domain("test.example.com"), id=LocalId(str(uuid4())), ) + from osa.domain.shared.model.srn import SchemaSRN + return RecordPublished( id=EventId(uuid4()), record_srn=RecordSRN( @@ -73,6 +75,7 @@ def make_record_published( ), source=DepositionSource(id=str(dep_srn)), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_srn=SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), metadata=metadata or {"title": "Test Record"}, ) diff --git a/server/tests/integration/test_insert_record_metadata.py b/server/tests/integration/test_insert_record_metadata.py new file mode 100644 index 0000000..a677c9d --- /dev/null +++ b/server/tests/integration/test_insert_record_metadata.py @@ -0,0 +1,104 @@ +"""Integration tests for InsertRecordMetadata event handler.""" + +from uuid import uuid4 + +import pytest +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.metadata.handler.insert_record_metadata import InsertRecordMetadata +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.record.event.record_published import RecordPublished +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.event import EventId +from osa.domain.shared.model.source import DepositionSource +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore +from osa.infrastructure.persistence.metadata_table import METADATA_SCHEMA + +from tests.integration.conftest import seed_record + +SCHEMA_V1 = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") +CONV_SRN = ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") + + +def _fields() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +def _event(record_srn: RecordSRN, metadata: dict) -> RecordPublished: + return RecordPublished( + id=EventId(uuid4()), + record_srn=record_srn, + source=DepositionSource(id="dep-1"), + convention_srn=CONV_SRN, + schema_srn=SCHEMA_V1, + metadata=metadata, + expected_features=[], + ) + + +@pytest.mark.asyncio +class TestInsertRecordMetadata: + async def test_insert_creates_typed_row(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + + record_srn = RecordSRN.parse("urn:osa:localhost:rec:one@1") + await seed_record(pg_engine, srn=str(record_srn), schema_srn=str(SCHEMA_V1)) + + handler = InsertRecordMetadata(metadata_service=MetadataService(metadata_store=store)) + await handler.handle(_event(record_srn, {"species": "Homo sapiens", "resolution": 3.5})) + await pg_session.commit() + + async with pg_engine.begin() as conn: + row = ( + await conn.execute( + text( + f"SELECT species, resolution " + f'FROM "{METADATA_SCHEMA}"."bio_sample_v1" ' + f"WHERE record_srn = :srn" + ), + {"srn": str(record_srn)}, + ) + ).first() + assert row is not None + assert row[0] == "Homo sapiens" + assert row[1] == 3.5 + + async def test_duplicate_delivery_is_idempotent( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + + record_srn = RecordSRN.parse("urn:osa:localhost:rec:dup@1") + await seed_record(pg_engine, srn=str(record_srn), schema_srn=str(SCHEMA_V1)) + + handler = InsertRecordMetadata(metadata_service=MetadataService(metadata_store=store)) + event = _event(record_srn, {"species": "Mus musculus", "resolution": 1.0}) + + await handler.handle(event) + await handler.handle(event) + await pg_session.commit() + + async with pg_engine.begin() as conn: + count = ( + await conn.execute( + text(f'SELECT COUNT(*) FROM "{METADATA_SCHEMA}"."bio_sample_v1"') + ) + ).scalar() + assert count == 1 diff --git a/server/tests/integration/test_metadata_additive_evolve_postgres.py b/server/tests/integration/test_metadata_additive_evolve_postgres.py new file mode 100644 index 0000000..d3940b1 --- /dev/null +++ b/server/tests/integration/test_metadata_additive_evolve_postgres.py @@ -0,0 +1,106 @@ +"""Integration tests for additive schema evolution end-to-end.""" + +import pytest +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore +from osa.infrastructure.persistence.metadata_table import METADATA_SCHEMA + +from tests.integration.conftest import seed_record + +IDENTITY = "urn:osa:localhost:schema:bio-sample" +SCHEMA_V10 = SchemaSRN.parse(f"{IDENTITY}@1.0.0") +SCHEMA_V11 = SchemaSRN.parse(f"{IDENTITY}@1.1.0") + + +def _fields_v10() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +def _fields_v11() -> list[FieldDefinition]: + return _fields_v10() + [ + FieldDefinition( + name="collection_site", + type=FieldType.TEXT, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +@pytest.mark.asyncio +class TestAdditiveEvolvePipeline: + async def test_old_row_null_new_row_typed( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + service = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) + + # Register v1.0.0 and publish a record. + await service.ensure_table(SCHEMA_V10, "bio_sample", _fields_v10()) + r_old = RecordSRN.parse("urn:osa:localhost:rec:old@1") + await seed_record(pg_engine, srn=str(r_old), schema_srn=str(SCHEMA_V10)) + await service.insert(SCHEMA_V10, r_old, {"species": "Mus musculus"}) + await pg_session.commit() + + # Bump to v1.1.0 (additive) and publish another record carrying the new field. + await service.ensure_table(SCHEMA_V11, "bio_sample", _fields_v11()) + r_new = RecordSRN.parse("urn:osa:localhost:rec:new@1") + await seed_record(pg_engine, srn=str(r_new), schema_srn=str(SCHEMA_V11)) + await service.insert( + SCHEMA_V11, r_new, {"species": "Homo sapiens", "collection_site": "Lab A"} + ) + await pg_session.commit() + + # Old row: NULL in new column. + async with pg_engine.begin() as conn: + old_site = ( + await conn.execute( + text( + f'SELECT collection_site FROM "{METADATA_SCHEMA}"."bio_sample_v1" ' + f"WHERE record_srn = :srn" + ), + {"srn": str(r_old)}, + ) + ).scalar() + new_site = ( + await conn.execute( + text( + f'SELECT collection_site FROM "{METADATA_SCHEMA}"."bio_sample_v1" ' + f"WHERE record_srn = :srn" + ), + {"srn": str(r_new)}, + ) + ).scalar() + assert old_site is None + assert new_site == "Lab A" + + async def test_catalog_lineage_has_both_srns( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + service = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) + await service.ensure_table(SCHEMA_V10, "bio_sample", _fields_v10()) + await service.ensure_table(SCHEMA_V11, "bio_sample", _fields_v11()) + + async with pg_engine.begin() as conn: + versions = ( + await conn.execute( + text( + "SELECT schema_versions FROM metadata_tables " + "WHERE schema_identity = :id AND schema_major = 1" + ), + {"id": IDENTITY}, + ) + ).scalar() + assert str(SCHEMA_V10) in versions + assert str(SCHEMA_V11) in versions diff --git a/server/tests/integration/test_non_additive_rejected_postgres.py b/server/tests/integration/test_non_additive_rejected_postgres.py new file mode 100644 index 0000000..200e571 --- /dev/null +++ b/server/tests/integration/test_non_additive_rejected_postgres.py @@ -0,0 +1,104 @@ +"""Integration tests for non-additive schema evolution rejection (FR-023).""" + +import pytest +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.srn import SchemaSRN +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore + +IDENTITY = "urn:osa:localhost:schema:bio-sample" +V1 = SchemaSRN.parse(f"{IDENTITY}@1.0.0") +V11 = SchemaSRN.parse(f"{IDENTITY}@1.1.0") + + +def _orig() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +@pytest.mark.asyncio +class TestNonAdditiveRejected: + async def test_rename_field_rejected(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + svc = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) + await svc.ensure_table(V1, "bio_sample", _orig()) + + # New field "organism" is optional so the validator reaches the removal + # check and reports the dropped "species" field specifically. + renamed = [ + FieldDefinition( + name="organism", + type=FieldType.TEXT, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + with pytest.raises(ValidationError) as exc: + await svc.ensure_table(V11, "bio_sample", renamed) + message = str(exc.value) + assert "species" in message and "removed" in message + + async def test_type_change_rejected(self, pg_engine: AsyncEngine, pg_session: AsyncSession): + svc = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) + await svc.ensure_table(V1, "bio_sample", _orig()) + + retyped = [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + # Previously NUMBER, now TEXT — retype is non-additive. + type=FieldType.TEXT, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + with pytest.raises(ValidationError, match="resolution"): + await svc.ensure_table(V11, "bio_sample", retyped) + + async def test_tightening_required_rejected( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + svc = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) + await svc.ensure_table(V1, "bio_sample", _orig()) + + tightened = [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=True, # was False + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + with pytest.raises(ValidationError, match="resolution"): + await svc.ensure_table(V11, "bio_sample", tightened) diff --git a/server/tests/unit/domain/deposition/test_convention_registered.py b/server/tests/unit/domain/deposition/test_convention_registered.py index c97a4e9..12cee71 100644 --- a/server/tests/unit/domain/deposition/test_convention_registered.py +++ b/server/tests/unit/domain/deposition/test_convention_registered.py @@ -14,13 +14,17 @@ OciConfig, TableFeatureSpec, ) -from osa.domain.shared.model.srn import ConventionSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN def _make_conv_srn() -> ConventionSRN: return ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") +def _make_schema_srn() -> SchemaSRN: + return SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") + + def _make_hook_definition(name: str = "pocket_detect") -> HookDefinition: return HookDefinition( name=name, @@ -42,6 +46,8 @@ def test_event_carries_hooks(self): event = ConventionRegistered( id=EventId(uuid4()), convention_srn=_make_conv_srn(), + schema_srn=_make_schema_srn(), + schema_fields=[], hooks=hooks, ) @@ -54,6 +60,7 @@ def test_event_defaults_to_empty_hooks(self): event = ConventionRegistered( id=EventId(uuid4()), convention_srn=_make_conv_srn(), + schema_srn=_make_schema_srn(), ) assert event.hooks == [] @@ -64,6 +71,8 @@ def test_serialization_with_hooks(self): event = ConventionRegistered( id=EventId(uuid4()), convention_srn=_make_conv_srn(), + schema_srn=_make_schema_srn(), + schema_fields=[], hooks=hooks, ) diff --git a/server/tests/unit/domain/deposition/test_event_chain.py b/server/tests/unit/domain/deposition/test_event_chain.py index 3a6243a..9cd952f 100644 --- a/server/tests/unit/domain/deposition/test_event_chain.py +++ b/server/tests/unit/domain/deposition/test_event_chain.py @@ -26,6 +26,7 @@ ConventionSRN, DepositionSRN, RecordSRN, + SchemaSRN, ValidationRunSRN, ) from osa.domain.validation.event.validation_completed import ValidationCompleted @@ -33,6 +34,10 @@ from osa.domain.validation.model import RunStatus +def _make_schema_srn() -> SchemaSRN: + return SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") + + def _make_dep_srn() -> DepositionSRN: return DepositionSRN.parse("urn:osa:localhost:dep:test-dep") @@ -263,6 +268,7 @@ async def test_delegates_to_feature_service(self): source=DepositionSource(id=str(_make_dep_srn())), metadata={"title": "Test"}, convention_srn=_make_conv_srn(), + schema_srn=_make_schema_srn(), expected_features=["pocket_detect"], ) await handler.handle(event) diff --git a/server/tests/unit/domain/discovery/test_discovery_service.py b/server/tests/unit/domain/discovery/test_discovery_service.py index c38a79d..3f1cad4 100644 --- a/server/tests/unit/domain/discovery/test_discovery_service.py +++ b/server/tests/unit/domain/discovery/test_discovery_service.py @@ -1,23 +1,40 @@ -"""Tests for DiscoveryService — filter validation, operator validation, delegation.""" +"""Tests for DiscoveryService — FilterExpr validation, operator validation, delegation.""" from datetime import UTC, datetime from unittest.mock import AsyncMock import pytest +from osa.config import Config +from osa.domain.discovery.model.refs import MetadataFieldRef from osa.domain.discovery.model.value import ( + And, ColumnInfo, FeatureCatalogEntry, FeatureRow, - Filter, FilterOperator, + Predicate, RecordSummary, SortOrder, + decode_cursor, + encode_cursor, ) from osa.domain.discovery.service.discovery import DiscoveryService from osa.domain.semantics.model.value import FieldType from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import RecordSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaSRN + + +SCHEMA_SRN = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") + + +def _config() -> Config: + # Build a Config with minimal auth — tests don't hit JWT paths + import os + + os.environ.setdefault("OSA_AUTH__JWT__SECRET", "a" * 64) # Test-only secret + os.environ.setdefault("OSA_BASE_URL", "http://localhost:8000") + return Config() # type: ignore[call-arg] @pytest.fixture @@ -38,19 +55,37 @@ def mock_field_reader() -> AsyncMock: "is_public": FieldType.BOOLEAN, "homepage": FieldType.URL, } + reader.get_fields_for_schema.return_value = { + "title": FieldType.TEXT, + "resolution": FieldType.NUMBER, + "method": FieldType.TERM, + "published_date": FieldType.DATE, + "is_public": FieldType.BOOLEAN, + "homepage": FieldType.URL, + } return reader @pytest.fixture def service(mock_read_store: AsyncMock, mock_field_reader: AsyncMock) -> DiscoveryService: - return DiscoveryService(read_store=mock_read_store, field_reader=mock_field_reader) + return DiscoveryService( + read_store=mock_read_store, + field_reader=mock_field_reader, + config=_config(), + ) + + +def _eq(field: str, value: object) -> Predicate: + return Predicate(field=MetadataFieldRef(field=field), op=FilterOperator.EQ, value=value) class TestSearchRecordsValidation: async def test_rejects_unknown_filter_field(self, service: DiscoveryService) -> None: - with pytest.raises(ValidationError, match="Unknown field 'bogus'"): + with pytest.raises(ValidationError, match="Unknown metadata field 'bogus'"): await service.search_records( - filters=[Filter(field="bogus", operator=FilterOperator.EQ, value="x")], + filter_expr=_eq("bogus", "x"), + schema_srn=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -59,9 +94,15 @@ async def test_rejects_unknown_filter_field(self, service: DiscoveryService) -> ) async def test_rejects_invalid_operator_for_type(self, service: DiscoveryService) -> None: - with pytest.raises(ValidationError, match="contains"): + with pytest.raises(ValidationError, match="not valid"): await service.search_records( - filters=[Filter(field="resolution", operator=FilterOperator.CONTAINS, value="x")], + filter_expr=Predicate( + field=MetadataFieldRef(field="resolution"), + op=FilterOperator.CONTAINS, + value="x", + ), + schema_srn=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -72,7 +113,9 @@ async def test_rejects_invalid_operator_for_type(self, service: DiscoveryService async def test_rejects_unknown_sort_field(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="Unknown sort field"): await service.search_records( - filters=[], + filter_expr=None, + schema_srn=SCHEMA_SRN, + convention_srn=None, q=None, sort="nonexistent", order=SortOrder.DESC, @@ -82,7 +125,9 @@ async def test_rejects_unknown_sort_field(self, service: DiscoveryService) -> No async def test_accepts_published_at_sort(self, service: DiscoveryService) -> None: result = await service.search_records( - filters=[], + filter_expr=None, + schema_srn=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -93,7 +138,9 @@ async def test_accepts_published_at_sort(self, service: DiscoveryService) -> Non async def test_accepts_metadata_field_sort(self, service: DiscoveryService) -> None: result = await service.search_records( - filters=[], + filter_expr=None, + schema_srn=SCHEMA_SRN, + convention_srn=None, q=None, sort="resolution", order=SortOrder.ASC, @@ -105,7 +152,9 @@ async def test_accepts_metadata_field_sort(self, service: DiscoveryService) -> N async def test_rejects_limit_too_low(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="limit"): await service.search_records( - filters=[], + filter_expr=None, + schema_srn=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -116,7 +165,9 @@ async def test_rejects_limit_too_low(self, service: DiscoveryService) -> None: async def test_rejects_limit_too_high(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="limit"): await service.search_records( - filters=[], + filter_expr=None, + schema_srn=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -125,16 +176,20 @@ async def test_rejects_limit_too_high(self, service: DiscoveryService) -> None: ) async def test_rejects_q_when_no_text_fields(self, mock_read_store: AsyncMock) -> None: - """q should raise when no TEXT/URL fields exist to search against.""" no_text_reader = AsyncMock() - no_text_reader.get_all_field_types.return_value = { - "resolution": FieldType.NUMBER, - } - svc = DiscoveryService(read_store=mock_read_store, field_reader=no_text_reader) + no_text_reader.get_all_field_types.return_value = {"resolution": FieldType.NUMBER} + no_text_reader.get_fields_for_schema.return_value = {"resolution": FieldType.NUMBER} + svc = DiscoveryService( + read_store=mock_read_store, + field_reader=no_text_reader, + config=_config(), + ) with pytest.raises(ValidationError, match="Free-text search is unavailable"): await svc.search_records( - filters=[], + filter_expr=None, + schema_srn=SCHEMA_SRN, + convention_srn=None, q="kinase", sort="published_at", order=SortOrder.DESC, @@ -148,7 +203,9 @@ async def test_delegates_to_read_store( self, service: DiscoveryService, mock_read_store: AsyncMock ) -> None: await service.search_records( - filters=[Filter(field="method", operator=FilterOperator.EQ, value="X-ray")], + filter_expr=_eq("method", "X-ray"), + schema_srn=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -158,38 +215,19 @@ async def test_delegates_to_read_store( mock_read_store.search_records.assert_called_once() call_kwargs = mock_read_store.search_records.call_args - assert len(call_kwargs.kwargs["filters"]) == 1 + assert call_kwargs.kwargs["filter_expr"] is not None assert call_kwargs.kwargs["q"] is None assert call_kwargs.kwargs["sort"] == "published_at" - assert call_kwargs.kwargs["limit"] == 21 # N+1 trick - - async def test_extracts_text_fields_for_q( - self, service: DiscoveryService, mock_read_store: AsyncMock - ) -> None: - await service.search_records( - filters=[], - q="kinase", - sort="published_at", - order=SortOrder.DESC, - cursor=None, - limit=20, - ) - - call_kwargs = mock_read_store.search_records.call_args - text_fields = call_kwargs.kwargs["text_fields"] - # title (TEXT) and homepage (URL) are text-searchable - assert "title" in text_fields - assert "homepage" in text_fields - assert "resolution" not in text_fields + assert call_kwargs.kwargs["limit"] == 21 # N+1 async def test_decodes_cursor( self, service: DiscoveryService, mock_read_store: AsyncMock ) -> None: - from osa.domain.discovery.model.value import encode_cursor - cursor = encode_cursor("2026-01-01", "urn:osa:localhost:rec:abc@1") await service.search_records( - filters=[], + filter_expr=None, + schema_srn=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -205,7 +243,9 @@ async def test_decodes_cursor( async def test_invalid_cursor_raises(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="cursor"): await service.search_records( - filters=[], + filter_expr=None, + schema_srn=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -218,13 +258,14 @@ async def test_encodes_next_cursor_from_results( ) -> None: srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") ts = datetime(2026, 1, 1, tzinfo=UTC) - # Return limit+1 rows so the service detects has_more=True mock_read_store.search_records.return_value = [ RecordSummary(srn=srn, published_at=ts, metadata={"title": f"r{i}"}) for i in range(2) ] result = await service.search_records( - filters=[], + filter_expr=None, + schema_srn=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -234,10 +275,7 @@ async def test_encodes_next_cursor_from_results( assert result.has_more is True assert result.cursor is not None - assert len(result.results) == 1 # trimmed back to limit - - from osa.domain.discovery.model.value import decode_cursor - + assert len(result.results) == 1 decoded = decode_cursor(result.cursor) assert decoded["id"] == str(srn) @@ -247,7 +285,9 @@ async def test_no_cursor_when_no_more_results( mock_read_store.search_records.return_value = [] result = await service.search_records( - filters=[], + filter_expr=None, + schema_srn=SCHEMA_SRN, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, @@ -259,111 +299,51 @@ async def test_no_cursor_when_no_more_results( assert result.has_more is False -class TestSearchRecordsPagination: - async def test_has_more_false_when_exactly_limit_rows( - self, service: DiscoveryService, mock_read_store: AsyncMock - ) -> None: - """Exactly limit rows should NOT report has_more (no false positive).""" - srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") - ts = datetime(2026, 1, 1, tzinfo=UTC) - mock_read_store.search_records.return_value = [ - RecordSummary(srn=srn, published_at=ts, metadata={"title": f"r{i}"}) for i in range(3) - ] - - result = await service.search_records( - filters=[], - q=None, - sort="published_at", - order=SortOrder.DESC, - cursor=None, - limit=3, - ) - - assert result.has_more is False - assert result.cursor is None - assert len(result.results) == 3 - - async def test_has_more_true_when_more_than_limit_rows( - self, service: DiscoveryService, mock_read_store: AsyncMock - ) -> None: - """Adapter returning limit+1 rows signals more pages exist.""" - srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") - ts = datetime(2026, 1, 1, tzinfo=UTC) - mock_read_store.search_records.return_value = [ - RecordSummary(srn=srn, published_at=ts, metadata={"title": f"r{i}"}) for i in range(4) - ] - - result = await service.search_records( - filters=[], - q=None, - sort="published_at", - order=SortOrder.DESC, - cursor=None, - limit=3, - ) - - assert result.has_more is True - assert result.cursor is not None - assert len(result.results) == 3 # trimmed back to limit - - async def test_passes_limit_plus_one_to_read_store( - self, service: DiscoveryService, mock_read_store: AsyncMock - ) -> None: - """Service should fetch one extra row to detect more pages.""" - await service.search_records( - filters=[], - q=None, - sort="published_at", - order=SortOrder.DESC, - cursor=None, - limit=20, - ) - - call_kwargs = mock_read_store.search_records.call_args - assert call_kwargs.kwargs["limit"] == 21 - - -class TestSearchRecordsFieldTypes: - async def test_passes_field_types_to_read_store( - self, service: DiscoveryService, mock_read_store: AsyncMock - ) -> None: - await service.search_records( - filters=[], - q=None, - sort="published_at", - order=SortOrder.DESC, - cursor=None, - limit=20, - ) +class TestFilterBounds: + async def test_depth_exceeded_raises(self, service: DiscoveryService) -> None: + # Build a nest of AND that exceeds the default depth (10) + leaf = _eq("title", "r") + tree = leaf + for _ in range(11): + tree = And(operands=[tree, leaf]) - call_kwargs = mock_read_store.search_records.call_args - field_types = call_kwargs.kwargs["field_types"] - assert field_types["resolution"] == FieldType.NUMBER - assert field_types["title"] == FieldType.TEXT + with pytest.raises(ValidationError, match="filter_depth_exceeded|depth"): + await service.search_records( + filter_expr=tree, + schema_srn=SCHEMA_SRN, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) class TestFeatureCursorEncoding: async def test_cursor_encodes_row_id( self, mock_read_store: AsyncMock, mock_field_reader: AsyncMock ) -> None: - from osa.domain.discovery.model.value import decode_cursor - srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") mock_read_store.get_feature_table_schema.return_value = FeatureCatalogEntry( hook_name="detect_pockets", columns=[ColumnInfo(name="score", type="number", required=True)], record_count=0, ) - # Return limit+1 rows so the service detects has_more=True mock_read_store.search_features.return_value = [ FeatureRow(row_id=42, record_srn=srn, data={"score": 7.66}), FeatureRow(row_id=43, record_srn=srn, data={"score": 6.0}), ] - service = DiscoveryService(read_store=mock_read_store, field_reader=mock_field_reader) + service = DiscoveryService( + read_store=mock_read_store, + field_reader=mock_field_reader, + config=_config(), + ) result = await service.search_features( hook_name="detect_pockets", - filters=[], + filter_expr=None, + schema_srn=None, record_srn=None, sort="score", order=SortOrder.DESC, @@ -377,39 +357,3 @@ async def test_cursor_encodes_row_id( decoded = decode_cursor(result.cursor) assert decoded["id"] == 42 assert decoded["s"] == 7.66 - - async def test_cursor_uses_row_id_for_id_sort( - self, mock_read_store: AsyncMock, mock_field_reader: AsyncMock - ) -> None: - from osa.domain.discovery.model.value import decode_cursor - - srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") - mock_read_store.get_feature_table_schema.return_value = FeatureCatalogEntry( - hook_name="detect_pockets", - columns=[ColumnInfo(name="score", type="number", required=True)], - record_count=0, - ) - # Return limit+1 rows so the service detects has_more=True - mock_read_store.search_features.return_value = [ - FeatureRow(row_id=99, record_srn=srn, data={"score": 5.0}), - FeatureRow(row_id=98, record_srn=srn, data={"score": 4.0}), - ] - - service = DiscoveryService(read_store=mock_read_store, field_reader=mock_field_reader) - result = await service.search_features( - hook_name="detect_pockets", - filters=[], - record_srn=None, - sort="id", - order=SortOrder.DESC, - cursor=None, - limit=1, - ) - - assert result.has_more is True - assert result.cursor is not None - assert len(result.rows) == 1 - decoded = decode_cursor(result.cursor) - # When sort is "id", sort_val should be the row_id itself - assert decoded["s"] == 99 - assert decoded["id"] == 99 diff --git a/server/tests/unit/domain/discovery/test_get_feature_catalog.py b/server/tests/unit/domain/discovery/test_get_feature_catalog.py index 6cfb1fe..f6197e3 100644 --- a/server/tests/unit/domain/discovery/test_get_feature_catalog.py +++ b/server/tests/unit/domain/discovery/test_get_feature_catalog.py @@ -10,9 +10,18 @@ GetFeatureCatalogHandler, GetFeatureCatalogResult, ) +from osa.config import Config from osa.domain.discovery.service.discovery import DiscoveryService +def _config() -> Config: + import os + + os.environ.setdefault("OSA_AUTH__JWT__SECRET", "a" * 64) + os.environ.setdefault("OSA_BASE_URL", "http://localhost:8000") + return Config() # type: ignore[call-arg] + + @pytest.fixture def mock_read_store() -> AsyncMock: return AsyncMock() @@ -22,12 +31,17 @@ def mock_read_store() -> AsyncMock: def mock_field_reader() -> AsyncMock: reader = AsyncMock() reader.get_all_field_types.return_value = {} + reader.get_fields_for_schema.return_value = {} return reader @pytest.fixture def service(mock_read_store: AsyncMock, mock_field_reader: AsyncMock) -> DiscoveryService: - return DiscoveryService(read_store=mock_read_store, field_reader=mock_field_reader) + return DiscoveryService( + read_store=mock_read_store, + field_reader=mock_field_reader, + config=_config(), + ) class TestGetFeatureCatalogHandler: diff --git a/server/tests/unit/domain/discovery/test_search_features.py b/server/tests/unit/domain/discovery/test_search_features.py index 8c46b1f..7fa4fcb 100644 --- a/server/tests/unit/domain/discovery/test_search_features.py +++ b/server/tests/unit/domain/discovery/test_search_features.py @@ -4,13 +4,15 @@ import pytest +from osa.config import Config +from osa.domain.discovery.model.refs import FeatureFieldRef from osa.domain.discovery.model.value import ( ColumnInfo, FeatureCatalogEntry, FeatureRow, FeatureSearchResult, - Filter, FilterOperator, + Predicate, SortOrder, ) from osa.domain.discovery.query.search_features import ( @@ -35,6 +37,18 @@ def _make_catalog_entry() -> FeatureCatalogEntry: ) +def _config() -> Config: + import os + + os.environ.setdefault("OSA_AUTH__JWT__SECRET", "a" * 64) + os.environ.setdefault("OSA_BASE_URL", "http://localhost:8000") + return Config() # type: ignore[call-arg] + + +def _predicate(hook: str, column: str, op: FilterOperator, value: object) -> Predicate: + return Predicate(field=FeatureFieldRef(hook=hook, column=column), op=op, value=value) + + @pytest.fixture def mock_read_store() -> AsyncMock: store = AsyncMock() @@ -47,12 +61,17 @@ def mock_read_store() -> AsyncMock: def mock_field_reader() -> AsyncMock: reader = AsyncMock() reader.get_all_field_types.return_value = {} + reader.get_fields_for_schema.return_value = {} return reader @pytest.fixture def service(mock_read_store: AsyncMock, mock_field_reader: AsyncMock) -> DiscoveryService: - return DiscoveryService(read_store=mock_read_store, field_reader=mock_field_reader) + return DiscoveryService( + read_store=mock_read_store, + field_reader=mock_field_reader, + config=_config(), + ) class TestSearchFeaturesHandler: @@ -103,10 +122,11 @@ async def test_raises_not_found_for_unknown_hook( ) -> None: mock_read_store.get_feature_table_schema.return_value = None - with pytest.raises(NotFoundError, match="unknown_hook"): + with pytest.raises(NotFoundError): await service.search_features( hook_name="unknown_hook", - filters=[], + filter_expr=None, + schema_srn=None, record_srn=None, sort="id", order=SortOrder.DESC, @@ -118,7 +138,8 @@ async def test_rejects_unknown_column(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="bogus"): await service.search_features( hook_name="detect_pockets", - filters=[Filter(field="bogus", operator=FilterOperator.EQ, value=1)], + filter_expr=_predicate("detect_pockets", "bogus", FilterOperator.EQ, 1), + schema_srn=None, record_srn=None, sort="id", order=SortOrder.DESC, @@ -130,19 +151,8 @@ async def test_validates_operator_for_number_column(self, service: DiscoveryServ with pytest.raises(ValidationError, match="contains"): await service.search_features( hook_name="detect_pockets", - filters=[Filter(field="score", operator=FilterOperator.CONTAINS, value="x")], - record_srn=None, - sort="id", - order=SortOrder.DESC, - cursor=None, - limit=50, - ) - - async def test_validates_operator_for_boolean_column(self, service: DiscoveryService) -> None: - with pytest.raises(ValidationError, match="gte"): - await service.search_features( - hook_name="detect_pockets", - filters=[Filter(field="is_active", operator=FilterOperator.GTE, value=True)], + filter_expr=_predicate("detect_pockets", "score", FilterOperator.CONTAINS, "x"), + schema_srn=None, record_srn=None, sort="id", order=SortOrder.DESC, @@ -153,7 +163,8 @@ async def test_validates_operator_for_boolean_column(self, service: DiscoverySer async def test_accepts_string_contains_operator(self, service: DiscoveryService) -> None: await service.search_features( hook_name="detect_pockets", - filters=[Filter(field="label", operator=FilterOperator.CONTAINS, value="test")], + filter_expr=_predicate("detect_pockets", "label", FilterOperator.CONTAINS, "test"), + schema_srn=None, record_srn=None, sort="id", order=SortOrder.DESC, @@ -167,7 +178,8 @@ async def test_passes_record_srn_filter( srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") await service.search_features( hook_name="detect_pockets", - filters=[], + filter_expr=None, + schema_srn=None, record_srn=srn, sort="id", order=SortOrder.DESC, @@ -178,33 +190,13 @@ async def test_passes_record_srn_filter( call_kwargs = mock_read_store.search_features.call_args assert call_kwargs.kwargs["record_srn"] == srn - async def test_decodes_cursor( - self, service: DiscoveryService, mock_read_store: AsyncMock - ) -> None: - from osa.domain.discovery.model.value import encode_cursor - - cursor = encode_cursor(7.66, 42) - await service.search_features( - hook_name="detect_pockets", - filters=[], - record_srn=None, - sort="score", - order=SortOrder.DESC, - cursor=cursor, - limit=50, - ) - - call_kwargs = mock_read_store.search_features.call_args - decoded = call_kwargs.kwargs["cursor"] - assert decoded["s"] == 7.66 - assert decoded["id"] == 42 - async def test_delegates_to_read_store( self, service: DiscoveryService, mock_read_store: AsyncMock ) -> None: await service.search_features( hook_name="detect_pockets", - filters=[Filter(field="score", operator=FilterOperator.GTE, value=6.0)], + filter_expr=_predicate("detect_pockets", "score", FilterOperator.GTE, 6.0), + schema_srn=None, record_srn=None, sort="score", order=SortOrder.DESC, @@ -215,14 +207,13 @@ async def test_delegates_to_read_store( mock_read_store.search_features.assert_called_once() call_kwargs = mock_read_store.search_features.call_args assert call_kwargs.kwargs["hook_name"] == "detect_pockets" - assert len(call_kwargs.kwargs["filters"]) == 1 + assert call_kwargs.kwargs["filter_expr"] is not None class TestSearchFeaturesPagination: async def test_has_more_false_when_exactly_limit_rows( self, service: DiscoveryService, mock_read_store: AsyncMock ) -> None: - """Exactly limit rows should NOT report has_more (no false positive).""" srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") mock_read_store.search_features.return_value = [ FeatureRow(row_id=i, record_srn=srn, data={"score": float(i)}) for i in range(3) @@ -230,7 +221,8 @@ async def test_has_more_false_when_exactly_limit_rows( result = await service.search_features( hook_name="detect_pockets", - filters=[], + filter_expr=None, + schema_srn=None, record_srn=None, sort="score", order=SortOrder.DESC, @@ -245,7 +237,6 @@ async def test_has_more_false_when_exactly_limit_rows( async def test_has_more_true_when_more_than_limit_rows( self, service: DiscoveryService, mock_read_store: AsyncMock ) -> None: - """Adapter returning limit+1 rows signals more pages exist.""" srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") mock_read_store.search_features.return_value = [ FeatureRow(row_id=i, record_srn=srn, data={"score": float(i)}) for i in range(4) @@ -253,7 +244,8 @@ async def test_has_more_true_when_more_than_limit_rows( result = await service.search_features( hook_name="detect_pockets", - filters=[], + filter_expr=None, + schema_srn=None, record_srn=None, sort="score", order=SortOrder.DESC, @@ -268,10 +260,10 @@ async def test_has_more_true_when_more_than_limit_rows( async def test_passes_limit_plus_one_to_read_store( self, service: DiscoveryService, mock_read_store: AsyncMock ) -> None: - """Service should fetch one extra row to detect more pages.""" await service.search_features( hook_name="detect_pockets", - filters=[], + filter_expr=None, + schema_srn=None, record_srn=None, sort="score", order=SortOrder.DESC, diff --git a/server/tests/unit/domain/discovery/test_search_records.py b/server/tests/unit/domain/discovery/test_search_records.py index c713a5b..a558d37 100644 --- a/server/tests/unit/domain/discovery/test_search_records.py +++ b/server/tests/unit/domain/discovery/test_search_records.py @@ -40,7 +40,9 @@ async def test_delegates_to_service( await handler.run(cmd) mock_service.search_records.assert_called_once_with( - filters=[], + filter_expr=None, + schema_srn=None, + convention_srn=None, q=None, sort="published_at", order=SortOrder.DESC, diff --git a/server/tests/unit/domain/discovery/test_value.py b/server/tests/unit/domain/discovery/test_value.py index 0accb29..18518f5 100644 --- a/server/tests/unit/domain/discovery/test_value.py +++ b/server/tests/unit/domain/discovery/test_value.py @@ -69,31 +69,38 @@ def test_non_dict_payload(self) -> None: class TestValidOperators: - def test_text_operators(self) -> None: - assert VALID_OPERATORS[FieldType.TEXT] == {FilterOperator.EQ, FilterOperator.CONTAINS} - - def test_url_operators(self) -> None: - assert VALID_OPERATORS[FieldType.URL] == {FilterOperator.EQ, FilterOperator.CONTAINS} - - def test_number_operators(self) -> None: - assert VALID_OPERATORS[FieldType.NUMBER] == { - FilterOperator.EQ, - FilterOperator.GTE, - FilterOperator.LTE, - } - - def test_date_operators(self) -> None: - assert VALID_OPERATORS[FieldType.DATE] == { - FilterOperator.EQ, - FilterOperator.GTE, - FilterOperator.LTE, - } + def test_text_operators_include_basics(self) -> None: + ops = VALID_OPERATORS[FieldType.TEXT] + assert FilterOperator.EQ in ops + assert FilterOperator.CONTAINS in ops + assert FilterOperator.IN in ops + assert FilterOperator.NEQ in ops + + def test_url_operators_include_basics(self) -> None: + ops = VALID_OPERATORS[FieldType.URL] + assert FilterOperator.EQ in ops + assert FilterOperator.CONTAINS in ops + assert FilterOperator.IN in ops + + def test_number_operators_support_ordering(self) -> None: + ops = VALID_OPERATORS[FieldType.NUMBER] + assert FilterOperator.EQ in ops + assert FilterOperator.GT in ops + assert FilterOperator.GTE in ops + assert FilterOperator.LT in ops + assert FilterOperator.LTE in ops + + def test_date_operators_support_ordering(self) -> None: + ops = VALID_OPERATORS[FieldType.DATE] + assert FilterOperator.GTE in ops + assert FilterOperator.LTE in ops def test_boolean_operators(self) -> None: - assert VALID_OPERATORS[FieldType.BOOLEAN] == {FilterOperator.EQ} + assert FilterOperator.EQ in VALID_OPERATORS[FieldType.BOOLEAN] + assert FilterOperator.IS_NULL in VALID_OPERATORS[FieldType.BOOLEAN] def test_term_operators(self) -> None: - assert VALID_OPERATORS[FieldType.TERM] == {FilterOperator.EQ} + assert FilterOperator.EQ in VALID_OPERATORS[FieldType.TERM] def test_all_field_types_have_operators(self) -> None: for ft in FieldType: diff --git a/server/tests/unit/domain/feature/test_convention_ready.py b/server/tests/unit/domain/feature/test_convention_ready.py deleted file mode 100644 index 8f5ad1e..0000000 --- a/server/tests/unit/domain/feature/test_convention_ready.py +++ /dev/null @@ -1,42 +0,0 @@ -"""Unit tests for ConventionReady event. - -Tests for User Story 2: Convention Initialization Chain. -""" - -from uuid import uuid4 - -from osa.domain.feature.event.convention_ready import ConventionReady -from osa.domain.shared.event import EventId -from osa.domain.shared.model.srn import ConventionSRN - - -def _make_conv_srn() -> ConventionSRN: - return ConventionSRN.parse("urn:osa:localhost:conv:test-conv@1.0.0") - - -class TestConventionReady: - def test_creation_with_convention_srn(self): - """ConventionReady event carries convention_srn.""" - srn = _make_conv_srn() - event = ConventionReady(id=EventId(uuid4()), convention_srn=srn) - - assert event.convention_srn == srn - assert event.id is not None - - def test_serialization_roundtrip(self): - """ConventionReady serializes and deserializes correctly.""" - srn = _make_conv_srn() - event = ConventionReady(id=EventId(uuid4()), convention_srn=srn) - - data = event.model_dump() - restored = ConventionReady.model_validate(data) - - assert restored.convention_srn == event.convention_srn - assert restored.id == event.id - - def test_registered_in_event_registry(self): - """ConventionReady should be auto-registered in Event._registry.""" - from osa.domain.shared.event import Event - - assert "ConventionReady" in Event._registry - assert Event._registry["ConventionReady"] is ConventionReady diff --git a/server/tests/unit/domain/feature/test_create_feature_tables.py b/server/tests/unit/domain/feature/test_create_feature_tables.py index 712e68d..8abda9e 100644 --- a/server/tests/unit/domain/feature/test_create_feature_tables.py +++ b/server/tests/unit/domain/feature/test_create_feature_tables.py @@ -1,7 +1,4 @@ -"""Unit tests for CreateFeatureTables event handler. - -Tests for User Story 2: Convention Initialization Chain. -""" +"""Unit tests for CreateFeatureTables event handler.""" from unittest.mock import AsyncMock from uuid import uuid4 @@ -9,7 +6,6 @@ import pytest from osa.domain.deposition.event.convention_registered import ConventionRegistered -from osa.domain.feature.event.convention_ready import ConventionReady from osa.domain.feature.handler.create_feature_tables import CreateFeatureTables from osa.domain.shared.error import ConflictError from osa.domain.shared.event import EventId @@ -19,13 +15,17 @@ OciConfig, TableFeatureSpec, ) -from osa.domain.shared.model.srn import ConventionSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN def _make_conv_srn() -> ConventionSRN: return ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") +def _make_schema_srn() -> SchemaSRN: + return SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") + + def _make_hook_definition(name: str = "pocket_detect") -> HookDefinition: return HookDefinition( name=name, @@ -44,107 +44,65 @@ def _make_event(hooks: list[HookDefinition] | None = None) -> ConventionRegister return ConventionRegistered( id=EventId(uuid4()), convention_srn=_make_conv_srn(), + schema_srn=_make_schema_srn(), + schema_fields=[], hooks=hooks or [], ) class TestCreateFeatureTables: @pytest.mark.asyncio - async def test_creates_tables_and_emits_convention_ready(self): - """Given ConventionRegistered with hooks, creates feature tables and emits ConventionReady.""" + async def test_creates_tables_for_each_hook(self): hook = _make_hook_definition() event = _make_event(hooks=[hook]) feature_service = AsyncMock() - outbox = AsyncMock() - - handler = CreateFeatureTables( - feature_service=feature_service, - outbox=outbox, - ) + handler = CreateFeatureTables(feature_service=feature_service) await handler.handle(event) feature_service.create_table.assert_called_once_with(hook) - outbox.append.assert_called_once() - emitted = outbox.append.call_args[0][0] - assert isinstance(emitted, ConventionReady) - assert emitted.convention_srn == event.convention_srn @pytest.mark.asyncio async def test_creates_multiple_tables(self): - """Creates a feature table for each hook in the event.""" hooks = [_make_hook_definition("hook_a"), _make_hook_definition("hook_b")] event = _make_event(hooks=hooks) feature_service = AsyncMock() - outbox = AsyncMock() - - handler = CreateFeatureTables( - feature_service=feature_service, - outbox=outbox, - ) + handler = CreateFeatureTables(feature_service=feature_service) await handler.handle(event) assert feature_service.create_table.call_count == 2 - outbox.append.assert_called_once() @pytest.mark.asyncio - async def test_emits_convention_ready_with_empty_hooks(self): - """Given empty hooks, still emits ConventionReady.""" + async def test_no_hooks_is_noop(self): event = _make_event(hooks=[]) feature_service = AsyncMock() - outbox = AsyncMock() - - handler = CreateFeatureTables( - feature_service=feature_service, - outbox=outbox, - ) + handler = CreateFeatureTables(feature_service=feature_service) await handler.handle(event) feature_service.create_table.assert_not_called() - outbox.append.assert_called_once() - emitted = outbox.append.call_args[0][0] - assert isinstance(emitted, ConventionReady) @pytest.mark.asyncio - async def test_does_not_emit_convention_ready_on_failure(self): - """Feature table creation failure does not emit ConventionReady.""" + async def test_propagates_non_conflict_errors(self): hook = _make_hook_definition() event = _make_event(hooks=[hook]) feature_service = AsyncMock() feature_service.create_table.side_effect = RuntimeError("DDL failed") - outbox = AsyncMock() - - handler = CreateFeatureTables( - feature_service=feature_service, - outbox=outbox, - ) + handler = CreateFeatureTables(feature_service=feature_service) with pytest.raises(RuntimeError, match="DDL failed"): await handler.handle(event) - outbox.append.assert_not_called() - @pytest.mark.asyncio async def test_skips_existing_tables_on_redelivery(self): - """ConflictError (table already exists) is skipped; ConventionReady still emitted.""" hooks = [_make_hook_definition("hook_a"), _make_hook_definition("hook_b")] event = _make_event(hooks=hooks) feature_service = AsyncMock() feature_service.create_table.side_effect = ConflictError("table already exists") - outbox = AsyncMock() - - handler = CreateFeatureTables( - feature_service=feature_service, - outbox=outbox, - ) + handler = CreateFeatureTables(feature_service=feature_service) await handler.handle(event) assert feature_service.create_table.call_count == 2 - outbox.append.assert_called_once() - emitted = outbox.append.call_args[0][0] - assert isinstance(emitted, ConventionReady) - assert emitted.convention_srn == event.convention_srn diff --git a/server/tests/unit/domain/feature/test_insert_record_features.py b/server/tests/unit/domain/feature/test_insert_record_features.py index 9bb2a52..d6303e7 100644 --- a/server/tests/unit/domain/feature/test_insert_record_features.py +++ b/server/tests/unit/domain/feature/test_insert_record_features.py @@ -13,6 +13,7 @@ from osa.domain.shared.model.srn import ( ConventionSRN, RecordSRN, + SchemaSRN, ) @@ -24,6 +25,10 @@ def _make_conv_srn() -> ConventionSRN: return ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") +def _make_schema_srn() -> SchemaSRN: + return SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") + + def _make_event( expected_features: list[str] | None = None, ) -> RecordPublished: @@ -33,6 +38,7 @@ def _make_event( source=DepositionSource(id="urn:osa:localhost:dep:test-dep"), metadata={"title": "Test"}, convention_srn=_make_conv_srn(), + schema_srn=_make_schema_srn(), expected_features=expected_features or [], ) @@ -219,6 +225,7 @@ async def test_ingest_source_uses_source_fields(self): ), metadata={"title": "Ingested"}, convention_srn=_make_conv_srn(), + schema_srn=_make_schema_srn(), expected_features=["pocket_detect"], ) await handler.handle(event) diff --git a/server/tests/unit/domain/index/test_fanout_listener.py b/server/tests/unit/domain/index/test_fanout_listener.py index 329cf08..14ee97c 100644 --- a/server/tests/unit/domain/index/test_fanout_listener.py +++ b/server/tests/unit/domain/index/test_fanout_listener.py @@ -97,6 +97,9 @@ async def test_creates_index_record_per_backend( record_srn=sample_record_srn, source=DepositionSource(id=str(sample_deposition_srn)), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_srn=__import__( + "osa.domain.shared.model.srn", fromlist=["SchemaSRN"] + ).SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), metadata=sample_metadata, ) @@ -137,6 +140,9 @@ async def test_creates_unique_event_ids( record_srn=sample_record_srn, source=DepositionSource(id=str(sample_deposition_srn)), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_srn=__import__( + "osa.domain.shared.model.srn", fromlist=["SchemaSRN"] + ).SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), metadata=sample_metadata, ) @@ -166,6 +172,9 @@ async def test_empty_registry_creates_no_events( record_srn=sample_record_srn, source=DepositionSource(id=str(sample_deposition_srn)), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_srn=__import__( + "osa.domain.shared.model.srn", fromlist=["SchemaSRN"] + ).SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), metadata=sample_metadata, ) diff --git a/server/tests/unit/domain/record/test_get_record_handler.py b/server/tests/unit/domain/record/test_get_record_handler.py index 8fea888..10c1771 100644 --- a/server/tests/unit/domain/record/test_get_record_handler.py +++ b/server/tests/unit/domain/record/test_get_record_handler.py @@ -8,7 +8,7 @@ from osa.domain.record.model.aggregate import Record from osa.domain.shared.error import NotFoundError from osa.domain.shared.model.source import DepositionSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN def _make_record_srn() -> RecordSRN: @@ -24,6 +24,7 @@ def _make_record() -> Record: srn=_make_record_srn(), source=DepositionSource(id="urn:osa:localhost:dep:test-dep"), convention_srn=_make_conv_srn(), + schema_srn=SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), metadata={"title": "Test Protein"}, published_at=datetime.now(UTC), ) diff --git a/server/tests/unit/domain/record/test_record_features.py b/server/tests/unit/domain/record/test_record_features.py index e1143f4..b4227f5 100644 --- a/server/tests/unit/domain/record/test_record_features.py +++ b/server/tests/unit/domain/record/test_record_features.py @@ -9,7 +9,7 @@ from osa.domain.record.query.get_record import GetRecord, GetRecordHandler, RecordDetail from osa.domain.record.service.record import RecordService from osa.domain.shared.model.source import DepositionSource -from osa.domain.shared.model.srn import ConventionSRN, Domain, RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, Domain, RecordSRN, SchemaSRN from osa.infrastructure.persistence.adapter.feature_reader import PostgresFeatureReader @@ -177,6 +177,7 @@ def _make_record() -> Record: srn=RecordSRN.parse("urn:osa:localhost:rec:abc@1"), source=DepositionSource(id="urn:osa:localhost:dep:dep1"), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_srn=SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), metadata={"title": "Test"}, published_at=datetime.now(UTC), ) @@ -191,6 +192,7 @@ async def test_get_features_delegates_to_reader(self) -> None: service = RecordService( record_repo=mock_repo, + convention_repo=AsyncMock(), outbox=mock_outbox, node_domain=Domain("localhost"), feature_reader=mock_reader, diff --git a/server/tests/unit/domain/record/test_record_published_enriched.py b/server/tests/unit/domain/record/test_record_published_enriched.py index bee3f0e..a463da8 100644 --- a/server/tests/unit/domain/record/test_record_published_enriched.py +++ b/server/tests/unit/domain/record/test_record_published_enriched.py @@ -8,7 +8,10 @@ from osa.domain.record.event.record_published import RecordPublished from osa.domain.shared.event import EventId from osa.domain.shared.model.source import DepositionSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN + + +SCHEMA = SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") class TestRecordPublishedEnriched: @@ -20,6 +23,7 @@ def test_carries_source(self): source=source, metadata={"title": "Test"}, convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_srn=SCHEMA, expected_features=["pocketeer"], ) assert event.source.type == "deposition" @@ -32,6 +36,7 @@ def test_carries_convention_srn(self): source=DepositionSource(id="urn:osa:localhost:dep:test"), metadata={"title": "Test"}, convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_srn=SCHEMA, expected_features=[], ) assert event.convention_srn == ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") @@ -43,6 +48,7 @@ def test_carries_expected_features(self): source=DepositionSource(id="urn:osa:localhost:dep:test"), metadata={"title": "Test"}, convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_srn=SCHEMA, expected_features=["pocketeer", "qc_check"], ) assert event.expected_features == ["pocketeer", "qc_check"] diff --git a/server/tests/unit/domain/record/test_record_service.py b/server/tests/unit/domain/record/test_record_service.py index e62c2cd..83758aa 100644 --- a/server/tests/unit/domain/record/test_record_service.py +++ b/server/tests/unit/domain/record/test_record_service.py @@ -1,19 +1,26 @@ """Unit tests for RecordService.""" +from datetime import UTC, datetime from unittest.mock import AsyncMock, MagicMock from uuid import uuid4 import pytest +from osa.domain.deposition.model.convention import Convention +from osa.domain.deposition.model.value import FileRequirements +from osa.domain.deposition.port.convention_repository import ConventionRepository from osa.domain.record.event.record_published import RecordPublished from osa.domain.record.model.draft import RecordDraft from osa.domain.record.port.repository import RecordRepository from osa.domain.record.service.record import RecordService -from osa.domain.shared.model.source import ( - DepositionSource, - IngestSource, +from osa.domain.shared.model.source import DepositionSource, IngestSource +from osa.domain.shared.model.srn import ( + ConventionSRN, + DepositionSRN, + Domain, + LocalId, + SchemaSRN, ) -from osa.domain.shared.model.srn import ConventionSRN, DepositionSRN, Domain, LocalId from osa.domain.shared.outbox import Outbox @@ -21,6 +28,22 @@ def _make_conv_srn() -> ConventionSRN: return ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") +def _make_schema_srn() -> SchemaSRN: + return SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") + + +def _make_convention() -> Convention: + return Convention( + srn=_make_conv_srn(), + title="Test Convention", + description=None, + schema_srn=_make_schema_srn(), + file_requirements=FileRequirements(accepted_types=[], max_count=0, max_file_size=0), + hooks=[], + created_at=datetime.now(UTC), + ) + + @pytest.fixture def mock_record_repo() -> RecordRepository: repo = MagicMock(spec=RecordRepository) @@ -28,6 +51,13 @@ def mock_record_repo() -> RecordRepository: return repo +@pytest.fixture +def mock_convention_repo() -> ConventionRepository: + repo = MagicMock(spec=ConventionRepository) + repo.get = AsyncMock(return_value=_make_convention()) + return repo + + @pytest.fixture def mock_outbox() -> Outbox: outbox = MagicMock(spec=Outbox) @@ -51,28 +81,39 @@ def sample_draft(node_domain: Domain) -> RecordDraft: ) +def _make_service( + record_repo: RecordRepository, + convention_repo: ConventionRepository, + outbox: Outbox, + node_domain: Domain, +) -> RecordService: + return RecordService( + record_repo=record_repo, + convention_repo=convention_repo, + outbox=outbox, + node_domain=node_domain, + feature_reader=AsyncMock(), + ) + + class TestRecordService: @pytest.mark.asyncio async def test_publish_record_creates_record( self, mock_record_repo: RecordRepository, + mock_convention_repo: ConventionRepository, mock_outbox: Outbox, node_domain: Domain, sample_draft: RecordDraft, ): - """Service should create and persist a Record from a draft.""" - service = RecordService( - record_repo=mock_record_repo, - outbox=mock_outbox, - node_domain=node_domain, - feature_reader=AsyncMock(), - ) + service = _make_service(mock_record_repo, mock_convention_repo, mock_outbox, node_domain) record = await service.publish_record(sample_draft) assert record is not None assert record.source == sample_draft.source assert record.convention_srn == sample_draft.convention_srn + assert record.schema_srn == _make_schema_srn() assert record.metadata == sample_draft.metadata mock_record_repo.save.assert_called_once() @@ -80,17 +121,12 @@ async def test_publish_record_creates_record( async def test_publish_record_emits_record_published_event( self, mock_record_repo: RecordRepository, + mock_convention_repo: ConventionRepository, mock_outbox: Outbox, node_domain: Domain, sample_draft: RecordDraft, ): - """Service should emit RecordPublished event with source-agnostic fields.""" - service = RecordService( - record_repo=mock_record_repo, - outbox=mock_outbox, - node_domain=node_domain, - feature_reader=AsyncMock(), - ) + service = _make_service(mock_record_repo, mock_convention_repo, mock_outbox, node_domain) record = await service.publish_record(sample_draft) @@ -100,6 +136,7 @@ async def test_publish_record_emits_record_published_event( assert event.record_srn == record.srn assert event.source == sample_draft.source assert event.convention_srn == sample_draft.convention_srn + assert event.schema_srn == _make_schema_srn() assert event.expected_features == sample_draft.expected_features assert event.metadata == sample_draft.metadata @@ -107,17 +144,12 @@ async def test_publish_record_emits_record_published_event( async def test_publish_record_creates_version_1( self, mock_record_repo: RecordRepository, + mock_convention_repo: ConventionRepository, mock_outbox: Outbox, node_domain: Domain, sample_draft: RecordDraft, ): - """New records should be version 1.""" - service = RecordService( - record_repo=mock_record_repo, - outbox=mock_outbox, - node_domain=node_domain, - feature_reader=AsyncMock(), - ) + service = _make_service(mock_record_repo, mock_convention_repo, mock_outbox, node_domain) record = await service.publish_record(sample_draft) @@ -125,16 +157,14 @@ async def test_publish_record_creates_version_1( class TestRecordServiceIngestSource: - """US2: Verify ingest-sourced records publish correctly.""" - @pytest.mark.asyncio async def test_publish_with_ingest_source( self, mock_record_repo: RecordRepository, + mock_convention_repo: ConventionRepository, mock_outbox: Outbox, node_domain: Domain, ): - """IngestSource draft produces correct Record + RecordPublished event.""" draft = RecordDraft( source=IngestSource( id="run-123-pdb-456", @@ -146,12 +176,7 @@ async def test_publish_with_ingest_source( expected_features=["pocket_detect"], ) - service = RecordService( - record_repo=mock_record_repo, - outbox=mock_outbox, - node_domain=node_domain, - feature_reader=AsyncMock(), - ) + service = _make_service(mock_record_repo, mock_convention_repo, mock_outbox, node_domain) record = await service.publish_record(draft) diff --git a/server/tests/unit/test_field_ref_resolution.py b/server/tests/unit/test_field_ref_resolution.py new file mode 100644 index 0000000..d647c3a --- /dev/null +++ b/server/tests/unit/test_field_ref_resolution.py @@ -0,0 +1,42 @@ +"""US3 tests: typed field-reference parsing and tree validation.""" + +import pytest + +from osa.domain.discovery.model.refs import ( + FeatureFieldRef, + MetadataFieldRef, + parse_field_ref, +) + + +class TestParseFieldRef: + def test_parses_metadata_ref(self): + ref = parse_field_ref("metadata.species") + assert isinstance(ref, MetadataFieldRef) + assert ref.field == "species" + + def test_parses_feature_ref(self): + ref = parse_field_ref("features.cell_classifier.confidence") + assert isinstance(ref, FeatureFieldRef) + assert ref.hook == "cell_classifier" + assert ref.column == "confidence" + + def test_rejects_unknown_prefix(self): + with pytest.raises(ValueError, match="prefix"): + parse_field_ref("other.foo") + + def test_rejects_malformed_metadata(self): + with pytest.raises(ValueError): + parse_field_ref("metadata.a.b") + + def test_rejects_malformed_feature(self): + with pytest.raises(ValueError): + parse_field_ref("features.hook") + + def test_rejects_invalid_identifier(self): + with pytest.raises(ValueError): + parse_field_ref("metadata.Has-Dash") + + def test_dotted_round_trip(self): + assert parse_field_ref("metadata.species").dotted() == "metadata.species" + assert parse_field_ref("features.hook.col").dotted() == "features.hook.col" diff --git a/server/tests/unit/test_filter_expr_and_compile.py b/server/tests/unit/test_filter_expr_and_compile.py new file mode 100644 index 0000000..7e5b301 --- /dev/null +++ b/server/tests/unit/test_filter_expr_and_compile.py @@ -0,0 +1,197 @@ +"""US1 tests: FilterExpr AND-tree validation via DiscoveryService bounds.""" + +from unittest.mock import AsyncMock + +import pytest + +from osa.config import Config +from osa.domain.discovery.model.refs import FeatureFieldRef, MetadataFieldRef +from osa.domain.discovery.model.value import ( + And, + FilterOperator, + Predicate, + SortOrder, +) +from osa.domain.discovery.service.discovery import DiscoveryService +from osa.domain.semantics.model.value import FieldType +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.srn import SchemaSRN + + +SCHEMA = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") + + +def _config(overrides: dict | None = None) -> Config: + import os + + os.environ.setdefault("OSA_AUTH__JWT__SECRET", "a" * 64) + os.environ.setdefault("OSA_BASE_URL", "http://localhost:8000") + cfg = Config() # type: ignore[call-arg] + if overrides: + for k, v in overrides.items(): + setattr(cfg, k, v) + return cfg + + +def _svc( + *, + field_map: dict[str, FieldType] | None = None, + max_depth: int | None = None, + max_preds: int | None = None, + max_joins: int | None = None, +) -> DiscoveryService: + read_store = AsyncMock() + read_store.search_records.return_value = [] + read_store.get_feature_catalog.return_value = [] + + reader = AsyncMock() + fm = field_map or { + "title": FieldType.TEXT, + "resolution": FieldType.NUMBER, + } + reader.get_all_field_types.return_value = fm + reader.get_fields_for_schema.return_value = fm + + overrides = {} + if max_depth is not None: + overrides["discovery_max_filter_depth"] = max_depth + if max_preds is not None: + overrides["discovery_max_predicates"] = max_preds + if max_joins is not None: + overrides["discovery_max_cross_domain_joins"] = max_joins + + return DiscoveryService(read_store=read_store, field_reader=reader, config=_config(overrides)) + + +def _pred(field: str, op: FilterOperator, value: object) -> Predicate: + return Predicate(field=MetadataFieldRef(field=field), op=op, value=value) + + +class TestAndOnlyTrees: + async def test_accepts_and_of_predicates(self) -> None: + svc = _svc() + tree = And( + operands=[ + _pred("title", FilterOperator.EQ, "x"), + _pred("resolution", FilterOperator.GTE, 3.0), + ] + ) + await svc.search_records( + filter_expr=tree, + schema_srn=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + + +class TestBoundsEnforced: + async def test_depth_exceeded(self) -> None: + svc = _svc(max_depth=3) + leaf = _pred("title", FilterOperator.EQ, "x") + tree = leaf + for _ in range(4): + tree = And(operands=[tree, leaf]) + + with pytest.raises(ValidationError, match="depth"): + await svc.search_records( + filter_expr=tree, + schema_srn=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + + async def test_predicates_exceeded(self) -> None: + svc = _svc(max_preds=2) + tree = And( + operands=[ + _pred("title", FilterOperator.EQ, "x"), + _pred("title", FilterOperator.EQ, "y"), + _pred("resolution", FilterOperator.GTE, 3.0), + ] + ) + with pytest.raises(ValidationError, match="predicate leaves"): + await svc.search_records( + filter_expr=tree, + schema_srn=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + + async def test_joins_exceeded(self) -> None: + svc = _svc(max_joins=1) + # Simulate catalog advertising multiple hooks with a column named score + svc.read_store.get_feature_catalog.return_value = [ # type: ignore[attr-defined] + type( + "E", + (), + { + "hook_name": "hook_a", + "columns": [ + type("C", (), {"name": "score", "type": "number", "required": True}) + ], + }, + ), + type( + "E", + (), + { + "hook_name": "hook_b", + "columns": [ + type("C", (), {"name": "score", "type": "number", "required": True}) + ], + }, + ), + ] + tree = And( + operands=[ + Predicate( + field=FeatureFieldRef(hook="hook_a", column="score"), + op=FilterOperator.GT, + value=0.0, + ), + Predicate( + field=FeatureFieldRef(hook="hook_b", column="score"), + op=FilterOperator.GT, + value=0.0, + ), + ] + ) + with pytest.raises(ValidationError, match="distinct feature hooks"): + await svc.search_records( + filter_expr=tree, + schema_srn=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + + +class TestUnknownField: + async def test_unknown_metadata_field_rejected(self) -> None: + svc = _svc() + with pytest.raises(ValidationError, match="Unknown metadata field"): + await svc.search_records( + filter_expr=_pred("bogus", FilterOperator.EQ, "x"), + schema_srn=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) diff --git a/server/tests/unit/test_filter_expr_or_not.py b/server/tests/unit/test_filter_expr_or_not.py new file mode 100644 index 0000000..040dbb8 --- /dev/null +++ b/server/tests/unit/test_filter_expr_or_not.py @@ -0,0 +1,131 @@ +"""US2 tests: FilterExpr accepts OR/NOT trees and validation walks them correctly.""" + +from unittest.mock import AsyncMock + +import pytest + +from osa.config import Config +from osa.domain.discovery.model.refs import MetadataFieldRef +from osa.domain.discovery.model.value import ( + And, + FilterOperator, + Not, + Or, + Predicate, + SortOrder, +) +from osa.domain.discovery.service.discovery import DiscoveryService +from osa.domain.semantics.model.value import FieldType +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.srn import SchemaSRN + + +SCHEMA = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") + + +def _config() -> Config: + import os + + os.environ.setdefault("OSA_AUTH__JWT__SECRET", "a" * 64) + os.environ.setdefault("OSA_BASE_URL", "http://localhost:8000") + return Config() # type: ignore[call-arg] + + +def _svc() -> DiscoveryService: + read_store = AsyncMock() + read_store.search_records.return_value = [] + reader = AsyncMock() + fm = { + "title": FieldType.TEXT, + "resolution": FieldType.NUMBER, + } + reader.get_all_field_types.return_value = fm + reader.get_fields_for_schema.return_value = fm + return DiscoveryService(read_store=read_store, field_reader=reader, config=_config()) + + +def _pred(field: str, op: FilterOperator, value: object) -> Predicate: + return Predicate(field=MetadataFieldRef(field=field), op=op, value=value) + + +class TestOrNot: + async def test_or_tree_accepted(self): + svc = _svc() + tree = Or( + operands=[ + _pred("title", FilterOperator.EQ, "A"), + _pred("title", FilterOperator.EQ, "B"), + ] + ) + await svc.search_records( + filter_expr=tree, + schema_srn=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + ) + + async def test_not_tree_accepted(self): + svc = _svc() + tree = Not(operand=_pred("title", FilterOperator.EQ, "X")) + await svc.search_records( + filter_expr=tree, + schema_srn=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + ) + + async def test_nested_mixed_tree(self): + svc = _svc() + tree = And( + operands=[ + _pred("title", FilterOperator.EQ, "X"), + Or( + operands=[ + _pred("resolution", FilterOperator.GTE, 3.0), + _pred("resolution", FilterOperator.LT, 1.0), + ] + ), + Not(operand=_pred("title", FilterOperator.EQ, "Bad")), + ] + ) + await svc.search_records( + filter_expr=tree, + schema_srn=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + ) + + +class TestCompoundDisabledFlag: + async def test_or_rejected_when_compound_disabled(self): + svc = _svc() + tree = Or( + operands=[ + _pred("title", FilterOperator.EQ, "A"), + _pred("title", FilterOperator.EQ, "B"), + ] + ) + with pytest.raises(ValidationError, match="compound_disabled|Compound"): + await svc.search_records( + filter_expr=tree, + schema_srn=SCHEMA, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + allow_compound=False, + ) diff --git a/server/tests/unit/test_metadata_column_mapper.py b/server/tests/unit/test_metadata_column_mapper.py new file mode 100644 index 0000000..74e6ff6 --- /dev/null +++ b/server/tests/unit/test_metadata_column_mapper.py @@ -0,0 +1,35 @@ +"""Tests for metadata column mapping — reuses the shared column_mapper.""" + +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import JSONB + +from osa.domain.shared.model.hook import ColumnDef +from osa.infrastructure.persistence.column_mapper import map_column + + +class TestScalarTypes: + def test_text(self): + col = map_column(ColumnDef(name="title", json_type="string", required=True)) + assert isinstance(col.type, sa.Text) + assert col.nullable is False + + def test_number(self): + col = map_column(ColumnDef(name="resolution", json_type="number", required=True)) + assert isinstance(col.type, sa.Float) + + def test_integer(self): + col = map_column(ColumnDef(name="count", json_type="integer", required=True)) + assert isinstance(col.type, sa.BigInteger) + + def test_boolean(self): + col = map_column(ColumnDef(name="ok", json_type="boolean", required=False)) + assert isinstance(col.type, sa.Boolean) + assert col.nullable is True + + def test_date(self): + col = map_column(ColumnDef(name="d", json_type="string", format="date", required=False)) + assert isinstance(col.type, sa.Date) + + def test_array_jsonb(self): + col = map_column(ColumnDef(name="tags", json_type="array", required=False)) + assert isinstance(col.type, JSONB) diff --git a/server/tests/unit/test_metadata_service.py b/server/tests/unit/test_metadata_service.py new file mode 100644 index 0000000..5ef2401 --- /dev/null +++ b/server/tests/unit/test_metadata_service.py @@ -0,0 +1,39 @@ +"""MetadataService unit tests — thin delegator over MetadataStore.""" + +from unittest.mock import AsyncMock + +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.shared.model.srn import RecordSRN, SchemaSRN + +SCHEMA = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") +RECORD = RecordSRN.parse("urn:osa:localhost:rec:abc@1") + + +def _fields() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ) + ] + + +class TestMetadataService: + async def test_ensure_table_delegates(self): + store = AsyncMock() + svc = MetadataService(metadata_store=store) + await svc.ensure_table(schema_srn=SCHEMA, schema_title="bio_sample", fields=_fields()) + store.ensure_table.assert_called_once() + + async def test_insert_delegates(self): + store = AsyncMock() + svc = MetadataService(metadata_store=store) + await svc.insert( + schema_srn=SCHEMA, + record_srn=RECORD, + values={"species": "Homo sapiens"}, + ) + store.insert.assert_called_once() diff --git a/server/tests/unit/test_metadata_slug.py b/server/tests/unit/test_metadata_slug.py new file mode 100644 index 0000000..59e3130 --- /dev/null +++ b/server/tests/unit/test_metadata_slug.py @@ -0,0 +1,40 @@ +"""Tests for schema_slug() — pg-safe slug derivation from Schema title.""" + +import pytest + +from osa.infrastructure.persistence.metadata_table import schema_slug + + +class TestSchemaSlug: + def test_accepts_simple_title(self): + assert schema_slug("bio_sample") == "bio_sample" + + def test_lowercases_camel_case(self): + assert schema_slug("BioSample") == "biosample" + + def test_replaces_spaces_with_underscore(self): + assert schema_slug("bio sample") == "bio_sample" + + def test_replaces_punctuation_with_underscore(self): + assert schema_slug("bio-sample.v2") == "bio_sample_v2" + + def test_collapses_repeated_non_alnum(self): + assert schema_slug("bio---sample") == "bio_sample" + + def test_strips_leading_and_trailing_underscores(self): + assert schema_slug("__bio_sample__") == "bio_sample" + + def test_is_stable_across_invocations(self): + assert schema_slug("BioSample v1") == schema_slug("BioSample v1") + + def test_rejects_empty_title(self): + with pytest.raises(ValueError): + schema_slug("") + + def test_rejects_title_with_only_punctuation(self): + with pytest.raises(ValueError): + schema_slug("!!!") + + def test_rejects_title_starting_with_digit(self): + with pytest.raises(ValueError): + schema_slug("1bio_sample") diff --git a/server/tests/unit/test_record_schema_srn_immutable.py b/server/tests/unit/test_record_schema_srn_immutable.py new file mode 100644 index 0000000..be8bbda --- /dev/null +++ b/server/tests/unit/test_record_schema_srn_immutable.py @@ -0,0 +1,28 @@ +"""FR-008: Record.schema_srn is immutable after construction.""" + +from datetime import UTC, datetime + +import pytest +from pydantic import ValidationError + +from osa.domain.record.model.aggregate import Record +from osa.domain.shared.model.source import DepositionSource +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN + + +def _make_record() -> Record: + return Record( + srn=RecordSRN.parse("urn:osa:localhost:rec:abc@1"), + source=DepositionSource(id="urn:osa:localhost:dep:d1"), + convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), + schema_srn=SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), + metadata={"title": "T"}, + published_at=datetime.now(UTC), + ) + + +def test_schema_srn_cannot_be_reassigned(): + record = _make_record() + other = SchemaSRN.parse("urn:osa:localhost:schema:other@1.0.0") + with pytest.raises(ValidationError): + record.schema_srn = other # type: ignore[misc] diff --git a/server/uv.lock b/server/uv.lock index 79388c5..0e39f48 100644 --- a/server/uv.lock +++ b/server/uv.lock @@ -277,6 +277,7 @@ dependencies = [ { name = "jmespath", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "s3transfer", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ] +sdist = { url = "https://files.pythonhosted.org/packages/74/ec/636ab2aa7ad9e6bf6e297240ac2d44dba63cc6611e2d5038db318436d449/boto3-1.42.74.tar.gz", hash = "sha256:dbacd808cf2a3dadbf35f3dbd8de97b94dc9f78b1ebd439f38f552e0f9753577", size = 112739, upload-time = "2026-03-23T19:34:09.815Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ad/16/a264b4da2af99f4a12609b93fea941cce5ec41da14b33ed3fef77a910f0c/boto3-1.42.74-py3-none-any.whl", hash = "sha256:4bf89c044d618fe4435af854ab820f09dd43569c0df15d7beb0398f50b9aa970", size = 140557, upload-time = "2026-03-23T19:34:07.084Z" }, ] From a35677007191c24dc861b1a8079548bf9907ab0e Mon Sep 17 00:00:00 2001 From: Rory Byrne Date: Wed, 22 Apr 2026 11:42:01 +0100 Subject: [PATCH 2/9] refactor: replace schema SRN with short-form schema ID throughout system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace URN-based schema references with short-form `(id, version)` pairs to simplify internal operations while reserving full URNs for federation edges. Key changes: - Replace `SchemaSRN` with `SchemaId` in domain models and services - Update database schema: `schemas.srn` → `schemas.id` + `schemas.version` - Update `conventions.schema_srn` → `conventions.schema_id` + `conventions.schema_version` - Update `records.schema_srn` → `records.schema_id` + `records.schema_version` - Add `SchemaIdentifier` type for human-readable schema slugs - Update metadata table catalog to use `schema_id` instead of `schema_identity` - Require pinned schema for metadata filtering (remove JSONB fallback) - Add schema ID validation and coercion in API layer - Update discovery queries to work with short-form schema references refactor: replace SchemaSRN with SchemaId across domain models Replace SchemaSRN with simplified SchemaId that only contains id and version without full URN structure. Update all references in models, services, tests and handlers to use the new SchemaId format. --- .../076_add_metadata_schema_and_catalog.py | 10 +- .../versions/076_add_records_schema_srn.py | 37 +++-- .../migrations/versions/076_schemas_to_id.py | 90 ++++++++++++ .../application/api/v1/routes/discovery.py | 33 ++++- .../osa/application/api/v1/routes/schemas.py | 14 +- .../deposition/command/create_convention.py | 15 +- .../deposition/command/upload_spreadsheet.py | 4 +- .../deposition/event/convention_registered.py | 14 +- .../osa/domain/deposition/model/convention.py | 4 +- .../domain/deposition/port/schema_reader.py | 6 +- .../deposition/query/download_template.py | 4 +- .../domain/deposition/query/get_convention.py | 6 +- .../deposition/query/list_conventions.py | 6 +- .../domain/deposition/service/convention.py | 16 +- .../discovery/port/field_definition_reader.py | 4 +- .../osa/domain/discovery/port/read_store.py | 6 +- .../domain/discovery/query/search_features.py | 6 +- .../domain/discovery/query/search_records.py | 6 +- .../osa/domain/discovery/service/discovery.py | 70 +++++---- .../metadata/handler/ensure_metadata_table.py | 11 +- .../metadata/handler/insert_batch_metadata.py | 52 +++++++ .../handler/insert_record_metadata.py | 4 +- .../domain/metadata/port/metadata_store.py | 17 ++- .../osa/domain/metadata/service/metadata.py | 11 +- .../domain/record/event/record_published.py | 9 +- server/osa/domain/record/model/aggregate.py | 4 +- server/osa/domain/record/service/record.py | 24 +-- .../domain/semantics/command/create_schema.py | 8 +- server/osa/domain/semantics/model/schema.py | 4 +- .../semantics/port/schema_repository.py | 6 +- .../osa/domain/semantics/query/get_schema.py | 10 +- .../domain/semantics/query/list_schemas.py | 6 +- server/osa/domain/semantics/service/schema.py | 31 ++-- server/osa/domain/shared/model/srn.py | 77 +++++++++- server/osa/infrastructure/event/di.py | 2 + .../persistence/adapter/discovery.py | 139 +++++------------- .../persistence/adapter/readers.py | 21 ++- .../infrastructure/persistence/api_naming.py | 46 ++++++ .../persistence/feature_store.py | 14 +- .../persistence/feature_table.py | 17 ++- .../persistence/mappers/record.py | 19 ++- .../persistence/metadata_store.py | 87 ++++++----- .../persistence/metadata_table.py | 7 +- .../persistence/repository/convention.py | 10 +- .../persistence/repository/schema.py | 24 ++- .../osa/infrastructure/persistence/tables.py | 19 ++- server/tests/integration/conftest.py | 13 +- .../persistence/test_convention_repo.py | 8 +- .../persistence/test_discovery_pagination.py | 7 +- .../persistence/test_metadata_store.py | 129 ++++++++++++---- .../test_discovery_compound_postgres.py | 21 ++- .../test_discovery_cross_join_postgres.py | 19 ++- .../test_discovery_records_typed_and.py | 94 ++++++------ .../integration/test_ensure_metadata_table.py | 23 ++- .../test_event_batch_processing.py | 4 +- .../test_insert_record_metadata.py | 24 ++- .../test_metadata_additive_evolve_postgres.py | 34 +++-- .../test_non_additive_rejected_postgres.py | 20 +-- .../unit/domain/deposition/test_convention.py | 16 +- .../deposition/test_convention_registered.py | 12 +- .../deposition/test_convention_service.py | 20 ++- .../deposition/test_convention_service_v2.py | 21 ++- .../deposition/test_deposition_service.py | 8 +- .../domain/deposition/test_event_chain.py | 8 +- .../domain/deposition/test_spreadsheet.py | 8 +- .../discovery/test_discovery_service.py | 97 +++++++++--- .../domain/discovery/test_search_features.py | 18 +-- .../domain/discovery/test_search_records.py | 2 +- .../feature/test_create_feature_tables.py | 8 +- .../feature/test_insert_record_features.py | 10 +- .../unit/domain/index/test_fanout_listener.py | 18 +-- .../domain/record/test_get_record_handler.py | 4 +- .../domain/record/test_record_features.py | 4 +- .../record/test_record_published_enriched.py | 10 +- .../unit/domain/record/test_record_service.py | 12 +- .../unit/domain/semantics/test_schema.py | 18 +-- .../domain/semantics/test_schema_service.py | 80 ++++++++-- server/tests/unit/domain/shared/test_srn.py | 14 +- .../unit/test_filter_expr_and_compile.py | 14 +- server/tests/unit/test_filter_expr_or_not.py | 12 +- server/tests/unit/test_metadata_service.py | 8 +- .../unit/test_record_schema_srn_immutable.py | 12 +- 82 files changed, 1206 insertions(+), 654 deletions(-) create mode 100644 server/migrations/versions/076_schemas_to_id.py create mode 100644 server/osa/domain/metadata/handler/insert_batch_metadata.py create mode 100644 server/osa/infrastructure/persistence/api_naming.py diff --git a/server/migrations/versions/076_add_metadata_schema_and_catalog.py b/server/migrations/versions/076_add_metadata_schema_and_catalog.py index 1ba5601..e425c95 100644 --- a/server/migrations/versions/076_add_metadata_schema_and_catalog.py +++ b/server/migrations/versions/076_add_metadata_schema_and_catalog.py @@ -2,7 +2,7 @@ Create the ``metadata`` PostgreSQL schema and the ``public.metadata_tables`` catalog table. Dynamic per-schema metadata tables will live inside the -``metadata`` schema; the catalog indexes them by schema identity+major. +``metadata`` schema; the catalog indexes them by short schema id + major. Revision ID: 076_metadata_catalog Revises: add_deliver_after @@ -29,7 +29,7 @@ def upgrade() -> None: op.create_table( "metadata_tables", sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), - sa.Column("schema_identity", sa.Text(), nullable=False), + sa.Column("schema_id", sa.Text(), nullable=False), sa.Column("schema_slug", sa.Text(), nullable=False), sa.Column("schema_major", sa.Integer(), nullable=False), sa.Column("schema_versions", JSONB(), nullable=False), @@ -37,11 +37,7 @@ def upgrade() -> None: sa.Column("metadata_schema", JSONB(), nullable=False), sa.Column("created_at", sa.DateTime(timezone=True), nullable=False), sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False), - sa.UniqueConstraint( - "schema_identity", - "schema_major", - name="uq_metadata_tables_identity_major", - ), + sa.UniqueConstraint("schema_id", "schema_major", name="uq_metadata_tables_id_major"), sa.UniqueConstraint("pg_table", name="uq_metadata_tables_pg_table"), ) diff --git a/server/migrations/versions/076_add_records_schema_srn.py b/server/migrations/versions/076_add_records_schema_srn.py index ea3e225..6989a4b 100644 --- a/server/migrations/versions/076_add_records_schema_srn.py +++ b/server/migrations/versions/076_add_records_schema_srn.py @@ -1,14 +1,14 @@ -"""076_add_records_schema_srn +"""076_add_records_schema_id -Add a ``records.schema_srn`` column so Record linkage to its typed metadata -shape is first-class (FR-008). Backfill from the linked convention's -``schema_srn`` before tightening to NOT NULL. +Add ``records.schema_id`` + ``records.schema_version`` so a Record's typed +linkage is first-class (FR-008). Backfill from the linked convention's +``schema_id`` / ``schema_version`` columns, then tighten to NOT NULL. -Greenfield deployments with no records will skip the backfill (the UPDATE is -a no-op) and go straight to NOT NULL. +Greenfield deployments with no records will skip the backfill and go straight +to NOT NULL. Revision ID: 076_records_schema_srn -Revises: 076_metadata_catalog +Revises: 076_schemas_to_id Create Date: 2026-04-19 """ @@ -20,28 +20,35 @@ # revision identifiers, used by Alembic. revision: str = "076_records_schema_srn" -down_revision: Union[str, Sequence[str], None] = "076_metadata_catalog" +down_revision: Union[str, Sequence[str], None] = "076_schemas_to_id" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - op.add_column("records", sa.Column("schema_srn", sa.Text(), nullable=True)) + op.add_column("records", sa.Column("schema_id", sa.Text(), nullable=True)) + op.add_column("records", sa.Column("schema_version", sa.Text(), nullable=True)) + # Backfill from the owning convention's schema_id/schema_version + # (populated by ``076_schemas_to_id`` which ran just before this). op.execute( """ UPDATE records r - SET schema_srn = c.schema_srn + SET + schema_id = c.schema_id, + schema_version = c.schema_version FROM conventions c WHERE c.srn = r.convention_srn - AND r.schema_srn IS NULL + AND r.schema_id IS NULL """ ) - op.alter_column("records", "schema_srn", nullable=False) - op.create_index("idx_records_schema_srn", "records", ["schema_srn"]) + op.alter_column("records", "schema_id", nullable=False) + op.alter_column("records", "schema_version", nullable=False) + op.create_index("idx_records_schema_id", "records", ["schema_id"]) def downgrade() -> None: - op.drop_index("idx_records_schema_srn", table_name="records") - op.drop_column("records", "schema_srn") + op.drop_index("idx_records_schema_id", table_name="records") + op.drop_column("records", "schema_version") + op.drop_column("records", "schema_id") diff --git a/server/migrations/versions/076_schemas_to_id.py b/server/migrations/versions/076_schemas_to_id.py new file mode 100644 index 0000000..50e681d --- /dev/null +++ b/server/migrations/versions/076_schemas_to_id.py @@ -0,0 +1,90 @@ +"""076_schemas_to_id + +Replace URN-keyed ``schemas`` and ``conventions`` columns with short-form +``(id, version)`` pairs. After this migration, internal code works entirely +in ``SchemaId``; full URNs are reserved for federation edges. + +Changes: +- ``schemas.srn`` → ``schemas.id`` + ``schemas.version``. Composite PK. +- ``conventions.schema_srn`` → ``conventions.schema_id`` + ``conventions.schema_version``. + +Revision ID: 076_schemas_to_id +Revises: 076_metadata_catalog +Create Date: 2026-04-20 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "076_schemas_to_id" +down_revision: Union[str, Sequence[str], None] = "076_metadata_catalog" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # schemas: drop old SRN PK, add id + version, recompose PK. + op.add_column("schemas", sa.Column("id", sa.String(), nullable=True)) + op.add_column("schemas", sa.Column("version", sa.String(), nullable=True)) + op.execute( + """ + UPDATE schemas + SET + id = split_part(split_part(srn, ':', 5), '@', 1), + version = split_part(srn, '@', 2) + """ + ) + op.alter_column("schemas", "id", nullable=False) + op.alter_column("schemas", "version", nullable=False) + op.drop_constraint("schemas_pkey", "schemas", type_="primary") + op.drop_column("schemas", "srn") + op.create_primary_key("schemas_pkey", "schemas", ["id", "version"]) + op.create_index("idx_schemas_id", "schemas", ["id"]) + + # conventions: split schema_srn into schema_id + schema_version. + op.add_column("conventions", sa.Column("schema_id", sa.String(), nullable=True)) + op.add_column("conventions", sa.Column("schema_version", sa.String(), nullable=True)) + op.execute( + """ + UPDATE conventions + SET + schema_id = split_part(split_part(schema_srn, ':', 5), '@', 1), + schema_version = split_part(schema_srn, '@', 2) + """ + ) + op.alter_column("conventions", "schema_id", nullable=False) + op.alter_column("conventions", "schema_version", nullable=False) + op.drop_column("conventions", "schema_srn") + + +def downgrade() -> None: + # conventions back to schema_srn + op.add_column("conventions", sa.Column("schema_srn", sa.String(), nullable=True)) + op.execute( + """ + UPDATE conventions + SET schema_srn = 'urn:osa:localhost:schema:' || schema_id || '@' || schema_version + """ + ) + op.alter_column("conventions", "schema_srn", nullable=False) + op.drop_column("conventions", "schema_version") + op.drop_column("conventions", "schema_id") + + # schemas back to srn + op.drop_index("idx_schemas_id", table_name="schemas") + op.drop_constraint("schemas_pkey", "schemas", type_="primary") + op.add_column("schemas", sa.Column("srn", sa.String(), nullable=True)) + op.execute( + """ + UPDATE schemas + SET srn = 'urn:osa:localhost:schema:' || id || '@' || version + """ + ) + op.alter_column("schemas", "srn", nullable=False) + op.create_primary_key("schemas_pkey", "schemas", ["srn"]) + op.drop_column("schemas", "version") + op.drop_column("schemas", "id") diff --git a/server/osa/application/api/v1/routes/discovery.py b/server/osa/application/api/v1/routes/discovery.py index 129ab9e..fb4295a 100644 --- a/server/osa/application/api/v1/routes/discovery.py +++ b/server/osa/application/api/v1/routes/discovery.py @@ -22,7 +22,8 @@ SearchRecordsHandler, SearchRecordsResult, ) -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.srn import ConventionSRN, SchemaId router = APIRouter( prefix="/discovery", @@ -35,7 +36,9 @@ class RecordSearchRequest(BaseModel): - schema_srn: SchemaSRN | None = None + schema: str | None = None + """Short-form schema identity: ``"@"`` (e.g. ``"pdb-structure@1.0.0"``).""" + convention_srn: ConventionSRN | None = None filter: FilterExpr | None = None q: str | None = None @@ -56,7 +59,9 @@ class FeatureCatalogResponse(BaseModel): class FeatureSearchRequest(BaseModel): - schema_srn: SchemaSRN | None = None + schema: str | None = None + """Short-form schema identity, optional. See RecordSearchRequest.schema.""" + filter: FilterExpr | None = None record_srn: str | None = None sort: str = "id" @@ -71,6 +76,24 @@ class FeatureSearchResponse(BaseModel): has_more: bool +def _parse_schema(value: str | None) -> SchemaId | None: + if value is None: + return None + if "@" not in value: + raise ValidationError( + f"Schema {value!r} must be fully qualified as '@' " + "(e.g. 'pdb-structure@1.0.0'). Family-level scoping " + "(id alone, resolving to the latest version across a schema family) " + "is planned but not yet supported.", + field="schema", + code="cross_scope_not_yet_supported", + ) + try: + return SchemaId.parse(value) + except ValueError as exc: + raise ValidationError(str(exc), field="schema") from exc + + # ── Routes ── @@ -83,7 +106,7 @@ async def search_records( result: SearchRecordsResult = await handler.run( SearchRecords( filter_expr=body.filter, - schema_srn=body.schema_srn, + schema_id=_parse_schema(body.schema), convention_srn=body.convention_srn, q=body.q, sort=body.sort, @@ -119,7 +142,7 @@ async def search_features( SearchFeatures( hook_name=hook_name, filter_expr=body.filter, - schema_srn=body.schema_srn, + schema_id=_parse_schema(body.schema), record_srn=body.record_srn, sort=body.sort, order=body.order, diff --git a/server/osa/application/api/v1/routes/schemas.py b/server/osa/application/api/v1/routes/schemas.py index 421a943..6d00ddf 100644 --- a/server/osa/application/api/v1/routes/schemas.py +++ b/server/osa/application/api/v1/routes/schemas.py @@ -18,7 +18,8 @@ ListSchemasHandler, SchemaList, ) -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.srn import SchemaId router = APIRouter(prefix="/schemas", tags=["Schemas"], route_class=DishkaRoute) @@ -31,12 +32,17 @@ async def create_schema( return await handler.run(body) -@router.get("/{srn:path}", response_model=SchemaDetail) +@router.get("/{schema:path}", response_model=SchemaDetail) async def get_schema( - srn: str, + schema: str, handler: FromDishka[GetSchemaHandler], ) -> SchemaDetail: - return await handler.run(GetSchema(srn=SchemaSRN.parse(srn))) + """Fetch a schema by its short id+version, e.g. ``"pdb-structure@1.0.0"``.""" + try: + sid = SchemaId.parse(schema) + except ValueError as exc: + raise ValidationError(str(exc), field="schema") from exc + return await handler.run(GetSchema(schema_id=sid)) @router.get("", response_model=SchemaList) diff --git a/server/osa/domain/deposition/command/create_convention.py b/server/osa/domain/deposition/command/create_convention.py index c50059b..499f68c 100644 --- a/server/osa/domain/deposition/command/create_convention.py +++ b/server/osa/domain/deposition/command/create_convention.py @@ -9,12 +9,20 @@ from osa.domain.shared.command import Command, CommandHandler, Result from osa.domain.shared.model.hook import HookDefinition from osa.domain.shared.model.source import IngesterDefinition -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId, SchemaIdentifier class CreateConvention(Command): model_config = ConfigDict(populate_by_name=True) + id: SchemaIdentifier + """Schema slug — becomes the ```` in ``schema_id = @``. + + A convention is a bundle of (schema + validators + file requirements), and + the caller supplies the slug of the embedded schema here. The convention + itself gets an opaque server-generated SRN. + """ + title: str version: str schema_fields: list[FieldDefinition] = Field(alias="schema") @@ -28,7 +36,7 @@ class ConventionCreated(Result): srn: ConventionSRN title: str description: str | None - schema_srn: SchemaSRN + schema_id: SchemaId created_at: datetime @@ -38,6 +46,7 @@ class CreateConventionHandler(CommandHandler[CreateConvention, ConventionCreated async def run(self, cmd: CreateConvention) -> ConventionCreated: convention = await self.convention_service.create_convention( + id=cmd.id, title=cmd.title, version=cmd.version, schema=cmd.schema_fields, @@ -50,6 +59,6 @@ async def run(self, cmd: CreateConvention) -> ConventionCreated: srn=convention.srn, title=convention.title, description=convention.description, - schema_srn=convention.schema_srn, + schema_id=convention.schema_id, created_at=convention.created_at, ) diff --git a/server/osa/domain/deposition/command/upload_spreadsheet.py b/server/osa/domain/deposition/command/upload_spreadsheet.py index c092dd4..ad29269 100644 --- a/server/osa/domain/deposition/command/upload_spreadsheet.py +++ b/server/osa/domain/deposition/command/upload_spreadsheet.py @@ -34,9 +34,9 @@ async def run(self, cmd: UploadSpreadsheet) -> SpreadsheetUploaded: if convention is None: raise NotFoundError(f"Convention not found: {dep.convention_srn}") - schema = await self.schema_reader.get_schema(convention.schema_srn) + schema = await self.schema_reader.get_schema(convention.schema_id) if schema is None: - raise NotFoundError(f"Schema not found: {convention.schema_srn}") + raise NotFoundError(f"Schema not found: {convention.schema_id}") parse_result = self.spreadsheet.parse_upload(schema, cmd.content) diff --git a/server/osa/domain/deposition/event/convention_registered.py b/server/osa/domain/deposition/event/convention_registered.py index 987b28a..62d2112 100644 --- a/server/osa/domain/deposition/event/convention_registered.py +++ b/server/osa/domain/deposition/event/convention_registered.py @@ -3,22 +3,22 @@ from osa.domain.semantics.model.value import FieldDefinition from osa.domain.shared.event import Event, EventId from osa.domain.shared.model.hook import HookDefinition -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId class ConventionRegistered(Event): """Emitted when a convention is created via deploy. - Carries hook definitions so downstream handlers (e.g. CreateFeatureTables) - can create feature tables without querying the convention repository. + Carries hook definitions so ``CreateFeatureTables`` can create feature + tables without querying the convention repository. - Carries ``schema_srn`` and ``schema_fields`` so downstream handlers (e.g. - EnsureMetadataTable) can create and evolve typed metadata tables without - traversing the semantics repository. + Carries ``schema_id`` and ``schema_fields`` so ``EnsureMetadataTable`` can + create and evolve typed metadata tables without traversing the semantics + repository. """ id: EventId convention_srn: ConventionSRN - schema_srn: SchemaSRN + schema_id: SchemaId schema_fields: list[FieldDefinition] = [] hooks: list[HookDefinition] = [] diff --git a/server/osa/domain/deposition/model/convention.py b/server/osa/domain/deposition/model/convention.py index 63c1e77..53bc3ac 100644 --- a/server/osa/domain/deposition/model/convention.py +++ b/server/osa/domain/deposition/model/convention.py @@ -4,7 +4,7 @@ from osa.domain.shared.model.aggregate import Aggregate from osa.domain.shared.model.hook import HookDefinition from osa.domain.shared.model.source import IngesterDefinition -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId class Convention(Aggregate): @@ -13,7 +13,7 @@ class Convention(Aggregate): srn: ConventionSRN title: str description: str | None = None - schema_srn: SchemaSRN + schema_id: SchemaId file_requirements: FileRequirements hooks: list[HookDefinition] = [] ingester: IngesterDefinition | None = None diff --git a/server/osa/domain/deposition/port/schema_reader.py b/server/osa/domain/deposition/port/schema_reader.py index 36f1e00..fb790c3 100644 --- a/server/osa/domain/deposition/port/schema_reader.py +++ b/server/osa/domain/deposition/port/schema_reader.py @@ -1,7 +1,7 @@ from abc import abstractmethod from typing import TYPE_CHECKING, Protocol -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId from osa.domain.shared.port import Port if TYPE_CHECKING: @@ -12,7 +12,7 @@ class SchemaReader(Port, Protocol): """Read-only cross-domain port for reading schemas from the deposition domain.""" @abstractmethod - async def get_schema(self, srn: SchemaSRN) -> "Schema | None": ... + async def get_schema(self, schema_id: SchemaId) -> "Schema | None": ... @abstractmethod - async def schema_exists(self, srn: SchemaSRN) -> bool: ... + async def schema_exists(self, schema_id: SchemaId) -> bool: ... diff --git a/server/osa/domain/deposition/query/download_template.py b/server/osa/domain/deposition/query/download_template.py index eab466a..acaa504 100644 --- a/server/osa/domain/deposition/query/download_template.py +++ b/server/osa/domain/deposition/query/download_template.py @@ -33,9 +33,9 @@ async def run(self, cmd: DownloadTemplate) -> TemplateResult: if convention is None: raise NotFoundError(f"Convention not found: {cmd.convention_srn}") - schema = await self.schema_reader.get_schema(convention.schema_srn) + schema = await self.schema_reader.get_schema(convention.schema_id) if schema is None: - raise NotFoundError(f"Schema not found: {convention.schema_srn}") + raise NotFoundError(f"Schema not found: {convention.schema_id}") # Collect ontology terms for fields that reference ontologies ontology_terms_by_srn: dict[str, list[str]] = {} diff --git a/server/osa/domain/deposition/query/get_convention.py b/server/osa/domain/deposition/query/get_convention.py index b39e467..7bf9d46 100644 --- a/server/osa/domain/deposition/query/get_convention.py +++ b/server/osa/domain/deposition/query/get_convention.py @@ -5,7 +5,7 @@ from osa.domain.shared.authorization.gate import public from osa.domain.shared.model.hook import HookDefinition from osa.domain.shared.model.source import IngesterDefinition -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId from osa.domain.shared.query import Query, QueryHandler, Result @@ -17,7 +17,7 @@ class ConventionDetail(Result): srn: ConventionSRN title: str description: str | None - schema_srn: SchemaSRN + schema_id: SchemaId file_requirements: FileRequirements hooks: list[HookDefinition] ingester: IngesterDefinition | None = None @@ -34,7 +34,7 @@ async def run(self, cmd: GetConvention) -> ConventionDetail: srn=conv.srn, title=conv.title, description=conv.description, - schema_srn=conv.schema_srn, + schema_id=conv.schema_id, file_requirements=conv.file_requirements, hooks=conv.hooks, ingester=conv.ingester, diff --git a/server/osa/domain/deposition/query/list_conventions.py b/server/osa/domain/deposition/query/list_conventions.py index f38a07e..0838bd0 100644 --- a/server/osa/domain/deposition/query/list_conventions.py +++ b/server/osa/domain/deposition/query/list_conventions.py @@ -4,7 +4,7 @@ from osa.domain.deposition.service.convention import ConventionService from osa.domain.shared.authorization.gate import public -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId from osa.domain.shared.query import Query, QueryHandler, Result @@ -16,7 +16,7 @@ class ConventionSummary(BaseModel): srn: ConventionSRN title: str description: str | None - schema_srn: SchemaSRN + schema_id: SchemaId created_at: datetime @@ -36,7 +36,7 @@ async def run(self, cmd: ListConventions) -> ConventionList: srn=c.srn, title=c.title, description=c.description, - schema_srn=c.schema_srn, + schema_id=c.schema_id, created_at=c.created_at, ) for c in conventions diff --git a/server/osa/domain/deposition/service/convention.py b/server/osa/domain/deposition/service/convention.py index b099acd..b01f23a 100644 --- a/server/osa/domain/deposition/service/convention.py +++ b/server/osa/domain/deposition/service/convention.py @@ -11,7 +11,13 @@ from osa.domain.shared.event import EventId from osa.domain.shared.model.hook import HookDefinition from osa.domain.shared.model.source import IngesterDefinition -from osa.domain.shared.model.srn import ConventionSRN, Domain, LocalId, Semver +from osa.domain.shared.model.srn import ( + ConventionSRN, + Domain, + LocalId, + SchemaIdentifier, + Semver, +) from osa.domain.shared.outbox import Outbox from osa.domain.shared.service import Service @@ -24,6 +30,7 @@ class ConventionService(Service): async def create_convention( self, + id: SchemaIdentifier, title: str, version: str, schema: list[FieldDefinition], @@ -35,13 +42,14 @@ async def create_convention( """Create a convention with an inline schema. The schema is created as a separate Schema row internally, - and the convention references it via schema_srn. + and the convention references it via schema_id. Feature table creation is handled asynchronously by the CreateFeatureTables handler reacting to ConventionRegistered. """ # Create Schema row from inline field definitions created_schema = await self.schema_service.create_schema( + id=id, title=title, version=version, fields=schema, @@ -56,7 +64,7 @@ async def create_convention( srn=srn, title=title, description=description, - schema_srn=created_schema.srn, + schema_id=created_schema.id, file_requirements=file_requirements, hooks=hooks or [], ingester=ingester, @@ -68,7 +76,7 @@ async def create_convention( ConventionRegistered( id=EventId(uuid4()), convention_srn=srn, - schema_srn=created_schema.srn, + schema_id=created_schema.id, schema_fields=created_schema.fields, hooks=convention.hooks, ) diff --git a/server/osa/domain/discovery/port/field_definition_reader.py b/server/osa/domain/discovery/port/field_definition_reader.py index 5f4974e..516d51e 100644 --- a/server/osa/domain/discovery/port/field_definition_reader.py +++ b/server/osa/domain/discovery/port/field_definition_reader.py @@ -6,7 +6,7 @@ if TYPE_CHECKING: from osa.domain.semantics.model.value import FieldType - from osa.domain.shared.model.srn import SchemaSRN + from osa.domain.shared.model.srn import SchemaId class FieldDefinitionReader(Protocol): @@ -17,7 +17,7 @@ async def get_all_field_types(self) -> dict[str, FieldType]: """ ... - async def get_fields_for_schema(self, schema_srn: "SchemaSRN") -> dict[str, FieldType]: + async def get_fields_for_schema(self, schema_id: "SchemaId") -> dict[str, FieldType]: """Return field_name -> FieldType for a specific schema's current major version. Falls back to an empty dict when the schema is unknown to the node. diff --git a/server/osa/domain/discovery/port/read_store.py b/server/osa/domain/discovery/port/read_store.py index 34faec6..762364e 100644 --- a/server/osa/domain/discovery/port/read_store.py +++ b/server/osa/domain/discovery/port/read_store.py @@ -13,14 +13,14 @@ SortOrder, ) from osa.domain.semantics.model.value import FieldType - from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN + from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId class DiscoveryReadStore(Protocol): async def search_records( self, filter_expr: "FilterExpr | None", - schema_srn: "SchemaSRN | None", + schema_id: "SchemaId | None", convention_srn: "ConventionSRN | None", text_fields: list[str], q: str | None, @@ -41,7 +41,7 @@ async def search_features( self, hook_name: str, filter_expr: "FilterExpr | None", - schema_srn: "SchemaSRN | None", + schema_id: "SchemaId | None", record_srn: "RecordSRN | None", sort: str, order: "SortOrder", diff --git a/server/osa/domain/discovery/query/search_features.py b/server/osa/domain/discovery/query/search_features.py index f5d8a61..42dde9a 100644 --- a/server/osa/domain/discovery/query/search_features.py +++ b/server/osa/domain/discovery/query/search_features.py @@ -8,14 +8,14 @@ from osa.domain.discovery.service.discovery import DiscoveryService from osa.domain.shared.authorization.gate import public from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaId from osa.domain.shared.query import Query, QueryHandler, Result class SearchFeatures(Query): hook_name: str filter_expr: FilterExpr | None = None - schema_srn: SchemaSRN | None = None + schema_id: SchemaId | None = None record_srn: str | None = None sort: str = "id" order: SortOrder = SortOrder.DESC @@ -43,7 +43,7 @@ async def run(self, cmd: SearchFeatures) -> SearchFeaturesResult: result: FeatureSearchResult = await self.discovery_service.search_features( hook_name=cmd.hook_name, filter_expr=cmd.filter_expr, - schema_srn=cmd.schema_srn, + schema_id=cmd.schema_id, record_srn=record_srn, sort=cmd.sort, order=cmd.order, diff --git a/server/osa/domain/discovery/query/search_records.py b/server/osa/domain/discovery/query/search_records.py index da27009..515d980 100644 --- a/server/osa/domain/discovery/query/search_records.py +++ b/server/osa/domain/discovery/query/search_records.py @@ -9,13 +9,13 @@ ) from osa.domain.discovery.service.discovery import DiscoveryService from osa.domain.shared.authorization.gate import public -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId from osa.domain.shared.query import Query, QueryHandler, Result class SearchRecords(Query): filter_expr: FilterExpr | None = None - schema_srn: SchemaSRN | None = None + schema_id: SchemaId | None = None convention_srn: ConventionSRN | None = None q: str | None = None sort: str = "published_at" @@ -37,7 +37,7 @@ class SearchRecordsHandler(QueryHandler[SearchRecords, SearchRecordsResult]): async def run(self, cmd: SearchRecords) -> SearchRecordsResult: result: RecordSearchResult = await self.discovery_service.search_records( filter_expr=cmd.filter_expr, - schema_srn=cmd.schema_srn, + schema_id=cmd.schema_id, convention_srn=cmd.convention_srn, q=cmd.q, sort=cmd.sort, diff --git a/server/osa/domain/discovery/service/discovery.py b/server/osa/domain/discovery/service/discovery.py index 1e8f850..3f54ea5 100644 --- a/server/osa/domain/discovery/service/discovery.py +++ b/server/osa/domain/discovery/service/discovery.py @@ -34,7 +34,7 @@ from osa.domain.discovery.port.read_store import DiscoveryReadStore from osa.domain.semantics.model.value import FieldType from osa.domain.shared.error import NotFoundError, ValidationError -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId from osa.domain.shared.service import Service logger = logging.getLogger(__name__) @@ -50,7 +50,7 @@ class DiscoveryService(Service): async def search_records( self, filter_expr: FilterExpr | None, - schema_srn: SchemaSRN | None, + schema_id: SchemaId | None, convention_srn: ConventionSRN | None, q: str | None, sort: str, @@ -69,22 +69,37 @@ async def search_records( if limit < 1 or limit > 100: raise ValidationError("limit must be between 1 and 100", field="limit") - schema_field_map: dict[str, FieldType] = {} - if schema_srn is not None: - schema_field_map = await self.field_reader.get_fields_for_schema(schema_srn) + if sort != "published_at" and schema_id is None: + raise ValidationError( + f"Sorting by '{sort}' requires the request to pin a 'schema' " + "('@'). Plain listings must sort by 'published_at'.", + field="sort", + code="schema_required_for_metadata_sort", + ) + + if q and schema_id is None: + raise ValidationError( + "Free-text search ('q') requires the request to pin a 'schema' " + "('@'). Without a schema, the server cannot resolve " + "which metadata fields are text-indexed.", + field="q", + code="schema_required_for_free_text_search", + ) - global_field_map = await self.field_reader.get_all_field_types() - effective_field_map = schema_field_map or global_field_map + schema_field_map: dict[str, FieldType] = {} + if schema_id is not None: + schema_field_map = await self.field_reader.get_fields_for_schema(schema_id) if filter_expr is not None: self._validate_tree(filter_expr, allow_compound=allow_compound) - await self._validate_refs(filter_expr, schema_srn, effective_field_map) + await self._validate_refs(filter_expr, schema_id, schema_field_map) - # Sort field validation - if sort != "published_at" and sort not in effective_field_map: + # Sort field validation (against pinned schema) + if sort != "published_at" and sort not in schema_field_map: raise ValidationError( - f"Unknown sort field '{sort}': not defined in registered schema", + f"Unknown sort field '{sort}': not defined in the pinned schema.", field="sort", + code="unknown_sort_field", ) decoded_cursor: dict[str, Any] | None = None @@ -95,19 +110,18 @@ async def search_records( raise ValidationError(str(exc), field="cursor") from exc text_fields = [ - name - for name, ft in effective_field_map.items() - if ft in (FieldType.TEXT, FieldType.URL) + name for name, ft in schema_field_map.items() if ft in (FieldType.TEXT, FieldType.URL) ] if q and not text_fields: raise ValidationError( - "Free-text search is unavailable: no text or URL fields are registered", + "Free-text search is unavailable: the pinned schema defines no text or URL fields.", field="q", + code="no_text_fields_in_schema", ) results = await self.read_store.search_records( filter_expr=filter_expr, - schema_srn=schema_srn, + schema_id=schema_id, convention_srn=convention_srn, text_fields=text_fields, q=q, @@ -115,7 +129,7 @@ async def search_records( order=order, cursor=decoded_cursor, limit=limit + 1, - field_types=effective_field_map, + field_types=schema_field_map, ) has_more = len(results) > limit @@ -143,7 +157,7 @@ async def search_features( self, hook_name: str, filter_expr: FilterExpr | None, - schema_srn: SchemaSRN | None, + schema_id: SchemaId | None, record_srn: RecordSRN | None, sort: str, order: SortOrder, @@ -163,8 +177,8 @@ async def search_features( col_map["record_srn"] = "string" schema_field_map: dict[str, FieldType] = {} - if schema_srn is not None: - schema_field_map = await self.field_reader.get_fields_for_schema(schema_srn) + if schema_id is not None: + schema_field_map = await self.field_reader.get_fields_for_schema(schema_id) if filter_expr is not None: self._validate_tree(filter_expr, allow_compound=allow_compound) @@ -189,7 +203,7 @@ async def search_features( rows = await self.read_store.search_features( hook_name=hook_name, filter_expr=filter_expr, - schema_srn=schema_srn, + schema_id=schema_id, record_srn=record_srn, sort=sort, order=order, @@ -259,24 +273,26 @@ def _validate_tree(self, expr: FilterExpr, *, allow_compound: bool) -> None: async def _validate_refs( self, expr: FilterExpr, - schema_srn: SchemaSRN | None, + schema_id: SchemaId | None, field_map: dict[str, FieldType], ) -> None: """Resolve each predicate's field and check operator compatibility.""" feature_catalog: dict[str, dict[str, str]] | None = None for p in _iter_predicates(expr): if isinstance(p.field, MetadataFieldRef): - if schema_srn is None and not field_map: + if schema_id is None: raise ValidationError( - f"Unknown metadata field '{p.field.field}': " - "no schema_srn provided and no registered schemas.", + f"Metadata predicate on {p.field.dotted()!r} requires " + "the request to pin a 'schema' ('@'). " + "Unscoped metadata filtering is not supported — the typed " + "metadata table is the only filter path.", field=p.field.dotted(), - code="unknown_field", + code="schema_required_for_metadata_query", ) field_name = p.field.field if field_name not in field_map: raise ValidationError( - f"Unknown metadata field '{field_name}' for the provided schema.", + f"Unknown metadata field '{field_name}' for the pinned schema.", field=p.field.dotted(), code="unknown_field", ) diff --git a/server/osa/domain/metadata/handler/ensure_metadata_table.py b/server/osa/domain/metadata/handler/ensure_metadata_table.py index b8214ba..1a35eb1 100644 --- a/server/osa/domain/metadata/handler/ensure_metadata_table.py +++ b/server/osa/domain/metadata/handler/ensure_metadata_table.py @@ -7,7 +7,6 @@ from osa.domain.deposition.event.convention_registered import ConventionRegistered from osa.domain.deposition.port.convention_repository import ConventionRepository from osa.domain.metadata.service.metadata import MetadataService -from osa.domain.semantics.port.schema_repository import SchemaRepository from osa.domain.shared.error import DomainError, NotFoundError from osa.domain.shared.event import EventHandler @@ -23,7 +22,6 @@ class EnsureMetadataTable(EventHandler[ConventionRegistered]): """ metadata_service: MetadataService - schema_repo: SchemaRepository convention_repo: ConventionRepository async def handle(self, event: ConventionRegistered) -> None: @@ -31,20 +29,15 @@ async def handle(self, event: ConventionRegistered) -> None: if convention is None: raise NotFoundError(f"Convention not found: {event.convention_srn}") - schema = await self.schema_repo.get(event.schema_srn) - if schema is None: - raise NotFoundError(f"Schema not found: {event.schema_srn}") - try: await self.metadata_service.ensure_table( - schema_srn=event.schema_srn, - schema_title=schema.title, + schema_id=event.schema_id, fields=event.schema_fields, ) except DomainError: logger.exception( "EnsureMetadataTable failed: convention=%s schema=%s", event.convention_srn, - event.schema_srn, + event.schema_id, ) raise diff --git a/server/osa/domain/metadata/handler/insert_batch_metadata.py b/server/osa/domain/metadata/handler/insert_batch_metadata.py new file mode 100644 index 0000000..e0f1450 --- /dev/null +++ b/server/osa/domain/metadata/handler/insert_batch_metadata.py @@ -0,0 +1,52 @@ +"""InsertBatchMetadata — bulk metadata projection for ingest batches. + +Mirrors :class:`InsertBatchFeatures` — listens to ``IngestBatchPublished`` +rather than per-record ``RecordPublished``, because the bulk ingest pipeline +emits one batch-level event instead of N per-record ones (AD-3). +""" + +from __future__ import annotations + +from osa.domain.ingest.event.events import IngestBatchPublished +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.record.port.repository import RecordRepository +from osa.domain.shared.event import EventHandler +from osa.domain.shared.model.srn import RecordSRN +from osa.infrastructure.logging import get_logger + +log = get_logger(__name__) + + +class InsertBatchMetadata(EventHandler[IngestBatchPublished]): + """Project each newly-published record's metadata into its typed table.""" + + metadata_service: MetadataService + record_repo: RecordRepository + + async def handle(self, event: IngestBatchPublished) -> None: + if not event.published_srns: + return + + inserted = 0 + for srn_str in event.published_srns: + srn = RecordSRN.parse(srn_str) + record = await self.record_repo.get(srn) + if record is None: + # Record was published in this batch but we can't find it — + # would indicate the same UOW is reading stale state. Skip. + continue + await self.metadata_service.insert( + schema_id=record.schema_id, + record_srn=record.srn, + values=record.metadata, + ) + inserted += 1 + + short_id = event.ingest_run_id[:8] + log.info( + "[{short_id}] batch {batch_index}: inserted {inserted} metadata rows", + short_id=short_id, + batch_index=event.batch_index, + inserted=inserted, + ingest_run_id=event.ingest_run_id, + ) diff --git a/server/osa/domain/metadata/handler/insert_record_metadata.py b/server/osa/domain/metadata/handler/insert_record_metadata.py index 4ab1558..a7d09d3 100644 --- a/server/osa/domain/metadata/handler/insert_record_metadata.py +++ b/server/osa/domain/metadata/handler/insert_record_metadata.py @@ -18,12 +18,12 @@ class InsertRecordMetadata(EventHandler[RecordPublished]): async def handle(self, event: RecordPublished) -> None: await self.metadata_service.insert( - schema_srn=event.schema_srn, + schema_id=event.schema_id, record_srn=event.record_srn, values=event.metadata, ) logger.debug( "Inserted metadata row: record=%s schema=%s", event.record_srn, - event.schema_srn, + event.schema_id, ) diff --git a/server/osa/domain/metadata/port/metadata_store.py b/server/osa/domain/metadata/port/metadata_store.py index 64899a2..3661e76 100644 --- a/server/osa/domain/metadata/port/metadata_store.py +++ b/server/osa/domain/metadata/port/metadata_store.py @@ -6,7 +6,7 @@ if TYPE_CHECKING: from osa.domain.semantics.model.value import FieldDefinition - from osa.domain.shared.model.srn import RecordSRN, SchemaSRN + from osa.domain.shared.model.srn import RecordSRN, SchemaId class MetadataStore(Protocol): @@ -14,25 +14,28 @@ class MetadataStore(Protocol): Implementations are responsible for: - Creating the ``metadata._v`` table on first - registration for a (schema_identity, major) pair. + registration for a ``(schema_id, major)`` pair. - Additively ALTER ADD COLUMN when the schema bumps (minor/patch) with new optional fields. - - Appending SRN lineage into the catalog's ``schema_versions`` list. + - Appending version lineage into the catalog's ``schema_versions`` list. - Idempotent UPSERT of a row keyed on ``record_srn``. """ async def ensure_table( self, - schema_srn: "SchemaSRN", - schema_title: str, + schema_id: "SchemaId", fields: "list[FieldDefinition]", ) -> None: - """Create or additively evolve the typed metadata table for a schema.""" + """Create or additively evolve the typed metadata table for a schema. + + The PG table slug is derived from ``schema_id.id.root`` — the schema's + human-readable slug is the single source of truth for the storage name. + """ ... async def insert( self, - schema_srn: "SchemaSRN", + schema_id: "SchemaId", record_srn: "RecordSRN", values: dict[str, Any], ) -> None: diff --git a/server/osa/domain/metadata/service/metadata.py b/server/osa/domain/metadata/service/metadata.py index a639c1b..b2b7e07 100644 --- a/server/osa/domain/metadata/service/metadata.py +++ b/server/osa/domain/metadata/service/metadata.py @@ -6,7 +6,7 @@ from osa.domain.metadata.port.metadata_store import MetadataStore from osa.domain.semantics.model.value import FieldDefinition -from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaId from osa.domain.shared.service import Service @@ -17,16 +17,15 @@ class MetadataService(Service): async def ensure_table( self, - schema_srn: SchemaSRN, - schema_title: str, + schema_id: SchemaId, fields: list[FieldDefinition], ) -> None: - await self.metadata_store.ensure_table(schema_srn, schema_title, fields) + await self.metadata_store.ensure_table(schema_id, fields) async def insert( self, - schema_srn: SchemaSRN, + schema_id: SchemaId, record_srn: RecordSRN, values: dict[str, Any], ) -> None: - await self.metadata_store.insert(schema_srn, record_srn, values) + await self.metadata_store.insert(schema_id, record_srn, values) diff --git a/server/osa/domain/record/event/record_published.py b/server/osa/domain/record/event/record_published.py index b2a8f34..6cb1637 100644 --- a/server/osa/domain/record/event/record_published.py +++ b/server/osa/domain/record/event/record_published.py @@ -4,21 +4,20 @@ from osa.domain.shared.event import Event, EventId from osa.domain.shared.model.source import RecordSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId class RecordPublished(Event): """Emitted when a record is published and ready for indexing. - Enriched with source, convention_srn, schema_srn, and expected_features so - downstream consumers (metadata insertion, feature insertion, indexing) can - operate without querying record/convention repositories. + Carries ``schema_id`` so downstream consumers (metadata insertion, + indexing) operate in terms of short-form identity rather than full URNs. """ id: EventId record_srn: RecordSRN source: RecordSource convention_srn: ConventionSRN - schema_srn: SchemaSRN + schema_id: SchemaId metadata: dict[str, Any] expected_features: list[str] = [] diff --git a/server/osa/domain/record/model/aggregate.py b/server/osa/domain/record/model/aggregate.py index 0e797d5..8b2f491 100644 --- a/server/osa/domain/record/model/aggregate.py +++ b/server/osa/domain/record/model/aggregate.py @@ -7,7 +7,7 @@ from osa.domain.shared.model.aggregate import Aggregate from osa.domain.shared.model.source import RecordSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId class Record(Aggregate): @@ -16,6 +16,6 @@ class Record(Aggregate): srn: RecordSRN source: RecordSource convention_srn: ConventionSRN - schema_srn: SchemaSRN = Field(frozen=True) + schema_id: SchemaId = Field(frozen=True) metadata: dict[str, Any] published_at: datetime diff --git a/server/osa/domain/record/service/record.py b/server/osa/domain/record/service/record.py index 7dbbf6c..3e130b1 100644 --- a/server/osa/domain/record/service/record.py +++ b/server/osa/domain/record/service/record.py @@ -20,7 +20,7 @@ LocalId, RecordSRN, RecordVersion, - SchemaSRN, + SchemaId, ) from osa.domain.shared.outbox import Outbox from osa.domain.shared.service import Service @@ -53,12 +53,12 @@ async def get(self, srn: RecordSRN) -> Record: raise NotFoundError(f"Record not found: {srn}") return record - async def _resolve_schema_srn(self, convention_srn: ConventionSRN) -> SchemaSRN: - """Resolve a convention to its schema SRN at publication time.""" + async def _resolve_schema_id(self, convention_srn: ConventionSRN) -> SchemaId: + """Resolve a convention to its schema id at publication time.""" convention = await self.convention_repo.get(convention_srn) if convention is None: raise NotFoundError(f"Convention not found: {convention_srn}") - return convention.schema_srn + return convention.schema_id async def bulk_publish(self, drafts: list[RecordDraft]) -> list[Record]: """Bulk-publish records from an ingest batch. @@ -71,14 +71,14 @@ async def bulk_publish(self, drafts: list[RecordDraft]) -> list[Record]: return [] # All drafts in a batch target the same convention (caller contract); - # resolve schema_srn once. - schema_srn_by_conv: dict[str, SchemaSRN] = {} + # resolve schema_id once. + schema_id_by_conv: dict[str, SchemaId] = {} records: list[Record] = [] for draft in drafts: key = str(draft.convention_srn) - if key not in schema_srn_by_conv: - schema_srn_by_conv[key] = await self._resolve_schema_srn(draft.convention_srn) + if key not in schema_id_by_conv: + schema_id_by_conv[key] = await self._resolve_schema_id(draft.convention_srn) record_srn = RecordSRN( domain=self.node_domain, id=LocalId(str(uuid4())), @@ -89,7 +89,7 @@ async def bulk_publish(self, drafts: list[RecordDraft]) -> list[Record]: srn=record_srn, source=draft.source, convention_srn=draft.convention_srn, - schema_srn=schema_srn_by_conv[key], + schema_id=schema_id_by_conv[key], metadata=draft.metadata, published_at=datetime.now(UTC), ) @@ -102,7 +102,7 @@ async def publish_record(self, draft: RecordDraft) -> Record: """Create and persist a Record from a draft.""" logger.info(f"Creating record from {draft.source.type} source: {draft.source.id}") - schema_srn = await self._resolve_schema_srn(draft.convention_srn) + schema_id = await self._resolve_schema_id(draft.convention_srn) record_srn = RecordSRN( domain=self.node_domain, @@ -114,7 +114,7 @@ async def publish_record(self, draft: RecordDraft) -> Record: srn=record_srn, source=draft.source, convention_srn=draft.convention_srn, - schema_srn=schema_srn, + schema_id=schema_id, metadata=draft.metadata, published_at=datetime.now(UTC), ) @@ -127,7 +127,7 @@ async def publish_record(self, draft: RecordDraft) -> Record: record_srn=record_srn, source=draft.source, convention_srn=draft.convention_srn, - schema_srn=schema_srn, + schema_id=schema_id, metadata=draft.metadata, expected_features=draft.expected_features, ) diff --git a/server/osa/domain/semantics/command/create_schema.py b/server/osa/domain/semantics/command/create_schema.py index 27c4202..ff6fcf2 100644 --- a/server/osa/domain/semantics/command/create_schema.py +++ b/server/osa/domain/semantics/command/create_schema.py @@ -6,17 +6,18 @@ from osa.domain.semantics.service.schema import SchemaService from osa.domain.shared.authorization.gate import at_least from osa.domain.shared.command import Command, CommandHandler, Result -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId, SchemaIdentifier class CreateSchema(Command): + id: SchemaIdentifier title: str version: str fields: list[FieldDefinition] class SchemaCreated(Result): - srn: SchemaSRN + id: SchemaId title: str field_count: int created_at: datetime @@ -29,12 +30,13 @@ class CreateSchemaHandler(CommandHandler[CreateSchema, SchemaCreated]): async def run(self, cmd: CreateSchema) -> SchemaCreated: schema = await self.schema_service.create_schema( + id=cmd.id, title=cmd.title, version=cmd.version, fields=cmd.fields, ) return SchemaCreated( - srn=schema.srn, + id=schema.id, title=schema.title, field_count=len(schema.fields), created_at=schema.created_at, diff --git a/server/osa/domain/semantics/model/schema.py b/server/osa/domain/semantics/model/schema.py index 1188dbf..20af362 100644 --- a/server/osa/domain/semantics/model/schema.py +++ b/server/osa/domain/semantics/model/schema.py @@ -3,13 +3,13 @@ from osa.domain.semantics.model.value import FieldDefinition from osa.domain.shared.error import ValidationError from osa.domain.shared.model.aggregate import Aggregate -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId class Schema(Aggregate): """An immutable, versioned definition of metadata structure.""" - srn: SchemaSRN + id: SchemaId title: str fields: list[FieldDefinition] created_at: datetime diff --git a/server/osa/domain/semantics/port/schema_repository.py b/server/osa/domain/semantics/port/schema_repository.py index b6849a8..1f49067 100644 --- a/server/osa/domain/semantics/port/schema_repository.py +++ b/server/osa/domain/semantics/port/schema_repository.py @@ -1,7 +1,7 @@ from abc import abstractmethod from typing import TYPE_CHECKING, List, Protocol -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId from osa.domain.shared.port import Port if TYPE_CHECKING: @@ -13,7 +13,7 @@ class SchemaRepository(Port, Protocol): async def save(self, schema: "Schema") -> None: ... @abstractmethod - async def get(self, srn: SchemaSRN) -> "Schema | None": ... + async def get(self, schema_id: SchemaId) -> "Schema | None": ... @abstractmethod async def list( @@ -21,4 +21,4 @@ async def list( ) -> "List[Schema]": ... @abstractmethod - async def exists(self, srn: SchemaSRN) -> bool: ... + async def exists(self, schema_id: SchemaId) -> bool: ... diff --git a/server/osa/domain/semantics/query/get_schema.py b/server/osa/domain/semantics/query/get_schema.py index 3c7afd8..c52fb07 100644 --- a/server/osa/domain/semantics/query/get_schema.py +++ b/server/osa/domain/semantics/query/get_schema.py @@ -3,16 +3,16 @@ from osa.domain.semantics.model.value import FieldDefinition from osa.domain.semantics.service.schema import SchemaService from osa.domain.shared.authorization.gate import public -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId from osa.domain.shared.query import Query, QueryHandler, Result class GetSchema(Query): - srn: SchemaSRN + schema_id: SchemaId class SchemaDetail(Result): - srn: SchemaSRN + id: SchemaId title: str fields: list[FieldDefinition] created_at: datetime @@ -23,9 +23,9 @@ class GetSchemaHandler(QueryHandler[GetSchema, SchemaDetail]): schema_service: SchemaService async def run(self, cmd: GetSchema) -> SchemaDetail: - schema = await self.schema_service.get_schema(cmd.srn) + schema = await self.schema_service.get_schema(cmd.schema_id) return SchemaDetail( - srn=schema.srn, + id=schema.id, title=schema.title, fields=schema.fields, created_at=schema.created_at, diff --git a/server/osa/domain/semantics/query/list_schemas.py b/server/osa/domain/semantics/query/list_schemas.py index b13ba2e..1fe8410 100644 --- a/server/osa/domain/semantics/query/list_schemas.py +++ b/server/osa/domain/semantics/query/list_schemas.py @@ -4,7 +4,7 @@ from osa.domain.semantics.service.schema import SchemaService from osa.domain.shared.authorization.gate import public -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId from osa.domain.shared.query import Query, QueryHandler, Result @@ -13,7 +13,7 @@ class ListSchemas(Query): class SchemaSummary(BaseModel): - srn: SchemaSRN + id: SchemaId title: str field_count: int created_at: datetime @@ -32,7 +32,7 @@ async def run(self, cmd: ListSchemas) -> SchemaList: return SchemaList( items=[ SchemaSummary( - srn=s.srn, + id=s.id, title=s.title, field_count=len(s.fields), created_at=s.created_at, diff --git a/server/osa/domain/semantics/service/schema.py b/server/osa/domain/semantics/service/schema.py index 3127488..01cb341 100644 --- a/server/osa/domain/semantics/service/schema.py +++ b/server/osa/domain/semantics/service/schema.py @@ -1,12 +1,17 @@ from datetime import UTC, datetime -from uuid import uuid4 from osa.domain.semantics.model.schema import Schema from osa.domain.semantics.model.value import FieldDefinition, FieldType, TermConstraints from osa.domain.semantics.port.ontology_repository import OntologyRepository from osa.domain.semantics.port.schema_repository import SchemaRepository -from osa.domain.shared.error import NotFoundError, ValidationError -from osa.domain.shared.model.srn import Domain, LocalId, SchemaSRN, Semver +from osa.domain.shared.error import ConflictError, NotFoundError, ValidationError +from osa.domain.shared.model.srn import ( + Domain, + LocalId, + SchemaId, + SchemaIdentifier, + Semver, +) from osa.domain.shared.service import Service @@ -17,6 +22,7 @@ class SchemaService(Service): async def create_schema( self, + id: SchemaIdentifier, title: str, version: str, fields: list[FieldDefinition], @@ -35,13 +41,18 @@ async def create_schema( f"(referenced by field '{field.name}')" ) - srn = SchemaSRN( - domain=self.node_domain, - id=LocalId(str(uuid4())[:20]), + schema_id = SchemaId( + id=LocalId(id.root), version=Semver.from_string(version), ) + existing = await self.schema_repo.get(schema_id) + if existing is not None: + raise ConflictError( + f"Schema already exists: {schema_id.render()}", + code="schema_already_exists", + ) schema = Schema( - srn=srn, + id=schema_id, title=title, fields=fields, created_at=datetime.now(UTC), @@ -49,10 +60,10 @@ async def create_schema( await self.schema_repo.save(schema) return schema - async def get_schema(self, srn: SchemaSRN) -> Schema: - schema = await self.schema_repo.get(srn) + async def get_schema(self, schema_id: SchemaId) -> Schema: + schema = await self.schema_repo.get(schema_id) if schema is None: - raise NotFoundError(f"Schema not found: {srn}") + raise NotFoundError(f"Schema not found: {schema_id}") return schema async def list_schemas( diff --git a/server/osa/domain/shared/model/srn.py b/server/osa/domain/shared/model/srn.py index 5da69aa..13f97db 100644 --- a/server/osa/domain/shared/model/srn.py +++ b/server/osa/domain/shared/model/srn.py @@ -51,7 +51,30 @@ class LocalId(RootModel[str]): def _validate(cls, v: str) -> str: v = v.strip().lower() if not cls._re.match(v): - raise ValueError("invalid LocalId (20–64 chars, [a-z0-9-])") + raise ValueError("invalid LocalId (3–64 chars, [a-z0-9-])") + return v + + +class SchemaIdentifier(RootModel[str]): + """Human-readable schema slug. Narrower than :class:`LocalId`: + + - must start with a letter (so it can drive a PG table name without + quoting the leading character) + - 3–64 chars total, ``[a-z0-9-]`` + + Validated strictly (no case-folding / whitespace-stripping) — a typo like + ``"PDB-Structure"`` should surface loudly rather than silently normalise. + """ + + _re: ClassVar[re.Pattern] = re.compile(r"^[a-z][a-z0-9\-]{2,63}$") + + @field_validator("root") + @classmethod + def _validate(cls, v: str) -> str: + if not cls._re.match(v): + raise ValueError( + "invalid schema id: must be 3–64 chars of [a-z0-9-] and start with a letter" + ) return v @@ -280,3 +303,55 @@ class SnapshotSRN(SRN): class EventSRN(SRN): type: ResourceType = Field(default=ResourceType.evt, frozen=True) version: None = None + + +# ---------- Schema identity (short form — internal primitive) ---------- + + +class SchemaId(ValueObject): + """Short-form schema identity. The internal primitive for all non- + federation code paths. + + A schema is unambiguously identified by ``(id, version)`` within a single + OSA node — the publishing domain and resource-type segments of the full + :class:`SchemaSRN` URN carry no information at the internal layer (the + domain is always the node's own; the type is always ``schema``). + + Use :class:`SchemaSRN` only at federation edges (exports, snapshot + manifests, inter-node references) where the publishing node's domain + becomes meaningful. + + Wire form: ``"@"`` (e.g., ``"pdb-structure@1.0.0"``). + """ + + id: LocalId + version: Semver + + @property + def major(self) -> int: + """Major version component — the shared typed-table key.""" + return int(self.version.root.split(".")[0]) + + def render(self) -> str: + return f"{self.id.root}@{self.version.root}" + + def __str__(self) -> str: + return self.render() + + @classmethod + def parse(cls, value: str) -> "SchemaId": + """Parse wire form ``"@"``. + + Raises ``ValueError`` on malformed input. + """ + if not isinstance(value, str) or "@" not in value: + raise ValueError(f"SchemaId must be '@', got {value!r}") + id_part, version_part = value.split("@", 1) + return cls(id=LocalId(id_part), version=Semver.from_string(version_part)) + + @classmethod + def from_srn(cls, srn: "SchemaSRN") -> "SchemaId": + return cls(id=srn.id, version=srn.version) + + def to_srn(self, domain: Domain) -> "SchemaSRN": + return SchemaSRN(domain=domain, id=self.id, version=self.version) diff --git a/server/osa/infrastructure/event/di.py b/server/osa/infrastructure/event/di.py index be7142e..682da77 100644 --- a/server/osa/infrastructure/event/di.py +++ b/server/osa/infrastructure/event/di.py @@ -15,6 +15,7 @@ ) from osa.domain.ingest.handler import PublishBatch, RunHooks, RunIngester from osa.domain.metadata.handler.ensure_metadata_table import EnsureMetadataTable +from osa.domain.metadata.handler.insert_batch_metadata import InsertBatchMetadata from osa.domain.metadata.handler.insert_record_metadata import InsertRecordMetadata from osa.domain.record.handler import ConvertDepositionToRecord from osa.domain.shared.event import EventHandler @@ -42,6 +43,7 @@ # Metadata handlers (feature 076) EnsureMetadataTable, InsertRecordMetadata, + InsertBatchMetadata, # Ingest handlers RunIngester, RunHooks, diff --git a/server/osa/infrastructure/persistence/adapter/discovery.py b/server/osa/infrastructure/persistence/adapter/discovery.py index 814504f..0fd452d 100644 --- a/server/osa/infrastructure/persistence/adapter/discovery.py +++ b/server/osa/infrastructure/persistence/adapter/discovery.py @@ -21,7 +21,6 @@ true, union_all, ) -from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.ext.asyncio import AsyncSession from osa.domain.discovery.model.refs import FeatureFieldRef, MetadataFieldRef @@ -41,7 +40,7 @@ from osa.domain.semantics.model.value import FieldType from osa.domain.shared.error import ValidationError from osa.domain.shared.model.hook import ColumnDef -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId from osa.infrastructure.persistence.feature_table import ( FeatureSchema, build_feature_table, @@ -123,7 +122,7 @@ def __init__(self, session: AsyncSession) -> None: self.session = session async def get_all_field_types(self) -> dict[str, FieldType]: - stmt = select(schemas_table.c.srn, schemas_table.c.fields) + stmt = select(schemas_table.c.fields) result = await self.session.execute(stmt) rows = result.mappings().all() @@ -142,9 +141,11 @@ async def get_all_field_types(self) -> dict[str, FieldType]: return field_map - async def get_fields_for_schema(self, schema_srn: SchemaSRN) -> dict[str, FieldType]: - rendered = str(schema_srn) - stmt = select(schemas_table.c.fields).where(schemas_table.c.srn == rendered) + async def get_fields_for_schema(self, schema_id: SchemaId) -> dict[str, FieldType]: + stmt = select(schemas_table.c.fields).where( + schemas_table.c.id == schema_id.id.root, + schemas_table.c.version == schema_id.version.root, + ) result = await self.session.execute(stmt) row = result.mappings().first() if row is None: @@ -161,7 +162,7 @@ def __init__(self, session: AsyncSession) -> None: async def search_records( self, filter_expr: FilterExpr | None, - schema_srn: SchemaSRN | None, + schema_id: SchemaId | None, convention_srn: ConventionSRN | None, text_fields: list[str], q: str | None, @@ -176,8 +177,8 @@ async def search_records( metadata_table = None metadata_schema: MetadataSchema | None = None - if schema_srn is not None: - catalog = await self._metadata_catalog_for(schema_srn) + if schema_id is not None: + catalog = await self._metadata_catalog_for(schema_id) if catalog is not None: metadata_schema = MetadataSchema.model_validate(catalog["metadata_schema"]) metadata_table = build_metadata_table(catalog["pg_table"], metadata_schema) @@ -197,7 +198,6 @@ async def search_records( metadata_t=metadata_table, metadata_schema=metadata_schema, feature_joins=feature_joins, - field_types=ft_map, ) ) @@ -300,13 +300,13 @@ async def get_feature_catalog(self) -> list[FeatureCatalogEntry]: return [] parsed = [ - (row["hook_name"], FeatureSchema.model_validate(row["feature_schema"]), row["pg_table"]) + (row["hook_name"], FeatureSchema.model_validate(row["feature_schema"])) for row in catalog_rows ] count_parts = [] - for hook_name, schema, pg_table in parsed: - ft = build_feature_table(pg_table, schema) + for hook_name, schema in parsed: + ft = build_feature_table(hook_name, schema) count_parts.append( select( literal(hook_name).label("hook_name"), @@ -322,7 +322,7 @@ async def get_feature_catalog(self) -> list[FeatureCatalogEntry]: columns=_to_column_info(schema.columns), record_count=counts_by_hook.get(hook_name, 0), ) - for hook_name, schema, _pg_table in parsed + for hook_name, schema in parsed ] async def get_feature_table_schema(self, hook_name: str) -> FeatureCatalogEntry | None: @@ -346,7 +346,7 @@ async def search_features( self, hook_name: str, filter_expr: FilterExpr | None, - schema_srn: SchemaSRN | None, + schema_id: SchemaId | None, record_srn: RecordSRN | None, sort: str, order: SortOrder, @@ -354,22 +354,20 @@ async def search_features( limit: int, ) -> list[FeatureRow]: pg_table_stmt = select( - feature_tables_table.c.pg_table, feature_tables_table.c.feature_schema, ).where(feature_tables_table.c.hook_name == hook_name) pg_result = await self.session.execute(pg_table_stmt) pg_row = pg_result.mappings().first() if pg_row is None: return [] - pg_table: str = pg_row["pg_table"] schema = FeatureSchema.model_validate(pg_row["feature_schema"]) - ft = build_feature_table(pg_table, schema) + ft = build_feature_table(hook_name, schema) metadata_table = None metadata_schema: MetadataSchema | None = None - if schema_srn is not None: - catalog = await self._metadata_catalog_for(schema_srn) + if schema_id is not None: + catalog = await self._metadata_catalog_for(schema_id) if catalog is not None: metadata_schema = MetadataSchema.model_validate(catalog["metadata_schema"]) metadata_table = build_metadata_table(catalog["pg_table"], metadata_schema) @@ -448,13 +446,11 @@ async def search_features( # ---------------- compilation helpers ---------------- - async def _metadata_catalog_for(self, schema_srn: SchemaSRN) -> dict[str, Any] | None: - """Look up the metadata table catalog row for a Schema SRN.""" - identity = str(schema_srn).split("@", 1)[0] - major = int(schema_srn.version.root.split(".")[0]) + async def _metadata_catalog_for(self, schema_id: SchemaId) -> dict[str, Any] | None: + """Look up the metadata table catalog row for a SchemaId.""" stmt = select(metadata_tables_table).where( - metadata_tables_table.c.schema_identity == identity, - metadata_tables_table.c.schema_major == major, + metadata_tables_table.c.schema_id == schema_id.id.root, + metadata_tables_table.c.schema_major == schema_id.major, ) result = await self.session.execute(stmt) row = result.mappings().first() @@ -472,14 +468,13 @@ async def _collect_feature_joins(self, filter_expr: FilterExpr | None) -> dict[s return {} stmt = select( feature_tables_table.c.hook_name, - feature_tables_table.c.pg_table, feature_tables_table.c.feature_schema, ).where(feature_tables_table.c.hook_name.in_(hooks)) result = await self.session.execute(stmt) joins: dict[str, Any] = {} for row in result.mappings(): schema = FeatureSchema.model_validate(row["feature_schema"]) - joins[row["hook_name"]] = build_feature_table(row["pg_table"], schema) + joins[row["hook_name"]] = build_feature_table(row["hook_name"], schema) missing = hooks - joins.keys() if missing: raise ValidationError( @@ -497,7 +492,6 @@ def _compile_filter_for_records( metadata_t: Any, metadata_schema: MetadataSchema | None, feature_joins: dict[str, Any], - field_types: dict[str, FieldType], ) -> Any: if isinstance(expr, Predicate): return self._compile_predicate( @@ -505,7 +499,6 @@ def _compile_filter_for_records( metadata_t=metadata_t, metadata_schema=metadata_schema, feature_joins=feature_joins, - field_types=field_types, ) if isinstance(expr, And): return and_( @@ -516,7 +509,6 @@ def _compile_filter_for_records( metadata_t=metadata_t, metadata_schema=metadata_schema, feature_joins=feature_joins, - field_types=field_types, ) for op in expr.operands ] @@ -530,7 +522,6 @@ def _compile_filter_for_records( metadata_t=metadata_t, metadata_schema=metadata_schema, feature_joins=feature_joins, - field_types=field_types, ) for op in expr.operands ] @@ -543,7 +534,6 @@ def _compile_filter_for_records( metadata_t=metadata_t, metadata_schema=metadata_schema, feature_joins=feature_joins, - field_types=field_types, ) ) raise ValidationError(f"Unsupported filter node: {type(expr).__name__}") @@ -562,7 +552,7 @@ def _compile_filter_for_features( if isinstance(expr.field, MetadataFieldRef): if metadata_t is None: raise ValidationError( - f"Metadata ref {expr.field.dotted()!r} requires schema_srn to be set.", + f"Metadata ref {expr.field.dotted()!r} requires schema_id to be set.", field=expr.field.dotted(), code="metadata_ref_requires_schema", ) @@ -629,21 +619,19 @@ def _compile_predicate( metadata_t: Any, metadata_schema: MetadataSchema | None, feature_joins: dict[str, Any], - field_types: dict[str, FieldType], ) -> Any: if isinstance(predicate.field, MetadataFieldRef): - # Prefer the typed projection when a schema is pinned. - if metadata_t is not None and metadata_schema is not None: - col = metadata_t.c[predicate.field.field] - return _apply_scalar_op(col, predicate.op, predicate.value) - # Otherwise compile against the canonical records.metadata JSONB. - return _apply_jsonb_op( - records_table, - field=predicate.field.field, - op=predicate.op, - value=predicate.value, - field_type=field_types.get(predicate.field.field), - ) + if metadata_t is None or metadata_schema is None: + raise ValidationError( + f"Metadata predicate on {predicate.field.dotted()!r} requires " + "the request to pin a 'schema' ('@'). " + "Unscoped metadata filtering is not supported — the typed table " + "is the only filter path.", + field=predicate.field.dotted(), + code="schema_required_for_metadata_query", + ) + col = metadata_t.c[predicate.field.field] + return _apply_scalar_op(col, predicate.op, predicate.value) assert isinstance(predicate.field, FeatureFieldRef) tbl = feature_joins.get(predicate.field.hook) @@ -657,63 +645,6 @@ def _compile_predicate( return _apply_scalar_op(col, predicate.op, predicate.value) -def _apply_jsonb_op( - records_t: Any, - *, - field: str, - op: FilterOperator, - value: Any, - field_type: FieldType | None, -) -> Any: - """Compile a metadata-field predicate against the canonical ``records.metadata`` JSONB. - - Used when no ``schema_srn`` is pinned (cross-schema / unscoped listings). - Equality uses JSONB containment (GIN-indexed); range ops cast the extracted - text to the appropriate type driven by ``field_type`` when known. - """ - meta = records_t.c.metadata - - if op == FilterOperator.EQ: - return meta.op("@>")(cast(func.json_build_object(field, value), JSONB)) - if op == FilterOperator.NEQ: - return not_(meta.op("@>")(cast(func.json_build_object(field, value), JSONB))) - if op == FilterOperator.IS_NULL: - # Absent key OR present-but-null both count as "null". - return or_(not_(meta.has_key(field)), meta[field].astext.is_(None)) - if op == FilterOperator.IN: - if not isinstance(value, list): - raise ValidationError( - "Operator 'in' requires a list value.", - field=field, - code="invalid_value_for_op", - ) - return meta[field].astext.in_([str(v) for v in value]) - if op == FilterOperator.CONTAINS: - return meta[field].astext.ilike(f"%{_escape_like(str(value))}%", escape="\\") - if op in (FilterOperator.GT, FilterOperator.GTE, FilterOperator.LT, FilterOperator.LTE): - if field_type == FieldType.NUMBER: - col_expr = cast(meta[field].astext, Float) - typed_value: Any = float(value) - elif field_type == FieldType.DATE: - col_expr = cast(meta[field].astext, Date) - typed_value = str(value) - else: - col_expr = cast(meta[field].astext, String) - typed_value = str(value) - if op == FilterOperator.GT: - return col_expr > typed_value - if op == FilterOperator.GTE: - return col_expr >= typed_value - if op == FilterOperator.LT: - return col_expr < typed_value - return col_expr <= typed_value - raise ValidationError( - f"Unsupported operator for JSONB fallback: {op}", - field=field, - code="unsupported_operator", - ) - - def _apply_scalar_op(col: Any, op: FilterOperator, value: Any) -> Any: if op == FilterOperator.EQ: return col == value diff --git a/server/osa/infrastructure/persistence/adapter/readers.py b/server/osa/infrastructure/persistence/adapter/readers.py index 36d066f..a871bf0 100644 --- a/server/osa/infrastructure/persistence/adapter/readers.py +++ b/server/osa/infrastructure/persistence/adapter/readers.py @@ -12,7 +12,7 @@ from osa.domain.semantics.model.ontology import Ontology, Term from osa.domain.semantics.model.schema import Schema from osa.domain.semantics.model.value import FieldDefinition -from osa.domain.shared.model.srn import OntologySRN, SchemaSRN +from osa.domain.shared.model.srn import LocalId, OntologySRN, SchemaId, Semver from osa.infrastructure.persistence.tables import ( ontologies_table, ontology_terms_table, @@ -20,12 +20,18 @@ ) +def _where_schema(schema_id: SchemaId): + return (schemas_table.c.id == schema_id.id.root) & ( + schemas_table.c.version == schema_id.version.root + ) + + class SchemaReaderAdapter(SchemaReader): def __init__(self, session: AsyncSession) -> None: self.session = session - async def get_schema(self, srn: SchemaSRN) -> Schema | None: - stmt = select(schemas_table).where(schemas_table.c.srn == str(srn)) + async def get_schema(self, schema_id: SchemaId) -> Schema | None: + stmt = select(schemas_table).where(_where_schema(schema_id)) result = await self.session.execute(stmt) row = result.mappings().first() if not row: @@ -33,14 +39,17 @@ async def get_schema(self, srn: SchemaSRN) -> Schema | None: row_dict = dict(row) fields = [FieldDefinition.model_validate(f) for f in row_dict["fields"]] return Schema( - srn=SchemaSRN.parse(row_dict["srn"]), + id=SchemaId( + id=LocalId(row_dict["id"]), + version=Semver.from_string(row_dict["version"]), + ), title=row_dict["title"], fields=fields, created_at=row_dict["created_at"], ) - async def schema_exists(self, srn: SchemaSRN) -> bool: - stmt = select(schemas_table.c.srn).where(schemas_table.c.srn == str(srn)) + async def schema_exists(self, schema_id: SchemaId) -> bool: + stmt = select(schemas_table.c.id).where(_where_schema(schema_id)) result = await self.session.execute(stmt) return result.first() is not None diff --git a/server/osa/infrastructure/persistence/api_naming.py b/server/osa/infrastructure/persistence/api_naming.py new file mode 100644 index 0000000..644f525 --- /dev/null +++ b/server/osa/infrastructure/persistence/api_naming.py @@ -0,0 +1,46 @@ +"""API-to-storage naming translation. + +The API surface and the PG storage layout coincidentally share names today — +feature references in the discovery wire format say ``features..`` +and that maps cleanly onto ``features."".`` in PostgreSQL. The +metadata tables in the ``metadata`` PG schema likewise mirror the API's +``metadata.`` prefix. + +This module is the seam between the API and the storage layer. Callers route +through these functions so that if the API naming ever needs to diverge from +the PG layout (API rename; storage consolidation; federation-driven rename), +the translation lives here rather than being sprinkled through adapters and +stores. + +All functions are identity implementations today. The point is to *mark the +boundary* so it is crossable later, not to make the names different now. +""" + +from __future__ import annotations + + +def feature_pg_schema() -> str: + """PG schema name holding dynamic feature tables. + + Mirrors the API's ``features.*`` prefix today. + """ + return "features" + + +def feature_pg_table(api_feature_name: str) -> str: + """PG table name for a feature referenced by its API name. + + The ```` segment of the API path ``features..`` maps + to this PG table name. Identity today — the API and PG names are + intentionally aligned for readability. Introduce a real mapping here if + the two ever diverge. + """ + return api_feature_name + + +def metadata_pg_schema() -> str: + """PG schema name holding dynamic per-schema metadata tables. + + Mirrors the API's ``metadata.*`` prefix today. + """ + return "metadata" diff --git a/server/osa/infrastructure/persistence/feature_store.py b/server/osa/infrastructure/persistence/feature_store.py index b73bfbc..9aa7e0f 100644 --- a/server/osa/infrastructure/persistence/feature_store.py +++ b/server/osa/infrastructure/persistence/feature_store.py @@ -12,8 +12,8 @@ from osa.domain.feature.port.feature_store import FeatureStore from osa.domain.shared.error import ConflictError, ValidationError from osa.domain.shared.model.hook import ColumnDef +from osa.infrastructure.persistence.api_naming import feature_pg_schema, feature_pg_table from osa.infrastructure.persistence.feature_table import ( - FEATURES_SCHEMA, FeatureSchema, build_feature_table, ) @@ -48,7 +48,7 @@ async def create_table(self, hook_name: str, columns: list[ColumnDef]) -> None: async with self._engine.begin() as conn: # Ensure the features schema exists - await conn.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{FEATURES_SCHEMA}"')) + await conn.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{feature_pg_schema()}"')) # Check for existing table in catalog — duplicate is a hard error existing = await conn.execute( @@ -68,7 +68,7 @@ async def create_table(self, hook_name: str, columns: list[ColumnDef]) -> None: await conn.execute( feature_tables_table.insert().values( hook_name=hook_name, - pg_table=hook_name, + pg_table=feature_pg_table(hook_name), feature_schema=schema.model_dump(), schema_version=1, created_at=datetime.now(UTC), @@ -99,11 +99,13 @@ async def insert_features( # Bulk insert in chunks of 1000 chunk_size = 1000 total = 0 + pg_schema = feature_pg_schema() + pg_table = feature_pg_table(hook_name) async with self._engine.begin() as conn: # Reflect the actual table to get correct column types for casts - metadata = sa.MetaData(schema=FEATURES_SCHEMA) - await conn.run_sync(metadata.reflect, only=[hook_name]) - table = metadata.tables[f"{FEATURES_SCHEMA}.{hook_name}"] + metadata = sa.MetaData(schema=pg_schema) + await conn.run_sync(metadata.reflect, only=[pg_table]) + table = metadata.tables[f"{pg_schema}.{pg_table}"] for i in range(0, len(enriched_rows), chunk_size): chunk = enriched_rows[i : i + chunk_size] diff --git a/server/osa/infrastructure/persistence/feature_table.py b/server/osa/infrastructure/persistence/feature_table.py index 32aa6ba..eff08c8 100644 --- a/server/osa/infrastructure/persistence/feature_table.py +++ b/server/osa/infrastructure/persistence/feature_table.py @@ -6,10 +6,13 @@ from osa.domain.shared.model.hook import ColumnDef from osa.domain.shared.model.value import ValueObject +from osa.infrastructure.persistence.api_naming import feature_pg_schema, feature_pg_table from osa.infrastructure.persistence.column_mapper import map_column from osa.infrastructure.persistence.tables import records_table -FEATURES_SCHEMA = "features" +# Back-compat re-export for callers that import the constant directly. +# Prefer ``feature_pg_schema()`` in new code. +FEATURES_SCHEMA = feature_pg_schema() AUTO_COLUMN_NAMES = frozenset({"id", "record_srn", "created_at"}) @@ -23,11 +26,15 @@ class FeatureSchema(ValueObject): columns: list[ColumnDef] = [] -def build_feature_table(pg_table: str, schema: FeatureSchema) -> sa.Table: +def build_feature_table(api_feature_name: str, schema: FeatureSchema) -> sa.Table: """Build a SQLAlchemy ``Table`` for a dynamic feature table. + *api_feature_name* is the ```` segment from the API's + ``features..`` path. The PG table name is resolved through + :func:`feature_pg_table` — identity today but kept as a seam. + Returns a ``Table`` with auto columns (``id``, ``record_srn``, ``created_at``) - plus data columns derived from *schema*, in the ``features`` PG schema. + plus data columns derived from *schema*, in the features PG schema. ``record_srn`` carries an ``ON DELETE CASCADE`` FK to ``records.srn`` — the FK target is the ``Column`` object itself (not a string reference), so @@ -38,7 +45,7 @@ def build_feature_table(pg_table: str, schema: FeatureSchema) -> sa.Table: metadata = sa.MetaData() return sa.Table( - pg_table, + feature_pg_table(api_feature_name), metadata, sa.Column("id", sa.BigInteger, primary_key=True, autoincrement=True), sa.Column( @@ -55,7 +62,7 @@ def build_feature_table(pg_table: str, schema: FeatureSchema) -> sa.Table: server_default=sa.func.now(), ), *data_columns, - schema=FEATURES_SCHEMA, + schema=feature_pg_schema(), ) diff --git a/server/osa/infrastructure/persistence/mappers/record.py b/server/osa/infrastructure/persistence/mappers/record.py index 8278e2c..aac791a 100644 --- a/server/osa/infrastructure/persistence/mappers/record.py +++ b/server/osa/infrastructure/persistence/mappers/record.py @@ -1,10 +1,9 @@ """Record mapper - converts between domain and persistence. -Feature 076 adds ``schema_srn`` as a first-class linkage and keeps ``metadata`` -as the canonical JSONB store. The typed ``metadata._v`` -table is a discovery-optimized projection maintained asynchronously by the -``InsertRecordMetadata`` event handler; it is not the source of truth for -record metadata. +Feature 076 adds ``schema_id`` + ``schema_version`` columns so a Record's +typed linkage is first-class. ``metadata`` remains the canonical JSONB store; +the typed ``metadata._v`` table is a discovery-optimized +projection maintained asynchronously by ``InsertRecordMetadata``. """ from datetime import datetime @@ -14,7 +13,7 @@ from osa.domain.record.model.aggregate import Record from osa.domain.shared.model.source import RecordSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, LocalId, RecordSRN, SchemaId, Semver _source_adapter = TypeAdapter(RecordSource) @@ -31,7 +30,10 @@ def row_to_record(row: dict[str, Any]) -> Record: srn=RecordSRN.parse(row["srn"]), source=source, convention_srn=ConventionSRN.parse(row["convention_srn"]), - schema_srn=SchemaSRN.parse(row["schema_srn"]), + schema_id=SchemaId( + id=LocalId(row["schema_id"]), + version=Semver.from_string(row["schema_version"]), + ), metadata=row.get("metadata") or {}, published_at=published_at, ) @@ -42,7 +44,8 @@ def record_to_dict(record: Record) -> dict[str, Any]: return { "srn": str(record.srn), "convention_srn": str(record.convention_srn), - "schema_srn": str(record.schema_srn), + "schema_id": record.schema_id.id.root, + "schema_version": record.schema_id.version.root, "source": _source_adapter.dump_python(record.source, mode="json"), "metadata": record.metadata, "published_at": record.published_at, diff --git a/server/osa/infrastructure/persistence/metadata_store.py b/server/osa/infrastructure/persistence/metadata_store.py index a18e174..68be0ee 100644 --- a/server/osa/infrastructure/persistence/metadata_store.py +++ b/server/osa/infrastructure/persistence/metadata_store.py @@ -1,14 +1,14 @@ """PostgreSQL implementation of MetadataStore. -Schema-keyed DDL lifecycle: one metadata table per (schema_identity, major -version) pair. The catalog row in ``public.metadata_tables`` is updated in -lock-step with ALTER ADD COLUMN operations so reads can reconstruct the -dynamic table shape without reflection. +Schema-keyed DDL lifecycle: one metadata table per ``(schema_id, major)`` +pair. The catalog row in ``public.metadata_tables`` is updated in lock-step +with ALTER ADD COLUMN operations so reads can reconstruct the dynamic table +shape without reflection. """ from __future__ import annotations -from datetime import UTC, datetime +from datetime import UTC, date, datetime from typing import Any, Literal, Sequence import sqlalchemy as sa @@ -20,10 +20,10 @@ from osa.domain.semantics.model.value import FieldDefinition, FieldType from osa.domain.shared.error import ValidationError from osa.domain.shared.model.hook import ColumnDef -from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaId +from osa.infrastructure.persistence.api_naming import metadata_pg_schema from osa.infrastructure.persistence.column_mapper import map_column from osa.infrastructure.persistence.metadata_table import ( - METADATA_SCHEMA, MetadataSchema, build_metadata_table, schema_slug, @@ -61,12 +61,6 @@ def _field_to_column(field: FieldDefinition) -> ColumnDef: ) -def _identity_of(schema_srn: SchemaSRN) -> str: - """Return the version-stripped schema SRN (the schema identity).""" - rendered = str(schema_srn) - return rendered.split("@", 1)[0] - - class PostgresMetadataStore(MetadataStore): """DDL + DML for per-schema typed metadata tables.""" @@ -76,26 +70,25 @@ def __init__(self, engine: AsyncEngine, session: AsyncSession) -> None: async def ensure_table( self, - schema_srn: SchemaSRN, - schema_title: str, + schema_id: SchemaId, fields: list[FieldDefinition], ) -> None: - identity = _identity_of(schema_srn) - major = int(schema_srn.version.root.split(".")[0]) - slug = schema_slug(schema_title) + id_str = schema_id.id.root + major = schema_id.major + slug = schema_slug(id_str) pg_table = f"{slug}_v{major}" columns = [_field_to_column(f) for f in fields] metadata_schema = MetadataSchema(columns=columns) async with self._engine.begin() as conn: - await conn.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{METADATA_SCHEMA}"')) + await conn.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{metadata_pg_schema()}"')) existing = ( ( await conn.execute( select(metadata_tables_table).where( - metadata_tables_table.c.schema_identity == identity, + metadata_tables_table.c.schema_id == id_str, metadata_tables_table.c.schema_major == major, ) ) @@ -110,10 +103,10 @@ async def ensure_table( now = datetime.now(UTC) await conn.execute( metadata_tables_table.insert().values( - schema_identity=identity, + schema_id=id_str, schema_slug=slug, schema_major=major, - schema_versions=[str(schema_srn)], + schema_versions=[schema_id.render()], pg_table=pg_table, metadata_schema=metadata_schema.model_dump(), created_at=now, @@ -132,9 +125,10 @@ async def ensure_table( new_columns = [ c for c in columns if c.name not in {s.name for s in stored_schema.columns} ] + rendered = schema_id.render() if not new_columns: - if str(schema_srn) not in stored_versions: - stored_versions.append(str(schema_srn)) + if rendered not in stored_versions: + stored_versions.append(rendered) await conn.execute( metadata_tables_table.update() .where(metadata_tables_table.c.id == existing["id"]) @@ -150,8 +144,8 @@ async def ensure_table( await conn.execute(text(_alter_add_column_stmt(pg_table, col_def))) merged_columns = stored_schema.columns + new_columns - if str(schema_srn) not in stored_versions: - stored_versions.append(str(schema_srn)) + if rendered not in stored_versions: + stored_versions.append(rendered) await conn.execute( metadata_tables_table.update() .where(metadata_tables_table.c.id == existing["id"]) @@ -164,18 +158,18 @@ async def ensure_table( async def insert( self, - schema_srn: SchemaSRN, + schema_id: SchemaId, record_srn: RecordSRN, values: dict[str, Any], ) -> None: - identity = _identity_of(schema_srn) - major = int(schema_srn.version.root.split(".")[0]) + id_str = schema_id.id.root + major = schema_id.major catalog_row = ( ( await self._session.execute( select(metadata_tables_table).where( - metadata_tables_table.c.schema_identity == identity, + metadata_tables_table.c.schema_id == id_str, metadata_tables_table.c.schema_major == major, ) ) @@ -186,18 +180,23 @@ async def insert( if catalog_row is None: raise ValidationError( - f"No metadata table registered for schema {schema_srn} " - f"(identity={identity}, major={major}). " + f"No metadata table registered for schema {schema_id.render()} " + f"(id={id_str}, major={major}). " "Ensure the convention has been registered first.", - field="schema_srn", + field="schema_id", ) schema = MetadataSchema.model_validate(catalog_row["metadata_schema"]) pg_table = catalog_row["pg_table"] table = build_metadata_table(pg_table, schema) - known = {c.name for c in schema.columns} - payload = {k: v for k, v in values.items() if k in known} + col_by_name = {c.name: c for c in schema.columns} + payload: dict[str, Any] = {} + for k, v in values.items(): + col = col_by_name.get(k) + if col is None: + continue + payload[k] = _coerce_value(col, v) payload["record_srn"] = str(record_srn) stmt = insert(table).values(**payload) @@ -251,11 +250,27 @@ def _alter_add_column_stmt(pg_table: str, col_def: ColumnDef) -> str: sql_type = _column_type_sql(map_column(col_def).type) null_sql = "" if not col_def.required else " NOT NULL" return ( - f'ALTER TABLE "{METADATA_SCHEMA}"."{pg_table}" ' + f'ALTER TABLE "{metadata_pg_schema()}"."{pg_table}" ' f'ADD COLUMN IF NOT EXISTS "{col_def.name}" {sql_type}{null_sql}' ) +def _coerce_value(col: ColumnDef, value: Any) -> Any: + """Coerce a JSONB-read value to match its typed PG column. + + ``records.metadata`` is JSONB, so date/datetime fields come back as ISO + strings. asyncpg won't auto-parse those for DATE / TIMESTAMP columns — + we parse here based on the declared column format. + """ + if value is None: + return None + if col.json_type == "string" and col.format == "date": + return value if isinstance(value, date) else date.fromisoformat(value) + if col.json_type == "string" and col.format == "date-time": + return value if isinstance(value, datetime) else datetime.fromisoformat(value) + return value + + def _column_type_sql(sa_type: Any) -> str: if isinstance(sa_type, sa.Text): return "text" diff --git a/server/osa/infrastructure/persistence/metadata_table.py b/server/osa/infrastructure/persistence/metadata_table.py index 239a3fb..ba9861d 100644 --- a/server/osa/infrastructure/persistence/metadata_table.py +++ b/server/osa/infrastructure/persistence/metadata_table.py @@ -13,10 +13,13 @@ from osa.domain.shared.model.hook import ColumnDef from osa.domain.shared.model.value import ValueObject +from osa.infrastructure.persistence.api_naming import metadata_pg_schema from osa.infrastructure.persistence.column_mapper import map_column from osa.infrastructure.persistence.tables import records_table -METADATA_SCHEMA = "metadata" +# Back-compat re-export for callers that import the constant directly. +# Prefer ``metadata_pg_schema()`` in new code. +METADATA_SCHEMA = metadata_pg_schema() AUTO_COLUMN_NAMES = frozenset({"id", "record_srn", "created_at"}) @@ -76,7 +79,7 @@ def build_metadata_table(pg_table: str, schema: MetadataSchema) -> sa.Table: server_default=sa.func.now(), ), *data_columns, - schema=METADATA_SCHEMA, + schema=metadata_pg_schema(), ) diff --git a/server/osa/infrastructure/persistence/repository/convention.py b/server/osa/infrastructure/persistence/repository/convention.py index 626f4cc..c2cd050 100644 --- a/server/osa/infrastructure/persistence/repository/convention.py +++ b/server/osa/infrastructure/persistence/repository/convention.py @@ -8,7 +8,7 @@ from osa.domain.deposition.port.convention_repository import ConventionRepository from osa.domain.shared.model.hook import HookDefinition from osa.domain.shared.model.source import IngesterDefinition -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, LocalId, SchemaId, Semver from osa.infrastructure.persistence.tables import conventions_table @@ -17,7 +17,8 @@ def _convention_to_row(convention: Convention) -> dict[str, Any]: "srn": str(convention.srn), "title": convention.title, "description": convention.description, - "schema_srn": str(convention.schema_srn), + "schema_id": convention.schema_id.id.root, + "schema_version": convention.schema_id.version.root, "file_requirements": convention.file_requirements.model_dump(), "hooks": [h.model_dump() for h in convention.hooks], "source": convention.ingester.model_dump() if convention.ingester else None, @@ -31,7 +32,10 @@ def _row_to_convention(row: dict[str, Any]) -> Convention: srn=ConventionSRN.parse(row["srn"]), title=row["title"], description=row.get("description"), - schema_srn=SchemaSRN.parse(row["schema_srn"]), + schema_id=SchemaId( + id=LocalId(row["schema_id"]), + version=Semver.from_string(row["schema_version"]), + ), file_requirements=FileRequirements.model_validate(row["file_requirements"]), hooks=[HookDefinition.model_validate(h) for h in (row.get("hooks") or [])], ingester=IngesterDefinition.model_validate(source_data) if source_data else None, diff --git a/server/osa/infrastructure/persistence/repository/schema.py b/server/osa/infrastructure/persistence/repository/schema.py index 135fc10..42d911a 100644 --- a/server/osa/infrastructure/persistence/repository/schema.py +++ b/server/osa/infrastructure/persistence/repository/schema.py @@ -1,18 +1,19 @@ from typing import Any, List -from sqlalchemy import insert, select +from sqlalchemy import and_, insert, select from sqlalchemy.ext.asyncio import AsyncSession from osa.domain.semantics.model.schema import Schema from osa.domain.semantics.model.value import FieldDefinition from osa.domain.semantics.port.schema_repository import SchemaRepository -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import LocalId, SchemaId, Semver from osa.infrastructure.persistence.tables import schemas_table def _schema_to_row(schema: Schema) -> dict[str, Any]: return { - "srn": str(schema.srn), + "id": schema.id.id.root, + "version": schema.id.version.root, "title": schema.title, "fields": [f.model_dump(mode="json") for f in schema.fields], "created_at": schema.created_at, @@ -22,13 +23,20 @@ def _schema_to_row(schema: Schema) -> dict[str, Any]: def _row_to_schema(row: dict[str, Any]) -> Schema: fields = [FieldDefinition.model_validate(f) for f in row["fields"]] return Schema( - srn=SchemaSRN.parse(row["srn"]), + id=SchemaId(id=LocalId(row["id"]), version=Semver.from_string(row["version"])), title=row["title"], fields=fields, created_at=row["created_at"], ) +def _where_schema_id(schema_id: SchemaId) -> Any: + return and_( + schemas_table.c.id == schema_id.id.root, + schemas_table.c.version == schema_id.version.root, + ) + + class PostgresSemanticsSchemaRepository(SchemaRepository): def __init__(self, session: AsyncSession) -> None: self.session = session @@ -38,8 +46,8 @@ async def save(self, schema: Schema) -> None: await self.session.execute(insert(schemas_table).values(**row)) await self.session.flush() - async def get(self, srn: SchemaSRN) -> Schema | None: - stmt = select(schemas_table).where(schemas_table.c.srn == str(srn)) + async def get(self, schema_id: SchemaId) -> Schema | None: + stmt = select(schemas_table).where(_where_schema_id(schema_id)) result = await self.session.execute(stmt) row = result.mappings().first() return _row_to_schema(dict(row)) if row else None @@ -54,7 +62,7 @@ async def list(self, *, limit: int | None = None, offset: int | None = None) -> result = await self.session.execute(stmt) return [_row_to_schema(dict(r)) for r in result.mappings().all()] - async def exists(self, srn: SchemaSRN) -> bool: - stmt = select(schemas_table.c.srn).where(schemas_table.c.srn == str(srn)) + async def exists(self, schema_id: SchemaId) -> bool: + stmt = select(schemas_table.c.id).where(_where_schema_id(schema_id)) result = await self.session.execute(stmt) return result.first() is not None diff --git a/server/osa/infrastructure/persistence/tables.py b/server/osa/infrastructure/persistence/tables.py index 42dc3f4..b8b8d22 100644 --- a/server/osa/infrastructure/persistence/tables.py +++ b/server/osa/infrastructure/persistence/tables.py @@ -66,14 +66,15 @@ metadata, Column("srn", String, primary_key=True), Column("convention_srn", Text, nullable=False), - Column("schema_srn", Text, nullable=False), + Column("schema_id", Text, nullable=False), + Column("schema_version", Text, nullable=False), Column("source", JSONB, nullable=False), Column("metadata", JSONB, nullable=False), Column("published_at", DateTime(timezone=True), nullable=False), ) Index("idx_records_convention_srn", records_table.c.convention_srn) -Index("idx_records_schema_srn", records_table.c.schema_srn) +Index("idx_records_schema_id", records_table.c.schema_id) Index( "uq_records_source", records_table.c.source["type"].as_string(), @@ -257,12 +258,15 @@ schemas_table = Table( "schemas", metadata, - Column("srn", String, primary_key=True), # Versioned SRN string + Column("id", String, primary_key=True, nullable=False), + Column("version", String, primary_key=True, nullable=False), Column("title", String(255), nullable=False), Column("fields", JSON, nullable=False), # List of FieldDefinition dicts Column("created_at", DateTime(timezone=True), nullable=False), ) +Index("idx_schemas_id", schemas_table.c.id) + # ============================================================================ # CONVENTIONS TABLE (Deposition) @@ -270,10 +274,11 @@ conventions_table = Table( "conventions", metadata, - Column("srn", String, primary_key=True), # Versioned SRN string + Column("srn", String, primary_key=True), # Convention SRN stays as-is (published artifact) Column("title", String(255), nullable=False), Column("description", Text, nullable=True), - Column("schema_srn", String, nullable=False), # Reference to schemas.srn + Column("schema_id", String, nullable=False), + Column("schema_version", String, nullable=False), Column("file_requirements", JSON, nullable=False), # FileRequirements as dict Column("hooks", JSON, nullable=False, default=[]), # List of HookDefinition dicts Column("source", JSON, nullable=True), # IngesterDefinition as dict @@ -304,7 +309,7 @@ "metadata_tables", metadata, Column("id", Integer, primary_key=True, autoincrement=True), - Column("schema_identity", Text, nullable=False), + Column("schema_id", Text, nullable=False), Column("schema_slug", Text, nullable=False), Column("schema_major", Integer, nullable=False), Column("schema_versions", JSONB, nullable=False), @@ -312,7 +317,7 @@ Column("metadata_schema", JSONB, nullable=False), Column("created_at", DateTime(timezone=True), nullable=False), Column("updated_at", DateTime(timezone=True), nullable=False), - UniqueConstraint("schema_identity", "schema_major", name="uq_metadata_tables_identity_major"), + UniqueConstraint("schema_id", "schema_major", name="uq_metadata_tables_id_major"), UniqueConstraint("pg_table", name="uq_metadata_tables_pg_table"), ) diff --git a/server/tests/integration/conftest.py b/server/tests/integration/conftest.py index a3c0828..890ccd4 100644 --- a/server/tests/integration/conftest.py +++ b/server/tests/integration/conftest.py @@ -29,7 +29,8 @@ async def seed_record( *, srn: str, convention_srn: str = "urn:osa:localhost:conv:test@1.0.0", - schema_srn: str = "urn:osa:localhost:schema:test@1.0.0", + schema_id: str = "test", + schema_version: str = "1.0.0", source: dict[str, Any] | None = None, metadata: dict[str, Any] | None = None, published_at: datetime | None = None, @@ -44,15 +45,17 @@ async def seed_record( await conn.execute( text( """ - INSERT INTO records (srn, convention_srn, schema_srn, source, metadata, published_at) - VALUES (:srn, :conv, :schema, CAST(:source AS JSONB), - CAST(:meta AS JSONB), :published_at) + INSERT INTO records (srn, convention_srn, schema_id, schema_version, + source, metadata, published_at) + VALUES (:srn, :conv, :schema_id, :schema_version, + CAST(:source AS JSONB), CAST(:meta AS JSONB), :published_at) """ ), { "srn": srn, "conv": convention_srn, - "schema": schema_srn, + "schema_id": schema_id, + "schema_version": schema_version, "source": json.dumps(src), "meta": json.dumps(metadata or {}), "published_at": published_at or datetime.now(UTC), diff --git a/server/tests/integration/persistence/test_convention_repo.py b/server/tests/integration/persistence/test_convention_repo.py index 632eb79..0e5e818 100644 --- a/server/tests/integration/persistence/test_convention_repo.py +++ b/server/tests/integration/persistence/test_convention_repo.py @@ -20,7 +20,7 @@ IngesterScheduleConfig, InitialRunConfig, ) -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId from osa.infrastructure.persistence.repository.convention import ( PostgresConventionRepository, ) @@ -30,7 +30,7 @@ def _make_convention( *, srn: str = "urn:osa:localhost:conv:test-convention-001@1.0.0", title: str = "Test Convention", - schema_srn: str = "urn:osa:localhost:schema:test-schema-001@1.0.0", + schema_id: str = "test-schema-001@1.0.0", hooks: list[HookDefinition] | None = None, ingester: IngesterDefinition | None = None, ) -> Convention: @@ -38,7 +38,7 @@ def _make_convention( srn=ConventionSRN.parse(srn), title=title, description="A test convention for integration tests", - schema_srn=SchemaSRN.parse(schema_srn), + schema_id=SchemaId.parse(schema_id), file_requirements=FileRequirements( accepted_types=[".csv", ".h5ad"], min_count=1, @@ -99,7 +99,7 @@ async def test_save_and_get(self, pg_session: AsyncSession): assert str(got.srn) == str(conv.srn) assert got.title == conv.title assert got.description == conv.description - assert str(got.schema_srn) == str(conv.schema_srn) + assert str(got.schema_id) == str(conv.schema_id) assert got.file_requirements == conv.file_requirements assert len(got.hooks) == 1 assert got.hooks[0].runtime.image == hook.runtime.image diff --git a/server/tests/integration/persistence/test_discovery_pagination.py b/server/tests/integration/persistence/test_discovery_pagination.py index a45f4ea..49b2fb8 100644 --- a/server/tests/integration/persistence/test_discovery_pagination.py +++ b/server/tests/integration/persistence/test_discovery_pagination.py @@ -23,7 +23,8 @@ async def _insert_record(session: AsyncSession, srn: str, published_at: datetime records_table.insert().values( srn=srn, convention_srn="urn:osa:localhost:conv:test@1.0.0", - schema_srn="urn:osa:localhost:schema:test@1.0.0", + schema_id="test", + schema_version="1.0.0", source={"type": "test", "id": srn}, metadata={}, published_at=published_at, @@ -45,7 +46,7 @@ async def test_second_page_with_published_at_cursor(self, pg_session: AsyncSessi first_page = await store.search_records( filter_expr=None, - schema_srn=None, + schema_id=None, convention_srn=None, text_fields=[], q=None, @@ -64,7 +65,7 @@ async def test_second_page_with_published_at_cursor(self, pg_session: AsyncSessi second_page = await store.search_records( filter_expr=None, - schema_srn=None, + schema_id=None, convention_srn=None, text_fields=[], q=None, diff --git a/server/tests/integration/persistence/test_metadata_store.py b/server/tests/integration/persistence/test_metadata_store.py index 60d8349..42bb71f 100644 --- a/server/tests/integration/persistence/test_metadata_store.py +++ b/server/tests/integration/persistence/test_metadata_store.py @@ -6,16 +6,16 @@ from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaId from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore from osa.infrastructure.persistence.metadata_table import METADATA_SCHEMA from tests.integration.conftest import seed_record -SCHEMA_IDENTITY = "urn:osa:localhost:schema:bio-sample" -SCHEMA_V1 = SchemaSRN.parse(f"{SCHEMA_IDENTITY}@1.0.0") -SCHEMA_V11 = SchemaSRN.parse(f"{SCHEMA_IDENTITY}@1.1.0") -SCHEMA_V2 = SchemaSRN.parse(f"{SCHEMA_IDENTITY}@2.0.0") +SCHEMA_ID = "bio-sample" +SCHEMA_V1 = SchemaId.parse(f"{SCHEMA_ID}@1.0.0") +SCHEMA_V11 = SchemaId.parse(f"{SCHEMA_ID}@1.1.0") +SCHEMA_V2 = SchemaId.parse(f"{SCHEMA_ID}@2.0.0") def _fields_v1() -> list[FieldDefinition]: @@ -95,7 +95,7 @@ async def test_creates_table_and_catalog_row( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + await store.ensure_table(SCHEMA_V1, _fields_v1()) assert await _table_exists(pg_engine, "bio_sample_v1") cols = await _column_names(pg_engine, "bio_sample_v1") @@ -106,14 +106,14 @@ async def test_creates_table_and_catalog_row( row = ( await conn.execute( text( - "SELECT schema_identity, schema_major, pg_table, schema_versions " - "FROM metadata_tables WHERE schema_identity = :id" + "SELECT schema_id, schema_major, pg_table, schema_versions " + "FROM metadata_tables WHERE schema_id = :id" ), - {"id": SCHEMA_IDENTITY}, + {"id": SCHEMA_ID}, ) ).first() assert row is not None - assert row[0] == SCHEMA_IDENTITY + assert row[0] == SCHEMA_ID assert row[1] == 1 assert row[2] == "bio_sample_v1" assert str(SCHEMA_V1) in row[3] @@ -122,15 +122,15 @@ async def test_idempotent_on_same_version( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + await store.ensure_table(SCHEMA_V1, _fields_v1()) # Second call with same SRN should not raise and should not duplicate catalog rows. - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + await store.ensure_table(SCHEMA_V1, _fields_v1()) async with pg_engine.begin() as conn: count = ( await conn.execute( - text("SELECT COUNT(*) FROM metadata_tables WHERE schema_identity = :id"), - {"id": SCHEMA_IDENTITY}, + text("SELECT COUNT(*) FROM metadata_tables WHERE schema_id = :id"), + {"id": SCHEMA_ID}, ) ).scalar() assert count == 1 @@ -139,7 +139,7 @@ async def test_foreign_key_cascade_on_record_srn( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + await store.ensure_table(SCHEMA_V1, _fields_v1()) async with pg_engine.begin() as conn: constraint = ( @@ -162,10 +162,15 @@ async def test_foreign_key_cascade_on_record_srn( class TestInsert: async def test_insert_typed_row(self, pg_engine: AsyncEngine, pg_session: AsyncSession): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + await store.ensure_table(SCHEMA_V1, _fields_v1()) record_srn = RecordSRN.parse("urn:osa:localhost:rec:abc@1") - await seed_record(pg_engine, srn=str(record_srn), schema_srn=str(SCHEMA_V1)) + await seed_record( + pg_engine, + srn=str(record_srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) await store.insert( SCHEMA_V1, @@ -192,10 +197,15 @@ async def test_insert_is_idempotent_on_duplicate_delivery( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + await store.ensure_table(SCHEMA_V1, _fields_v1()) record_srn = RecordSRN.parse("urn:osa:localhost:rec:dup@1") - await seed_record(pg_engine, srn=str(record_srn), schema_srn=str(SCHEMA_V1)) + await seed_record( + pg_engine, + srn=str(record_srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) await store.insert(SCHEMA_V1, record_srn, {"species": "Mus musculus", "resolution": 1.0}) await store.insert(SCHEMA_V1, record_srn, {"species": "Mus musculus", "resolution": 1.0}) @@ -209,14 +219,71 @@ async def test_insert_is_idempotent_on_duplicate_delivery( ).scalar() assert count == 1 + async def test_insert_coerces_iso_date_string_to_date_column( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """JSONB-stored metadata hands date/datetime values back as ISO strings; + asyncpg won't auto-parse those to DATE / TIMESTAMP. The store must + coerce them based on the declared column format.""" + fields = [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="collected_on", + type=FieldType.DATE, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + dated_schema = SchemaId.parse("dated-sample@1.0.0") + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(dated_schema, fields) + + record_srn = RecordSRN.parse("urn:osa:localhost:rec:dated@1") + await seed_record( + pg_engine, + srn=str(record_srn), + schema_id=dated_schema.id.root, + schema_version=dated_schema.version.root, + ) + + # Value as it would arrive from records.metadata JSONB — a string, + # not a datetime.date. Must not raise. + await store.insert( + dated_schema, + record_srn, + {"species": "Homo sapiens", "collected_on": "2026-04-17"}, + ) + await pg_session.commit() + + async with pg_engine.begin() as conn: + row = ( + await conn.execute( + text(f'SELECT collected_on FROM "{METADATA_SCHEMA}"."dated_sample_v1"') + ) + ).first() + from datetime import date + + assert row is not None + assert row[0] == date(2026, 4, 17) + async def test_cascade_delete_removes_metadata_row( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + await store.ensure_table(SCHEMA_V1, _fields_v1()) record_srn = RecordSRN.parse("urn:osa:localhost:rec:cascade@1") - await seed_record(pg_engine, srn=str(record_srn), schema_srn=str(SCHEMA_V1)) + await seed_record( + pg_engine, + srn=str(record_srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) await store.insert(SCHEMA_V1, record_srn, {"species": "Cascade", "resolution": 0.1}) await pg_session.commit() @@ -238,27 +305,27 @@ async def test_cascade_delete_removes_metadata_row( class TestAdditiveEvolution: async def test_add_column_on_minor_bump(self, pg_engine: AsyncEngine, pg_session: AsyncSession): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + await store.ensure_table(SCHEMA_V1, _fields_v1()) cols_before = await _column_names(pg_engine, "bio_sample_v1") assert "collection_site" not in cols_before - await store.ensure_table(SCHEMA_V11, "bio_sample", _fields_v11_additive()) + await store.ensure_table(SCHEMA_V11, _fields_v11_additive()) cols_after = await _column_names(pg_engine, "bio_sample_v1") assert "collection_site" in cols_after async def test_catalog_lineage_appended(self, pg_engine: AsyncEngine, pg_session: AsyncSession): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) - await store.ensure_table(SCHEMA_V11, "bio_sample", _fields_v11_additive()) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + await store.ensure_table(SCHEMA_V11, _fields_v11_additive()) async with pg_engine.begin() as conn: versions = ( await conn.execute( text( "SELECT schema_versions FROM metadata_tables " - "WHERE schema_identity = :id AND schema_major = 1" + "WHERE schema_id = :id AND schema_major = 1" ), - {"id": SCHEMA_IDENTITY}, + {"id": SCHEMA_ID}, ) ).scalar() assert str(SCHEMA_V1) in versions @@ -269,16 +336,16 @@ async def test_catalog_lineage_appended(self, pg_engine: AsyncEngine, pg_session class TestNonAdditiveRejection: async def test_rename_raises(self, pg_engine: AsyncEngine, pg_session: AsyncSession): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + await store.ensure_table(SCHEMA_V1, _fields_v1()) with pytest.raises(ValidationError, match="Non-additive"): - await store.ensure_table(SCHEMA_V11, "bio_sample", _fields_rename()) + await store.ensure_table(SCHEMA_V11, _fields_rename()) async def test_required_new_field_raises( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields_v1()) + await store.ensure_table(SCHEMA_V1, _fields_v1()) bad = _fields_v1() + [ FieldDefinition( @@ -289,4 +356,4 @@ async def test_required_new_field_raises( ) ] with pytest.raises(ValidationError, match="required"): - await store.ensure_table(SCHEMA_V11, "bio_sample", bad) + await store.ensure_table(SCHEMA_V11, bad) diff --git a/server/tests/integration/test_discovery_compound_postgres.py b/server/tests/integration/test_discovery_compound_postgres.py index a34ebca..f752c0f 100644 --- a/server/tests/integration/test_discovery_compound_postgres.py +++ b/server/tests/integration/test_discovery_compound_postgres.py @@ -13,13 +13,13 @@ SortOrder, ) from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType -from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaId from osa.infrastructure.persistence.adapter.discovery import PostgresDiscoveryReadStore from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore from tests.integration.conftest import seed_record -SCHEMA_V1 = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") +SCHEMA_V1 = SchemaId.parse("bio-sample@1.0.0") FIELD_TYPES = {"species": FieldType.TEXT, "resolution": FieldType.NUMBER} @@ -50,11 +50,11 @@ async def seeded_store(pg_engine: AsyncEngine, pg_session: AsyncSession) -> Post ) store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + await store.ensure_table(SCHEMA_V1, _fields()) repo = PostgresSemanticsSchemaRepository(pg_session) await repo.save( - Schema(srn=SCHEMA_V1, title="bio_sample", fields=_fields(), created_at=datetime.now(UTC)) + Schema(id=SCHEMA_V1, title="bio_sample", fields=_fields(), created_at=datetime.now(UTC)) ) rows = [ @@ -65,7 +65,12 @@ async def seeded_store(pg_engine: AsyncEngine, pg_session: AsyncSession) -> Post ] for rid, sp, res in rows: srn = RecordSRN.parse(f"urn:osa:localhost:rec:{rid}@1") - await seed_record(pg_engine, srn=str(srn), schema_srn=str(SCHEMA_V1)) + await seed_record( + pg_engine, + srn=str(srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) await store.insert(SCHEMA_V1, srn, {"species": sp, "resolution": res}) await pg_session.commit() @@ -89,7 +94,7 @@ async def test_or_tree(self, pg_engine: AsyncEngine, pg_session: AsyncSession, s ) results = await read_store.search_records( filter_expr=tree, - schema_srn=SCHEMA_V1, + schema_id=SCHEMA_V1, convention_srn=None, text_fields=[], q=None, @@ -112,7 +117,7 @@ async def test_not_tree(self, pg_engine: AsyncEngine, pg_session: AsyncSession, tree = Not(operand=_pred("species", FilterOperator.EQ, "Homo sapiens")) results = await read_store.search_records( filter_expr=tree, - schema_srn=SCHEMA_V1, + schema_id=SCHEMA_V1, convention_srn=None, text_fields=[], q=None, @@ -143,7 +148,7 @@ async def test_nested_and_or( ) results = await read_store.search_records( filter_expr=tree, - schema_srn=SCHEMA_V1, + schema_id=SCHEMA_V1, convention_srn=None, text_fields=[], q=None, diff --git a/server/tests/integration/test_discovery_cross_join_postgres.py b/server/tests/integration/test_discovery_cross_join_postgres.py index 9e888ab..f979a8f 100644 --- a/server/tests/integration/test_discovery_cross_join_postgres.py +++ b/server/tests/integration/test_discovery_cross_join_postgres.py @@ -8,14 +8,14 @@ from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType from osa.domain.shared.error import ValidationError from osa.domain.shared.model.hook import ColumnDef -from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaId from osa.infrastructure.persistence.adapter.discovery import PostgresDiscoveryReadStore from osa.infrastructure.persistence.feature_store import PostgresFeatureStore from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore from tests.integration.conftest import seed_record -SCHEMA_V1 = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") +SCHEMA_V1 = SchemaId.parse("bio-sample@1.0.0") FIELD_TYPES = {"species": FieldType.TEXT} @@ -46,7 +46,7 @@ async def seeded_both(pg_engine: AsyncEngine, pg_session: AsyncSession): ) mstore = PostgresMetadataStore(pg_engine, pg_session) - await mstore.ensure_table(SCHEMA_V1, "bio_sample", _metadata_fields()) + await mstore.ensure_table(SCHEMA_V1, _metadata_fields()) fstore = PostgresFeatureStore(pg_engine, pg_session) await fstore.create_table("cell_classifier", _feature_columns()) @@ -54,7 +54,7 @@ async def seeded_both(pg_engine: AsyncEngine, pg_session: AsyncSession): repo = PostgresSemanticsSchemaRepository(pg_session) await repo.save( Schema( - srn=SCHEMA_V1, + id=SCHEMA_V1, title="bio_sample", fields=_metadata_fields(), created_at=datetime.now(UTC), @@ -70,7 +70,12 @@ async def seeded_both(pg_engine: AsyncEngine, pg_session: AsyncSession): ("rec-r3", "Mus musculus", 0.95), ]: srn = RecordSRN.parse(f"urn:osa:localhost:rec:{rid}@1") - await seed_record(pg_engine, srn=str(srn), schema_srn=str(SCHEMA_V1)) + await seed_record( + pg_engine, + srn=str(srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) await mstore.insert(SCHEMA_V1, srn, {"species": sp}) await fstore.insert_features("cell_classifier", str(srn), [{"confidence": conf}]) @@ -100,7 +105,7 @@ async def test_joined_intersection( ) results = await read_store.search_records( filter_expr=tree, - schema_srn=SCHEMA_V1, + schema_id=SCHEMA_V1, convention_srn=None, text_fields=[], q=None, @@ -125,7 +130,7 @@ async def test_unknown_hook_raises( with pytest.raises(ValidationError, match="Unknown feature hook"): await read_store.search_records( filter_expr=tree, - schema_srn=SCHEMA_V1, + schema_id=SCHEMA_V1, convention_srn=None, text_fields=[], q=None, diff --git a/server/tests/integration/test_discovery_records_typed_and.py b/server/tests/integration/test_discovery_records_typed_and.py index 4b8ccbf..b9487c5 100644 --- a/server/tests/integration/test_discovery_records_typed_and.py +++ b/server/tests/integration/test_discovery_records_typed_and.py @@ -6,7 +6,7 @@ from osa.domain.discovery.model.refs import MetadataFieldRef from osa.domain.discovery.model.value import And, FilterOperator, Predicate, SortOrder from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType -from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaId from osa.infrastructure.persistence.adapter.discovery import ( PostgresDiscoveryReadStore, PostgresFieldDefinitionReader, @@ -15,7 +15,7 @@ from tests.integration.conftest import seed_record -SCHEMA_V1 = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") +SCHEMA_V1 = SchemaId.parse("bio-sample@1.0.0") def _fields() -> list[FieldDefinition]: @@ -52,7 +52,7 @@ async def _seed_schema_row(session: AsyncSession) -> None: repo = PostgresSemanticsSchemaRepository(session) await repo.save( - Schema(srn=SCHEMA_V1, title="bio_sample", fields=_fields(), created_at=datetime.now(UTC)) + Schema(id=SCHEMA_V1, title="bio_sample", fields=_fields(), created_at=datetime.now(UTC)) ) @@ -68,7 +68,8 @@ async def _publish( await seed_record( engine, srn=str(record_srn), - schema_srn=str(SCHEMA_V1), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, metadata={"species": species, "resolution": resolution, "method": method}, ) await store.insert( @@ -84,7 +85,7 @@ async def test_and_filter_returns_matching_records( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + await store.ensure_table(SCHEMA_V1, _fields()) await _seed_schema_row(pg_session) rows = [ @@ -122,7 +123,7 @@ async def test_and_filter_returns_matching_records( results = await read_store.search_records( filter_expr=tree, - schema_srn=SCHEMA_V1, + schema_id=SCHEMA_V1, convention_srn=None, text_fields=[], q=None, @@ -145,7 +146,7 @@ async def test_scalar_op_succeeds_on_unindexed_column( ): """FR-020: scalar ops must NOT be rejected for lack of index.""" store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + await store.ensure_table(SCHEMA_V1, _fields()) await _seed_schema_row(pg_session) await _publish( @@ -166,7 +167,7 @@ async def test_scalar_op_succeeds_on_unindexed_column( op=FilterOperator.CONTAINS, value="cryo", ), - schema_srn=SCHEMA_V1, + schema_id=SCHEMA_V1, convention_srn=None, text_fields=[], q=None, @@ -185,61 +186,58 @@ async def test_scalar_op_succeeds_on_unindexed_column( @pytest.mark.asyncio class TestUnscopedListing: - """When no schema_srn is passed, discovery should still return canonical - JSONB metadata — the typed table is an optimization, not the sole source.""" + """Plain listings without a filter return canonical JSONB metadata. + Metadata-filtered queries require a pinned schema — the typed table is + the only filter path.""" - async def test_unscoped_predicate_filter_hits_jsonb( + async def test_unscoped_predicate_filter_raises_without_schema( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): - """Filtering by a metadata field without schema_srn must compile - against the canonical JSONB column (the Pockets frontend pattern: - fetch-by-pdb_id without knowing the schema SRN).""" + """Filtering by a metadata field without schema_id must raise — + the JSONB fallback compile path was removed.""" from osa.domain.discovery.model.refs import MetadataFieldRef from osa.domain.discovery.model.value import FilterOperator, Predicate + from osa.domain.shared.error import ValidationError store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + await store.ensure_table(SCHEMA_V1, _fields()) await _seed_schema_row(pg_session) - - # Two records with distinct pdb-like ids in JSONB; typed table row - # written for completeness but not read by this test. - for srn_id, species in [("rec-9x1w", "Homo sapiens"), ("rec-8abc", "Mus musculus")]: - await _publish( - pg_engine, - pg_session, - store, - RecordSRN.parse(f"urn:osa:localhost:rec:{srn_id}@1"), - species, - 3.5, - "cryo-EM", - ) + await _publish( + pg_engine, + pg_session, + store, + RecordSRN.parse("urn:osa:localhost:rec:rec-9x1w@1"), + "Homo sapiens", + 3.5, + "cryo-EM", + ) await pg_session.commit() read_store = PostgresDiscoveryReadStore(pg_session) - results = await read_store.search_records( - filter_expr=Predicate( - field=MetadataFieldRef(field="species"), - op=FilterOperator.EQ, - value="Homo sapiens", - ), - schema_srn=None, - convention_srn=None, - text_fields=[], - q=None, - sort="published_at", - order=SortOrder.DESC, - cursor=None, - limit=10, - field_types={"species": FieldType.TEXT}, - ) - srns = {str(r.srn) for r in results} - assert srns == {"urn:osa:localhost:rec:rec-9x1w@1"} + with pytest.raises(ValidationError) as exc: + await read_store.search_records( + filter_expr=Predicate( + field=MetadataFieldRef(field="species"), + op=FilterOperator.EQ, + value="Homo sapiens", + ), + schema_id=None, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types={"species": FieldType.TEXT}, + ) + assert exc.value.code == "schema_required_for_metadata_query" async def test_unscoped_listing_returns_jsonb_metadata( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + await store.ensure_table(SCHEMA_V1, _fields()) await _seed_schema_row(pg_session) await _publish( @@ -256,7 +254,7 @@ async def test_unscoped_listing_returns_jsonb_metadata( read_store = PostgresDiscoveryReadStore(pg_session) results = await read_store.search_records( filter_expr=None, - schema_srn=None, # deliberately unscoped — exercises the JSONB path + schema_id=None, # deliberately unscoped — exercises the JSONB path convention_srn=None, text_fields=[], q=None, diff --git a/server/tests/integration/test_ensure_metadata_table.py b/server/tests/integration/test_ensure_metadata_table.py index 4482c22..86962dd 100644 --- a/server/tests/integration/test_ensure_metadata_table.py +++ b/server/tests/integration/test_ensure_metadata_table.py @@ -15,15 +15,15 @@ from osa.domain.semantics.model.schema import Schema from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType from osa.domain.shared.event import EventId -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore from osa.infrastructure.persistence.metadata_table import METADATA_SCHEMA from osa.infrastructure.persistence.repository.convention import PostgresConventionRepository from osa.infrastructure.persistence.repository.schema import PostgresSemanticsSchemaRepository -SCHEMA_IDENTITY = "urn:osa:localhost:schema:bio-sample" -SCHEMA_V1 = SchemaSRN.parse(f"{SCHEMA_IDENTITY}@1.0.0") -SCHEMA_V11 = SchemaSRN.parse(f"{SCHEMA_IDENTITY}@1.1.0") +SCHEMA_ID = "bio-sample" +SCHEMA_V1 = SchemaId.parse(f"{SCHEMA_ID}@1.0.0") +SCHEMA_V11 = SchemaId.parse(f"{SCHEMA_ID}@1.1.0") def _fields_v1() -> list[FieldDefinition]: @@ -49,22 +49,20 @@ def _fields_v11() -> list[FieldDefinition]: async def _seed_schema( - session: AsyncSession, srn: SchemaSRN, fields: list[FieldDefinition], title: str = "bio_sample" + session: AsyncSession, srn: SchemaId, fields: list[FieldDefinition], title: str = "bio_sample" ) -> None: repo = PostgresSemanticsSchemaRepository(session) - await repo.save(Schema(srn=srn, title=title, fields=fields, created_at=datetime.now(UTC))) + await repo.save(Schema(id=srn, title=title, fields=fields, created_at=datetime.now(UTC))) -async def _seed_convention( - session: AsyncSession, srn: ConventionSRN, schema_srn: SchemaSRN -) -> None: +async def _seed_convention(session: AsyncSession, srn: ConventionSRN, schema_id: SchemaId) -> None: repo = PostgresConventionRepository(session) await repo.save( Convention( srn=srn, title="bio_sample_v1", description=None, - schema_srn=schema_srn, + schema_id=schema_id, file_requirements=FileRequirements(accepted_types=[], max_count=0, max_file_size=0), hooks=[], created_at=datetime.now(UTC), @@ -74,13 +72,13 @@ async def _seed_convention( def _event( convention_srn: ConventionSRN, - schema_srn: SchemaSRN, + schema_id: SchemaId, schema_fields: list[FieldDefinition], ) -> ConventionRegistered: return ConventionRegistered( id=EventId(uuid4()), convention_srn=convention_srn, - schema_srn=schema_srn, + schema_id=schema_id, schema_fields=schema_fields, hooks=[], ) @@ -91,7 +89,6 @@ async def _make_handler(pg_engine: AsyncEngine, pg_session: AsyncSession) -> Ens service = MetadataService(metadata_store=store) return EnsureMetadataTable( metadata_service=service, - schema_repo=PostgresSemanticsSchemaRepository(pg_session), convention_repo=PostgresConventionRepository(pg_session), ) diff --git a/server/tests/integration/test_event_batch_processing.py b/server/tests/integration/test_event_batch_processing.py index 49c8d7e..e1f6754 100644 --- a/server/tests/integration/test_event_batch_processing.py +++ b/server/tests/integration/test_event_batch_processing.py @@ -64,7 +64,7 @@ def make_record_published( domain=Domain("test.example.com"), id=LocalId(str(uuid4())), ) - from osa.domain.shared.model.srn import SchemaSRN + from osa.domain.shared.model.srn import SchemaId return RecordPublished( id=EventId(uuid4()), @@ -75,7 +75,7 @@ def make_record_published( ), source=DepositionSource(id=str(dep_srn)), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), - schema_srn=SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), + schema_id=SchemaId.parse("test@1.0.0"), metadata=metadata or {"title": "Test Record"}, ) diff --git a/server/tests/integration/test_insert_record_metadata.py b/server/tests/integration/test_insert_record_metadata.py index a677c9d..1145d90 100644 --- a/server/tests/integration/test_insert_record_metadata.py +++ b/server/tests/integration/test_insert_record_metadata.py @@ -12,13 +12,13 @@ from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType from osa.domain.shared.event import EventId from osa.domain.shared.model.source import DepositionSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore from osa.infrastructure.persistence.metadata_table import METADATA_SCHEMA from tests.integration.conftest import seed_record -SCHEMA_V1 = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") +SCHEMA_V1 = SchemaId.parse("bio-sample@1.0.0") CONV_SRN = ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") @@ -45,7 +45,7 @@ def _event(record_srn: RecordSRN, metadata: dict) -> RecordPublished: record_srn=record_srn, source=DepositionSource(id="dep-1"), convention_srn=CONV_SRN, - schema_srn=SCHEMA_V1, + schema_id=SCHEMA_V1, metadata=metadata, expected_features=[], ) @@ -55,10 +55,15 @@ def _event(record_srn: RecordSRN, metadata: dict) -> RecordPublished: class TestInsertRecordMetadata: async def test_insert_creates_typed_row(self, pg_engine: AsyncEngine, pg_session: AsyncSession): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + await store.ensure_table(SCHEMA_V1, _fields()) record_srn = RecordSRN.parse("urn:osa:localhost:rec:one@1") - await seed_record(pg_engine, srn=str(record_srn), schema_srn=str(SCHEMA_V1)) + await seed_record( + pg_engine, + srn=str(record_srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) handler = InsertRecordMetadata(metadata_service=MetadataService(metadata_store=store)) await handler.handle(_event(record_srn, {"species": "Homo sapiens", "resolution": 3.5})) @@ -83,10 +88,15 @@ async def test_duplicate_delivery_is_idempotent( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, "bio_sample", _fields()) + await store.ensure_table(SCHEMA_V1, _fields()) record_srn = RecordSRN.parse("urn:osa:localhost:rec:dup@1") - await seed_record(pg_engine, srn=str(record_srn), schema_srn=str(SCHEMA_V1)) + await seed_record( + pg_engine, + srn=str(record_srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) handler = InsertRecordMetadata(metadata_service=MetadataService(metadata_store=store)) event = _event(record_srn, {"species": "Mus musculus", "resolution": 1.0}) diff --git a/server/tests/integration/test_metadata_additive_evolve_postgres.py b/server/tests/integration/test_metadata_additive_evolve_postgres.py index d3940b1..9b5f59f 100644 --- a/server/tests/integration/test_metadata_additive_evolve_postgres.py +++ b/server/tests/integration/test_metadata_additive_evolve_postgres.py @@ -6,15 +6,15 @@ from osa.domain.metadata.service.metadata import MetadataService from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType -from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaId from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore from osa.infrastructure.persistence.metadata_table import METADATA_SCHEMA from tests.integration.conftest import seed_record -IDENTITY = "urn:osa:localhost:schema:bio-sample" -SCHEMA_V10 = SchemaSRN.parse(f"{IDENTITY}@1.0.0") -SCHEMA_V11 = SchemaSRN.parse(f"{IDENTITY}@1.1.0") +SCHEMA_ID = "bio-sample" +SCHEMA_V10 = SchemaId.parse(f"{SCHEMA_ID}@1.0.0") +SCHEMA_V11 = SchemaId.parse(f"{SCHEMA_ID}@1.1.0") def _fields_v10() -> list[FieldDefinition]: @@ -47,16 +47,26 @@ async def test_old_row_null_new_row_typed( service = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) # Register v1.0.0 and publish a record. - await service.ensure_table(SCHEMA_V10, "bio_sample", _fields_v10()) + await service.ensure_table(SCHEMA_V10, _fields_v10()) r_old = RecordSRN.parse("urn:osa:localhost:rec:old@1") - await seed_record(pg_engine, srn=str(r_old), schema_srn=str(SCHEMA_V10)) + await seed_record( + pg_engine, + srn=str(r_old), + schema_id=SCHEMA_V10.id.root, + schema_version=SCHEMA_V10.version.root, + ) await service.insert(SCHEMA_V10, r_old, {"species": "Mus musculus"}) await pg_session.commit() # Bump to v1.1.0 (additive) and publish another record carrying the new field. - await service.ensure_table(SCHEMA_V11, "bio_sample", _fields_v11()) + await service.ensure_table(SCHEMA_V11, _fields_v11()) r_new = RecordSRN.parse("urn:osa:localhost:rec:new@1") - await seed_record(pg_engine, srn=str(r_new), schema_srn=str(SCHEMA_V11)) + await seed_record( + pg_engine, + srn=str(r_new), + schema_id=SCHEMA_V11.id.root, + schema_version=SCHEMA_V11.version.root, + ) await service.insert( SCHEMA_V11, r_new, {"species": "Homo sapiens", "collection_site": "Lab A"} ) @@ -89,17 +99,17 @@ async def test_catalog_lineage_has_both_srns( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): service = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) - await service.ensure_table(SCHEMA_V10, "bio_sample", _fields_v10()) - await service.ensure_table(SCHEMA_V11, "bio_sample", _fields_v11()) + await service.ensure_table(SCHEMA_V10, _fields_v10()) + await service.ensure_table(SCHEMA_V11, _fields_v11()) async with pg_engine.begin() as conn: versions = ( await conn.execute( text( "SELECT schema_versions FROM metadata_tables " - "WHERE schema_identity = :id AND schema_major = 1" + "WHERE schema_id = :id AND schema_major = 1" ), - {"id": IDENTITY}, + {"id": SCHEMA_ID}, ) ).scalar() assert str(SCHEMA_V10) in versions diff --git a/server/tests/integration/test_non_additive_rejected_postgres.py b/server/tests/integration/test_non_additive_rejected_postgres.py index 200e571..08eb4a6 100644 --- a/server/tests/integration/test_non_additive_rejected_postgres.py +++ b/server/tests/integration/test_non_additive_rejected_postgres.py @@ -6,12 +6,12 @@ from osa.domain.metadata.service.metadata import MetadataService from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore -IDENTITY = "urn:osa:localhost:schema:bio-sample" -V1 = SchemaSRN.parse(f"{IDENTITY}@1.0.0") -V11 = SchemaSRN.parse(f"{IDENTITY}@1.1.0") +SCHEMA_ID = "bio-sample" +V1 = SchemaId.parse(f"{SCHEMA_ID}@1.0.0") +V11 = SchemaId.parse(f"{SCHEMA_ID}@1.1.0") def _orig() -> list[FieldDefinition]: @@ -35,7 +35,7 @@ def _orig() -> list[FieldDefinition]: class TestNonAdditiveRejected: async def test_rename_field_rejected(self, pg_engine: AsyncEngine, pg_session: AsyncSession): svc = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) - await svc.ensure_table(V1, "bio_sample", _orig()) + await svc.ensure_table(V1, _orig()) # New field "organism" is optional so the validator reaches the removal # check and reports the dropped "species" field specifically. @@ -54,13 +54,13 @@ async def test_rename_field_rejected(self, pg_engine: AsyncEngine, pg_session: A ), ] with pytest.raises(ValidationError) as exc: - await svc.ensure_table(V11, "bio_sample", renamed) + await svc.ensure_table(V11, renamed) message = str(exc.value) assert "species" in message and "removed" in message async def test_type_change_rejected(self, pg_engine: AsyncEngine, pg_session: AsyncSession): svc = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) - await svc.ensure_table(V1, "bio_sample", _orig()) + await svc.ensure_table(V1, _orig()) retyped = [ FieldDefinition( @@ -78,13 +78,13 @@ async def test_type_change_rejected(self, pg_engine: AsyncEngine, pg_session: As ), ] with pytest.raises(ValidationError, match="resolution"): - await svc.ensure_table(V11, "bio_sample", retyped) + await svc.ensure_table(V11, retyped) async def test_tightening_required_rejected( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): svc = MetadataService(metadata_store=PostgresMetadataStore(pg_engine, pg_session)) - await svc.ensure_table(V1, "bio_sample", _orig()) + await svc.ensure_table(V1, _orig()) tightened = [ FieldDefinition( @@ -101,4 +101,4 @@ async def test_tightening_required_rejected( ), ] with pytest.raises(ValidationError, match="resolution"): - await svc.ensure_table(V11, "bio_sample", tightened) + await svc.ensure_table(V11, tightened) diff --git a/server/tests/unit/domain/deposition/test_convention.py b/server/tests/unit/domain/deposition/test_convention.py index b45881d..29bb09b 100644 --- a/server/tests/unit/domain/deposition/test_convention.py +++ b/server/tests/unit/domain/deposition/test_convention.py @@ -4,15 +4,15 @@ from osa.domain.deposition.model.convention import Convention from osa.domain.deposition.model.value import FileRequirements -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId def _make_conv_srn(id: str = "test-conv", version: str = "1.0.0") -> ConventionSRN: return ConventionSRN.parse(f"urn:osa:localhost:conv:{id}@{version}") -def _make_schema_srn(id: str = "test-schema", version: str = "1.0.0") -> SchemaSRN: - return SchemaSRN.parse(f"urn:osa:localhost:schema:{id}@{version}") +def _make_schema_id(id: str = "test-schema", version: str = "1.0.0") -> SchemaId: + return SchemaId.parse(f"{id}@{version}") def _make_file_reqs() -> FileRequirements: @@ -29,12 +29,12 @@ def test_create_with_required_fields(self): conv = Convention( srn=_make_conv_srn(), title="scRNA-seq Submission", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), created_at=datetime.now(UTC), ) assert conv.title == "scRNA-seq Submission" - assert conv.schema_srn == _make_schema_srn() + assert conv.schema_id == _make_schema_id() assert conv.file_requirements.max_count == 5 def test_create_with_description(self): @@ -42,7 +42,7 @@ def test_create_with_description(self): srn=_make_conv_srn(), title="Test", description="A test convention", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), created_at=datetime.now(UTC), ) @@ -52,7 +52,7 @@ def test_create_with_empty_hooks(self): conv = Convention( srn=_make_conv_srn(), title="Test", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), hooks=[], created_at=datetime.now(UTC), @@ -65,7 +65,7 @@ def test_srn_is_versioned(self): conv = Convention( srn=_make_conv_srn("my-conv", "2.0.0"), title="Test", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), created_at=datetime.now(UTC), ) diff --git a/server/tests/unit/domain/deposition/test_convention_registered.py b/server/tests/unit/domain/deposition/test_convention_registered.py index 12cee71..71352e0 100644 --- a/server/tests/unit/domain/deposition/test_convention_registered.py +++ b/server/tests/unit/domain/deposition/test_convention_registered.py @@ -14,15 +14,15 @@ OciConfig, TableFeatureSpec, ) -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId def _make_conv_srn() -> ConventionSRN: return ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") -def _make_schema_srn() -> SchemaSRN: - return SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") +def _make_schema_id() -> SchemaId: + return SchemaId.parse("test@1.0.0") def _make_hook_definition(name: str = "pocket_detect") -> HookDefinition: @@ -46,7 +46,7 @@ def test_event_carries_hooks(self): event = ConventionRegistered( id=EventId(uuid4()), convention_srn=_make_conv_srn(), - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), schema_fields=[], hooks=hooks, ) @@ -60,7 +60,7 @@ def test_event_defaults_to_empty_hooks(self): event = ConventionRegistered( id=EventId(uuid4()), convention_srn=_make_conv_srn(), - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), ) assert event.hooks == [] @@ -71,7 +71,7 @@ def test_serialization_with_hooks(self): event = ConventionRegistered( id=EventId(uuid4()), convention_srn=_make_conv_srn(), - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), schema_fields=[], hooks=hooks, ) diff --git a/server/tests/unit/domain/deposition/test_convention_service.py b/server/tests/unit/domain/deposition/test_convention_service.py index 7a765d9..d142cd6 100644 --- a/server/tests/unit/domain/deposition/test_convention_service.py +++ b/server/tests/unit/domain/deposition/test_convention_service.py @@ -16,15 +16,15 @@ OciConfig, TableFeatureSpec, ) -from osa.domain.shared.model.srn import ConventionSRN, Domain, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, Domain, SchemaId, SchemaIdentifier def _make_conv_srn(id: str = "test-conv", version: str = "1.0.0") -> ConventionSRN: return ConventionSRN.parse(f"urn:osa:localhost:conv:{id}@{version}") -def _make_schema_srn(id: str = "test-schema", version: str = "1.0.0") -> SchemaSRN: - return SchemaSRN.parse(f"urn:osa:localhost:schema:{id}@{version}") +def _make_schema_id(id: str = "test-schema", version: str = "1.0.0") -> SchemaId: + return SchemaId.parse(f"{id}@{version}") def _make_field_defs() -> list[FieldDefinition]: @@ -71,7 +71,8 @@ def _make_service( mock_schema_service = schema_service or AsyncMock() if not schema_service: mock_schema = AsyncMock() - mock_schema.srn = _make_schema_srn() + mock_schema.id = _make_schema_id() + mock_schema.fields = [] mock_schema_service.create_schema.return_value = mock_schema return ConventionService( @@ -88,11 +89,13 @@ async def test_create_convention_creates_schema(self): conv_repo = AsyncMock() schema_service = AsyncMock() mock_schema = AsyncMock() - mock_schema.srn = _make_schema_srn() + mock_schema.id = _make_schema_id() + mock_schema.fields = [] schema_service.create_schema.return_value = mock_schema service = _make_service(conv_repo, schema_service) result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="Test Convention", version="1.0.0", schema=_make_field_defs(), @@ -106,6 +109,7 @@ async def test_create_convention_creates_schema(self): async def test_create_convention_generates_srn(self): service = _make_service() result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="Test", version="1.0.0", schema=_make_field_defs(), @@ -119,6 +123,7 @@ async def test_create_convention_with_hooks_emits_hooks_in_event(self): service = _make_service(outbox=outbox) hooks = [_make_hook_def()] result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="With Hooks", version="1.0.0", schema=_make_field_defs(), @@ -136,6 +141,7 @@ async def test_create_convention_without_hooks_emits_empty_hooks(self): outbox = AsyncMock() service = _make_service(outbox=outbox) await service.create_convention( + id=SchemaIdentifier("test-schema"), title="No Hooks", version="1.0.0", schema=_make_field_defs(), @@ -151,7 +157,7 @@ async def test_get_existing(self): conv = Convention( srn=_make_conv_srn(), title="Test", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), created_at=datetime.now(UTC), ) @@ -178,7 +184,7 @@ async def test_list_conventions(self): conv = Convention( srn=_make_conv_srn(), title="Test", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), created_at=datetime.now(UTC), ) diff --git a/server/tests/unit/domain/deposition/test_convention_service_v2.py b/server/tests/unit/domain/deposition/test_convention_service_v2.py index 9663e22..d9111ad 100644 --- a/server/tests/unit/domain/deposition/test_convention_service_v2.py +++ b/server/tests/unit/domain/deposition/test_convention_service_v2.py @@ -15,7 +15,7 @@ TableFeatureSpec, ) from osa.domain.shared.model.source import IngesterDefinition -from osa.domain.shared.model.srn import Domain, SchemaSRN +from osa.domain.shared.model.srn import Domain, SchemaId, SchemaIdentifier def _make_field_defs() -> list[FieldDefinition]: @@ -78,7 +78,7 @@ def _make_service( # Default: create_schema returns a Schema-like obj with .srn if not schema_service: mock_schema = AsyncMock() - mock_schema.srn = SchemaSRN.parse("urn:osa:localhost:schema:testschema12345678@1.0.0") + mock_schema.id = SchemaId.parse("testschema12345678@1.0.0") mock_schema_service.create_schema.return_value = mock_schema return ConventionService( @@ -94,11 +94,12 @@ class TestCreateConventionWithInlineSchema: async def test_creates_schema_from_field_definitions(self): schema_service = AsyncMock() mock_schema = AsyncMock() - mock_schema.srn = SchemaSRN.parse("urn:osa:localhost:schema:testschema12345678@1.0.0") + mock_schema.id = SchemaId.parse("testschema12345678@1.0.0") schema_service.create_schema.return_value = mock_schema service = _make_service(schema_service=schema_service) await service.create_convention( + id=SchemaIdentifier("test-schema"), title="PDB Structures", version="1.0.0", schema=_make_field_defs(), @@ -112,27 +113,29 @@ async def test_creates_schema_from_field_definitions(self): assert len(call_kwargs[1]["fields"]) == 2 @pytest.mark.asyncio - async def test_convention_references_created_schema_srn(self): + async def test_convention_references_created_schema_id(self): schema_service = AsyncMock() - schema_srn = SchemaSRN.parse("urn:osa:localhost:schema:created123456789@1.0.0") + schema_id = SchemaId.parse("created123456789@1.0.0") mock_schema = AsyncMock() - mock_schema.srn = schema_srn + mock_schema.id = schema_id schema_service.create_schema.return_value = mock_schema service = _make_service(schema_service=schema_service) result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="Test", version="1.0.0", schema=_make_field_defs(), file_requirements=_make_file_reqs(), ) - assert result.schema_srn == schema_srn + assert result.schema_id == schema_id @pytest.mark.asyncio async def test_convention_saves_ingester_definition(self): service = _make_service() ingester = _make_ingester_def() result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="With Ingester", version="1.0.0", schema=_make_field_defs(), @@ -148,6 +151,7 @@ async def test_convention_saves_ingester_definition(self): async def test_convention_ingester_defaults_to_none(self): service = _make_service() result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="No Ingester", version="1.0.0", schema=_make_field_defs(), @@ -161,6 +165,7 @@ async def test_convention_with_hooks_emits_hooks_in_event(self): service = _make_service(outbox=outbox) hooks = [_make_hook_def()] await service.create_convention( + id=SchemaIdentifier("test-schema"), title="With Hooks", version="1.0.0", schema=_make_field_defs(), @@ -178,6 +183,7 @@ async def test_create_convention_emits_convention_registered(self): outbox = AsyncMock() service = _make_service(outbox=outbox) result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="With Source", version="1.0.0", schema=_make_field_defs(), @@ -194,6 +200,7 @@ async def test_create_convention_without_source_still_emits_event(self): outbox = AsyncMock() service = _make_service(outbox=outbox) result = await service.create_convention( + id=SchemaIdentifier("test-schema"), title="No Source", version="1.0.0", schema=_make_field_defs(), diff --git a/server/tests/unit/domain/deposition/test_deposition_service.py b/server/tests/unit/domain/deposition/test_deposition_service.py index 69a5494..53aeba3 100644 --- a/server/tests/unit/domain/deposition/test_deposition_service.py +++ b/server/tests/unit/domain/deposition/test_deposition_service.py @@ -21,7 +21,7 @@ from osa.domain.deposition.event.submitted import DepositionSubmittedEvent from osa.domain.deposition.service.deposition import DepositionService from osa.domain.shared.error import NotFoundError, ValidationError -from osa.domain.shared.model.srn import ConventionSRN, DepositionSRN, Domain, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, DepositionSRN, Domain, SchemaId def _make_dep_srn(id: str = "test-dep") -> DepositionSRN: @@ -32,8 +32,8 @@ def _make_conv_srn(id: str = "test-conv", version: str = "1.0.0") -> ConventionS return ConventionSRN.parse(f"urn:osa:localhost:conv:{id}@{version}") -def _make_schema_srn(id: str = "test-schema", version: str = "1.0.0") -> SchemaSRN: - return SchemaSRN.parse(f"urn:osa:localhost:schema:{id}@{version}") +def _make_schema_id(id: str = "test-schema", version: str = "1.0.0") -> SchemaId: + return SchemaId.parse(f"{id}@{version}") def _make_file_reqs(**overrides) -> FileRequirements: @@ -51,7 +51,7 @@ def _make_convention(**overrides) -> Convention: defaults = dict( srn=_make_conv_srn(), title="Test Convention", - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=_make_file_reqs(), created_at=datetime.now(UTC), ) diff --git a/server/tests/unit/domain/deposition/test_event_chain.py b/server/tests/unit/domain/deposition/test_event_chain.py index 9cd952f..31ac8a1 100644 --- a/server/tests/unit/domain/deposition/test_event_chain.py +++ b/server/tests/unit/domain/deposition/test_event_chain.py @@ -26,7 +26,7 @@ ConventionSRN, DepositionSRN, RecordSRN, - SchemaSRN, + SchemaId, ValidationRunSRN, ) from osa.domain.validation.event.validation_completed import ValidationCompleted @@ -34,8 +34,8 @@ from osa.domain.validation.model import RunStatus -def _make_schema_srn() -> SchemaSRN: - return SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") +def _make_schema_id() -> SchemaId: + return SchemaId.parse("test@1.0.0") def _make_dep_srn() -> DepositionSRN: @@ -268,7 +268,7 @@ async def test_delegates_to_feature_service(self): source=DepositionSource(id=str(_make_dep_srn())), metadata={"title": "Test"}, convention_srn=_make_conv_srn(), - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), expected_features=["pocket_detect"], ) await handler.handle(event) diff --git a/server/tests/unit/domain/deposition/test_spreadsheet.py b/server/tests/unit/domain/deposition/test_spreadsheet.py index 5a077de..30ace17 100644 --- a/server/tests/unit/domain/deposition/test_spreadsheet.py +++ b/server/tests/unit/domain/deposition/test_spreadsheet.py @@ -12,12 +12,12 @@ FieldType, TermConstraints, ) -from osa.domain.shared.model.srn import OntologySRN, SchemaSRN +from osa.domain.shared.model.srn import OntologySRN, SchemaId from osa.infrastructure.persistence.adapter.spreadsheet import OpenpyxlSpreadsheetAdapter -def _make_schema_srn() -> SchemaSRN: - return SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") +def _make_schema_id() -> SchemaId: + return SchemaId.parse("test@1.0.0") def _make_ontology_srn() -> OntologySRN: @@ -49,7 +49,7 @@ def _make_term_field( def _make_schema(fields: list[FieldDefinition] | None = None) -> Schema: return Schema( - srn=_make_schema_srn(), + id=_make_schema_id(), title="Test Schema", fields=fields or [_make_text_field()], created_at=datetime.now(UTC), diff --git a/server/tests/unit/domain/discovery/test_discovery_service.py b/server/tests/unit/domain/discovery/test_discovery_service.py index 3f1cad4..8f9bad7 100644 --- a/server/tests/unit/domain/discovery/test_discovery_service.py +++ b/server/tests/unit/domain/discovery/test_discovery_service.py @@ -22,10 +22,10 @@ from osa.domain.discovery.service.discovery import DiscoveryService from osa.domain.semantics.model.value import FieldType from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaId -SCHEMA_SRN = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") +SCHEMA_SRN = SchemaId.parse("bio-sample@1.0.0") def _config() -> Config: @@ -84,7 +84,7 @@ async def test_rejects_unknown_filter_field(self, service: DiscoveryService) -> with pytest.raises(ValidationError, match="Unknown metadata field 'bogus'"): await service.search_records( filter_expr=_eq("bogus", "x"), - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="published_at", @@ -101,7 +101,7 @@ async def test_rejects_invalid_operator_for_type(self, service: DiscoveryService op=FilterOperator.CONTAINS, value="x", ), - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="published_at", @@ -114,7 +114,7 @@ async def test_rejects_unknown_sort_field(self, service: DiscoveryService) -> No with pytest.raises(ValidationError, match="Unknown sort field"): await service.search_records( filter_expr=None, - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="nonexistent", @@ -126,7 +126,7 @@ async def test_rejects_unknown_sort_field(self, service: DiscoveryService) -> No async def test_accepts_published_at_sort(self, service: DiscoveryService) -> None: result = await service.search_records( filter_expr=None, - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="published_at", @@ -139,7 +139,7 @@ async def test_accepts_published_at_sort(self, service: DiscoveryService) -> Non async def test_accepts_metadata_field_sort(self, service: DiscoveryService) -> None: result = await service.search_records( filter_expr=None, - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="resolution", @@ -153,7 +153,7 @@ async def test_rejects_limit_too_low(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="limit"): await service.search_records( filter_expr=None, - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="published_at", @@ -166,7 +166,7 @@ async def test_rejects_limit_too_high(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="limit"): await service.search_records( filter_expr=None, - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="published_at", @@ -188,7 +188,7 @@ async def test_rejects_q_when_no_text_fields(self, mock_read_store: AsyncMock) - with pytest.raises(ValidationError, match="Free-text search is unavailable"): await svc.search_records( filter_expr=None, - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q="kinase", sort="published_at", @@ -204,7 +204,7 @@ async def test_delegates_to_read_store( ) -> None: await service.search_records( filter_expr=_eq("method", "X-ray"), - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="published_at", @@ -226,7 +226,7 @@ async def test_decodes_cursor( cursor = encode_cursor("2026-01-01", "urn:osa:localhost:rec:abc@1") await service.search_records( filter_expr=None, - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="published_at", @@ -244,7 +244,7 @@ async def test_invalid_cursor_raises(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="cursor"): await service.search_records( filter_expr=None, - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="published_at", @@ -264,7 +264,7 @@ async def test_encodes_next_cursor_from_results( result = await service.search_records( filter_expr=None, - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="published_at", @@ -286,7 +286,7 @@ async def test_no_cursor_when_no_more_results( result = await service.search_records( filter_expr=None, - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="published_at", @@ -299,6 +299,69 @@ async def test_no_cursor_when_no_more_results( assert result.has_more is False +class TestSchemaRequiredGuards: + """With the JSONB filter fallback removed, any query that resolves against + metadata fields must pin a schema.""" + + async def test_metadata_predicate_without_schema_raises( + self, service: DiscoveryService + ) -> None: + with pytest.raises(ValidationError) as exc: + await service.search_records( + filter_expr=_eq("title", "x"), + schema_id=None, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + assert exc.value.code == "schema_required_for_metadata_query" + + async def test_non_default_sort_without_schema_raises(self, service: DiscoveryService) -> None: + with pytest.raises(ValidationError) as exc: + await service.search_records( + filter_expr=None, + schema_id=None, + convention_srn=None, + q=None, + sort="resolution", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + assert exc.value.code == "schema_required_for_metadata_sort" + + async def test_q_without_schema_raises(self, service: DiscoveryService) -> None: + with pytest.raises(ValidationError) as exc: + await service.search_records( + filter_expr=None, + schema_id=None, + convention_srn=None, + q="kinase", + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + assert exc.value.code == "schema_required_for_free_text_search" + + async def test_plain_listing_without_schema_succeeds(self, service: DiscoveryService) -> None: + """No filter, default sort, no q → unscoped listing is allowed.""" + result = await service.search_records( + filter_expr=None, + schema_id=None, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + assert result.results == [] + + class TestFilterBounds: async def test_depth_exceeded_raises(self, service: DiscoveryService) -> None: # Build a nest of AND that exceeds the default depth (10) @@ -310,7 +373,7 @@ async def test_depth_exceeded_raises(self, service: DiscoveryService) -> None: with pytest.raises(ValidationError, match="filter_depth_exceeded|depth"): await service.search_records( filter_expr=tree, - schema_srn=SCHEMA_SRN, + schema_id=SCHEMA_SRN, convention_srn=None, q=None, sort="published_at", @@ -343,7 +406,7 @@ async def test_cursor_encodes_row_id( result = await service.search_features( hook_name="detect_pockets", filter_expr=None, - schema_srn=None, + schema_id=None, record_srn=None, sort="score", order=SortOrder.DESC, diff --git a/server/tests/unit/domain/discovery/test_search_features.py b/server/tests/unit/domain/discovery/test_search_features.py index 7fa4fcb..b89494b 100644 --- a/server/tests/unit/domain/discovery/test_search_features.py +++ b/server/tests/unit/domain/discovery/test_search_features.py @@ -126,7 +126,7 @@ async def test_raises_not_found_for_unknown_hook( await service.search_features( hook_name="unknown_hook", filter_expr=None, - schema_srn=None, + schema_id=None, record_srn=None, sort="id", order=SortOrder.DESC, @@ -139,7 +139,7 @@ async def test_rejects_unknown_column(self, service: DiscoveryService) -> None: await service.search_features( hook_name="detect_pockets", filter_expr=_predicate("detect_pockets", "bogus", FilterOperator.EQ, 1), - schema_srn=None, + schema_id=None, record_srn=None, sort="id", order=SortOrder.DESC, @@ -152,7 +152,7 @@ async def test_validates_operator_for_number_column(self, service: DiscoveryServ await service.search_features( hook_name="detect_pockets", filter_expr=_predicate("detect_pockets", "score", FilterOperator.CONTAINS, "x"), - schema_srn=None, + schema_id=None, record_srn=None, sort="id", order=SortOrder.DESC, @@ -164,7 +164,7 @@ async def test_accepts_string_contains_operator(self, service: DiscoveryService) await service.search_features( hook_name="detect_pockets", filter_expr=_predicate("detect_pockets", "label", FilterOperator.CONTAINS, "test"), - schema_srn=None, + schema_id=None, record_srn=None, sort="id", order=SortOrder.DESC, @@ -179,7 +179,7 @@ async def test_passes_record_srn_filter( await service.search_features( hook_name="detect_pockets", filter_expr=None, - schema_srn=None, + schema_id=None, record_srn=srn, sort="id", order=SortOrder.DESC, @@ -196,7 +196,7 @@ async def test_delegates_to_read_store( await service.search_features( hook_name="detect_pockets", filter_expr=_predicate("detect_pockets", "score", FilterOperator.GTE, 6.0), - schema_srn=None, + schema_id=None, record_srn=None, sort="score", order=SortOrder.DESC, @@ -222,7 +222,7 @@ async def test_has_more_false_when_exactly_limit_rows( result = await service.search_features( hook_name="detect_pockets", filter_expr=None, - schema_srn=None, + schema_id=None, record_srn=None, sort="score", order=SortOrder.DESC, @@ -245,7 +245,7 @@ async def test_has_more_true_when_more_than_limit_rows( result = await service.search_features( hook_name="detect_pockets", filter_expr=None, - schema_srn=None, + schema_id=None, record_srn=None, sort="score", order=SortOrder.DESC, @@ -263,7 +263,7 @@ async def test_passes_limit_plus_one_to_read_store( await service.search_features( hook_name="detect_pockets", filter_expr=None, - schema_srn=None, + schema_id=None, record_srn=None, sort="score", order=SortOrder.DESC, diff --git a/server/tests/unit/domain/discovery/test_search_records.py b/server/tests/unit/domain/discovery/test_search_records.py index a558d37..b9a7fbd 100644 --- a/server/tests/unit/domain/discovery/test_search_records.py +++ b/server/tests/unit/domain/discovery/test_search_records.py @@ -41,7 +41,7 @@ async def test_delegates_to_service( mock_service.search_records.assert_called_once_with( filter_expr=None, - schema_srn=None, + schema_id=None, convention_srn=None, q=None, sort="published_at", diff --git a/server/tests/unit/domain/feature/test_create_feature_tables.py b/server/tests/unit/domain/feature/test_create_feature_tables.py index 8abda9e..56466ea 100644 --- a/server/tests/unit/domain/feature/test_create_feature_tables.py +++ b/server/tests/unit/domain/feature/test_create_feature_tables.py @@ -15,15 +15,15 @@ OciConfig, TableFeatureSpec, ) -from osa.domain.shared.model.srn import ConventionSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, SchemaId def _make_conv_srn() -> ConventionSRN: return ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") -def _make_schema_srn() -> SchemaSRN: - return SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") +def _make_schema_id() -> SchemaId: + return SchemaId.parse("test@1.0.0") def _make_hook_definition(name: str = "pocket_detect") -> HookDefinition: @@ -44,7 +44,7 @@ def _make_event(hooks: list[HookDefinition] | None = None) -> ConventionRegister return ConventionRegistered( id=EventId(uuid4()), convention_srn=_make_conv_srn(), - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), schema_fields=[], hooks=hooks or [], ) diff --git a/server/tests/unit/domain/feature/test_insert_record_features.py b/server/tests/unit/domain/feature/test_insert_record_features.py index d6303e7..8de7db2 100644 --- a/server/tests/unit/domain/feature/test_insert_record_features.py +++ b/server/tests/unit/domain/feature/test_insert_record_features.py @@ -13,7 +13,7 @@ from osa.domain.shared.model.srn import ( ConventionSRN, RecordSRN, - SchemaSRN, + SchemaId, ) @@ -25,8 +25,8 @@ def _make_conv_srn() -> ConventionSRN: return ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") -def _make_schema_srn() -> SchemaSRN: - return SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") +def _make_schema_id() -> SchemaId: + return SchemaId.parse("test@1.0.0") def _make_event( @@ -38,7 +38,7 @@ def _make_event( source=DepositionSource(id="urn:osa:localhost:dep:test-dep"), metadata={"title": "Test"}, convention_srn=_make_conv_srn(), - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), expected_features=expected_features or [], ) @@ -225,7 +225,7 @@ async def test_ingest_source_uses_source_fields(self): ), metadata={"title": "Ingested"}, convention_srn=_make_conv_srn(), - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), expected_features=["pocket_detect"], ) await handler.handle(event) diff --git a/server/tests/unit/domain/index/test_fanout_listener.py b/server/tests/unit/domain/index/test_fanout_listener.py index 14ee97c..2aa0988 100644 --- a/server/tests/unit/domain/index/test_fanout_listener.py +++ b/server/tests/unit/domain/index/test_fanout_listener.py @@ -97,9 +97,9 @@ async def test_creates_index_record_per_backend( record_srn=sample_record_srn, source=DepositionSource(id=str(sample_deposition_srn)), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), - schema_srn=__import__( - "osa.domain.shared.model.srn", fromlist=["SchemaSRN"] - ).SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), + schema_id=__import__( + "osa.domain.shared.model.srn", fromlist=["SchemaId"] + ).SchemaId.parse("test@1.0.0"), metadata=sample_metadata, ) @@ -140,9 +140,9 @@ async def test_creates_unique_event_ids( record_srn=sample_record_srn, source=DepositionSource(id=str(sample_deposition_srn)), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), - schema_srn=__import__( - "osa.domain.shared.model.srn", fromlist=["SchemaSRN"] - ).SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), + schema_id=__import__( + "osa.domain.shared.model.srn", fromlist=["SchemaId"] + ).SchemaId.parse("test@1.0.0"), metadata=sample_metadata, ) @@ -172,9 +172,9 @@ async def test_empty_registry_creates_no_events( record_srn=sample_record_srn, source=DepositionSource(id=str(sample_deposition_srn)), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), - schema_srn=__import__( - "osa.domain.shared.model.srn", fromlist=["SchemaSRN"] - ).SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), + schema_id=__import__( + "osa.domain.shared.model.srn", fromlist=["SchemaId"] + ).SchemaId.parse("test@1.0.0"), metadata=sample_metadata, ) diff --git a/server/tests/unit/domain/record/test_get_record_handler.py b/server/tests/unit/domain/record/test_get_record_handler.py index 10c1771..206618e 100644 --- a/server/tests/unit/domain/record/test_get_record_handler.py +++ b/server/tests/unit/domain/record/test_get_record_handler.py @@ -8,7 +8,7 @@ from osa.domain.record.model.aggregate import Record from osa.domain.shared.error import NotFoundError from osa.domain.shared.model.source import DepositionSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId def _make_record_srn() -> RecordSRN: @@ -24,7 +24,7 @@ def _make_record() -> Record: srn=_make_record_srn(), source=DepositionSource(id="urn:osa:localhost:dep:test-dep"), convention_srn=_make_conv_srn(), - schema_srn=SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), + schema_id=SchemaId.parse("test@1.0.0"), metadata={"title": "Test Protein"}, published_at=datetime.now(UTC), ) diff --git a/server/tests/unit/domain/record/test_record_features.py b/server/tests/unit/domain/record/test_record_features.py index b4227f5..4f229e5 100644 --- a/server/tests/unit/domain/record/test_record_features.py +++ b/server/tests/unit/domain/record/test_record_features.py @@ -9,7 +9,7 @@ from osa.domain.record.query.get_record import GetRecord, GetRecordHandler, RecordDetail from osa.domain.record.service.record import RecordService from osa.domain.shared.model.source import DepositionSource -from osa.domain.shared.model.srn import ConventionSRN, Domain, RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, Domain, RecordSRN, SchemaId from osa.infrastructure.persistence.adapter.feature_reader import PostgresFeatureReader @@ -177,7 +177,7 @@ def _make_record() -> Record: srn=RecordSRN.parse("urn:osa:localhost:rec:abc@1"), source=DepositionSource(id="urn:osa:localhost:dep:dep1"), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), - schema_srn=SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), + schema_id=SchemaId.parse("test@1.0.0"), metadata={"title": "Test"}, published_at=datetime.now(UTC), ) diff --git a/server/tests/unit/domain/record/test_record_published_enriched.py b/server/tests/unit/domain/record/test_record_published_enriched.py index a463da8..670ed1d 100644 --- a/server/tests/unit/domain/record/test_record_published_enriched.py +++ b/server/tests/unit/domain/record/test_record_published_enriched.py @@ -8,10 +8,10 @@ from osa.domain.record.event.record_published import RecordPublished from osa.domain.shared.event import EventId from osa.domain.shared.model.source import DepositionSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId -SCHEMA = SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") +SCHEMA = SchemaId.parse("test@1.0.0") class TestRecordPublishedEnriched: @@ -23,7 +23,7 @@ def test_carries_source(self): source=source, metadata={"title": "Test"}, convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), - schema_srn=SCHEMA, + schema_id=SCHEMA, expected_features=["pocketeer"], ) assert event.source.type == "deposition" @@ -36,7 +36,7 @@ def test_carries_convention_srn(self): source=DepositionSource(id="urn:osa:localhost:dep:test"), metadata={"title": "Test"}, convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), - schema_srn=SCHEMA, + schema_id=SCHEMA, expected_features=[], ) assert event.convention_srn == ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") @@ -48,7 +48,7 @@ def test_carries_expected_features(self): source=DepositionSource(id="urn:osa:localhost:dep:test"), metadata={"title": "Test"}, convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), - schema_srn=SCHEMA, + schema_id=SCHEMA, expected_features=["pocketeer", "qc_check"], ) assert event.expected_features == ["pocketeer", "qc_check"] diff --git a/server/tests/unit/domain/record/test_record_service.py b/server/tests/unit/domain/record/test_record_service.py index 83758aa..4158209 100644 --- a/server/tests/unit/domain/record/test_record_service.py +++ b/server/tests/unit/domain/record/test_record_service.py @@ -19,7 +19,7 @@ DepositionSRN, Domain, LocalId, - SchemaSRN, + SchemaId, ) from osa.domain.shared.outbox import Outbox @@ -28,8 +28,8 @@ def _make_conv_srn() -> ConventionSRN: return ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") -def _make_schema_srn() -> SchemaSRN: - return SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0") +def _make_schema_id() -> SchemaId: + return SchemaId.parse("test@1.0.0") def _make_convention() -> Convention: @@ -37,7 +37,7 @@ def _make_convention() -> Convention: srn=_make_conv_srn(), title="Test Convention", description=None, - schema_srn=_make_schema_srn(), + schema_id=_make_schema_id(), file_requirements=FileRequirements(accepted_types=[], max_count=0, max_file_size=0), hooks=[], created_at=datetime.now(UTC), @@ -113,7 +113,7 @@ async def test_publish_record_creates_record( assert record is not None assert record.source == sample_draft.source assert record.convention_srn == sample_draft.convention_srn - assert record.schema_srn == _make_schema_srn() + assert record.schema_id == _make_schema_id() assert record.metadata == sample_draft.metadata mock_record_repo.save.assert_called_once() @@ -136,7 +136,7 @@ async def test_publish_record_emits_record_published_event( assert event.record_srn == record.srn assert event.source == sample_draft.source assert event.convention_srn == sample_draft.convention_srn - assert event.schema_srn == _make_schema_srn() + assert event.schema_id == _make_schema_id() assert event.expected_features == sample_draft.expected_features assert event.metadata == sample_draft.metadata diff --git a/server/tests/unit/domain/semantics/test_schema.py b/server/tests/unit/domain/semantics/test_schema.py index 0a245ab..c853f3d 100644 --- a/server/tests/unit/domain/semantics/test_schema.py +++ b/server/tests/unit/domain/semantics/test_schema.py @@ -14,11 +14,11 @@ TextConstraints, ) from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import OntologySRN, SchemaSRN +from osa.domain.shared.model.srn import OntologySRN, SchemaId -def _make_srn(id: str = "test-schema", version: str = "1.0.0") -> SchemaSRN: - return SchemaSRN.parse(f"urn:osa:localhost:schema:{id}@{version}") +def _make_srn(id: str = "test-schema", version: str = "1.0.0") -> SchemaId: + return SchemaId.parse(f"{id}@{version}") def _make_text_field(name: str = "title", required: bool = True) -> FieldDefinition: @@ -33,7 +33,7 @@ def _make_text_field(name: str = "title", required: bool = True) -> FieldDefinit class TestSchemaCreation: def test_create_with_single_field(self): schema = Schema( - srn=_make_srn(), + id=_make_srn(), title="Test Schema", fields=[_make_text_field()], created_at=datetime.now(UTC), @@ -43,7 +43,7 @@ def test_create_with_single_field(self): def test_create_with_multiple_fields(self): schema = Schema( - srn=_make_srn(), + id=_make_srn(), title="scRNA-seq", fields=[ _make_text_field("title"), @@ -62,7 +62,7 @@ def test_create_with_multiple_fields(self): def test_create_with_ontology_reference(self): onto_srn = OntologySRN.parse("urn:osa:localhost:onto:sex@1.0.0") schema = Schema( - srn=_make_srn(), + id=_make_srn(), title="With Ontology", fields=[ FieldDefinition( @@ -79,7 +79,7 @@ def test_create_with_ontology_reference(self): def test_create_with_text_constraints(self): schema = Schema( - srn=_make_srn(), + id=_make_srn(), title="Constrained", fields=[ FieldDefinition( @@ -99,7 +99,7 @@ class TestSchemaInvariants: def test_rejects_empty_fields(self): with pytest.raises(ValidationError, match="at least one field"): Schema( - srn=_make_srn(), + id=_make_srn(), title="Empty", fields=[], created_at=datetime.now(UTC), @@ -108,7 +108,7 @@ def test_rejects_empty_fields(self): def test_rejects_duplicate_field_names(self): with pytest.raises(ValidationError, match="Duplicate field names"): Schema( - srn=_make_srn(), + id=_make_srn(), title="Duplicate", fields=[ _make_text_field("title"), diff --git a/server/tests/unit/domain/semantics/test_schema_service.py b/server/tests/unit/domain/semantics/test_schema_service.py index 71fbaf4..a87d163 100644 --- a/server/tests/unit/domain/semantics/test_schema_service.py +++ b/server/tests/unit/domain/semantics/test_schema_service.py @@ -13,12 +13,12 @@ TermConstraints, ) from osa.domain.semantics.service.schema import SchemaService -from osa.domain.shared.error import NotFoundError, ValidationError -from osa.domain.shared.model.srn import Domain, OntologySRN, SchemaSRN +from osa.domain.shared.error import ConflictError, NotFoundError, ValidationError +from osa.domain.shared.model.srn import Domain, OntologySRN, SchemaId, SchemaIdentifier -def _make_schema_srn(id: str = "test-schema", version: str = "1.0.0") -> SchemaSRN: - return SchemaSRN.parse(f"urn:osa:localhost:schema:{id}@{version}") +def _make_schema_id(id: str = "test-schema", version: str = "1.0.0") -> SchemaId: + return SchemaId.parse(f"{id}@{version}") def _make_ontology_srn(id: str = "sex", version: str = "1.0.0") -> OntologySRN: @@ -48,6 +48,7 @@ class TestSchemaServiceCreate: @pytest.mark.asyncio async def test_create_schema_without_ontology_refs(self): schema_repo = AsyncMock() + schema_repo.get.return_value = None ontology_repo = AsyncMock() service = SchemaService( @@ -56,16 +57,19 @@ async def test_create_schema_without_ontology_refs(self): node_domain=Domain("localhost"), ) result = await service.create_schema( + id=SchemaIdentifier("simple-schema"), title="Simple Schema", version="1.0.0", fields=[_make_text_field()], ) assert result.title == "Simple Schema" + assert result.id.id.root == "simple-schema" schema_repo.save.assert_called_once() @pytest.mark.asyncio async def test_create_schema_with_valid_ontology_ref(self): schema_repo = AsyncMock() + schema_repo.get.return_value = None ontology_repo = AsyncMock() ontology_repo.exists.return_value = True @@ -75,6 +79,7 @@ async def test_create_schema_with_valid_ontology_ref(self): node_domain=Domain("localhost"), ) result = await service.create_schema( + id=SchemaIdentifier("with-ontology"), title="With Ontology", version="1.0.0", fields=[_make_text_field(), _make_term_field()], @@ -85,6 +90,7 @@ async def test_create_schema_with_valid_ontology_ref(self): @pytest.mark.asyncio async def test_create_schema_rejects_invalid_ontology_ref(self): schema_repo = AsyncMock() + schema_repo.get.return_value = None ontology_repo = AsyncMock() ontology_repo.exists.return_value = False @@ -95,14 +101,16 @@ async def test_create_schema_rejects_invalid_ontology_ref(self): ) with pytest.raises(ValidationError, match="Ontology.*not found"): await service.create_schema( + id=SchemaIdentifier("bad-ref"), title="Bad Ref", version="1.0.0", fields=[_make_term_field()], ) @pytest.mark.asyncio - async def test_create_schema_generates_srn(self): + async def test_create_schema_uses_supplied_id(self): schema_repo = AsyncMock() + schema_repo.get.return_value = None ontology_repo = AsyncMock() service = SchemaService( @@ -111,19 +119,67 @@ async def test_create_schema_generates_srn(self): node_domain=Domain("localhost"), ) result = await service.create_schema( - title="Test", + id=SchemaIdentifier("pdb-structure"), + title="PDB Structures", version="1.0.0", fields=[_make_text_field()], ) - assert str(result.srn).startswith("urn:osa:localhost:schema:") - assert str(result.srn).endswith("@1.0.0") + assert str(result.id) == "pdb-structure@1.0.0" + + @pytest.mark.asyncio + async def test_duplicate_id_version_raises_conflict(self): + schema_repo = AsyncMock() + existing_schema = Schema( + id=SchemaId.parse("dup@1.0.0"), + title="Existing", + fields=[_make_text_field()], + created_at=datetime.now(UTC), + ) + schema_repo.get.return_value = existing_schema + ontology_repo = AsyncMock() + + service = SchemaService( + schema_repo=schema_repo, + ontology_repo=ontology_repo, + node_domain=Domain("localhost"), + ) + with pytest.raises(ConflictError) as exc: + await service.create_schema( + id=SchemaIdentifier("dup"), + title="Dup", + version="1.0.0", + fields=[_make_text_field()], + ) + assert exc.value.code == "schema_already_exists" + schema_repo.save.assert_not_called() + + +class TestSchemaIdentifierValidation: + def test_rejects_leading_digit(self): + with pytest.raises(ValueError, match="invalid schema id"): + SchemaIdentifier("3d-scan") + + def test_rejects_uppercase(self): + with pytest.raises(ValueError, match="invalid schema id"): + SchemaIdentifier("PDBStructure") + + def test_rejects_too_short(self): + with pytest.raises(ValueError, match="invalid schema id"): + SchemaIdentifier("ab") + + def test_rejects_underscore(self): + with pytest.raises(ValueError, match="invalid schema id"): + SchemaIdentifier("pdb_structure") + + def test_accepts_hyphens_and_digits(self): + assert SchemaIdentifier("pdb-v2").root == "pdb-v2" class TestSchemaServiceGet: @pytest.mark.asyncio async def test_get_existing(self): schema = Schema( - srn=_make_schema_srn(), + id=_make_schema_id(), title="Test", fields=[_make_text_field()], created_at=datetime.now(UTC), @@ -137,7 +193,7 @@ async def test_get_existing(self): ontology_repo=ontology_repo, node_domain=Domain("localhost"), ) - result = await service.get_schema(schema.srn) + result = await service.get_schema(schema.id) assert result == schema @pytest.mark.asyncio @@ -152,14 +208,14 @@ async def test_get_nonexistent_raises(self): node_domain=Domain("localhost"), ) with pytest.raises(NotFoundError): - await service.get_schema(_make_schema_srn()) + await service.get_schema(_make_schema_id()) class TestSchemaServiceList: @pytest.mark.asyncio async def test_list_schemas(self): schema = Schema( - srn=_make_schema_srn(), + id=_make_schema_id(), title="Test", fields=[_make_text_field()], created_at=datetime.now(UTC), diff --git a/server/tests/unit/domain/shared/test_srn.py b/server/tests/unit/domain/shared/test_srn.py index 669c5ed..df47011 100644 --- a/server/tests/unit/domain/shared/test_srn.py +++ b/server/tests/unit/domain/shared/test_srn.py @@ -3,7 +3,7 @@ SRN, RecordSRN, DepositionSRN, - SchemaSRN, + SchemaId, ResourceType, ) @@ -17,12 +17,12 @@ def test_parse_record_srn(self): assert srn.version is not None assert srn.version.root == 1 - def test_parse_schema_srn(self): - raw = "urn:osa:node-1:schema:my-schema@1.0.0" - srn = SchemaSRN.parse(raw) - assert srn.type == ResourceType.schema - assert srn.id.root == "my-schema" - assert str(srn.version) == "1.0.0" + def test_parse_schema_id(self): + sid = SchemaId.parse("my-schema@1.0.0") + assert sid.id.root == "my-schema" + assert str(sid.version) == "1.0.0" + assert sid.major == 1 + assert sid.render() == "my-schema@1.0.0" def test_render_srn(self): srn = DepositionSRN.parse("urn:osa:node-1:dep:abc-123") diff --git a/server/tests/unit/test_filter_expr_and_compile.py b/server/tests/unit/test_filter_expr_and_compile.py index 7e5b301..41a2ba9 100644 --- a/server/tests/unit/test_filter_expr_and_compile.py +++ b/server/tests/unit/test_filter_expr_and_compile.py @@ -15,10 +15,10 @@ from osa.domain.discovery.service.discovery import DiscoveryService from osa.domain.semantics.model.value import FieldType from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId -SCHEMA = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") +SCHEMA = SchemaId.parse("bio-sample@1.0.0") def _config(overrides: dict | None = None) -> Config: @@ -78,7 +78,7 @@ async def test_accepts_and_of_predicates(self) -> None: ) await svc.search_records( filter_expr=tree, - schema_srn=SCHEMA, + schema_id=SCHEMA, convention_srn=None, q=None, sort="published_at", @@ -99,7 +99,7 @@ async def test_depth_exceeded(self) -> None: with pytest.raises(ValidationError, match="depth"): await svc.search_records( filter_expr=tree, - schema_srn=SCHEMA, + schema_id=SCHEMA, convention_srn=None, q=None, sort="published_at", @@ -120,7 +120,7 @@ async def test_predicates_exceeded(self) -> None: with pytest.raises(ValidationError, match="predicate leaves"): await svc.search_records( filter_expr=tree, - schema_srn=SCHEMA, + schema_id=SCHEMA, convention_srn=None, q=None, sort="published_at", @@ -171,7 +171,7 @@ async def test_joins_exceeded(self) -> None: with pytest.raises(ValidationError, match="distinct feature hooks"): await svc.search_records( filter_expr=tree, - schema_srn=SCHEMA, + schema_id=SCHEMA, convention_srn=None, q=None, sort="published_at", @@ -187,7 +187,7 @@ async def test_unknown_metadata_field_rejected(self) -> None: with pytest.raises(ValidationError, match="Unknown metadata field"): await svc.search_records( filter_expr=_pred("bogus", FilterOperator.EQ, "x"), - schema_srn=SCHEMA, + schema_id=SCHEMA, convention_srn=None, q=None, sort="published_at", diff --git a/server/tests/unit/test_filter_expr_or_not.py b/server/tests/unit/test_filter_expr_or_not.py index 040dbb8..191bc74 100644 --- a/server/tests/unit/test_filter_expr_or_not.py +++ b/server/tests/unit/test_filter_expr_or_not.py @@ -17,10 +17,10 @@ from osa.domain.discovery.service.discovery import DiscoveryService from osa.domain.semantics.model.value import FieldType from osa.domain.shared.error import ValidationError -from osa.domain.shared.model.srn import SchemaSRN +from osa.domain.shared.model.srn import SchemaId -SCHEMA = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") +SCHEMA = SchemaId.parse("bio-sample@1.0.0") def _config() -> Config: @@ -59,7 +59,7 @@ async def test_or_tree_accepted(self): ) await svc.search_records( filter_expr=tree, - schema_srn=SCHEMA, + schema_id=SCHEMA, convention_srn=None, q=None, sort="published_at", @@ -73,7 +73,7 @@ async def test_not_tree_accepted(self): tree = Not(operand=_pred("title", FilterOperator.EQ, "X")) await svc.search_records( filter_expr=tree, - schema_srn=SCHEMA, + schema_id=SCHEMA, convention_srn=None, q=None, sort="published_at", @@ -98,7 +98,7 @@ async def test_nested_mixed_tree(self): ) await svc.search_records( filter_expr=tree, - schema_srn=SCHEMA, + schema_id=SCHEMA, convention_srn=None, q=None, sort="published_at", @@ -120,7 +120,7 @@ async def test_or_rejected_when_compound_disabled(self): with pytest.raises(ValidationError, match="compound_disabled|Compound"): await svc.search_records( filter_expr=tree, - schema_srn=SCHEMA, + schema_id=SCHEMA, convention_srn=None, q=None, sort="published_at", diff --git a/server/tests/unit/test_metadata_service.py b/server/tests/unit/test_metadata_service.py index 5ef2401..ebc0ed7 100644 --- a/server/tests/unit/test_metadata_service.py +++ b/server/tests/unit/test_metadata_service.py @@ -4,9 +4,9 @@ from osa.domain.metadata.service.metadata import MetadataService from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType -from osa.domain.shared.model.srn import RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import RecordSRN, SchemaId -SCHEMA = SchemaSRN.parse("urn:osa:localhost:schema:bio-sample@1.0.0") +SCHEMA = SchemaId.parse("bio-sample@1.0.0") RECORD = RecordSRN.parse("urn:osa:localhost:rec:abc@1") @@ -25,14 +25,14 @@ class TestMetadataService: async def test_ensure_table_delegates(self): store = AsyncMock() svc = MetadataService(metadata_store=store) - await svc.ensure_table(schema_srn=SCHEMA, schema_title="bio_sample", fields=_fields()) + await svc.ensure_table(schema_id=SCHEMA, fields=_fields()) store.ensure_table.assert_called_once() async def test_insert_delegates(self): store = AsyncMock() svc = MetadataService(metadata_store=store) await svc.insert( - schema_srn=SCHEMA, + schema_id=SCHEMA, record_srn=RECORD, values={"species": "Homo sapiens"}, ) diff --git a/server/tests/unit/test_record_schema_srn_immutable.py b/server/tests/unit/test_record_schema_srn_immutable.py index be8bbda..13e5c7b 100644 --- a/server/tests/unit/test_record_schema_srn_immutable.py +++ b/server/tests/unit/test_record_schema_srn_immutable.py @@ -1,4 +1,4 @@ -"""FR-008: Record.schema_srn is immutable after construction.""" +"""FR-008: Record.schema_id is immutable after construction.""" from datetime import UTC, datetime @@ -7,7 +7,7 @@ from osa.domain.record.model.aggregate import Record from osa.domain.shared.model.source import DepositionSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaSRN +from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId def _make_record() -> Record: @@ -15,14 +15,14 @@ def _make_record() -> Record: srn=RecordSRN.parse("urn:osa:localhost:rec:abc@1"), source=DepositionSource(id="urn:osa:localhost:dep:d1"), convention_srn=ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0"), - schema_srn=SchemaSRN.parse("urn:osa:localhost:schema:test@1.0.0"), + schema_id=SchemaId.parse("test@1.0.0"), metadata={"title": "T"}, published_at=datetime.now(UTC), ) -def test_schema_srn_cannot_be_reassigned(): +def test_schema_id_cannot_be_reassigned(): record = _make_record() - other = SchemaSRN.parse("urn:osa:localhost:schema:other@1.0.0") + other = SchemaId.parse("other@1.0.0") with pytest.raises(ValidationError): - record.schema_srn = other # type: ignore[misc] + record.schema_id = other # type: ignore[misc] From 70416827e58fd99fa3b22f302a09cf12d476659b Mon Sep 17 00:00:00 2001 From: Rory Byrne Date: Wed, 22 Apr 2026 12:56:39 +0100 Subject: [PATCH 3/9] refactor: replace async metadata projection with synchronous dual-write Move metadata table creation and row insertion from event handlers to inline operations within ConventionService and RecordService to eliminate race conditions and ensure atomicity between canonical records and typed metadata tables. --- .../domain/deposition/service/convention.py | 10 + .../osa/domain/deposition/util/di/provider.py | 3 + .../metadata/handler/ensure_metadata_table.py | 43 ---- .../metadata/handler/insert_batch_metadata.py | 52 ----- .../handler/insert_record_metadata.py | 29 --- .../domain/metadata/port/metadata_store.py | 12 + .../osa/domain/metadata/service/metadata.py | 7 + server/osa/domain/record/service/record.py | 22 ++ server/osa/infrastructure/event/di.py | 9 +- server/osa/infrastructure/persistence/di.py | 3 + .../persistence/metadata_store.py | 42 +++- .../persistence/test_metadata_store.py | 94 ++++++++ .../test_bulk_publish_dual_write.py | 206 ++++++++++++++++++ .../integration/test_ensure_metadata_table.py | 169 -------------- .../test_insert_record_metadata.py | 114 ---------- .../deposition/test_convention_service.py | 1 + .../deposition/test_convention_service_v2.py | 1 + .../domain/record/test_record_features.py | 1 + .../unit/domain/record/test_record_service.py | 1 + 19 files changed, 395 insertions(+), 424 deletions(-) delete mode 100644 server/osa/domain/metadata/handler/ensure_metadata_table.py delete mode 100644 server/osa/domain/metadata/handler/insert_batch_metadata.py delete mode 100644 server/osa/domain/metadata/handler/insert_record_metadata.py create mode 100644 server/tests/integration/test_bulk_publish_dual_write.py delete mode 100644 server/tests/integration/test_ensure_metadata_table.py delete mode 100644 server/tests/integration/test_insert_record_metadata.py diff --git a/server/osa/domain/deposition/service/convention.py b/server/osa/domain/deposition/service/convention.py index b01f23a..366ffbe 100644 --- a/server/osa/domain/deposition/service/convention.py +++ b/server/osa/domain/deposition/service/convention.py @@ -5,6 +5,7 @@ from osa.domain.deposition.model.convention import Convention from osa.domain.deposition.model.value import FileRequirements from osa.domain.deposition.port.convention_repository import ConventionRepository +from osa.domain.metadata.service.metadata import MetadataService from osa.domain.semantics.model.value import FieldDefinition from osa.domain.semantics.service.schema import SchemaService from osa.domain.shared.error import NotFoundError @@ -25,6 +26,7 @@ class ConventionService(Service): convention_repo: ConventionRepository schema_service: SchemaService + metadata_service: MetadataService outbox: Outbox node_domain: Domain @@ -55,6 +57,14 @@ async def create_convention( fields=schema, ) + # Create (or additively evolve) the typed metadata table in the same + # transaction — no async window where records can publish against a + # convention whose typed table doesn't exist yet. + await self.metadata_service.ensure_table( + schema_id=created_schema.id, + fields=created_schema.fields, + ) + srn = ConventionSRN( domain=self.node_domain, id=LocalId(str(uuid4())[:20]), diff --git a/server/osa/domain/deposition/util/di/provider.py b/server/osa/domain/deposition/util/di/provider.py index 66ea290..a38e7a5 100644 --- a/server/osa/domain/deposition/util/di/provider.py +++ b/server/osa/domain/deposition/util/di/provider.py @@ -20,6 +20,7 @@ from osa.domain.deposition.query.list_depositions import ListDepositionsHandler from osa.domain.deposition.service.convention import ConventionService from osa.domain.deposition.service.deposition import DepositionService +from osa.domain.metadata.service.metadata import MetadataService from osa.domain.semantics.service.schema import SchemaService from osa.domain.shared.model.srn import Domain from osa.domain.shared.outbox import Outbox @@ -51,12 +52,14 @@ def get_convention_service( self, convention_repo: ConventionRepository, schema_service: SchemaService, + metadata_service: MetadataService, outbox: Outbox, config: Config, ) -> ConventionService: return ConventionService( convention_repo=convention_repo, schema_service=schema_service, + metadata_service=metadata_service, outbox=outbox, node_domain=Domain(config.domain), ) diff --git a/server/osa/domain/metadata/handler/ensure_metadata_table.py b/server/osa/domain/metadata/handler/ensure_metadata_table.py deleted file mode 100644 index 1a35eb1..0000000 --- a/server/osa/domain/metadata/handler/ensure_metadata_table.py +++ /dev/null @@ -1,43 +0,0 @@ -"""EnsureMetadataTable — creates/evolves the typed metadata table on ConventionRegistered.""" - -from __future__ import annotations - -import logging - -from osa.domain.deposition.event.convention_registered import ConventionRegistered -from osa.domain.deposition.port.convention_repository import ConventionRepository -from osa.domain.metadata.service.metadata import MetadataService -from osa.domain.shared.error import DomainError, NotFoundError -from osa.domain.shared.event import EventHandler - -logger = logging.getLogger(__name__) - - -class EnsureMetadataTable(EventHandler[ConventionRegistered]): - """Reacts to ConventionRegistered, creates/evolves the schema's metadata table. - - Idempotent and schema-keyed: two conventions against the same - ``(schema_identity, schema_major)`` share one table. Additive minor/patch - bumps trigger ALTER ADD COLUMN. - """ - - metadata_service: MetadataService - convention_repo: ConventionRepository - - async def handle(self, event: ConventionRegistered) -> None: - convention = await self.convention_repo.get(event.convention_srn) - if convention is None: - raise NotFoundError(f"Convention not found: {event.convention_srn}") - - try: - await self.metadata_service.ensure_table( - schema_id=event.schema_id, - fields=event.schema_fields, - ) - except DomainError: - logger.exception( - "EnsureMetadataTable failed: convention=%s schema=%s", - event.convention_srn, - event.schema_id, - ) - raise diff --git a/server/osa/domain/metadata/handler/insert_batch_metadata.py b/server/osa/domain/metadata/handler/insert_batch_metadata.py deleted file mode 100644 index e0f1450..0000000 --- a/server/osa/domain/metadata/handler/insert_batch_metadata.py +++ /dev/null @@ -1,52 +0,0 @@ -"""InsertBatchMetadata — bulk metadata projection for ingest batches. - -Mirrors :class:`InsertBatchFeatures` — listens to ``IngestBatchPublished`` -rather than per-record ``RecordPublished``, because the bulk ingest pipeline -emits one batch-level event instead of N per-record ones (AD-3). -""" - -from __future__ import annotations - -from osa.domain.ingest.event.events import IngestBatchPublished -from osa.domain.metadata.service.metadata import MetadataService -from osa.domain.record.port.repository import RecordRepository -from osa.domain.shared.event import EventHandler -from osa.domain.shared.model.srn import RecordSRN -from osa.infrastructure.logging import get_logger - -log = get_logger(__name__) - - -class InsertBatchMetadata(EventHandler[IngestBatchPublished]): - """Project each newly-published record's metadata into its typed table.""" - - metadata_service: MetadataService - record_repo: RecordRepository - - async def handle(self, event: IngestBatchPublished) -> None: - if not event.published_srns: - return - - inserted = 0 - for srn_str in event.published_srns: - srn = RecordSRN.parse(srn_str) - record = await self.record_repo.get(srn) - if record is None: - # Record was published in this batch but we can't find it — - # would indicate the same UOW is reading stale state. Skip. - continue - await self.metadata_service.insert( - schema_id=record.schema_id, - record_srn=record.srn, - values=record.metadata, - ) - inserted += 1 - - short_id = event.ingest_run_id[:8] - log.info( - "[{short_id}] batch {batch_index}: inserted {inserted} metadata rows", - short_id=short_id, - batch_index=event.batch_index, - inserted=inserted, - ingest_run_id=event.ingest_run_id, - ) diff --git a/server/osa/domain/metadata/handler/insert_record_metadata.py b/server/osa/domain/metadata/handler/insert_record_metadata.py deleted file mode 100644 index a7d09d3..0000000 --- a/server/osa/domain/metadata/handler/insert_record_metadata.py +++ /dev/null @@ -1,29 +0,0 @@ -"""InsertRecordMetadata — writes a record's typed metadata row on RecordPublished.""" - -from __future__ import annotations - -import logging - -from osa.domain.metadata.service.metadata import MetadataService -from osa.domain.record.event.record_published import RecordPublished -from osa.domain.shared.event import EventHandler - -logger = logging.getLogger(__name__) - - -class InsertRecordMetadata(EventHandler[RecordPublished]): - """Reacts to RecordPublished, inserts a typed metadata row for the record.""" - - metadata_service: MetadataService - - async def handle(self, event: RecordPublished) -> None: - await self.metadata_service.insert( - schema_id=event.schema_id, - record_srn=event.record_srn, - values=event.metadata, - ) - logger.debug( - "Inserted metadata row: record=%s schema=%s", - event.record_srn, - event.schema_id, - ) diff --git a/server/osa/domain/metadata/port/metadata_store.py b/server/osa/domain/metadata/port/metadata_store.py index 3661e76..cec1d74 100644 --- a/server/osa/domain/metadata/port/metadata_store.py +++ b/server/osa/domain/metadata/port/metadata_store.py @@ -41,3 +41,15 @@ async def insert( ) -> None: """Upsert a record's typed metadata row into the schema's table.""" ... + + async def insert_many( + self, + schema_id: "SchemaId", + rows: "list[tuple[RecordSRN, dict[str, Any]]]", + ) -> None: + """Bulk upsert typed metadata rows — one multi-row SQL statement. + + All rows must belong to the same schema; callers group by schema_id + before calling. Empty ``rows`` is a no-op. + """ + ... diff --git a/server/osa/domain/metadata/service/metadata.py b/server/osa/domain/metadata/service/metadata.py index b2b7e07..8bff8fa 100644 --- a/server/osa/domain/metadata/service/metadata.py +++ b/server/osa/domain/metadata/service/metadata.py @@ -29,3 +29,10 @@ async def insert( values: dict[str, Any], ) -> None: await self.metadata_store.insert(schema_id, record_srn, values) + + async def insert_many( + self, + schema_id: SchemaId, + rows: list[tuple[RecordSRN, dict[str, Any]]], + ) -> None: + await self.metadata_store.insert_many(schema_id, rows) diff --git a/server/osa/domain/record/service/record.py b/server/osa/domain/record/service/record.py index 3e130b1..e84512b 100644 --- a/server/osa/domain/record/service/record.py +++ b/server/osa/domain/record/service/record.py @@ -8,6 +8,7 @@ from uuid import uuid4 from osa.domain.deposition.port.convention_repository import ConventionRepository +from osa.domain.metadata.service.metadata import MetadataService from osa.domain.record.event.record_published import RecordPublished from osa.domain.record.model.aggregate import Record from osa.domain.record.model.draft import RecordDraft @@ -36,6 +37,7 @@ class RecordService(Service): record_repo: RecordRepository convention_repo: ConventionRepository + metadata_service: MetadataService outbox: Outbox node_domain: Domain feature_reader: FeatureReader @@ -96,6 +98,19 @@ async def bulk_publish(self, drafts: list[RecordDraft]) -> list[Record]: ) published = await self.record_repo.save_many(records) + + # Dual-write typed metadata projection in the same transaction. + # Group by schema_id — each schema has its own typed table. Use the + # rendered string as the dict key because SchemaId holds unhashable + # RootModel fields (LocalId, Semver). + by_schema: dict[str, tuple[SchemaId, list[tuple[RecordSRN, dict[str, Any]]]]] = {} + for r in published: + key = r.schema_id.render() + entry = by_schema.setdefault(key, (r.schema_id, [])) + entry[1].append((r.srn, r.metadata)) + for schema_id, typed_rows in by_schema.values(): + await self.metadata_service.insert_many(schema_id, typed_rows) + return published async def publish_record(self, draft: RecordDraft) -> Record: @@ -122,6 +137,13 @@ async def publish_record(self, draft: RecordDraft) -> Record: await self.record_repo.save(record) logger.info(f"Record persisted: {record_srn}") + # Dual-write typed metadata projection in the same transaction. + await self.metadata_service.insert( + schema_id=schema_id, + record_srn=record_srn, + values=draft.metadata, + ) + published = RecordPublished( id=EventId(uuid4()), record_srn=record_srn, diff --git a/server/osa/infrastructure/event/di.py b/server/osa/infrastructure/event/di.py index 682da77..3b05d40 100644 --- a/server/osa/infrastructure/event/di.py +++ b/server/osa/infrastructure/event/di.py @@ -14,9 +14,6 @@ InsertRecordFeatures, ) from osa.domain.ingest.handler import PublishBatch, RunHooks, RunIngester -from osa.domain.metadata.handler.ensure_metadata_table import EnsureMetadataTable -from osa.domain.metadata.handler.insert_batch_metadata import InsertBatchMetadata -from osa.domain.metadata.handler.insert_record_metadata import InsertRecordMetadata from osa.domain.record.handler import ConvertDepositionToRecord from osa.domain.shared.event import EventHandler from osa.domain.shared.event_log import EventLog @@ -40,10 +37,8 @@ CreateFeatureTables, InsertRecordFeatures, InsertBatchFeatures, - # Metadata handlers (feature 076) - EnsureMetadataTable, - InsertRecordMetadata, - InsertBatchMetadata, + # Metadata projection is now synchronous (dual-write inside RecordService / + # ConventionService) — no event handlers required for it. # Ingest handlers RunIngester, RunHooks, diff --git a/server/osa/infrastructure/persistence/di.py b/server/osa/infrastructure/persistence/di.py index 75908e4..12873e2 100644 --- a/server/osa/infrastructure/persistence/di.py +++ b/server/osa/infrastructure/persistence/di.py @@ -12,6 +12,7 @@ from osa.domain.deposition.port.repository import DepositionRepository from osa.domain.deposition.port.schema_reader import SchemaReader from osa.domain.deposition.port.storage import FileStoragePort +from osa.domain.metadata.service.metadata import MetadataService from osa.domain.record.port.feature_reader import FeatureReader from osa.domain.record.port.repository import RecordRepository from osa.domain.record.query.get_record import GetRecordHandler @@ -154,6 +155,7 @@ def get_record_service( self, record_repo: RecordRepository, convention_repo: ConventionRepository, + metadata_service: MetadataService, outbox: Outbox, config: Config, feature_reader: FeatureReader, @@ -165,6 +167,7 @@ def get_record_service( return RecordService( record_repo=record_repo, convention_repo=convention_repo, + metadata_service=metadata_service, outbox=outbox, node_domain=Domain(config.domain), feature_reader=feature_reader, diff --git a/server/osa/infrastructure/persistence/metadata_store.py b/server/osa/infrastructure/persistence/metadata_store.py index 68be0ee..109b365 100644 --- a/server/osa/infrastructure/persistence/metadata_store.py +++ b/server/osa/infrastructure/persistence/metadata_store.py @@ -162,6 +162,16 @@ async def insert( record_srn: RecordSRN, values: dict[str, Any], ) -> None: + await self.insert_many(schema_id, [(record_srn, values)]) + + async def insert_many( + self, + schema_id: SchemaId, + rows: list[tuple[RecordSRN, dict[str, Any]]], + ) -> None: + if not rows: + return + id_str = schema_id.id.root major = schema_id.major @@ -191,16 +201,28 @@ async def insert( table = build_metadata_table(pg_table, schema) col_by_name = {c.name: c for c in schema.columns} - payload: dict[str, Any] = {} - for k, v in values.items(): - col = col_by_name.get(k) - if col is None: - continue - payload[k] = _coerce_value(col, v) - payload["record_srn"] = str(record_srn) - - stmt = insert(table).values(**payload) - update_cols = {c: stmt.excluded[c] for c in payload.keys() if c != "record_srn"} + known_names = set(col_by_name.keys()) + + payloads: list[dict[str, Any]] = [] + for record_srn, values in rows: + payload: dict[str, Any] = {} + for k, v in values.items(): + col = col_by_name.get(k) + if col is None: + continue + payload[k] = _coerce_value(col, v) + payload["record_srn"] = str(record_srn) + payloads.append(payload) + + # Uniform column set across all rows — asyncpg multi-row insert requires it. + # Fill missing columns with None so every payload has the same keys. + all_keys: set[str] = {"record_srn"} | known_names + for p in payloads: + for k in all_keys: + p.setdefault(k, None) + + stmt = insert(table).values(payloads) + update_cols = {c: stmt.excluded[c] for c in all_keys if c != "record_srn"} if update_cols: stmt = stmt.on_conflict_do_update( index_elements=[table.c.record_srn], diff --git a/server/tests/integration/persistence/test_metadata_store.py b/server/tests/integration/persistence/test_metadata_store.py index 42bb71f..8f07e10 100644 --- a/server/tests/integration/persistence/test_metadata_store.py +++ b/server/tests/integration/persistence/test_metadata_store.py @@ -219,6 +219,100 @@ async def test_insert_is_idempotent_on_duplicate_delivery( ).scalar() assert count == 1 + async def test_insert_many_bulk_upserts_rows( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + + record_srns = [RecordSRN.parse(f"urn:osa:localhost:rec:bulk-{i}@1") for i in range(5)] + for srn in record_srns: + await seed_record( + pg_engine, + srn=str(srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) + + rows = [ + (srn, {"species": f"species-{i}", "resolution": float(i)}) + for i, srn in enumerate(record_srns) + ] + await store.insert_many(SCHEMA_V1, rows) + await pg_session.commit() + + async with pg_engine.begin() as conn: + count = ( + await conn.execute( + text(f'SELECT COUNT(*) FROM "{METADATA_SCHEMA}"."bio_sample_v1"') + ) + ).scalar() + assert count == 5 + + async def test_insert_many_empty_rows_noop( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(SCHEMA_V1, _fields_v1()) + # Must not raise, must not hit the DB + await store.insert_many(SCHEMA_V1, []) + + async def test_insert_many_coerces_dates_per_row( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """Every row in a batch gets type coercion applied independently.""" + from datetime import date + + fields = [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="collected_on", + type=FieldType.DATE, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + dated_schema = SchemaId.parse("dated-bulk@1.0.0") + store = PostgresMetadataStore(pg_engine, pg_session) + await store.ensure_table(dated_schema, fields) + + srns = [RecordSRN.parse(f"urn:osa:localhost:rec:dated-bulk-{i}@1") for i in range(3)] + for srn in srns: + await seed_record( + pg_engine, + srn=str(srn), + schema_id=dated_schema.id.root, + schema_version=dated_schema.version.root, + ) + + rows = [ + (srns[0], {"species": "A", "collected_on": "2026-01-01"}), + (srns[1], {"species": "B", "collected_on": "2026-02-02"}), + (srns[2], {"species": "C", "collected_on": "2026-03-03"}), + ] + await store.insert_many(dated_schema, rows) + await pg_session.commit() + + async with pg_engine.begin() as conn: + result = ( + await conn.execute( + text( + f"SELECT species, collected_on " + f'FROM "{METADATA_SCHEMA}"."dated_bulk_v1" ORDER BY species' + ) + ) + ).all() + assert [(r[0], r[1]) for r in result] == [ + ("A", date(2026, 1, 1)), + ("B", date(2026, 2, 2)), + ("C", date(2026, 3, 3)), + ] + async def test_insert_coerces_iso_date_string_to_date_column( self, pg_engine: AsyncEngine, pg_session: AsyncSession ): diff --git a/server/tests/integration/test_bulk_publish_dual_write.py b/server/tests/integration/test_bulk_publish_dual_write.py new file mode 100644 index 0000000..ab11f35 --- /dev/null +++ b/server/tests/integration/test_bulk_publish_dual_write.py @@ -0,0 +1,206 @@ +"""Integration tests for dual-write of records + typed metadata. + +``RecordService.bulk_publish`` and ``RecordService.publish_record`` now write +both the canonical ``records`` row and the typed ``metadata._v`` +row atomically in one transaction. These tests verify: + +- Both rows land on a successful publish. +- A malformed metadata value rolls back the whole transaction — no partial + state where ``records`` has a row but the typed table doesn't. +- ``ConventionService.create_convention`` creates the typed table inline + (no event-handler race window). +""" + +from __future__ import annotations + +from unittest.mock import AsyncMock +from uuid import uuid4 + +import pytest +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession + +from osa.domain.deposition.model.value import FileRequirements +from osa.domain.deposition.service.convention import ConventionService +from osa.domain.metadata.service.metadata import MetadataService +from osa.domain.record.model.draft import RecordDraft +from osa.domain.record.service import RecordService +from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType +from osa.domain.semantics.service.schema import SchemaService +from osa.domain.shared.model.source import DepositionSource +from osa.domain.shared.model.srn import ConventionSRN, Domain, SchemaIdentifier +from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore +from osa.infrastructure.persistence.repository.convention import PostgresConventionRepository +from osa.infrastructure.persistence.repository.ontology import PostgresOntologyRepository +from osa.infrastructure.persistence.repository.record import PostgresRecordRepository +from osa.infrastructure.persistence.repository.schema import PostgresSemanticsSchemaRepository + + +def _fields() -> list[FieldDefinition]: + return [ + FieldDefinition( + name="species", + type=FieldType.TEXT, + required=True, + cardinality=Cardinality.EXACTLY_ONE, + ), + FieldDefinition( + name="resolution", + type=FieldType.NUMBER, + required=False, + cardinality=Cardinality.EXACTLY_ONE, + ), + ] + + +async def _register_convention( + pg_engine: AsyncEngine, + pg_session: AsyncSession, + slug: str = "dual-write-sample", +) -> ConventionService: + metadata_store = PostgresMetadataStore(pg_engine, pg_session) + metadata_service = MetadataService(metadata_store=metadata_store) + schema_service = SchemaService( + schema_repo=PostgresSemanticsSchemaRepository(pg_session), + ontology_repo=PostgresOntologyRepository(pg_session), + node_domain=Domain("localhost"), + ) + convention_service = ConventionService( + convention_repo=PostgresConventionRepository(pg_session), + schema_service=schema_service, + metadata_service=metadata_service, + outbox=AsyncMock(), + node_domain=Domain("localhost"), + ) + await convention_service.create_convention( + id=SchemaIdentifier(slug), + title="Dual Write Sample", + version="1.0.0", + schema=_fields(), + file_requirements=FileRequirements(accepted_types=[], max_count=0, max_file_size=0), + ) + await pg_session.commit() + return convention_service + + +@pytest.mark.asyncio +class TestConventionCreatesTypedTableInline: + async def test_typed_table_exists_immediately_after_create_convention( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """No event-handler race window — the table exists in the same txn.""" + await _register_convention(pg_engine, pg_session, slug="inline-create") + + async with pg_engine.begin() as conn: + exists = ( + await conn.execute( + text( + "SELECT EXISTS (SELECT 1 FROM information_schema.tables " + "WHERE table_schema = 'metadata' AND table_name = 'inline_create_v1')" + ) + ) + ).scalar() + assert exists is True + + +@pytest.mark.asyncio +class TestBulkPublishDualWrite: + async def test_bulk_publish_writes_both_tables( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + await _register_convention(pg_engine, pg_session, slug="bulk-dual") + + # Fetch the convention SRN to attach drafts to. + async with pg_engine.begin() as conn: + conv_srn_str = ( + await conn.execute(text("SELECT srn FROM conventions LIMIT 1")) + ).scalar() + assert conv_srn_str is not None + + record_service = RecordService( + record_repo=PostgresRecordRepository(pg_session), + convention_repo=PostgresConventionRepository(pg_session), + metadata_service=MetadataService( + metadata_store=PostgresMetadataStore(pg_engine, pg_session), + ), + outbox=AsyncMock(), + node_domain=Domain("localhost"), + feature_reader=AsyncMock(), + ) + + drafts = [ + RecordDraft( + source=DepositionSource(id=f"dep-{uuid4()}"), + metadata={"species": "Homo sapiens", "resolution": 2.0 + i * 0.1}, + convention_srn=ConventionSRN.parse(conv_srn_str), + ) + for i in range(3) + ] + + published = await record_service.bulk_publish(drafts) + await pg_session.commit() + + assert len(published) == 3 + + async with pg_engine.begin() as conn: + records_count = ( + await conn.execute( + text("SELECT COUNT(*) FROM records WHERE schema_id = 'bulk-dual'") + ) + ).scalar() + typed_count = ( + await conn.execute(text('SELECT COUNT(*) FROM "metadata"."bulk_dual_v1"')) + ).scalar() + assert records_count == 3 + assert typed_count == 3 + + async def test_malformed_metadata_rolls_back_everything( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """A type error in the typed write must fail the whole transaction — + no orphan row left in ``records``.""" + await _register_convention(pg_engine, pg_session, slug="rollback-sample") + + async with pg_engine.begin() as conn: + conv_srn_str = ( + await conn.execute( + text("SELECT srn FROM conventions WHERE schema_id = 'rollback-sample'") + ) + ).scalar() + + record_service = RecordService( + record_repo=PostgresRecordRepository(pg_session), + convention_repo=PostgresConventionRepository(pg_session), + metadata_service=MetadataService( + metadata_store=PostgresMetadataStore(pg_engine, pg_session), + ), + outbox=AsyncMock(), + node_domain=Domain("localhost"), + feature_reader=AsyncMock(), + ) + + # 'resolution' expects a NUMBER; pass a non-coercible string. + drafts = [ + RecordDraft( + source=DepositionSource(id=f"dep-{uuid4()}"), + metadata={"species": "A", "resolution": "not-a-number"}, + convention_srn=ConventionSRN.parse(conv_srn_str), + ) + ] + + with pytest.raises(Exception): # noqa: BLE001 — asyncpg DataError or similar + await record_service.bulk_publish(drafts) + await pg_session.commit() + await pg_session.rollback() + + async with pg_engine.begin() as conn: + records_count = ( + await conn.execute( + text("SELECT COUNT(*) FROM records WHERE schema_id = 'rollback-sample'") + ) + ).scalar() + typed_count = ( + await conn.execute(text('SELECT COUNT(*) FROM "metadata"."rollback_sample_v1"')) + ).scalar() + assert records_count == 0 + assert typed_count == 0 diff --git a/server/tests/integration/test_ensure_metadata_table.py b/server/tests/integration/test_ensure_metadata_table.py deleted file mode 100644 index 86962dd..0000000 --- a/server/tests/integration/test_ensure_metadata_table.py +++ /dev/null @@ -1,169 +0,0 @@ -"""Integration tests for EnsureMetadataTable event handler.""" - -from datetime import UTC, datetime -from uuid import uuid4 - -import pytest -from sqlalchemy import text -from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession - -from osa.domain.deposition.event.convention_registered import ConventionRegistered -from osa.domain.deposition.model.convention import Convention -from osa.domain.deposition.model.value import FileRequirements -from osa.domain.metadata.handler.ensure_metadata_table import EnsureMetadataTable -from osa.domain.metadata.service.metadata import MetadataService -from osa.domain.semantics.model.schema import Schema -from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType -from osa.domain.shared.event import EventId -from osa.domain.shared.model.srn import ConventionSRN, SchemaId -from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore -from osa.infrastructure.persistence.metadata_table import METADATA_SCHEMA -from osa.infrastructure.persistence.repository.convention import PostgresConventionRepository -from osa.infrastructure.persistence.repository.schema import PostgresSemanticsSchemaRepository - -SCHEMA_ID = "bio-sample" -SCHEMA_V1 = SchemaId.parse(f"{SCHEMA_ID}@1.0.0") -SCHEMA_V11 = SchemaId.parse(f"{SCHEMA_ID}@1.1.0") - - -def _fields_v1() -> list[FieldDefinition]: - return [ - FieldDefinition( - name="species", - type=FieldType.TEXT, - required=True, - cardinality=Cardinality.EXACTLY_ONE, - ), - ] - - -def _fields_v11() -> list[FieldDefinition]: - return _fields_v1() + [ - FieldDefinition( - name="collection_site", - type=FieldType.TEXT, - required=False, - cardinality=Cardinality.EXACTLY_ONE, - ), - ] - - -async def _seed_schema( - session: AsyncSession, srn: SchemaId, fields: list[FieldDefinition], title: str = "bio_sample" -) -> None: - repo = PostgresSemanticsSchemaRepository(session) - await repo.save(Schema(id=srn, title=title, fields=fields, created_at=datetime.now(UTC))) - - -async def _seed_convention(session: AsyncSession, srn: ConventionSRN, schema_id: SchemaId) -> None: - repo = PostgresConventionRepository(session) - await repo.save( - Convention( - srn=srn, - title="bio_sample_v1", - description=None, - schema_id=schema_id, - file_requirements=FileRequirements(accepted_types=[], max_count=0, max_file_size=0), - hooks=[], - created_at=datetime.now(UTC), - ) - ) - - -def _event( - convention_srn: ConventionSRN, - schema_id: SchemaId, - schema_fields: list[FieldDefinition], -) -> ConventionRegistered: - return ConventionRegistered( - id=EventId(uuid4()), - convention_srn=convention_srn, - schema_id=schema_id, - schema_fields=schema_fields, - hooks=[], - ) - - -async def _make_handler(pg_engine: AsyncEngine, pg_session: AsyncSession) -> EnsureMetadataTable: - store = PostgresMetadataStore(pg_engine, pg_session) - service = MetadataService(metadata_store=store) - return EnsureMetadataTable( - metadata_service=service, - convention_repo=PostgresConventionRepository(pg_session), - ) - - -async def _catalog_row_count(engine: AsyncEngine) -> int: - async with engine.begin() as conn: - return int((await conn.execute(text("SELECT COUNT(*) FROM metadata_tables"))).scalar() or 0) - - -async def _table_columns(engine: AsyncEngine, pg_table: str) -> set[str]: - async with engine.begin() as conn: - result = await conn.execute( - text( - "SELECT column_name FROM information_schema.columns " - "WHERE table_schema = :s AND table_name = :t" - ), - {"s": METADATA_SCHEMA, "t": pg_table}, - ) - return {row[0] for row in result.fetchall()} - - -@pytest.mark.asyncio -class TestEnsureMetadataTable: - async def test_first_event_creates_table_and_catalog_row( - self, pg_engine: AsyncEngine, pg_session: AsyncSession - ): - conv_srn = ConventionSRN.parse("urn:osa:localhost:conv:conv-c1@1.0.0") - await _seed_schema(pg_session, SCHEMA_V1, _fields_v1()) - await _seed_convention(pg_session, conv_srn, SCHEMA_V1) - await pg_session.commit() - - handler = await _make_handler(pg_engine, pg_session) - await handler.handle(_event(conv_srn, SCHEMA_V1, _fields_v1())) - await pg_session.commit() - - assert await _catalog_row_count(pg_engine) == 1 - cols = await _table_columns(pg_engine, "bio_sample_v1") - assert "species" in cols - - async def test_second_event_same_schema_is_noop( - self, pg_engine: AsyncEngine, pg_session: AsyncSession - ): - conv_a = ConventionSRN.parse("urn:osa:localhost:conv:conv-a1@1.0.0") - conv_b = ConventionSRN.parse("urn:osa:localhost:conv:conv-b1@1.0.0") - await _seed_schema(pg_session, SCHEMA_V1, _fields_v1()) - await _seed_convention(pg_session, conv_a, SCHEMA_V1) - await _seed_convention(pg_session, conv_b, SCHEMA_V1) - await pg_session.commit() - - handler = await _make_handler(pg_engine, pg_session) - await handler.handle(_event(conv_a, SCHEMA_V1, _fields_v1())) - await handler.handle(_event(conv_b, SCHEMA_V1, _fields_v1())) - await pg_session.commit() - - # Still one catalog row, one table. - assert await _catalog_row_count(pg_engine) == 1 - - async def test_additive_bump_alters_table( - self, pg_engine: AsyncEngine, pg_session: AsyncSession - ): - conv_a = ConventionSRN.parse("urn:osa:localhost:conv:conv-a1@1.0.0") - conv_b = ConventionSRN.parse("urn:osa:localhost:conv:conv-b1@1.0.0") - await _seed_schema(pg_session, SCHEMA_V1, _fields_v1()) - await _seed_schema(pg_session, SCHEMA_V11, _fields_v11()) - await _seed_convention(pg_session, conv_a, SCHEMA_V1) - await _seed_convention(pg_session, conv_b, SCHEMA_V11) - await pg_session.commit() - - handler = await _make_handler(pg_engine, pg_session) - await handler.handle(_event(conv_a, SCHEMA_V1, _fields_v1())) - cols_before = await _table_columns(pg_engine, "bio_sample_v1") - assert "collection_site" not in cols_before - - await handler.handle(_event(conv_b, SCHEMA_V11, _fields_v11())) - await pg_session.commit() - - cols_after = await _table_columns(pg_engine, "bio_sample_v1") - assert "collection_site" in cols_after diff --git a/server/tests/integration/test_insert_record_metadata.py b/server/tests/integration/test_insert_record_metadata.py deleted file mode 100644 index 1145d90..0000000 --- a/server/tests/integration/test_insert_record_metadata.py +++ /dev/null @@ -1,114 +0,0 @@ -"""Integration tests for InsertRecordMetadata event handler.""" - -from uuid import uuid4 - -import pytest -from sqlalchemy import text -from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession - -from osa.domain.metadata.handler.insert_record_metadata import InsertRecordMetadata -from osa.domain.metadata.service.metadata import MetadataService -from osa.domain.record.event.record_published import RecordPublished -from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType -from osa.domain.shared.event import EventId -from osa.domain.shared.model.source import DepositionSource -from osa.domain.shared.model.srn import ConventionSRN, RecordSRN, SchemaId -from osa.infrastructure.persistence.metadata_store import PostgresMetadataStore -from osa.infrastructure.persistence.metadata_table import METADATA_SCHEMA - -from tests.integration.conftest import seed_record - -SCHEMA_V1 = SchemaId.parse("bio-sample@1.0.0") -CONV_SRN = ConventionSRN.parse("urn:osa:localhost:conv:test@1.0.0") - - -def _fields() -> list[FieldDefinition]: - return [ - FieldDefinition( - name="species", - type=FieldType.TEXT, - required=True, - cardinality=Cardinality.EXACTLY_ONE, - ), - FieldDefinition( - name="resolution", - type=FieldType.NUMBER, - required=False, - cardinality=Cardinality.EXACTLY_ONE, - ), - ] - - -def _event(record_srn: RecordSRN, metadata: dict) -> RecordPublished: - return RecordPublished( - id=EventId(uuid4()), - record_srn=record_srn, - source=DepositionSource(id="dep-1"), - convention_srn=CONV_SRN, - schema_id=SCHEMA_V1, - metadata=metadata, - expected_features=[], - ) - - -@pytest.mark.asyncio -class TestInsertRecordMetadata: - async def test_insert_creates_typed_row(self, pg_engine: AsyncEngine, pg_session: AsyncSession): - store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, _fields()) - - record_srn = RecordSRN.parse("urn:osa:localhost:rec:one@1") - await seed_record( - pg_engine, - srn=str(record_srn), - schema_id=SCHEMA_V1.id.root, - schema_version=SCHEMA_V1.version.root, - ) - - handler = InsertRecordMetadata(metadata_service=MetadataService(metadata_store=store)) - await handler.handle(_event(record_srn, {"species": "Homo sapiens", "resolution": 3.5})) - await pg_session.commit() - - async with pg_engine.begin() as conn: - row = ( - await conn.execute( - text( - f"SELECT species, resolution " - f'FROM "{METADATA_SCHEMA}"."bio_sample_v1" ' - f"WHERE record_srn = :srn" - ), - {"srn": str(record_srn)}, - ) - ).first() - assert row is not None - assert row[0] == "Homo sapiens" - assert row[1] == 3.5 - - async def test_duplicate_delivery_is_idempotent( - self, pg_engine: AsyncEngine, pg_session: AsyncSession - ): - store = PostgresMetadataStore(pg_engine, pg_session) - await store.ensure_table(SCHEMA_V1, _fields()) - - record_srn = RecordSRN.parse("urn:osa:localhost:rec:dup@1") - await seed_record( - pg_engine, - srn=str(record_srn), - schema_id=SCHEMA_V1.id.root, - schema_version=SCHEMA_V1.version.root, - ) - - handler = InsertRecordMetadata(metadata_service=MetadataService(metadata_store=store)) - event = _event(record_srn, {"species": "Mus musculus", "resolution": 1.0}) - - await handler.handle(event) - await handler.handle(event) - await pg_session.commit() - - async with pg_engine.begin() as conn: - count = ( - await conn.execute( - text(f'SELECT COUNT(*) FROM "{METADATA_SCHEMA}"."bio_sample_v1"') - ) - ).scalar() - assert count == 1 diff --git a/server/tests/unit/domain/deposition/test_convention_service.py b/server/tests/unit/domain/deposition/test_convention_service.py index d142cd6..59ea3fc 100644 --- a/server/tests/unit/domain/deposition/test_convention_service.py +++ b/server/tests/unit/domain/deposition/test_convention_service.py @@ -78,6 +78,7 @@ def _make_service( return ConventionService( convention_repo=conv_repo or AsyncMock(), schema_service=mock_schema_service, + metadata_service=AsyncMock(), outbox=outbox or AsyncMock(), node_domain=Domain("localhost"), ) diff --git a/server/tests/unit/domain/deposition/test_convention_service_v2.py b/server/tests/unit/domain/deposition/test_convention_service_v2.py index d9111ad..1e8557e 100644 --- a/server/tests/unit/domain/deposition/test_convention_service_v2.py +++ b/server/tests/unit/domain/deposition/test_convention_service_v2.py @@ -84,6 +84,7 @@ def _make_service( return ConventionService( convention_repo=conv_repo or AsyncMock(), schema_service=mock_schema_service, + metadata_service=AsyncMock(), outbox=outbox or AsyncMock(), node_domain=Domain("localhost"), ) diff --git a/server/tests/unit/domain/record/test_record_features.py b/server/tests/unit/domain/record/test_record_features.py index 4f229e5..2fe6a1e 100644 --- a/server/tests/unit/domain/record/test_record_features.py +++ b/server/tests/unit/domain/record/test_record_features.py @@ -193,6 +193,7 @@ async def test_get_features_delegates_to_reader(self) -> None: service = RecordService( record_repo=mock_repo, convention_repo=AsyncMock(), + metadata_service=AsyncMock(), outbox=mock_outbox, node_domain=Domain("localhost"), feature_reader=mock_reader, diff --git a/server/tests/unit/domain/record/test_record_service.py b/server/tests/unit/domain/record/test_record_service.py index 4158209..dbeafc0 100644 --- a/server/tests/unit/domain/record/test_record_service.py +++ b/server/tests/unit/domain/record/test_record_service.py @@ -90,6 +90,7 @@ def _make_service( return RecordService( record_repo=record_repo, convention_repo=convention_repo, + metadata_service=AsyncMock(), outbox=outbox, node_domain=node_domain, feature_reader=AsyncMock(), From 7de6ceaadb6e06c331f76969fd23008c696eae66 Mon Sep 17 00:00:00 2001 From: Rory Byrne Date: Thu, 23 Apr 2026 09:51:49 +0100 Subject: [PATCH 4/9] refactor: remove data backfill from database migrations for greenfield-only deployment Remove backfill logic from schema migration scripts to support greenfield deployments only. Migrations now fail with clear constraint errors on populated databases, signaling data predates the new schema structure. feat: add SQL injection protection for metadata store DDL operations Add _safe_ident function to validate PostgreSQL identifiers before interpolation into raw DDL statements to prevent SQL injection attacks through malicious column names. feat: add advisory lock for concurrent metadata table creation Implement pg_advisory_xact_lock to serialize concurrent ensure_table calls for the same schema, preventing race conditions that could cause DuplicateTable errors during simultaneous convention registration. fix: restore metadata schema creation in integration test cleanup Re-create empty metadata and features schemas after DROP CASCADE in test cleanup to maintain production migration invariants. --- .../versions/076_add_records_schema_srn.py | 24 ++------- .../migrations/versions/076_schemas_to_id.py | 32 ++--------- .../persistence/metadata_store.py | 46 ++++++++++++++-- server/tests/integration/conftest.py | 5 ++ .../persistence/test_metadata_store.py | 54 +++++++++++++++++++ 5 files changed, 110 insertions(+), 51 deletions(-) diff --git a/server/migrations/versions/076_add_records_schema_srn.py b/server/migrations/versions/076_add_records_schema_srn.py index 6989a4b..585cb0b 100644 --- a/server/migrations/versions/076_add_records_schema_srn.py +++ b/server/migrations/versions/076_add_records_schema_srn.py @@ -1,11 +1,12 @@ """076_add_records_schema_id Add ``records.schema_id`` + ``records.schema_version`` so a Record's typed -linkage is first-class (FR-008). Backfill from the linked convention's -``schema_id`` / ``schema_version`` columns, then tighten to NOT NULL. +linkage is first-class (FR-008). -Greenfield deployments with no records will skip the backfill and go straight -to NOT NULL. +Greenfield only: no backfill from the linked convention. If this runs +against a populated ``records`` table it fails at ``SET NOT NULL`` with a +clear constraint error, which is the correct signal that the data predates +this schema. Revision ID: 076_records_schema_srn Revises: 076_schemas_to_id @@ -28,21 +29,6 @@ def upgrade() -> None: op.add_column("records", sa.Column("schema_id", sa.Text(), nullable=True)) op.add_column("records", sa.Column("schema_version", sa.Text(), nullable=True)) - - # Backfill from the owning convention's schema_id/schema_version - # (populated by ``076_schemas_to_id`` which ran just before this). - op.execute( - """ - UPDATE records r - SET - schema_id = c.schema_id, - schema_version = c.schema_version - FROM conventions c - WHERE c.srn = r.convention_srn - AND r.schema_id IS NULL - """ - ) - op.alter_column("records", "schema_id", nullable=False) op.alter_column("records", "schema_version", nullable=False) op.create_index("idx_records_schema_id", "records", ["schema_id"]) diff --git a/server/migrations/versions/076_schemas_to_id.py b/server/migrations/versions/076_schemas_to_id.py index 50e681d..c05f5e2 100644 --- a/server/migrations/versions/076_schemas_to_id.py +++ b/server/migrations/versions/076_schemas_to_id.py @@ -8,6 +8,10 @@ - ``schemas.srn`` → ``schemas.id`` + ``schemas.version``. Composite PK. - ``conventions.schema_srn`` → ``conventions.schema_id`` + ``conventions.schema_version``. +Greenfield only: no backfill from the old URN columns. If this runs against +a populated DB it fails at ``SET NOT NULL`` with a clear constraint error, +which is the correct signal that the data predates this schema. + Revision ID: 076_schemas_to_id Revises: 076_metadata_catalog Create Date: 2026-04-20 @@ -30,14 +34,6 @@ def upgrade() -> None: # schemas: drop old SRN PK, add id + version, recompose PK. op.add_column("schemas", sa.Column("id", sa.String(), nullable=True)) op.add_column("schemas", sa.Column("version", sa.String(), nullable=True)) - op.execute( - """ - UPDATE schemas - SET - id = split_part(split_part(srn, ':', 5), '@', 1), - version = split_part(srn, '@', 2) - """ - ) op.alter_column("schemas", "id", nullable=False) op.alter_column("schemas", "version", nullable=False) op.drop_constraint("schemas_pkey", "schemas", type_="primary") @@ -48,14 +44,6 @@ def upgrade() -> None: # conventions: split schema_srn into schema_id + schema_version. op.add_column("conventions", sa.Column("schema_id", sa.String(), nullable=True)) op.add_column("conventions", sa.Column("schema_version", sa.String(), nullable=True)) - op.execute( - """ - UPDATE conventions - SET - schema_id = split_part(split_part(schema_srn, ':', 5), '@', 1), - schema_version = split_part(schema_srn, '@', 2) - """ - ) op.alter_column("conventions", "schema_id", nullable=False) op.alter_column("conventions", "schema_version", nullable=False) op.drop_column("conventions", "schema_srn") @@ -64,12 +52,6 @@ def upgrade() -> None: def downgrade() -> None: # conventions back to schema_srn op.add_column("conventions", sa.Column("schema_srn", sa.String(), nullable=True)) - op.execute( - """ - UPDATE conventions - SET schema_srn = 'urn:osa:localhost:schema:' || schema_id || '@' || schema_version - """ - ) op.alter_column("conventions", "schema_srn", nullable=False) op.drop_column("conventions", "schema_version") op.drop_column("conventions", "schema_id") @@ -78,12 +60,6 @@ def downgrade() -> None: op.drop_index("idx_schemas_id", table_name="schemas") op.drop_constraint("schemas_pkey", "schemas", type_="primary") op.add_column("schemas", sa.Column("srn", sa.String(), nullable=True)) - op.execute( - """ - UPDATE schemas - SET srn = 'urn:osa:localhost:schema:' || id || '@' || version - """ - ) op.alter_column("schemas", "srn", nullable=False) op.create_primary_key("schemas_pkey", "schemas", ["srn"]) op.drop_column("schemas", "version") diff --git a/server/osa/infrastructure/persistence/metadata_store.py b/server/osa/infrastructure/persistence/metadata_store.py index 109b365..5ebc27d 100644 --- a/server/osa/infrastructure/persistence/metadata_store.py +++ b/server/osa/infrastructure/persistence/metadata_store.py @@ -8,6 +8,7 @@ from __future__ import annotations +import re from datetime import UTC, date, datetime from typing import Any, Literal, Sequence @@ -33,6 +34,20 @@ _JsonType = Literal["string", "number", "integer", "boolean", "array", "object"] +# Defense-in-depth: validate any string interpolated into a raw DDL statement. +# ``ColumnDef.name`` is declared as ``PgIdentifier`` at the Pydantic layer but +# we re-check here because a) catalog rows round-trip through JSON and a bad +# actor with write access to metadata_tables could smuggle a malicious name +# through, and b) this function's contract should not rely on upstream +# validators that might be refactored away. +_PG_IDENT_RE = re.compile(r"^[a-z][a-z0-9_]{0,62}$") + + +def _safe_ident(name: str) -> str: + if not _PG_IDENT_RE.match(name): + raise ValidationError(f"Refusing to interpolate unsafe PG identifier {name!r} into DDL") + return name + _JSON_TYPE_MAP: dict[FieldType, tuple[_JsonType | None, str | None]] = { FieldType.TEXT: ("string", None), @@ -82,7 +97,22 @@ async def ensure_table( metadata_schema = MetadataSchema(columns=columns) async with self._engine.begin() as conn: - await conn.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{metadata_pg_schema()}"')) + # Note: the ``metadata`` PG schema is created by migration + # ``076_add_metadata_schema_and_catalog`` and is a precondition + # for this store. We don't run ``CREATE SCHEMA IF NOT EXISTS`` + # here because it races on ``pg_namespace`` under concurrency, + # and the migration makes it unnecessary. + + # Serialise concurrent ensure_table() calls for the same + # (schema_id, major) pair. Without this lock, two conventions + # registering simultaneously both pass the "does it exist?" + # check and race on CREATE TABLE, causing the loser to fail + # with DuplicateTable. The advisory lock is released at + # transaction commit. + await conn.execute( + text("SELECT pg_advisory_xact_lock(hashtextextended(:key, 0))"), + {"key": f"{id_str}@v{major}"}, + ) existing = ( ( @@ -268,12 +298,20 @@ def _validate_additive(existing: Sequence[ColumnDef], incoming: Sequence[ColumnD def _alter_add_column_stmt(pg_table: str, col_def: ColumnDef) -> str: - """SQL string to ALTER TABLE ADD COLUMN for a single column definition.""" + """SQL string to ALTER TABLE ADD COLUMN for a single column definition. + + Both ``pg_table`` and ``col_def.name`` are interpolated into raw SQL, so + they are strictly validated against the PG identifier regex first — any + attempt to smuggle a ``"`` through would otherwise break the quoting and + inject arbitrary DDL. + """ sql_type = _column_type_sql(map_column(col_def).type) null_sql = "" if not col_def.required else " NOT NULL" + safe_table = _safe_ident(pg_table) + safe_col = _safe_ident(col_def.name) return ( - f'ALTER TABLE "{metadata_pg_schema()}"."{pg_table}" ' - f'ADD COLUMN IF NOT EXISTS "{col_def.name}" {sql_type}{null_sql}' + f'ALTER TABLE "{metadata_pg_schema()}"."{safe_table}" ' + f'ADD COLUMN IF NOT EXISTS "{safe_col}" {sql_type}{null_sql}' ) diff --git a/server/tests/integration/conftest.py b/server/tests/integration/conftest.py index 890ccd4..2ee9a02 100644 --- a/server/tests/integration/conftest.py +++ b/server/tests/integration/conftest.py @@ -96,6 +96,11 @@ async def pg_session(pg_engine: AsyncEngine): ) await conn.execute(text('DROP SCHEMA IF EXISTS "features" CASCADE')) await conn.execute(text('DROP SCHEMA IF EXISTS "metadata" CASCADE')) + # Re-create empty ``metadata`` schema. Production relies on the + # migration having created it; tests need to restore that + # invariant after the DROP above. + await conn.execute(text('CREATE SCHEMA "metadata"')) + await conn.execute(text('CREATE SCHEMA "features"')) # Re-seed system user after truncate await ensure_system_user(pg_engine) diff --git a/server/tests/integration/persistence/test_metadata_store.py b/server/tests/integration/persistence/test_metadata_store.py index 8f07e10..d416213 100644 --- a/server/tests/integration/persistence/test_metadata_store.py +++ b/server/tests/integration/persistence/test_metadata_store.py @@ -158,6 +158,60 @@ async def test_foreign_key_cascade_on_record_srn( assert constraint == "c" +@pytest.mark.asyncio +class TestDdlInjectionGuard: + """Defense-in-depth: raw DDL interpolation must refuse bad identifiers.""" + + async def test_ddl_injection_in_field_name_rejected( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + """A field name with a quote or injection payload must never reach + the ALTER TABLE SQL. ``_safe_ident`` rejects at the DDL boundary.""" + from osa.infrastructure.persistence.metadata_store import _safe_ident + + with pytest.raises(ValidationError, match="unsafe PG identifier"): + _safe_ident('species"; DROP TABLE records; --') + with pytest.raises(ValidationError, match="unsafe PG identifier"): + _safe_ident("has-hyphen") + with pytest.raises(ValidationError, match="unsafe PG identifier"): + _safe_ident("1starts_with_digit") + with pytest.raises(ValidationError, match="unsafe PG identifier"): + _safe_ident("") + # Valid identifiers pass through. + assert _safe_ident("species") == "species" + assert _safe_ident("bio_sample_v1") == "bio_sample_v1" + + +@pytest.mark.asyncio +class TestConcurrentEnsureTable: + """TOCTOU defense: two ensure_table calls for the same schema must not + both try to CREATE TABLE. The advisory lock serialises them; one wins, + the other sees the catalog row and no-ops.""" + + async def test_concurrent_ensure_table_does_not_raise( + self, pg_engine: AsyncEngine, pg_session: AsyncSession + ): + import asyncio + + store_a = PostgresMetadataStore(pg_engine, pg_session) + store_b = PostgresMetadataStore(pg_engine, pg_session) + # Run both concurrently. Without the advisory lock, the second + # would either race on SELECT and raise DuplicateTable on CREATE, + # or raise on the catalog INSERT unique violation. + await asyncio.gather( + store_a.ensure_table(SCHEMA_V1, _fields_v1()), + store_b.ensure_table(SCHEMA_V1, _fields_v1()), + ) + async with pg_engine.begin() as conn: + count = ( + await conn.execute( + text("SELECT COUNT(*) FROM metadata_tables WHERE schema_id = :id"), + {"id": SCHEMA_ID}, + ) + ).scalar() + assert count == 1 + + @pytest.mark.asyncio class TestInsert: async def test_insert_typed_row(self, pg_engine: AsyncEngine, pg_session: AsyncSession): From 044fb9d96d5228002a50e766778a3bcce7ce7557 Mon Sep 17 00:00:00 2001 From: Rory Byrne Date: Thu, 23 Apr 2026 11:00:26 +0100 Subject: [PATCH 5/9] fix: add SQL injection protection for PostgreSQL identifiers Add regex validation to prevent SQL injection attacks when interpolating hook names into DDL statements. Validates identifiers match PostgreSQL naming conventions before use. feat: add schema validation for discovery operations Raise NotFoundError when pinned schema is not found instead of silently falling back to unscoped queries that could return cross-schema records. docs: clarify field definition reader behavior Update docstring to explain that callers must handle empty results and raise NotFoundError themselves when treating unknown schemas as error conditions. --- .../076_add_feature_tables_record_srn_fks.py | 28 +++++++--- .../discovery/port/field_definition_reader.py | 5 +- .../osa/domain/discovery/service/discovery.py | 10 ++++ .../discovery/test_discovery_service.py | 56 ++++++++++++++++++- 4 files changed, 88 insertions(+), 11 deletions(-) diff --git a/server/migrations/versions/076_add_feature_tables_record_srn_fks.py b/server/migrations/versions/076_add_feature_tables_record_srn_fks.py index 3c4cc57..b61ee14 100644 --- a/server/migrations/versions/076_add_feature_tables_record_srn_fks.py +++ b/server/migrations/versions/076_add_feature_tables_record_srn_fks.py @@ -13,6 +13,7 @@ """ +import re from typing import Sequence, Union from alembic import op @@ -26,18 +27,27 @@ FK_NAME_TEMPLATE = "fk_features_{hook}_record_srn" +# Defense-in-depth: hook names read from ``feature_tables`` are interpolated +# into raw DDL below. Application code constrains hooks to this shape at write +# time, but the migration should not trust that invariant — a stray ``"`` in a +# stored name would break out of quoting. Mirrors the ``_safe_ident`` check in +# ``osa.infrastructure.persistence.metadata_store``. +_PG_IDENT_RE = re.compile(r"^[a-z][a-z0-9_]{0,62}$") + + +def _safe_ident(name: str) -> str: + if not _PG_IDENT_RE.match(name): + raise ValueError(f"Refusing to interpolate unsafe PG identifier {name!r} into DDL") + return name + def upgrade() -> None: conn = op.get_bind() - rows = conn.execute( - # text() via op.execute-style select - _select_hooks() - ).fetchall() + rows = conn.execute(_select_hooks()).fetchall() for row in rows: - hook = row[0] - fk_name = FK_NAME_TEMPLATE.format(hook=hook) - # Check if constraint already exists + hook = _safe_ident(row[0]) + fk_name = _safe_ident(FK_NAME_TEMPLATE.format(hook=hook)) exists = conn.execute(_check_constraint(fk_name)).scalar() if exists: continue @@ -49,8 +59,8 @@ def downgrade() -> None: conn = op.get_bind() rows = conn.execute(_select_hooks()).fetchall() for row in rows: - hook = row[0] - fk_name = FK_NAME_TEMPLATE.format(hook=hook) + hook = _safe_ident(row[0]) + fk_name = _safe_ident(FK_NAME_TEMPLATE.format(hook=hook)) exists = conn.execute(_check_constraint(fk_name)).scalar() if not exists: continue diff --git a/server/osa/domain/discovery/port/field_definition_reader.py b/server/osa/domain/discovery/port/field_definition_reader.py index 516d51e..c2a6bfa 100644 --- a/server/osa/domain/discovery/port/field_definition_reader.py +++ b/server/osa/domain/discovery/port/field_definition_reader.py @@ -20,6 +20,9 @@ async def get_all_field_types(self) -> dict[str, FieldType]: async def get_fields_for_schema(self, schema_id: "SchemaId") -> dict[str, FieldType]: """Return field_name -> FieldType for a specific schema's current major version. - Falls back to an empty dict when the schema is unknown to the node. + Returns an empty dict when the schema is unknown to the node. Callers + that treat "unknown schema" as an error condition must check for an + empty map and raise ``NotFoundError`` themselves — the port stays + neutral so that non-user-facing callers can handle absence explicitly. """ ... diff --git a/server/osa/domain/discovery/service/discovery.py b/server/osa/domain/discovery/service/discovery.py index 3f54ea5..cd8db52 100644 --- a/server/osa/domain/discovery/service/discovery.py +++ b/server/osa/domain/discovery/service/discovery.py @@ -89,6 +89,11 @@ async def search_records( schema_field_map: dict[str, FieldType] = {} if schema_id is not None: schema_field_map = await self.field_reader.get_fields_for_schema(schema_id) + if not schema_field_map: + raise NotFoundError( + f"Schema not found: {schema_id.render()}. " + "Pin an '@' that matches a registered schema." + ) if filter_expr is not None: self._validate_tree(filter_expr, allow_compound=allow_compound) @@ -179,6 +184,11 @@ async def search_features( schema_field_map: dict[str, FieldType] = {} if schema_id is not None: schema_field_map = await self.field_reader.get_fields_for_schema(schema_id) + if not schema_field_map: + raise NotFoundError( + f"Schema not found: {schema_id.render()}. " + "Pin an '@' that matches a registered schema." + ) if filter_expr is not None: self._validate_tree(filter_expr, allow_compound=allow_compound) diff --git a/server/tests/unit/domain/discovery/test_discovery_service.py b/server/tests/unit/domain/discovery/test_discovery_service.py index 8f9bad7..b68dbba 100644 --- a/server/tests/unit/domain/discovery/test_discovery_service.py +++ b/server/tests/unit/domain/discovery/test_discovery_service.py @@ -21,7 +21,7 @@ ) from osa.domain.discovery.service.discovery import DiscoveryService from osa.domain.semantics.model.value import FieldType -from osa.domain.shared.error import ValidationError +from osa.domain.shared.error import NotFoundError, ValidationError from osa.domain.shared.model.srn import RecordSRN, SchemaId @@ -175,6 +175,60 @@ async def test_rejects_limit_too_high(self, service: DiscoveryService) -> None: limit=101, ) + async def test_raises_not_found_for_unknown_schema(self, mock_read_store: AsyncMock) -> None: + """Pinning an unregistered schema must raise NotFoundError, not silently + fall through to an unscoped query that returns cross-schema records.""" + empty_reader = AsyncMock() + empty_reader.get_fields_for_schema.return_value = {} + svc = DiscoveryService( + read_store=mock_read_store, + field_reader=empty_reader, + config=_config(), + ) + + with pytest.raises(NotFoundError, match="Schema not found"): + await svc.search_records( + filter_expr=None, + schema_id=SCHEMA_SRN, + convention_srn=None, + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + mock_read_store.search_records.assert_not_called() + + async def test_search_features_raises_not_found_for_unknown_schema( + self, mock_read_store: AsyncMock + ) -> None: + """search_features must also guard against unknown schema pins.""" + mock_read_store.get_feature_table_schema.return_value = FeatureCatalogEntry( + hook_name="detect_pockets", + columns=[ColumnInfo(name="score", type="number", required=False)], + record_count=0, + ) + empty_reader = AsyncMock() + empty_reader.get_fields_for_schema.return_value = {} + svc = DiscoveryService( + read_store=mock_read_store, + field_reader=empty_reader, + config=_config(), + ) + + with pytest.raises(NotFoundError, match="Schema not found"): + await svc.search_features( + hook_name="detect_pockets", + filter_expr=None, + schema_id=SCHEMA_SRN, + record_srn=None, + sort="id", + order=SortOrder.DESC, + cursor=None, + limit=20, + ) + mock_read_store.search_features.assert_not_called() + async def test_rejects_q_when_no_text_fields(self, mock_read_store: AsyncMock) -> None: no_text_reader = AsyncMock() no_text_reader.get_all_field_types.return_value = {"resolution": FieldType.NUMBER} From bb27aa99e50dd23710545d6b68a805faefececbe Mon Sep 17 00:00:00 2001 From: Rory Byrne Date: Thu, 23 Apr 2026 11:58:55 +0100 Subject: [PATCH 6/9] fix: handle NULL values in SQL filter operations to prevent silent record exclusion Wrap NOT operations with coalesce to treat NULL as FALSE before negation and modify NEQ operator to explicitly include NULL values, ensuring records with missing feature/metadata values are not silently dropped from query results. --- .../persistence/adapter/discovery.py | 48 +++++--- .../persistence/metadata_store.py | 10 +- .../persistence/metadata_table.py | 37 +++++- .../test_discovery_cross_join_postgres.py | 114 +++++++++++++++++- server/tests/unit/test_metadata_slug.py | 25 +++- 5 files changed, 209 insertions(+), 25 deletions(-) diff --git a/server/osa/infrastructure/persistence/adapter/discovery.py b/server/osa/infrastructure/persistence/adapter/discovery.py index 0fd452d..c44220c 100644 --- a/server/osa/infrastructure/persistence/adapter/discovery.py +++ b/server/osa/infrastructure/persistence/adapter/discovery.py @@ -13,6 +13,7 @@ String, and_, cast, + false, func, literal, not_, @@ -527,15 +528,19 @@ def _compile_filter_for_records( ] ) if isinstance(expr, Not): - return not_( - self._compile_filter_for_records( - expr.operand, - records_t=records_t, - metadata_t=metadata_t, - metadata_schema=metadata_schema, - feature_joins=feature_joins, - ) + inner = self._compile_filter_for_records( + expr.operand, + records_t=records_t, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, ) + # Coalesce NULL → FALSE before negating so records with NULL + # feature/metadata values (including rows missing from outer- + # joined feature tables) survive a NOT predicate. Without this, + # ``NOT (score = 5)`` reads NULL for records with no score and + # three-valued logic silently drops them. + return not_(func.coalesce(inner, false())) raise ValidationError(f"Unsupported filter node: {type(expr).__name__}") def _compile_filter_for_features( @@ -600,16 +605,18 @@ def _compile_filter_for_features( ] ) if isinstance(expr, Not): - return not_( - self._compile_filter_for_features( - expr.operand, - this_hook=this_hook, - this_ft=this_ft, - metadata_t=metadata_t, - metadata_schema=metadata_schema, - feature_joins=feature_joins, - ) + inner = self._compile_filter_for_features( + expr.operand, + this_hook=this_hook, + this_ft=this_ft, + metadata_t=metadata_t, + metadata_schema=metadata_schema, + feature_joins=feature_joins, ) + # See ``_compile_filter_for_records`` — NULL → FALSE coalesce so + # NOT over outer-joined feature / optional metadata columns + # includes records with missing values. + return not_(func.coalesce(inner, false())) raise ValidationError(f"Unsupported filter node: {type(expr).__name__}") def _compile_predicate( @@ -649,7 +656,12 @@ def _apply_scalar_op(col: Any, op: FilterOperator, value: Any) -> Any: if op == FilterOperator.EQ: return col == value if op == FilterOperator.NEQ: - return col != value + # Feature tables are outer-joined, so a missing feature row makes + # ``col`` NULL. Plain ``col != value`` yields NULL (falsy) and + # silently excludes those records from the result. Users reading + # ``!= X`` expect "anything except X, including missing", so treat + # NULL as non-equal explicitly. + return or_(col != value, col.is_(None)) if op == FilterOperator.GT: return col > value if op == FilterOperator.GTE: diff --git a/server/osa/infrastructure/persistence/metadata_store.py b/server/osa/infrastructure/persistence/metadata_store.py index 5ebc27d..c44f912 100644 --- a/server/osa/infrastructure/persistence/metadata_store.py +++ b/server/osa/infrastructure/persistence/metadata_store.py @@ -27,6 +27,7 @@ from osa.infrastructure.persistence.metadata_table import ( MetadataSchema, build_metadata_table, + check_pg_table_name, schema_slug, ) from osa.infrastructure.persistence.tables import metadata_tables_table @@ -90,8 +91,15 @@ async def ensure_table( ) -> None: id_str = schema_id.id.root major = schema_id.major - slug = schema_slug(id_str) + try: + slug = schema_slug(id_str) + except ValueError as exc: + raise ValidationError(str(exc), field="schema_id") from exc pg_table = f"{slug}_v{major}" + try: + check_pg_table_name(pg_table) + except ValueError as exc: + raise ValidationError(str(exc), field="schema_id") from exc columns = [_field_to_column(f) for f in fields] metadata_schema = MetadataSchema(columns=columns) diff --git a/server/osa/infrastructure/persistence/metadata_table.py b/server/osa/infrastructure/persistence/metadata_table.py index ba9861d..5ae0695 100644 --- a/server/osa/infrastructure/persistence/metadata_table.py +++ b/server/osa/infrastructure/persistence/metadata_table.py @@ -23,7 +23,16 @@ AUTO_COLUMN_NAMES = frozenset({"id", "record_srn", "created_at"}) -_SLUG_RE = re.compile(r"^[a-z][a-z0-9_]{0,50}$") +# PG identifier limit under default ``NAMEDATALEN`` (64). Identifiers over +# this are silently truncated by PG, which would cause catalog/table name +# drift — surface the limit as a hard check instead. +PG_IDENT_MAX_LEN = 63 + +# Upper bound for a derived slug — matches :class:`SchemaIdentifier` (3-64). +# The final table name is ``f"{slug}_v{major}"``; that total length is +# checked separately by :func:`check_pg_table_name` at the boundary where +# ``major`` is known. +_SLUG_RE = re.compile(r"^[a-z][a-z0-9_]{2,63}$") class MetadataSchema(ValueObject): @@ -36,18 +45,38 @@ def schema_slug(title: str) -> str: """Derive a pg-safe slug from a Schema title. Lowercases, replaces runs of non-alphanumerics with a single underscore, - strips leading/trailing underscores, then validates against ``^[a-z][a-z0-9_]{0,50}$``. - Raises ``ValueError`` if the derived slug is empty or cannot be validated. + strips leading/trailing underscores, then validates against + ``^[a-z][a-z0-9_]{2,63}$`` (3-64 chars, matching + :class:`SchemaIdentifier`). Raises ``ValueError`` if the derived slug is + empty or cannot be validated. + + Callers that combine the slug with a suffix (e.g. ``_v{major}``) must + separately check the combined length against :data:`PG_IDENT_MAX_LEN`. """ normalised = re.sub(r"[^a-z0-9]+", "_", title.strip().lower()).strip("_") if not normalised or not _SLUG_RE.match(normalised): raise ValueError( f"Cannot derive a valid metadata table slug from title {title!r}. " - f"Expected a string that maps to ^[a-z][a-z0-9_]{{0,50}}$." + "Expected a string that maps to ^[a-z][a-z0-9_]{2,63}$." ) return normalised +def check_pg_table_name(pg_table: str) -> None: + """Raise ``ValueError`` if *pg_table* exceeds the PG identifier limit. + + Without this, PG silently truncates long identifiers at 63 chars, which + would desynchronise the catalog (``metadata_tables.pg_table``) from the + actual table name. + """ + if len(pg_table) > PG_IDENT_MAX_LEN: + raise ValueError( + f"Derived PG table name {pg_table!r} is {len(pg_table)} chars, " + f"exceeds PG's {PG_IDENT_MAX_LEN}-char identifier limit. " + "Use a shorter schema id." + ) + + def build_metadata_table(pg_table: str, schema: MetadataSchema) -> sa.Table: """Build a SQLAlchemy ``Table`` for a dynamic metadata table. diff --git a/server/tests/integration/test_discovery_cross_join_postgres.py b/server/tests/integration/test_discovery_cross_join_postgres.py index f979a8f..c2ed9ec 100644 --- a/server/tests/integration/test_discovery_cross_join_postgres.py +++ b/server/tests/integration/test_discovery_cross_join_postgres.py @@ -4,7 +4,7 @@ from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession from osa.domain.discovery.model.refs import FeatureFieldRef, MetadataFieldRef -from osa.domain.discovery.model.value import And, FilterOperator, Predicate, SortOrder +from osa.domain.discovery.model.value import And, FilterOperator, Not, Predicate, SortOrder from osa.domain.semantics.model.value import Cardinality, FieldDefinition, FieldType from osa.domain.shared.error import ValidationError from osa.domain.shared.model.hook import ColumnDef @@ -140,3 +140,115 @@ async def test_unknown_hook_raises( limit=10, field_types=FIELD_TYPES, ) + + +@pytest.fixture +async def seeded_with_missing_feature_row(pg_engine: AsyncEngine, pg_session: AsyncSession): + """Seed a record with a metadata row but NO feature row, so the outer + join produces NULL feature columns for that record.""" + from datetime import UTC, datetime + + from osa.domain.semantics.model.schema import Schema + from osa.infrastructure.persistence.repository.schema import ( + PostgresSemanticsSchemaRepository, + ) + + mstore = PostgresMetadataStore(pg_engine, pg_session) + await mstore.ensure_table(SCHEMA_V1, _metadata_fields()) + + fstore = PostgresFeatureStore(pg_engine, pg_session) + await fstore.create_table("cell_classifier", _feature_columns()) + + repo = PostgresSemanticsSchemaRepository(pg_session) + await repo.save( + Schema( + id=SCHEMA_V1, + title="bio_sample", + fields=_metadata_fields(), + created_at=datetime.now(UTC), + ) + ) + + # rec-has-feature: has a feature row with confidence 0.95. + # rec-no-feature: no feature row at all (outer join will produce NULLs). + for rid, sp in [("rec-has-feature", "Homo sapiens"), ("rec-no-feature", "Mus musculus")]: + srn = RecordSRN.parse(f"urn:osa:localhost:rec:{rid}@1") + await seed_record( + pg_engine, + srn=str(srn), + schema_id=SCHEMA_V1.id.root, + schema_version=SCHEMA_V1.version.root, + ) + await mstore.insert(SCHEMA_V1, srn, {"species": sp}) + + has_feature_srn = "urn:osa:localhost:rec:rec-has-feature@1" + await fstore.insert_features("cell_classifier", has_feature_srn, [{"confidence": 0.95}]) + + await pg_session.commit() + + +@pytest.mark.asyncio +class TestOuterJoinNullHandling: + """Records without a feature row must not be silently dropped from NEQ/NOT + predicates on feature columns — the outer join produces NULL, and naive + SQL three-valued logic would exclude them.""" + + async def test_neq_on_feature_column_includes_missing_rows( + self, + pg_engine: AsyncEngine, + pg_session: AsyncSession, + seeded_with_missing_feature_row, + ): + read_store = PostgresDiscoveryReadStore(pg_session) + tree = Predicate( + field=FeatureFieldRef(hook="cell_classifier", column="confidence"), + op=FilterOperator.NEQ, + value=0.95, + ) + results = await read_store.search_records( + filter_expr=tree, + schema_id=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + srns = {str(r.srn) for r in results} + # rec-no-feature has no feature row → confidence is NULL → "!= 0.95" + # should include it. rec-has-feature has confidence 0.95 → excluded. + assert srns == {"urn:osa:localhost:rec:rec-no-feature@1"} + + async def test_not_on_feature_column_includes_missing_rows( + self, + pg_engine: AsyncEngine, + pg_session: AsyncSession, + seeded_with_missing_feature_row, + ): + read_store = PostgresDiscoveryReadStore(pg_session) + tree = Not( + operand=Predicate( + field=FeatureFieldRef(hook="cell_classifier", column="confidence"), + op=FilterOperator.EQ, + value=0.95, + ) + ) + results = await read_store.search_records( + filter_expr=tree, + schema_id=SCHEMA_V1, + convention_srn=None, + text_fields=[], + q=None, + sort="published_at", + order=SortOrder.DESC, + cursor=None, + limit=10, + field_types=FIELD_TYPES, + ) + srns = {str(r.srn) for r in results} + # Same invariant as NEQ: NOT(confidence = 0.95) must surface the + # record with a missing feature row. + assert srns == {"urn:osa:localhost:rec:rec-no-feature@1"} diff --git a/server/tests/unit/test_metadata_slug.py b/server/tests/unit/test_metadata_slug.py index 59e3130..ea0abf1 100644 --- a/server/tests/unit/test_metadata_slug.py +++ b/server/tests/unit/test_metadata_slug.py @@ -2,7 +2,11 @@ import pytest -from osa.infrastructure.persistence.metadata_table import schema_slug +from osa.infrastructure.persistence.metadata_table import ( + PG_IDENT_MAX_LEN, + check_pg_table_name, + schema_slug, +) class TestSchemaSlug: @@ -38,3 +42,22 @@ def test_rejects_title_with_only_punctuation(self): def test_rejects_title_starting_with_digit(self): with pytest.raises(ValueError): schema_slug("1bio_sample") + + def test_accepts_max_length_schema_identifier(self): + """SchemaIdentifier allows 64-char ids; slug must not reject them.""" + long_id = "a" + "b" * 63 # 64 chars, matches SchemaIdentifier upper bound + assert schema_slug(long_id) == long_id + + def test_rejects_over_max_length(self): + with pytest.raises(ValueError): + schema_slug("a" + "b" * 64) # 65 chars + + +class TestCheckPgTableName: + def test_accepts_table_name_at_pg_limit(self): + name = "a" * PG_IDENT_MAX_LEN + check_pg_table_name(name) # no raise + + def test_rejects_table_name_over_pg_limit(self): + with pytest.raises(ValueError, match="exceeds PG's"): + check_pg_table_name("a" * (PG_IDENT_MAX_LEN + 1)) From 3cabcfac1fe8e225185634de66518a8770dbe290 Mon Sep 17 00:00:00 2001 From: Rory Byrne Date: Thu, 23 Apr 2026 12:06:16 +0100 Subject: [PATCH 7/9] fix: replace assert statements with explicit TypeError exceptions Replace assert statements with proper exception handling to provide clearer error messages when field reference types are unexpected, improving debugging and error reporting. docs: update record mapper comment to reflect synchronous metadata Update comment to accurately describe that metadata projection is written synchronously in the same transaction rather than asynchronously. --- server/osa/infrastructure/persistence/adapter/discovery.py | 6 ++++-- server/osa/infrastructure/persistence/mappers/record.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/server/osa/infrastructure/persistence/adapter/discovery.py b/server/osa/infrastructure/persistence/adapter/discovery.py index c44220c..be9d067 100644 --- a/server/osa/infrastructure/persistence/adapter/discovery.py +++ b/server/osa/infrastructure/persistence/adapter/discovery.py @@ -563,7 +563,8 @@ def _compile_filter_for_features( ) col = metadata_t.c[expr.field.field] return _apply_scalar_op(col, expr.op, expr.value) - assert isinstance(expr.field, FeatureFieldRef) + if not isinstance(expr.field, FeatureFieldRef): + raise TypeError(f"Unexpected field ref type: {type(expr.field).__name__}") if expr.field.hook == this_hook: col = this_ft.c[expr.field.column] else: @@ -640,7 +641,8 @@ def _compile_predicate( col = metadata_t.c[predicate.field.field] return _apply_scalar_op(col, predicate.op, predicate.value) - assert isinstance(predicate.field, FeatureFieldRef) + if not isinstance(predicate.field, FeatureFieldRef): + raise TypeError(f"Unexpected field ref type: {type(predicate.field).__name__}") tbl = feature_joins.get(predicate.field.hook) if tbl is None: raise ValidationError( diff --git a/server/osa/infrastructure/persistence/mappers/record.py b/server/osa/infrastructure/persistence/mappers/record.py index aac791a..a916d75 100644 --- a/server/osa/infrastructure/persistence/mappers/record.py +++ b/server/osa/infrastructure/persistence/mappers/record.py @@ -3,7 +3,8 @@ Feature 076 adds ``schema_id`` + ``schema_version`` columns so a Record's typed linkage is first-class. ``metadata`` remains the canonical JSONB store; the typed ``metadata._v`` table is a discovery-optimized -projection maintained asynchronously by ``InsertRecordMetadata``. +projection written synchronously alongside ``records`` in the same UoW +transaction by ``RecordService.publish_record`` / ``bulk_publish``. """ from datetime import datetime From f06549118c169814f0d826f1a50ffe3ee9482fbf Mon Sep 17 00:00:00 2001 From: Rory Byrne Date: Thu, 23 Apr 2026 12:14:29 +0100 Subject: [PATCH 8/9] feat: improve error handling for date/datetime coercion in metadata store Add ValidationError for malformed ISO date/datetime strings to return 400 status instead of 500, and include record SRN context in error messages for better debugging --- .../persistence/metadata_store.py | 29 +++++++- .../persistence/test_metadata_store_coerce.py | 73 +++++++++++++++++++ 2 files changed, 98 insertions(+), 4 deletions(-) create mode 100644 server/tests/unit/infrastructure/persistence/test_metadata_store_coerce.py diff --git a/server/osa/infrastructure/persistence/metadata_store.py b/server/osa/infrastructure/persistence/metadata_store.py index c44f912..b7996d1 100644 --- a/server/osa/infrastructure/persistence/metadata_store.py +++ b/server/osa/infrastructure/persistence/metadata_store.py @@ -248,7 +248,7 @@ async def insert_many( col = col_by_name.get(k) if col is None: continue - payload[k] = _coerce_value(col, v) + payload[k] = _coerce_value(col, v, record_srn=str(record_srn)) payload["record_srn"] = str(record_srn) payloads.append(payload) @@ -323,19 +323,40 @@ def _alter_add_column_stmt(pg_table: str, col_def: ColumnDef) -> str: ) -def _coerce_value(col: ColumnDef, value: Any) -> Any: +def _coerce_value(col: ColumnDef, value: Any, *, record_srn: str | None = None) -> Any: """Coerce a JSONB-read value to match its typed PG column. ``records.metadata`` is JSONB, so date/datetime fields come back as ISO strings. asyncpg won't auto-parse those for DATE / TIMESTAMP columns — we parse here based on the declared column format. + + Malformed ISO strings are re-raised as ``ValidationError`` so the API + surfaces them as 400 with field context, not a bare 500. """ if value is None: return None if col.json_type == "string" and col.format == "date": - return value if isinstance(value, date) else date.fromisoformat(value) + if isinstance(value, date): + return value + try: + return date.fromisoformat(value) + except (TypeError, ValueError) as exc: + raise ValidationError( + f"Field {col.name!r} expects an ISO-8601 date, got {value!r}" + + (f" (record {record_srn})" if record_srn else ""), + field=col.name, + ) from exc if col.json_type == "string" and col.format == "date-time": - return value if isinstance(value, datetime) else datetime.fromisoformat(value) + if isinstance(value, datetime): + return value + try: + return datetime.fromisoformat(value) + except (TypeError, ValueError) as exc: + raise ValidationError( + f"Field {col.name!r} expects an ISO-8601 date-time, got {value!r}" + + (f" (record {record_srn})" if record_srn else ""), + field=col.name, + ) from exc return value diff --git a/server/tests/unit/infrastructure/persistence/test_metadata_store_coerce.py b/server/tests/unit/infrastructure/persistence/test_metadata_store_coerce.py new file mode 100644 index 0000000..833e636 --- /dev/null +++ b/server/tests/unit/infrastructure/persistence/test_metadata_store_coerce.py @@ -0,0 +1,73 @@ +"""Unit tests for ``_coerce_value`` — ensures bad JSONB values surface as +``ValidationError`` (→ 400) instead of propagating raw ``ValueError`` (→ 500). +""" + +from datetime import date, datetime + +import pytest + +from osa.domain.shared.error import ValidationError +from osa.domain.shared.model.hook import ColumnDef +from osa.infrastructure.persistence.metadata_store import _coerce_value + + +def _date_col(name: str = "collected_on") -> ColumnDef: + return ColumnDef(name=name, json_type="string", format="date", required=False) + + +def _datetime_col(name: str = "measured_at") -> ColumnDef: + return ColumnDef(name=name, json_type="string", format="date-time", required=False) + + +class TestCoerceValueDate: + def test_parses_iso_date_string(self): + assert _coerce_value(_date_col(), "2026-04-23") == date(2026, 4, 23) + + def test_passes_through_date(self): + d = date(2026, 4, 23) + assert _coerce_value(_date_col(), d) is d + + def test_none_passes_through(self): + assert _coerce_value(_date_col(), None) is None + + def test_malformed_iso_date_raises_validation_error(self): + with pytest.raises(ValidationError) as exc_info: + _coerce_value(_date_col("collected_on"), "2026-99-99") + assert exc_info.value.field == "collected_on" + assert "ISO-8601 date" in str(exc_info.value) + + def test_non_string_non_date_raises_validation_error(self): + with pytest.raises(ValidationError) as exc_info: + _coerce_value(_date_col(), 12345) + assert exc_info.value.field == "collected_on" + + def test_includes_record_srn_in_error_when_provided(self): + with pytest.raises(ValidationError, match="record urn:osa:localhost:rec:abc@1"): + _coerce_value(_date_col(), "not-a-date", record_srn="urn:osa:localhost:rec:abc@1") + + +class TestCoerceValueDatetime: + def test_parses_iso_datetime_string(self): + assert _coerce_value(_datetime_col(), "2026-04-23T10:30:00") == datetime( + 2026, 4, 23, 10, 30, 0 + ) + + def test_passes_through_datetime(self): + dt = datetime(2026, 4, 23, 10, 30, 0) + assert _coerce_value(_datetime_col(), dt) is dt + + def test_malformed_iso_datetime_raises_validation_error(self): + with pytest.raises(ValidationError) as exc_info: + _coerce_value(_datetime_col("measured_at"), "not-a-datetime") + assert exc_info.value.field == "measured_at" + assert "ISO-8601 date-time" in str(exc_info.value) + + +class TestCoerceValueOther: + def test_text_passthrough(self): + col = ColumnDef(name="species", json_type="string", format=None, required=False) + assert _coerce_value(col, "Homo sapiens") == "Homo sapiens" + + def test_number_passthrough(self): + col = ColumnDef(name="resolution", json_type="number", format=None, required=False) + assert _coerce_value(col, 1.5) == 1.5 From 2ad7d8b7013d1235fe3e3de015fc6d47363e5aa4 Mon Sep 17 00:00:00 2001 From: Rory Byrne Date: Thu, 23 Apr 2026 13:24:56 +0100 Subject: [PATCH 9/9] feat: add HookName type with 40-char limit for PostgreSQL compatibility Introduce HookName type to enforce 40-character limit on hook names, ensuring derived identifiers like fk_features_{name}_record_srn stay within PostgreSQL's 63-character identifier limit. Update HookDefinition to use HookName instead of PgIdentifier while keeping ColumnDef unchanged. --- server/osa/domain/shared/model/hook.py | 10 +++++- .../unit/domain/shared/test_hook_models.py | 35 +++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/server/osa/domain/shared/model/hook.py b/server/osa/domain/shared/model/hook.py index 49a9ab7..3691dbc 100644 --- a/server/osa/domain/shared/model/hook.py +++ b/server/osa/domain/shared/model/hook.py @@ -16,6 +16,14 @@ # Safe for use as PG identifiers, file path components, and env var values. PgIdentifier = Annotated[str, Field(pattern=r"^[a-z][a-z0-9_]{0,62}$")] +# Hook names compose into PG identifiers alongside fixed prefixes/suffixes — +# notably the per-hook FK constraint ``fk_features_{name}_record_srn`` (23 +# chars of overhead). PG's identifier limit is 63 chars, so cap hook names at +# 40 to keep every derived identifier inside the limit without surprise +# truncation. Column names use plain ``PgIdentifier`` because they don't get +# composed into longer names. +HookName = Annotated[str, Field(pattern=r"^[a-z][a-z0-9_]{0,39}$")] + _MEMORY_RE = re.compile(r"^(\d+(?:\.\d+)?)(g|m|k)?i?$") _GIB = 1024 * 1024 * 1024 @@ -115,7 +123,7 @@ class TableFeatureSpec(FeatureSpec): class HookDefinition(ValueObject): """Complete specification for a hook: how it runs + what it produces.""" - name: PgIdentifier + name: HookName runtime: Annotated[OciConfig, Field(discriminator="type")] feature: Annotated[TableFeatureSpec, Field(discriminator="kind")] diff --git a/server/tests/unit/domain/shared/test_hook_models.py b/server/tests/unit/domain/shared/test_hook_models.py index cf669b0..e7fc144 100644 --- a/server/tests/unit/domain/shared/test_hook_models.py +++ b/server/tests/unit/domain/shared/test_hook_models.py @@ -359,3 +359,38 @@ def test_valid_names_accepted(self): col = ColumnDef(name=name, json_type="number", required=True) assert col.name == name + + def test_hook_name_accepts_40_chars(self): + """Hook names must fit in derived identifiers like + ``fk_features_{name}_record_srn`` — 23 chars overhead + up to 40-char + hook = 63-char max, which is PG's identifier limit.""" + from osa.domain.shared.model.hook import HookDefinition, OciConfig, TableFeatureSpec + + forty = "a" + "b" * 39 + hook = HookDefinition( + name=forty, + runtime=OciConfig(image="img:v1", digest="sha256:abc"), + feature=TableFeatureSpec(cardinality="one", columns=[]), + ) + assert hook.name == forty + + def test_hook_name_rejects_over_40_chars(self): + """41+ char names would produce an FK name exceeding PG's 63-char + identifier limit.""" + from osa.domain.shared.model.hook import HookDefinition, OciConfig, TableFeatureSpec + + with pytest.raises(ValidationError): + HookDefinition( + name="a" + "b" * 40, # 41 chars + runtime=OciConfig(image="img:v1", digest="sha256:abc"), + feature=TableFeatureSpec(cardinality="one", columns=[]), + ) + + def test_column_name_still_accepts_63_chars(self): + """ColumnDef uses plain PgIdentifier — columns don't compose into + longer derived identifiers, so the full 63-char PG limit is fine.""" + from osa.domain.shared.model.hook import ColumnDef + + sixty_three = "a" + "b" * 62 + col = ColumnDef(name=sixty_three, json_type="number", required=True) + assert col.name == sixty_three