From a8f2013cc98900061374503c79f4573e5ee3af24 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Fri, 6 Feb 2026 13:34:02 +0100 Subject: [PATCH 01/17] feat: add queue, nosql, markup, tesseract plugin boilerplates Add empty shell plugins for message queues (Kafka, RabbitMQ, SQS, Redis Streams), NoSQL databases (MongoDB, DynamoDB, Firestore), markup/tabular parsing (HTML, XML, JSON, CSV, TSV), and Tesseract OCR. Register all four with the server engine. Move the packages table from the root README to packages/README.md with white-paper-style descriptions for each plugin category. Co-Authored-By: Claude Opus 4.6 --- README.md | 12 +-- package-lock.json | 81 +++++++++++++++ packages/README.md | 99 ++++++++++--------- packages/nvisy-plugin-markup/package.json | 30 ++++++ packages/nvisy-plugin-markup/src/index.ts | 13 +++ packages/nvisy-plugin-markup/tsconfig.json | 13 +++ packages/nvisy-plugin-markup/tsup.config.ts | 22 +++++ packages/nvisy-plugin-nosql/package.json | 30 ++++++ packages/nvisy-plugin-nosql/src/index.ts | 13 +++ packages/nvisy-plugin-nosql/tsconfig.json | 13 +++ packages/nvisy-plugin-nosql/tsup.config.ts | 22 +++++ packages/nvisy-plugin-queue/package.json | 30 ++++++ packages/nvisy-plugin-queue/src/index.ts | 13 +++ packages/nvisy-plugin-queue/tsconfig.json | 13 +++ packages/nvisy-plugin-queue/tsup.config.ts | 22 +++++ packages/nvisy-plugin-tesseract/package.json | 30 ++++++ packages/nvisy-plugin-tesseract/src/index.ts | 13 +++ packages/nvisy-plugin-tesseract/tsconfig.json | 13 +++ .../nvisy-plugin-tesseract/tsup.config.ts | 22 +++++ packages/nvisy-server/package.json | 4 + .../service/{engine.ts => engine-factory.ts} | 8 ++ packages/nvisy-server/src/service/index.ts | 2 +- 22 files changed, 462 insertions(+), 56 deletions(-) create mode 100644 packages/nvisy-plugin-markup/package.json create mode 100644 packages/nvisy-plugin-markup/src/index.ts create mode 100644 packages/nvisy-plugin-markup/tsconfig.json create mode 100644 packages/nvisy-plugin-markup/tsup.config.ts create mode 100644 packages/nvisy-plugin-nosql/package.json create mode 100644 packages/nvisy-plugin-nosql/src/index.ts create mode 100644 packages/nvisy-plugin-nosql/tsconfig.json create mode 100644 packages/nvisy-plugin-nosql/tsup.config.ts create mode 100644 packages/nvisy-plugin-queue/package.json create mode 100644 packages/nvisy-plugin-queue/src/index.ts create mode 100644 packages/nvisy-plugin-queue/tsconfig.json create mode 100644 packages/nvisy-plugin-queue/tsup.config.ts create mode 100644 packages/nvisy-plugin-tesseract/package.json create mode 100644 packages/nvisy-plugin-tesseract/src/index.ts create mode 100644 packages/nvisy-plugin-tesseract/tsconfig.json create mode 100644 packages/nvisy-plugin-tesseract/tsup.config.ts rename packages/nvisy-server/src/service/{engine.ts => engine-factory.ts} (76%) diff --git a/README.md b/README.md index 1783c89..c4cfb55 100644 --- a/README.md +++ b/README.md @@ -10,17 +10,7 @@ all flow through typed, validated primitives with full lineage tracking. ## Packages -| Package | Description | -|---------|-------------| -| [`nvisy-core`](packages/nvisy-core/) | Core data types, errors, and utilities | -| [`nvisy-runtime`](packages/nvisy-runtime/) | Graph definition, DAG compiler, execution engine | -| [`nvisy-plugin-ai`](packages/nvisy-plugin-ai/) | AI provider integrations (OpenAI, Anthropic, Google) | -| [`nvisy-plugin-object`](packages/nvisy-plugin-object/) | Object store integrations (S3, GCS, Parquet, JSONL, CSV) | -| [`nvisy-plugin-sql`](packages/nvisy-plugin-sql/) | SQL provider integrations (Postgres, MySQL, MSSQL) | -| [`nvisy-plugin-vector`](packages/nvisy-plugin-vector/) | Vector database integrations (Qdrant, Milvus, Weaviate, Pinecone) | -| [`nvisy-server`](packages/nvisy-server/) | HTTP execution worker | - -See [packages/](packages/README.md) for detailed descriptions. +See [packages/](packages/README.md) for the full package listing and detailed descriptions. ## Quick Start diff --git a/package-lock.json b/package-lock.json index ccab769..bd49562 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2410,6 +2410,14 @@ "resolved": "packages/nvisy-plugin-ai", "link": true }, + "node_modules/@nvisy/plugin-markup": { + "resolved": "packages/nvisy-plugin-markup", + "link": true + }, + "node_modules/@nvisy/plugin-nosql": { + "resolved": "packages/nvisy-plugin-nosql", + "link": true + }, "node_modules/@nvisy/plugin-object": { "resolved": "packages/nvisy-plugin-object", "link": true @@ -2418,10 +2426,18 @@ "resolved": "packages/nvisy-plugin-pandoc", "link": true }, + "node_modules/@nvisy/plugin-queue": { + "resolved": "packages/nvisy-plugin-queue", + "link": true + }, "node_modules/@nvisy/plugin-sql": { "resolved": "packages/nvisy-plugin-sql", "link": true }, + "node_modules/@nvisy/plugin-tesseract": { + "resolved": "packages/nvisy-plugin-tesseract", + "link": true + }, "node_modules/@nvisy/plugin-vector": { "resolved": "packages/nvisy-plugin-vector", "link": true @@ -7409,6 +7425,30 @@ "node": ">=22.0.0" } }, + "packages/nvisy-plugin-markup": { + "name": "@nvisy/plugin-markup", + "version": "0.1.0", + "dependencies": { + "@logtape/logtape": "^2.0.2", + "@nvisy/core": "*", + "zod": "^4.3.6" + }, + "engines": { + "node": ">=22.0.0" + } + }, + "packages/nvisy-plugin-nosql": { + "name": "@nvisy/plugin-nosql", + "version": "0.1.0", + "dependencies": { + "@logtape/logtape": "^2.0.2", + "@nvisy/core": "*", + "zod": "^4.3.6" + }, + "engines": { + "node": ">=22.0.0" + } + }, "packages/nvisy-plugin-object": { "name": "@nvisy/plugin-object", "version": "0.1.0", @@ -7424,6 +7464,19 @@ "node": ">=22.0.0" } }, + "packages/nvisy-plugin-ocr": { + "name": "@nvisy/plugin-ocr", + "version": "0.1.0", + "extraneous": true, + "dependencies": { + "@logtape/logtape": "^2.0.2", + "@nvisy/core": "*", + "zod": "^4.3.6" + }, + "engines": { + "node": ">=22.0.0" + } + }, "packages/nvisy-plugin-pandoc": { "name": "@nvisy/plugin-pandoc", "version": "0.1.0", @@ -7436,6 +7489,18 @@ "node": ">=22.0.0" } }, + "packages/nvisy-plugin-queue": { + "name": "@nvisy/plugin-queue", + "version": "0.1.0", + "dependencies": { + "@logtape/logtape": "^2.0.2", + "@nvisy/core": "*", + "zod": "^4.3.6" + }, + "engines": { + "node": ">=22.0.0" + } + }, "packages/nvisy-plugin-sql": { "name": "@nvisy/plugin-sql", "version": "0.1.0", @@ -7456,6 +7521,18 @@ "node": ">=22.0.0" } }, + "packages/nvisy-plugin-tesseract": { + "name": "@nvisy/plugin-tesseract", + "version": "0.1.0", + "dependencies": { + "@logtape/logtape": "^2.0.2", + "@nvisy/core": "*", + "zod": "^4.3.6" + }, + "engines": { + "node": ">=22.0.0" + } + }, "packages/nvisy-plugin-vector": { "name": "@nvisy/plugin-vector", "version": "0.1.0", @@ -7507,9 +7584,13 @@ "@logtape/redaction": "^2.0.2", "@nvisy/core": "*", "@nvisy/plugin-ai": "*", + "@nvisy/plugin-markup": "*", + "@nvisy/plugin-nosql": "*", "@nvisy/plugin-object": "*", "@nvisy/plugin-pandoc": "*", + "@nvisy/plugin-queue": "*", "@nvisy/plugin-sql": "*", + "@nvisy/plugin-tesseract": "*", "@nvisy/plugin-vector": "*", "@nvisy/runtime": "*", "@scalar/hono-api-reference": "^0.9.40", diff --git a/packages/README.md b/packages/README.md index 0b3b5ba..9719434 100644 --- a/packages/README.md +++ b/packages/README.md @@ -2,47 +2,58 @@ [![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) -The monorepo is organized around a shared core, a runtime engine, a set of -plugin packages, and an HTTP server. `nvisy-core` is the shared foundation -with no internal dependencies. - -## nvisy-core - -Shared primitives, type system, validation, error taxonomy, and base -interfaces for sources, sinks, and actions. Also houses core observability -utilities (structured logging, metrics, tracing). Every other package depends -on this one. - -## nvisy-runtime - -Graph definition, JSON parser, DAG compiler, and execution engine. Parses JSON -graph definitions into an immutable `ExecutionPlan`, then executes it — -walking the DAG in topological order, managing concurrency via Effection -structured concurrency, enforcing per-connector rate limits, retrying failed nodes with -configurable backoff, and emitting runtime metrics and OpenTelemetry traces. - -## nvisy-plugin-sql - -Source and Sink implementations for relational databases. Targets PostgreSQL -and MySQL. Handles connection pooling, query generation, type mapping, and -batch insert/upsert operations. - -## nvisy-plugin-object - -Source and Sink implementations for object stores and file formats. Targets S3, -GCS, Parquet, JSONL, and CSV. Handles multipart uploads, streaming reads, -prefix-based listing, schema inference, and chunked reading. - -## nvisy-plugin-vector - -Source and Sink implementations for vector databases. Targets Pinecone, Qdrant, -Milvus, Weaviate, and pgvector. Handles collection/index management, upsert -with metadata, batch operations, and dimensionality validation. - -## nvisy-server - -HTTP server built on Hono. Exposes a REST API for graph CRUD, run -management, connector health checks, and lineage queries. Includes a cron -scheduler, webhook-based event triggers, and server-level observability -(request logging, health endpoints, metric export). - +## Core infrastructure + +The runtime architecture follows a layered separation of concerns. At the +foundation, a shared core library defines the type system, error taxonomy, and +abstract interfaces that all other packages depend on. Above it, the runtime +engine implements a DAG-based execution model: pipeline definitions are parsed +from declarative JSON graphs, compiled into immutable execution plans, and +evaluated in topological order with structured concurrency, per-node retry +policies, and full lineage tracking across every data item. The server package +exposes this engine over HTTP, providing a REST API for pipeline management, +execution, and observability. + +| Package | Description | +|---------|-------------| +| [`nvisy-core`](nvisy-core/) | Core data types, errors, and utilities | +| [`nvisy-runtime`](nvisy-runtime/) | Graph definition, DAG compiler, execution engine | +| [`nvisy-server`](nvisy-server/) | HTTP execution worker | + +## Provider plugins + +Provider plugins supply the I/O boundary of a pipeline. Each plugin implements +one or more _providers_ — authenticated clients to external systems — and +_streams_ — source or target adapters that read from or write to those systems +using the provider's client. This design decouples credential management from +data flow: a single provider connection can back multiple streams within the +same pipeline, and streams are reusable across providers that share a common +client interface. Provider plugins cover the six major categories of external +storage: relational databases, document stores, object stores, vector +databases, message queues, and AI model endpoints. + +| Package | Description | +|---------|-------------| +| [`nvisy-plugin-ai`](nvisy-plugin-ai/) | AI provider integrations (OpenAI, Anthropic, Google) | +| [`nvisy-plugin-nosql`](nvisy-plugin-nosql/) | NoSQL database integrations (MongoDB, DynamoDB, Firestore) | +| [`nvisy-plugin-object`](nvisy-plugin-object/) | Object store integrations (S3, GCS, Azure Blob) | +| [`nvisy-plugin-queue`](nvisy-plugin-queue/) | Message queue integrations (Kafka, RabbitMQ, SQS, Redis Streams) | +| [`nvisy-plugin-sql`](nvisy-plugin-sql/) | SQL database integrations (Postgres, MySQL, MSSQL) | +| [`nvisy-plugin-vector`](nvisy-plugin-vector/) | Vector database integrations (Pinecone, Qdrant, Milvus, Weaviate, pgvector) | + +## Action plugins + +Action plugins operate on data in-flight without requiring external service +credentials. They implement pure transformations: a function from one typed data +item to another, executed locally within the pipeline process. This category +includes format conversion, structured parsing, and content extraction. Because +actions carry no provider dependency, they compose freely between any source and +target and introduce no additional authentication surface. The runtime +guarantees type safety at the graph edges — an action's input and output types +must match the adjacent nodes in the DAG. + +| Package | Description | +|---------|-------------| +| [`nvisy-plugin-markup`](nvisy-plugin-markup/) | HTML, XML, JSON, CSV, TSV, and plain text parsing | +| [`nvisy-plugin-tesseract`](nvisy-plugin-tesseract/) | Optical character recognition (Tesseract) | +| [`nvisy-plugin-pandoc`](nvisy-plugin-pandoc/) | Document format conversion (Pandoc) | diff --git a/packages/nvisy-plugin-markup/package.json b/packages/nvisy-plugin-markup/package.json new file mode 100644 index 0000000..fc9caca --- /dev/null +++ b/packages/nvisy-plugin-markup/package.json @@ -0,0 +1,30 @@ +{ + "name": "@nvisy/plugin-markup", + "version": "0.1.0", + "description": "Markup, tabular, and text format parsing for the Nvisy platform", + "type": "module", + "exports": { + ".": { + "source": "./src/index.ts", + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "files": [ + "dist" + ], + "scripts": { + "build": "tsup", + "build:watch": "tsup --watch", + "clean": "rimraf dist", + "typecheck": "tsc -b" + }, + "dependencies": { + "@logtape/logtape": "^2.0.2", + "@nvisy/core": "*", + "zod": "^4.3.6" + }, + "engines": { + "node": ">=22.0.0" + } +} diff --git a/packages/nvisy-plugin-markup/src/index.ts b/packages/nvisy-plugin-markup/src/index.ts new file mode 100644 index 0000000..2020a01 --- /dev/null +++ b/packages/nvisy-plugin-markup/src/index.ts @@ -0,0 +1,13 @@ +/** + * @module @nvisy/plugin-markup + * + * Markup and text format parsing plugin for the Nvisy runtime. + * + * Provides actions for parsing and extracting structured data from + * HTML, XML, JSON, CSV, TSV, and plain text formats. + */ + +import { Plugin } from "@nvisy/core"; + +/** Markup parsing plugin instance. */ +export const markupPlugin = Plugin.define("markup"); diff --git a/packages/nvisy-plugin-markup/tsconfig.json b/packages/nvisy-plugin-markup/tsconfig.json new file mode 100644 index 0000000..c91a2dd --- /dev/null +++ b/packages/nvisy-plugin-markup/tsconfig.json @@ -0,0 +1,13 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + /* Emit */ + "outDir": "./dist", + "rootDir": "./src", + "composite": true + }, + /* Scope */ + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"], + "references": [{ "path": "../nvisy-core" }] +} diff --git a/packages/nvisy-plugin-markup/tsup.config.ts b/packages/nvisy-plugin-markup/tsup.config.ts new file mode 100644 index 0000000..d68a5db --- /dev/null +++ b/packages/nvisy-plugin-markup/tsup.config.ts @@ -0,0 +1,22 @@ +import { defineConfig } from "tsup"; + +export default defineConfig({ + /* Entry */ + entry: ["src/index.ts"], + format: ["esm"], + + /* Output */ + outDir: "dist", + dts: { compilerOptions: { composite: false } }, + sourcemap: true, + clean: true, + + /* Optimization */ + splitting: false, + treeshake: true, + skipNodeModulesBundle: true, + + /* Environment */ + platform: "node", + target: "es2024", +}); diff --git a/packages/nvisy-plugin-nosql/package.json b/packages/nvisy-plugin-nosql/package.json new file mode 100644 index 0000000..f2cbaa7 --- /dev/null +++ b/packages/nvisy-plugin-nosql/package.json @@ -0,0 +1,30 @@ +{ + "name": "@nvisy/plugin-nosql", + "version": "0.1.0", + "description": "NoSQL database integrations for the Nvisy platform", + "type": "module", + "exports": { + ".": { + "source": "./src/index.ts", + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "files": [ + "dist" + ], + "scripts": { + "build": "tsup", + "build:watch": "tsup --watch", + "clean": "rimraf dist", + "typecheck": "tsc -b" + }, + "dependencies": { + "@logtape/logtape": "^2.0.2", + "@nvisy/core": "*", + "zod": "^4.3.6" + }, + "engines": { + "node": ">=22.0.0" + } +} diff --git a/packages/nvisy-plugin-nosql/src/index.ts b/packages/nvisy-plugin-nosql/src/index.ts new file mode 100644 index 0000000..ec4ee10 --- /dev/null +++ b/packages/nvisy-plugin-nosql/src/index.ts @@ -0,0 +1,13 @@ +/** + * @module @nvisy/plugin-nosql + * + * NoSQL database plugin for the Nvisy runtime. + * + * Provides source and target streams for document databases + * (MongoDB, DynamoDB, Firestore). + */ + +import { Plugin } from "@nvisy/core"; + +/** NoSQL database plugin instance. */ +export const nosqlPlugin = Plugin.define("nosql"); diff --git a/packages/nvisy-plugin-nosql/tsconfig.json b/packages/nvisy-plugin-nosql/tsconfig.json new file mode 100644 index 0000000..c91a2dd --- /dev/null +++ b/packages/nvisy-plugin-nosql/tsconfig.json @@ -0,0 +1,13 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + /* Emit */ + "outDir": "./dist", + "rootDir": "./src", + "composite": true + }, + /* Scope */ + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"], + "references": [{ "path": "../nvisy-core" }] +} diff --git a/packages/nvisy-plugin-nosql/tsup.config.ts b/packages/nvisy-plugin-nosql/tsup.config.ts new file mode 100644 index 0000000..d68a5db --- /dev/null +++ b/packages/nvisy-plugin-nosql/tsup.config.ts @@ -0,0 +1,22 @@ +import { defineConfig } from "tsup"; + +export default defineConfig({ + /* Entry */ + entry: ["src/index.ts"], + format: ["esm"], + + /* Output */ + outDir: "dist", + dts: { compilerOptions: { composite: false } }, + sourcemap: true, + clean: true, + + /* Optimization */ + splitting: false, + treeshake: true, + skipNodeModulesBundle: true, + + /* Environment */ + platform: "node", + target: "es2024", +}); diff --git a/packages/nvisy-plugin-queue/package.json b/packages/nvisy-plugin-queue/package.json new file mode 100644 index 0000000..a481298 --- /dev/null +++ b/packages/nvisy-plugin-queue/package.json @@ -0,0 +1,30 @@ +{ + "name": "@nvisy/plugin-queue", + "version": "0.1.0", + "description": "Message queue integrations for the Nvisy platform", + "type": "module", + "exports": { + ".": { + "source": "./src/index.ts", + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "files": [ + "dist" + ], + "scripts": { + "build": "tsup", + "build:watch": "tsup --watch", + "clean": "rimraf dist", + "typecheck": "tsc -b" + }, + "dependencies": { + "@logtape/logtape": "^2.0.2", + "@nvisy/core": "*", + "zod": "^4.3.6" + }, + "engines": { + "node": ">=22.0.0" + } +} diff --git a/packages/nvisy-plugin-queue/src/index.ts b/packages/nvisy-plugin-queue/src/index.ts new file mode 100644 index 0000000..d66e2d9 --- /dev/null +++ b/packages/nvisy-plugin-queue/src/index.ts @@ -0,0 +1,13 @@ +/** + * @module @nvisy/plugin-queue + * + * Message queue plugin for the Nvisy runtime. + * + * Provides source and target streams for message queue systems + * (Kafka, RabbitMQ, SQS, Redis Streams). + */ + +import { Plugin } from "@nvisy/core"; + +/** Message queue plugin instance. */ +export const queuePlugin = Plugin.define("queue"); diff --git a/packages/nvisy-plugin-queue/tsconfig.json b/packages/nvisy-plugin-queue/tsconfig.json new file mode 100644 index 0000000..c91a2dd --- /dev/null +++ b/packages/nvisy-plugin-queue/tsconfig.json @@ -0,0 +1,13 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + /* Emit */ + "outDir": "./dist", + "rootDir": "./src", + "composite": true + }, + /* Scope */ + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"], + "references": [{ "path": "../nvisy-core" }] +} diff --git a/packages/nvisy-plugin-queue/tsup.config.ts b/packages/nvisy-plugin-queue/tsup.config.ts new file mode 100644 index 0000000..d68a5db --- /dev/null +++ b/packages/nvisy-plugin-queue/tsup.config.ts @@ -0,0 +1,22 @@ +import { defineConfig } from "tsup"; + +export default defineConfig({ + /* Entry */ + entry: ["src/index.ts"], + format: ["esm"], + + /* Output */ + outDir: "dist", + dts: { compilerOptions: { composite: false } }, + sourcemap: true, + clean: true, + + /* Optimization */ + splitting: false, + treeshake: true, + skipNodeModulesBundle: true, + + /* Environment */ + platform: "node", + target: "es2024", +}); diff --git a/packages/nvisy-plugin-tesseract/package.json b/packages/nvisy-plugin-tesseract/package.json new file mode 100644 index 0000000..aabba40 --- /dev/null +++ b/packages/nvisy-plugin-tesseract/package.json @@ -0,0 +1,30 @@ +{ + "name": "@nvisy/plugin-tesseract", + "version": "0.1.0", + "description": "Optical character recognition for the Nvisy platform", + "type": "module", + "exports": { + ".": { + "source": "./src/index.ts", + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "files": [ + "dist" + ], + "scripts": { + "build": "tsup", + "build:watch": "tsup --watch", + "clean": "rimraf dist", + "typecheck": "tsc -b" + }, + "dependencies": { + "@logtape/logtape": "^2.0.2", + "@nvisy/core": "*", + "zod": "^4.3.6" + }, + "engines": { + "node": ">=22.0.0" + } +} diff --git a/packages/nvisy-plugin-tesseract/src/index.ts b/packages/nvisy-plugin-tesseract/src/index.ts new file mode 100644 index 0000000..8660d15 --- /dev/null +++ b/packages/nvisy-plugin-tesseract/src/index.ts @@ -0,0 +1,13 @@ +/** + * @module @nvisy/plugin-tesseract + * + * Optical character recognition plugin for the Nvisy runtime. + * + * Provides actions for extracting text from images and scanned + * documents using Tesseract. + */ + +import { Plugin } from "@nvisy/core"; + +/** Tesseract OCR plugin instance. */ +export const tesseractPlugin = Plugin.define("tesseract"); diff --git a/packages/nvisy-plugin-tesseract/tsconfig.json b/packages/nvisy-plugin-tesseract/tsconfig.json new file mode 100644 index 0000000..c91a2dd --- /dev/null +++ b/packages/nvisy-plugin-tesseract/tsconfig.json @@ -0,0 +1,13 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + /* Emit */ + "outDir": "./dist", + "rootDir": "./src", + "composite": true + }, + /* Scope */ + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"], + "references": [{ "path": "../nvisy-core" }] +} diff --git a/packages/nvisy-plugin-tesseract/tsup.config.ts b/packages/nvisy-plugin-tesseract/tsup.config.ts new file mode 100644 index 0000000..d68a5db --- /dev/null +++ b/packages/nvisy-plugin-tesseract/tsup.config.ts @@ -0,0 +1,22 @@ +import { defineConfig } from "tsup"; + +export default defineConfig({ + /* Entry */ + entry: ["src/index.ts"], + format: ["esm"], + + /* Output */ + outDir: "dist", + dts: { compilerOptions: { composite: false } }, + sourcemap: true, + clean: true, + + /* Optimization */ + splitting: false, + treeshake: true, + skipNodeModulesBundle: true, + + /* Environment */ + platform: "node", + target: "es2024", +}); diff --git a/packages/nvisy-server/package.json b/packages/nvisy-server/package.json index 5d5ee2a..eb8ed30 100644 --- a/packages/nvisy-server/package.json +++ b/packages/nvisy-server/package.json @@ -25,8 +25,12 @@ "@logtape/redaction": "^2.0.2", "@nvisy/core": "*", "@nvisy/plugin-ai": "*", + "@nvisy/plugin-markup": "*", + "@nvisy/plugin-nosql": "*", "@nvisy/plugin-object": "*", + "@nvisy/plugin-tesseract": "*", "@nvisy/plugin-pandoc": "*", + "@nvisy/plugin-queue": "*", "@nvisy/plugin-sql": "*", "@nvisy/plugin-vector": "*", "@nvisy/runtime": "*", diff --git a/packages/nvisy-server/src/service/engine.ts b/packages/nvisy-server/src/service/engine-factory.ts similarity index 76% rename from packages/nvisy-server/src/service/engine.ts rename to packages/nvisy-server/src/service/engine-factory.ts index fd7b4ce..d26204c 100644 --- a/packages/nvisy-server/src/service/engine.ts +++ b/packages/nvisy-server/src/service/engine-factory.ts @@ -1,7 +1,11 @@ import { getLogger } from "@logtape/logtape"; import { aiPlugin } from "@nvisy/plugin-ai"; +import { markupPlugin } from "@nvisy/plugin-markup"; +import { nosqlPlugin } from "@nvisy/plugin-nosql"; import { objectPlugin } from "@nvisy/plugin-object"; +import { tesseractPlugin } from "@nvisy/plugin-tesseract"; import { pandocPlugin } from "@nvisy/plugin-pandoc"; +import { queuePlugin } from "@nvisy/plugin-queue"; import { sqlPlugin } from "@nvisy/plugin-sql"; import { vectorPlugin } from "@nvisy/plugin-vector"; import { Engine } from "@nvisy/runtime"; @@ -15,8 +19,12 @@ export function createEngine(): Engine { try { const engine = new Engine() .register(aiPlugin) + .register(markupPlugin) + .register(nosqlPlugin) .register(objectPlugin) + .register(tesseractPlugin) .register(pandocPlugin) + .register(queuePlugin) .register(sqlPlugin) .register(vectorPlugin); diff --git a/packages/nvisy-server/src/service/index.ts b/packages/nvisy-server/src/service/index.ts index 3dcb0a4..e013301 100644 --- a/packages/nvisy-server/src/service/index.ts +++ b/packages/nvisy-server/src/service/index.ts @@ -1 +1 @@ -export { createEngine } from "./engine.js"; +export { createEngine } from "./engine-factory.js"; From ecb569ac5e5786d41e10397159d8b819e599e39c Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sat, 7 Feb 2026 09:00:15 +0100 Subject: [PATCH 02/17] refactor(core): redesign Element hierarchy with typed subclasses and move shared types to types.ts Extract type-specific fields from the flat Element base class into dedicated subclasses (TableElement, FormElement, EmailElement, CompositeElement), bundle OCR/extraction provenance into ElementProvenance, add sourceTag/textAsHtml/ pageName for source fidelity, add FormKeyValuePair for structured form data, introduce EmailType ontology category, and move JsonValue/Metadata to types.ts. Co-Authored-By: Claude Opus 4.6 --- packages/nvisy-core/src/datatypes/data.ts | 23 +- .../nvisy-core/src/datatypes/document.test.ts | 334 ++++++++-------- packages/nvisy-core/src/datatypes/document.ts | 148 +++---- .../src/datatypes/embedding.test.ts | 50 +++ packages/nvisy-core/src/datatypes/index.ts | 27 +- .../src/documents/coordinates.test.ts | 148 +++++++ .../nvisy-core/src/documents/coordinates.ts | 174 ++++++++ .../nvisy-core/src/documents/elements.test.ts | 310 +++++++++++++++ packages/nvisy-core/src/documents/elements.ts | 374 ++++++++++++++++++ packages/nvisy-core/src/documents/index.ts | 69 ++++ .../nvisy-core/src/documents/ontology.test.ts | 137 +++++++ packages/nvisy-core/src/documents/ontology.ts | 125 ++++++ packages/nvisy-core/src/errors/index.ts | 2 + packages/nvisy-core/src/errors/timeout.ts | 15 + packages/nvisy-core/src/index.ts | 71 ++-- .../nvisy-core/src/{loaders => }/loader.ts | 4 +- packages/nvisy-core/src/plugin.ts | 2 +- packages/nvisy-core/src/types.ts | 23 ++ packages/nvisy-plugin-ai/src/actions/chunk.ts | 99 ++--- .../nvisy-plugin-ai/src/actions/enrich.ts | 2 +- .../nvisy-plugin-ai/src/actions/partition.ts | 4 +- packages/nvisy-plugin-markup/src/index.ts | 7 +- .../src/loaders/index.ts | 2 - .../src/loaders/plaintext.test.ts | 7 +- .../src/loaders/plaintext.ts | 3 +- packages/nvisy-runtime/src/engine/nodes.ts | 5 +- .../src/service/engine-factory.ts | 2 +- 27 files changed, 1807 insertions(+), 360 deletions(-) create mode 100644 packages/nvisy-core/src/datatypes/embedding.test.ts create mode 100644 packages/nvisy-core/src/documents/coordinates.test.ts create mode 100644 packages/nvisy-core/src/documents/coordinates.ts create mode 100644 packages/nvisy-core/src/documents/elements.test.ts create mode 100644 packages/nvisy-core/src/documents/elements.ts create mode 100644 packages/nvisy-core/src/documents/index.ts create mode 100644 packages/nvisy-core/src/documents/ontology.test.ts create mode 100644 packages/nvisy-core/src/documents/ontology.ts create mode 100644 packages/nvisy-core/src/errors/timeout.ts rename packages/nvisy-core/src/{loaders => }/loader.ts (96%) rename packages/{nvisy-core => nvisy-plugin-markup}/src/loaders/index.ts (53%) rename packages/{nvisy-core => nvisy-plugin-markup}/src/loaders/plaintext.test.ts (95%) rename packages/{nvisy-core => nvisy-plugin-markup}/src/loaders/plaintext.ts (90%) diff --git a/packages/nvisy-core/src/datatypes/data.ts b/packages/nvisy-core/src/datatypes/data.ts index e07851b..d0aaa0c 100644 --- a/packages/nvisy-core/src/datatypes/data.ts +++ b/packages/nvisy-core/src/datatypes/data.ts @@ -1,25 +1,4 @@ -/** - * A JSON-compatible value. - * - * Mirrors the types that `JSON.parse` can return and `JSON.stringify` - * can accept, making it safe for serialisation boundaries (APIs, - * databases, message queues). - */ -export type JsonValue = - | string - | number - | boolean - | null - | JsonValue[] - | { [key: string]: JsonValue }; - -/** - * Key-value metadata bag attached to {@link Data} items. - * - * All values must be JSON-serialisable so metadata can travel across - * process boundaries without lossy conversion. - */ -export type Metadata = Record; +import type { Metadata } from "../types.js"; /** * Abstract base class for all data types flowing through the pipeline. diff --git a/packages/nvisy-core/src/datatypes/document.test.ts b/packages/nvisy-core/src/datatypes/document.test.ts index 4846366..64eabd5 100644 --- a/packages/nvisy-core/src/datatypes/document.test.ts +++ b/packages/nvisy-core/src/datatypes/document.test.ts @@ -1,195 +1,207 @@ import { describe, expect, it } from "vitest"; -import type { DocumentPage, DocumentSection } from "./document.js"; +import { Element } from "../documents/elements.js"; import { Document } from "./document.js"; describe("Document", () => { - it("stores content and has no pages by default", () => { + it("stores content and has no elements by default", () => { const doc = new Document("hello world"); expect(doc.content).toBe("hello world"); - expect(doc.pages).toBeUndefined(); - expect(doc.flatElements).toEqual([]); + expect(doc.elements).toBeUndefined(); }); - it("constructor accepts pages in options", () => { - const pages: DocumentPage[] = [ - { - pageNumber: 1, - sections: [ - { - title: "Intro", - elements: [{ type: "paragraph", text: "Hello" }], - }, - ], - }, - ]; - const doc = new Document("Hello", { pages }); + it("constructor accepts elements in options", () => { + const el = new Element({ + type: "narrative-text", + text: "Hello", + }); + const doc = new Document("Hello", { elements: [el] }); expect(doc.content).toBe("Hello"); - expect(doc.pages).toEqual(pages); + expect(doc.elements).toHaveLength(1); + expect(doc.elements![0]!.text).toBe("Hello"); + }); + + describe("title", () => { + it("is undefined by default", () => { + const doc = new Document("text"); + expect(doc.title).toBeUndefined(); + }); + + it("is set via constructor options", () => { + const doc = new Document("text", { title: "Quarterly Report" }); + expect(doc.title).toBe("Quarterly Report"); + }); + + it("is preserved by fromElements", () => { + const el = new Element({ + type: "narrative-text", + text: "hi", + }); + const doc = Document.fromElements([el], { + sourceType: "html", + title: "My Page", + }); + expect(doc.title).toBe("My Page"); + expect(doc.sourceType).toBe("html"); + }); }); - describe("fromPages", () => { + describe("languages", () => { + it("is empty when there are no elements", () => { + const doc = new Document("text"); + expect(doc.languages).toEqual([]); + }); + + it("is empty when no elements have languages", () => { + const doc = new Document("text", { + elements: [ + new Element({ + type: "narrative-text", + text: "hello", + }), + ], + }); + expect(doc.languages).toEqual([]); + }); + + it("collects unique languages from all elements", () => { + const doc = new Document("text", { + elements: [ + new Element({ + type: "narrative-text", + text: "hello", + languages: ["en"], + }), + new Element({ + type: "narrative-text", + text: "hallo", + languages: ["de", "en"], + }), + new Element({ + type: "narrative-text", + text: "bonjour", + languages: ["fr"], + }), + ], + }); + expect(doc.languages).toEqual(["en", "de", "fr"]); + }); + + it("skips elements without languages", () => { + const doc = new Document("text", { + elements: [ + new Element({ + type: "narrative-text", + text: "no lang", + }), + new Element({ + type: "narrative-text", + text: "has lang", + languages: ["es"], + }), + ], + }); + expect(doc.languages).toEqual(["es"]); + }); + }); + + describe("fromElements", () => { it("derives content from element texts joined with \\n\\n", () => { - const pages: DocumentPage[] = [ - { - pageNumber: 1, - sections: [ - { - elements: [ - { type: "heading", text: "Title", level: 1 }, - { type: "paragraph", text: "First paragraph." }, - ], - }, - ], - }, - { - pageNumber: 2, - sections: [ - { - elements: [{ type: "paragraph", text: "Second page content." }], - }, - ], - }, + const elements = [ + new Element({ + type: "title", + text: "Title", + level: 1, + }), + new Element({ + type: "narrative-text", + text: "First paragraph.", + }), + new Element({ + type: "narrative-text", + text: "Second paragraph.", + }), ]; - - const doc = Document.fromPages(pages); + const doc = Document.fromElements(elements); expect(doc.content).toBe( - "Title\n\nFirst paragraph.\n\nSecond page content.", + "Title\n\nFirst paragraph.\n\nSecond paragraph.", ); - expect(doc.pages).toEqual(pages); + expect(doc.elements).toHaveLength(3); }); - it("produces empty content from empty pages array", () => { - const doc = Document.fromPages([]); + it("produces empty content from empty elements array", () => { + const doc = Document.fromElements([]); expect(doc.content).toBe(""); - expect(doc.pages).toEqual([]); + expect(doc.elements).toEqual([]); }); it("preserves sourceType", () => { - const pages: DocumentPage[] = [ - { - pageNumber: 1, - sections: [ - { - elements: [{ type: "paragraph", text: "text" }], - }, - ], - }, - ]; - const doc = Document.fromPages(pages, { sourceType: "pdf" }); + const el = new Element({ + type: "narrative-text", + text: "text", + }); + const doc = Document.fromElements([el], { sourceType: "pdf" }); expect(doc.sourceType).toBe("pdf"); - expect(doc.pages).toHaveLength(1); + expect(doc.elements).toHaveLength(1); }); }); - describe("flatElements", () => { - it("traverses pages -> sections recursively in document order", () => { - const pages: DocumentPage[] = [ - { - pageNumber: 1, - sections: [ - { - title: "S1", - elements: [ - { type: "heading", text: "H1", level: 1 }, - { type: "paragraph", text: "P1" }, - ], - children: [ - { - title: "S1.1", - elements: [{ type: "paragraph", text: "P1.1" }], - }, - ], - }, - ], - }, - { - pageNumber: 2, - sections: [ - { - elements: [{ type: "table", text: "T1" }], - }, - ], - }, - ]; + describe("Element", () => { + it("auto-generates a unique id", () => { + const a = new Element({ + type: "narrative-text", + text: "a", + }); + const b = new Element({ + type: "narrative-text", + text: "b", + }); + expect(a.id).toBeTruthy(); + expect(b.id).toBeTruthy(); + expect(a.id).not.toBe(b.id); + }); - const doc = new Document("ignored", { pages }); - expect(doc.flatElements.map((e) => e.text)).toEqual([ - "H1", - "P1", - "P1.1", - "T1", - ]); + it("carries parentId for hierarchy", () => { + const table = new Element({ type: "table", text: "" }); + const child = new Element({ + type: "narrative-text", + text: "Revenue", + parentId: table.id, + }); + expect(child.parentId).toBe(table.id); }); - it("handles deeply nested sections (3+ levels)", () => { - const deepSection: DocumentSection = { - title: "L1", - elements: [{ type: "paragraph", text: "Level 1" }], - children: [ - { - title: "L2", - elements: [{ type: "paragraph", text: "Level 2" }], - children: [ - { - title: "L3", - elements: [{ type: "paragraph", text: "Level 3" }], - children: [ - { - title: "L4", - elements: [{ type: "code", text: "Level 4" }], - }, - ], - }, - ], - }, - ], - }; - const pages: DocumentPage[] = [ - { pageNumber: 1, sections: [deepSection] }, - ]; - const doc = new Document("x", { pages }); - expect(doc.flatElements.map((e) => e.text)).toEqual([ - "Level 1", - "Level 2", - "Level 3", - "Level 4", - ]); + it("carries pageNumber", () => { + const el = new Element({ + type: "title", + text: "Intro", + pageNumber: 2, + }); + expect(el.pageNumber).toBe(2); }); - it("handles sections without titles", () => { - const pages: DocumentPage[] = [ - { - pageNumber: 1, - sections: [ - { - elements: [{ type: "paragraph", text: "No title section" }], - }, - ], - }, - ]; - const doc = new Document("x", { pages }); - expect(doc.flatElements).toHaveLength(1); - expect(doc.flatElements[0]!.text).toBe("No title section"); - }); - - it("includes elements with empty text strings", () => { - const pages: DocumentPage[] = [ - { - pageNumber: 1, - sections: [ - { - elements: [ - { type: "image", text: "" }, - { type: "paragraph", text: "after" }, - ], - }, - ], - }, + it("carries optional enrichment fields", () => { + const el = new Element({ + type: "table", + text: "A | B", + languages: ["en"], + provenance: { confidence: 0.95, isContinuation: false }, + }); + expect(el.provenance?.confidence).toBe(0.95); + expect(el.languages).toEqual(["en"]); + expect(el.provenance?.isContinuation).toBe(false); + }); + + it("accepts various element types", () => { + const elements = [ + new Element({ type: "formula", text: "E = mc²" }), + new Element({ type: "list-item", text: "First item" }), + new Element({ type: "page-break", text: "" }), ]; - const doc = new Document("x", { pages }); - expect(doc.flatElements).toHaveLength(2); - expect(doc.flatElements[0]!.text).toBe(""); - expect(doc.flatElements[0]!.type).toBe("image"); + expect(elements.map((e) => e.type)).toEqual([ + "formula", + "list-item", + "page-break", + ]); }); }); }); diff --git a/packages/nvisy-core/src/datatypes/document.ts b/packages/nvisy-core/src/datatypes/document.ts index 7d918e9..77c7e18 100644 --- a/packages/nvisy-core/src/datatypes/document.ts +++ b/packages/nvisy-core/src/datatypes/document.ts @@ -1,43 +1,35 @@ -import type { Metadata } from "./data.js"; +import type { Element } from "../documents/elements.js"; import { Data } from "./data.js"; -/** The kind of structural element within a document. */ -export type ElementType = - | "paragraph" - | "heading" - | "table" - | "list" - | "image" - | "code"; - -/** A single structural element within a {@link DocumentSection}. */ -export interface DocumentElement { - readonly type: ElementType; - readonly text: string; - /** Heading level (1-6). Only meaningful when `type` is `"heading"`. */ - readonly level?: number; - /** Element-scoped metadata (e.g. table caption, alt text). */ - readonly metadata?: Metadata; -} - -/** A titled section containing elements and optional nested sub-sections. */ -export interface DocumentSection { - readonly title?: string; - readonly elements: readonly DocumentElement[]; - readonly children?: readonly DocumentSection[]; -} - -/** A single page of a document. */ -export interface DocumentPage { - /** 1-based page number. */ - readonly pageNumber: number; - readonly sections: readonly DocumentSection[]; -} +export type { + CompositeElementOptions, + ElementOptions, + ElementProvenance, + EmailElementOptions, + EmphasizedText, + FormElementOptions, + FormKeyValuePair, + ImageElementOptions, + Link, + TableCellData, + TableElementOptions, +} from "../documents/elements.js"; +export { + CompositeElement, + Element, + EmailElement, + FormElement, + ImageElement, + TableElement, +} from "../documents/elements.js"; /** Options for constructing a {@link Document}. */ export interface DocumentOptions { readonly sourceType?: string; - readonly pages?: readonly DocumentPage[]; + /** Document title (e.g. HTML ``, PDF metadata). */ + readonly title?: string; + /** Pre-extracted structural elements. */ + readonly elements?: readonly Element[]; } /** @@ -47,23 +39,30 @@ export interface DocumentOptions { * already been converted into plain text that can be chunked, enriched, * or embedded. * + * Structural detail is carried as a flat array of {@link Element} + * instances. Hierarchy is expressed via `parentId` references and page + * membership via `pageNumber` on each element. + * * @example * ```ts - * const doc = new Document("Quarterly Report\n\nRevenue increased…", { - * sourceType: "pdf", - * }); + * const doc = Document.fromElements([ + * new Element({ type: "title", text: "Quarterly Report", pageNumber: 1 }), + * new Element({ type: "narrative-text", text: "Revenue increased…", pageNumber: 1 }), + * ], { sourceType: "pdf" }); * ``` */ export class Document extends Data { readonly #content: string; readonly #sourceType?: string | undefined; - readonly #pages?: readonly DocumentPage[] | undefined; + readonly #title?: string | undefined; + readonly #elements?: readonly Element[] | undefined; constructor(content: string, options?: DocumentOptions) { super(); this.#content = content; this.#sourceType = options?.sourceType; - this.#pages = options?.pages; + this.#title = options?.title; + this.#elements = options?.elements; } /** Text content of the document. */ @@ -76,59 +75,40 @@ export class Document extends Data { return this.#sourceType; } - /** Optional hierarchical page structure. */ - get pages(): readonly DocumentPage[] | undefined { - return this.#pages; + /** Document title (e.g. HTML `<title>`, PDF metadata). */ + get title(): string | undefined { + return this.#title; + } + + /** Unique BCP-47 language tags collected from all elements. */ + get languages(): readonly string[] { + if (this.#elements == null) return []; + const set = new Set<string>(); + for (const el of this.#elements) { + if (el.languages != null) { + for (const lang of el.languages) { + set.add(lang); + } + } + } + return [...set]; } - /** All elements across all pages and sections, flattened in document order. */ - get flatElements(): DocumentElement[] { - if (this.#pages == null) return []; - return collectElements(this.#pages); + /** Flat ordered list of structural elements. */ + get elements(): readonly Element[] | undefined { + return this.#elements; } /** - * Create a Document by deriving `content` from the element texts in the given pages. + * Create a Document by deriving `content` from the element texts. * * Element texts are joined with `\n\n` separators. */ - static fromPages( - pages: readonly DocumentPage[], - options?: Omit<DocumentOptions, "pages">, + static fromElements( + elements: readonly Element[], + options?: Omit<DocumentOptions, "elements">, ): Document { - const content = flattenPagesToText(pages); - return new Document(content, { ...options, pages }); + const content = elements.map((el) => el.text).join("\n\n"); + return new Document(content, { ...options, elements }); } } - -/** Collect all elements from a page tree in document order. */ -function collectElements(pages: readonly DocumentPage[]): DocumentElement[] { - const out: DocumentElement[] = []; - for (const page of pages) { - for (const section of page.sections) { - flattenSection(section, out); - } - } - return out; -} - -/** Recursively collect elements from a section and its children. */ -function flattenSection( - section: DocumentSection, - out: DocumentElement[], -): void { - for (const el of section.elements) { - out.push(el); - } - if (section.children) { - for (const child of section.children) { - flattenSection(child, out); - } - } -} - -/** Derive plain text content from a page tree. */ -function flattenPagesToText(pages: readonly DocumentPage[]): string { - const elements = collectElements(pages); - return elements.map((el) => el.text).join("\n\n"); -} diff --git a/packages/nvisy-core/src/datatypes/embedding.test.ts b/packages/nvisy-core/src/datatypes/embedding.test.ts new file mode 100644 index 0000000..1e0db5f --- /dev/null +++ b/packages/nvisy-core/src/datatypes/embedding.test.ts @@ -0,0 +1,50 @@ +import { describe, expect, it } from "vitest"; +import { Embedding } from "./embedding.js"; + +describe("Embedding", () => { + it("constructs from a number array", () => { + const e = new Embedding([0.1, -0.2, 0.3]); + expect(e.dimensions).toBe(3); + expect(e.vector).toBeInstanceOf(Float32Array); + }); + + it("constructs from a Float32Array", () => { + const arr = new Float32Array([1.0, 2.0]); + const e = new Embedding(arr); + expect(e.dimensions).toBe(2); + expect(e.vector).toBe(arr); + }); + + it("preserves approximate values from number array", () => { + const e = new Embedding([0.5, -0.5]); + expect(e.vector[0]).toBeCloseTo(0.5); + expect(e.vector[1]).toBeCloseTo(-0.5); + }); + + it("handles zero-length vector", () => { + const e = new Embedding([]); + expect(e.dimensions).toBe(0); + expect(e.vector).toHaveLength(0); + }); + + it("has a unique id", () => { + const a = new Embedding([1]); + const b = new Embedding([1]); + expect(a.id).not.toBe(b.id); + }); + + it("supports lineage via deriveFrom", () => { + const parent = new Embedding([1, 2]); + const child = new Embedding([3, 4]); + child.deriveFrom(parent); + + expect(child.parentId).toBe(parent.id); + expect(child.isDerived).toBe(true); + }); + + it("supports metadata", () => { + const e = new Embedding([0.1]); + e.withMetadata({ model: "text-embedding-3-small" }); + expect(e.metadata).toEqual({ model: "text-embedding-3-small" }); + }); +}); diff --git a/packages/nvisy-core/src/datatypes/index.ts b/packages/nvisy-core/src/datatypes/index.ts index 4042864..10bde7d 100644 --- a/packages/nvisy-core/src/datatypes/index.ts +++ b/packages/nvisy-core/src/datatypes/index.ts @@ -4,17 +4,32 @@ * Base data model and built-in types for the Nvisy pipeline. */ +export type { JsonValue, Metadata } from "../types.js"; export { Blob } from "./blob.js"; -export type { JsonValue, Metadata } from "./data.js"; export { Data } from "./data.js"; export type { - DocumentElement, + CompositeElementOptions, DocumentOptions, - DocumentPage, - DocumentSection, - ElementType, + ElementOptions, + ElementProvenance, + EmailElementOptions, + EmphasizedText, + FormElementOptions, + FormKeyValuePair, + ImageElementOptions, + Link, + TableCellData, + TableElementOptions, +} from "./document.js"; +export { + CompositeElement, + Document, + Element, + EmailElement, + FormElement, + ImageElement, + TableElement, } from "./document.js"; -export { Document } from "./document.js"; export { Embedding } from "./embedding.js"; import type { ClassRef } from "../types.js"; diff --git a/packages/nvisy-core/src/documents/coordinates.test.ts b/packages/nvisy-core/src/documents/coordinates.test.ts new file mode 100644 index 0000000..059cb13 --- /dev/null +++ b/packages/nvisy-core/src/documents/coordinates.test.ts @@ -0,0 +1,148 @@ +import { describe, expect, it } from "vitest"; +import { CoordinateSystem, Orientations } from "./coordinates.js"; + +describe("Orientations", () => { + it("SCREEN is [1, -1]", () => { + expect(Orientations.SCREEN).toEqual([1, -1]); + }); + + it("CARTESIAN is [1, 1]", () => { + expect(Orientations.CARTESIAN).toEqual([1, 1]); + }); +}); + +describe("CoordinateSystem", () => { + describe("static factories", () => { + it("pixel() creates a screen-oriented system", () => { + const sys = CoordinateSystem.pixel(1920, 1080); + expect(sys.width).toBe(1920); + expect(sys.height).toBe(1080); + expect(sys.orientation).toEqual(Orientations.SCREEN); + }); + + it("point() creates a cartesian-oriented system", () => { + const sys = CoordinateSystem.point(612, 792); + expect(sys.width).toBe(612); + expect(sys.height).toBe(792); + expect(sys.orientation).toEqual(Orientations.CARTESIAN); + }); + + it("relative() creates a 1x1 cartesian system", () => { + const sys = CoordinateSystem.relative(); + expect(sys.width).toBe(1); + expect(sys.height).toBe(1); + expect(sys.orientation).toEqual(Orientations.CARTESIAN); + }); + }); + + describe("toRelative / fromRelative", () => { + it("pixel origin (0,0) maps to relative (0,1)", () => { + const px = CoordinateSystem.pixel(100, 100); + const rel = px.toRelative({ x: 0, y: 0 }); + expect(rel.x).toBeCloseTo(0); + expect(rel.y).toBeCloseTo(1); + }); + + it("pixel bottom-right maps to relative (1,0)", () => { + const px = CoordinateSystem.pixel(100, 100); + const rel = px.toRelative({ x: 100, y: 100 }); + expect(rel.x).toBeCloseTo(1); + expect(rel.y).toBeCloseTo(0); + }); + + it("point origin (0,0) maps to relative (0,0)", () => { + const pt = CoordinateSystem.point(612, 792); + const rel = pt.toRelative({ x: 0, y: 0 }); + expect(rel.x).toBeCloseTo(0); + expect(rel.y).toBeCloseTo(0); + }); + + it("fromRelative is the inverse of toRelative", () => { + const px = CoordinateSystem.pixel(200, 300); + const original = { x: 50, y: 75 }; + const rel = px.toRelative(original); + const back = px.fromRelative(rel); + expect(back.x).toBeCloseTo(original.x); + expect(back.y).toBeCloseTo(original.y); + }); + }); + + describe("convertTo", () => { + it("converts pixel top-left to point bottom-left", () => { + const px = CoordinateSystem.pixel(100, 100); + const pt = CoordinateSystem.point(100, 100); + const result = px.convertTo(pt, { x: 0, y: 0 }); + expect(result.x).toBeCloseTo(0); + expect(result.y).toBeCloseTo(100); + }); + + it("converts pixel center to point center", () => { + const px = CoordinateSystem.pixel(200, 200); + const pt = CoordinateSystem.point(200, 200); + const result = px.convertTo(pt, { x: 100, y: 100 }); + expect(result.x).toBeCloseTo(100); + expect(result.y).toBeCloseTo(100); + }); + + it("handles different dimensions between systems", () => { + const px = CoordinateSystem.pixel(1920, 1080); + const pt = CoordinateSystem.point(612, 792); + const result = px.convertTo(pt, { x: 960, y: 540 }); + expect(result.x).toBeCloseTo(306); + expect(result.y).toBeCloseTo(396); + }); + + it("round-trips through relative", () => { + const a = CoordinateSystem.pixel(800, 600); + const b = CoordinateSystem.point(400, 300); + const p = { x: 200, y: 150 }; + const converted = a.convertTo(b, p); + const back = b.convertTo(a, converted); + expect(back.x).toBeCloseTo(p.x); + expect(back.y).toBeCloseTo(p.y); + }); + }); + + describe("convertAllTo", () => { + it("converts all corner points at once", () => { + const px = CoordinateSystem.pixel(100, 100); + const pt = CoordinateSystem.point(100, 100); + const corners = [ + { x: 10, y: 20 }, + { x: 10, y: 80 }, + { x: 90, y: 80 }, + { x: 90, y: 20 }, + ]; + const result = px.convertAllTo(pt, corners); + expect(result).toHaveLength(4); + expect(result[0]!.x).toBeCloseTo(10); + expect(result[0]!.y).toBeCloseTo(80); + }); + + it("returns empty array for empty input", () => { + const px = CoordinateSystem.pixel(100, 100); + const pt = CoordinateSystem.point(100, 100); + expect(px.convertAllTo(pt, [])).toEqual([]); + }); + }); + + describe("equals", () => { + it("returns true for identical systems", () => { + const a = CoordinateSystem.pixel(100, 200); + const b = CoordinateSystem.pixel(100, 200); + expect(a.equals(b)).toBe(true); + }); + + it("returns false for different orientations", () => { + const px = CoordinateSystem.pixel(100, 100); + const pt = CoordinateSystem.point(100, 100); + expect(px.equals(pt)).toBe(false); + }); + + it("returns false for different dimensions", () => { + const a = CoordinateSystem.pixel(100, 100); + const b = CoordinateSystem.pixel(200, 100); + expect(a.equals(b)).toBe(false); + }); + }); +}); diff --git a/packages/nvisy-core/src/documents/coordinates.ts b/packages/nvisy-core/src/documents/coordinates.ts new file mode 100644 index 0000000..5cc378f --- /dev/null +++ b/packages/nvisy-core/src/documents/coordinates.ts @@ -0,0 +1,174 @@ +/** + * Spatial coordinate systems and element positioning for document elements. + * + * Coordinate systems differ by origin and axis direction: + * - **Pixel space** — origin at top-left, y increases downward (images, OCR). + * - **Point space** — origin at bottom-left, y increases upward (PDF, PostScript). + * - **Relative** — unit square (0–1 on both axes), y increases upward. + * + * Element positions are stored as an array of corner {@link Point | points} + * rather than an axis-aligned bounding box, so rotated and skewed regions + * are represented without loss. + * + * Use {@link CoordinateSystem.convertTo} to transform points between systems. + * + * @example + * ```ts + * const px = CoordinateSystem.pixel(1920, 1080); + * const pt = CoordinateSystem.point(612, 792); + * const result = px.convertTo(pt, { x: 960, y: 540 }); + * ``` + * + * @module + */ + +/** A point in 2D space. */ +export interface Point { + readonly x: number; + readonly y: number; +} + +/** + * Axis orientation as an `[xSign, ySign]` tuple. + * + * - `1` — value grows in the standard (rightward / upward) direction. + * - `-1` — axis is inverted (e.g. y grows downward for screen coordinates). + */ +export type Orientation = readonly [x: 1 | -1, y: 1 | -1]; + +/** + * Built-in orientation presets. + * + * - `Orientations.SCREEN` — origin top-left, y increases downward. + * - `Orientations.CARTESIAN` — origin bottom-left, y increases upward. + */ +export const Orientations = { + /** Screen orientation — origin top-left, y increases downward. */ + SCREEN: [1, -1] as Orientation, + /** Cartesian orientation — origin bottom-left, y increases upward. */ + CARTESIAN: [1, 1] as Orientation, +} as const; + +/** Convert a single coordinate along one axis via a linear transformation. */ +function convertAxis( + value: number, + fromMax: number, + toMax: number, + sign: 1 | -1, +): number { + const t = value / fromMax; + return (((1 - t) * (1 - sign)) / 2 + (t * (1 + sign)) / 2) * toMax; +} + +/** + * A finite coordinate plane with a given width, height, and orientation. + * + * Instances are immutable value objects. Use the static factories + * {@link CoordinateSystem.pixel}, {@link CoordinateSystem.point}, and + * {@link CoordinateSystem.relative} for the common coordinate spaces. + */ +export class CoordinateSystem { + /** Width of the coordinate plane. */ + readonly width: number; + + /** Height of the coordinate plane. */ + readonly height: number; + + /** Axis orientation of this coordinate system. */ + readonly orientation: Orientation; + + constructor(width: number, height: number, orientation: Orientation) { + this.width = width; + this.height = height; + this.orientation = orientation; + } + + /** Pixel-space system (origin top-left, y down). */ + static pixel(width: number, height: number): CoordinateSystem { + return new CoordinateSystem(width, height, Orientations.SCREEN); + } + + /** Point-space system (origin bottom-left, y up). */ + static point(width: number, height: number): CoordinateSystem { + return new CoordinateSystem(width, height, Orientations.CARTESIAN); + } + + /** Unit-square relative coordinate system (0–1, Cartesian). */ + static relative(): CoordinateSystem { + return new CoordinateSystem(1, 1, Orientations.CARTESIAN); + } + + /** Convert a point from this system to the 0–1 relative system. */ + toRelative(p: Point): Point { + const [xSign, ySign] = this.orientation; + return { + x: convertAxis(p.x, this.width, 1, xSign), + y: convertAxis(p.y, this.height, 1, ySign), + }; + } + + /** Convert a point from the 0–1 relative system into this system. */ + fromRelative(p: Point): Point { + const [xSign, ySign] = this.orientation; + return { + x: convertAxis(p.x, 1, this.width, xSign), + y: convertAxis(p.y, 1, this.height, ySign), + }; + } + + /** Convert a point from this system into `target`. */ + convertTo(target: CoordinateSystem, p: Point): Point { + return target.fromRelative(this.toRelative(p)); + } + + /** + * Convert an array of points from this system into `target`. + * + * Convenience wrapper around {@link convertTo} for transforming + * all corners of an {@link ElementCoordinates.points} array at once. + */ + convertAllTo(target: CoordinateSystem, points: readonly Point[]): Point[] { + return points.map((p) => this.convertTo(target, p)); + } + + /** Structural equality. */ + equals(other: CoordinateSystem): boolean { + return ( + this.width === other.width && + this.height === other.height && + this.orientation[0] === other.orientation[0] && + this.orientation[1] === other.orientation[1] + ); + } +} + +/** + * Spatial coordinates for a document element. + * + * Corner points specify the bounding region of the element, starting + * from the top-left corner and proceeding counter-clockwise. Using + * points rather than an axis-aligned box naturally handles rotated + * and skewed regions. + * + * @example + * ```ts + * const coords: ElementCoordinates = { + * points: [ + * { x: 10, y: 20 }, // top-left + * { x: 10, y: 120 }, // bottom-left + * { x: 210, y: 120 }, // bottom-right + * { x: 210, y: 20 }, // top-right + * ], + * system: CoordinateSystem.pixel(1920, 1080), + * }; + * ``` + */ +export interface ElementCoordinates { + /** + * Corner points of the bounding region, counter-clockwise + * from top-left. + */ + readonly points: readonly Point[]; + /** The coordinate system the points were measured in. */ + readonly system: CoordinateSystem; +} diff --git a/packages/nvisy-core/src/documents/elements.test.ts b/packages/nvisy-core/src/documents/elements.test.ts new file mode 100644 index 0000000..85fdc38 --- /dev/null +++ b/packages/nvisy-core/src/documents/elements.test.ts @@ -0,0 +1,310 @@ +import { describe, expect, it } from "vitest"; +import { + CompositeElement, + Element, + EmailElement, + FormElement, + ImageElement, + TableElement, +} from "./elements.js"; + +describe("Element", () => { + it("auto-generates a unique id", () => { + const a = new Element({ type: "title", text: "a" }); + const b = new Element({ type: "title", text: "b" }); + expect(a.id).toBeTruthy(); + expect(b.id).toBeTruthy(); + expect(a.id).not.toBe(b.id); + }); + + it("assigns type and text from options", () => { + const el = new Element({ type: "narrative-text", text: "Hello world" }); + expect(el.type).toBe("narrative-text"); + expect(el.text).toBe("Hello world"); + }); + + it("carries all base fields", () => { + const el = new Element({ + type: "title", + text: "Intro", + parentId: "parent-1", + pageNumber: 2, + level: 1, + languages: ["en"], + metadata: { key: "value" }, + }); + expect(el.parentId).toBe("parent-1"); + expect(el.pageNumber).toBe(2); + expect(el.level).toBe(1); + expect(el.languages).toEqual(["en"]); + expect(el.metadata).toEqual({ key: "value" }); + }); + + it("all optional fields default to undefined", () => { + const el = new Element({ type: "title", text: "" }); + expect(el.parentId).toBeUndefined(); + expect(el.pageNumber).toBeUndefined(); + expect(el.pageName).toBeUndefined(); + expect(el.level).toBeUndefined(); + expect(el.languages).toBeUndefined(); + expect(el.metadata).toBeUndefined(); + expect(el.sourceTag).toBeUndefined(); + expect(el.textAsHtml).toBeUndefined(); + expect(el.links).toBeUndefined(); + expect(el.emphasizedTexts).toBeUndefined(); + expect(el.provenance).toBeUndefined(); + }); + + describe("links and emphasizedTexts", () => { + it("carries links with startIndex", () => { + const el = new Element({ + type: "narrative-text", + text: "Visit example.com for details.", + links: [ + { text: "example.com", url: "https://example.com", startIndex: 6 }, + ], + }); + expect(el.links).toHaveLength(1); + expect(el.links![0].startIndex).toBe(6); + expect(el.links![0].url).toBe("https://example.com"); + }); + + it("carries emphasizedTexts", () => { + const el = new Element({ + type: "title", + text: "Important notice", + emphasizedTexts: [{ text: "Important", tag: "strong" }], + }); + expect(el.emphasizedTexts).toHaveLength(1); + expect(el.emphasizedTexts![0].tag).toBe("strong"); + }); + + it("available on any element type, not just text", () => { + const el = new TableElement({ + type: "table", + text: "A table with links", + links: [{ text: "link", url: "https://example.com", startIndex: 0 }], + emphasizedTexts: [{ text: "table", tag: "b" }], + }); + expect(el.links).toHaveLength(1); + expect(el.emphasizedTexts).toHaveLength(1); + }); + }); + + describe("table fields", () => { + it("carries cells with row, column, isHeader", () => { + const el = new TableElement({ + type: "table", + text: "", + cells: [ + { row: 0, column: 0, text: "Name", isHeader: true }, + { row: 0, column: 1, text: "Age", isHeader: true }, + { row: 1, column: 0, text: "Alice" }, + { row: 1, column: 1, text: "30" }, + ], + }); + expect(el).toBeInstanceOf(Element); + expect(el.cells).toHaveLength(4); + expect(el.cells![0].isHeader).toBe(true); + expect(el.cells![2].text).toBe("Alice"); + }); + + it("cells defaults to undefined", () => { + const el = new TableElement({ type: "table", text: "" }); + expect(el.cells).toBeUndefined(); + }); + }); + + describe("image fields", () => { + it("carries imageBase64, imageMimeType, imageUrl, imagePath", () => { + const el = new ImageElement({ + type: "image", + text: "A photo", + imageBase64: "abc123==", + imageMimeType: "image/png", + imageUrl: "https://example.com/photo.png", + imagePath: "/tmp/photo.png", + }); + expect(el.imageBase64).toBe("abc123=="); + expect(el.imageMimeType).toBe("image/png"); + expect(el.imageUrl).toBe("https://example.com/photo.png"); + expect(el.imagePath).toBe("/tmp/photo.png"); + }); + + it("is an instance of Element", () => { + const el = new ImageElement({ type: "image", text: "photo" }); + expect(el).toBeInstanceOf(Element); + expect(el).toBeInstanceOf(ImageElement); + }); + + it("image fields default to undefined", () => { + const el = new ImageElement({ type: "image", text: "" }); + expect(el.imageBase64).toBeUndefined(); + expect(el.imageMimeType).toBeUndefined(); + expect(el.imageUrl).toBeUndefined(); + expect(el.imagePath).toBeUndefined(); + }); + + it("image fields only live on ImageElement", () => { + const base = new Element({ type: "image", text: "" }); + expect("imageBase64" in base).toBe(false); + expect("imageMimeType" in base).toBe(false); + expect("imageUrl" in base).toBe(false); + expect("imagePath" in base).toBe(false); + }); + }); + + describe("form fields", () => { + it("carries checked and value", () => { + const el = new FormElement({ + type: "checkbox", + text: "Accept terms", + checked: true, + value: "yes", + }); + expect(el).toBeInstanceOf(Element); + expect(el.checked).toBe(true); + expect(el.value).toBe("yes"); + }); + + it("checked and value default to undefined", () => { + const el = new FormElement({ type: "checkbox", text: "" }); + expect(el.checked).toBeUndefined(); + expect(el.value).toBeUndefined(); + }); + }); + + describe("email fields", () => { + it("carries all email envelope fields", () => { + const el = new EmailElement({ + type: "email-message", + text: "Hello from email", + sentFrom: ["alice@example.com"], + sentTo: ["bob@example.com"], + ccRecipient: ["carol@example.com"], + bccRecipient: ["dave@example.com"], + subject: "Meeting notes", + signature: "— Alice", + emailMessageId: "<msg-001@example.com>", + }); + expect(el).toBeInstanceOf(Element); + expect(el.sentFrom).toEqual(["alice@example.com"]); + expect(el.sentTo).toEqual(["bob@example.com"]); + expect(el.ccRecipient).toEqual(["carol@example.com"]); + expect(el.bccRecipient).toEqual(["dave@example.com"]); + expect(el.subject).toBe("Meeting notes"); + expect(el.signature).toBe("— Alice"); + expect(el.emailMessageId).toBe("<msg-001@example.com>"); + }); + + it("email fields default to undefined", () => { + const el = new EmailElement({ type: "email-message", text: "" }); + expect(el.sentFrom).toBeUndefined(); + expect(el.sentTo).toBeUndefined(); + expect(el.ccRecipient).toBeUndefined(); + expect(el.bccRecipient).toBeUndefined(); + expect(el.subject).toBeUndefined(); + expect(el.signature).toBeUndefined(); + expect(el.emailMessageId).toBeUndefined(); + }); + }); + + describe("provenance fields", () => { + it("carries detectionOrigin via provenance", () => { + const el = new Element({ + type: "title", + text: "Hello", + provenance: { detectionOrigin: "tesseract-v5" }, + }); + expect(el.provenance?.detectionOrigin).toBe("tesseract-v5"); + }); + + it("carries headerFooterType via provenance", () => { + const el = new Element({ + type: "header", + text: "Page 1", + provenance: { headerFooterType: "page-header" }, + }); + expect(el.provenance?.headerFooterType).toBe("page-header"); + }); + }); + + describe("source fidelity fields", () => { + it("carries sourceTag for format-specific origin", () => { + const el = new Element({ + type: "narrative-text", + text: "To be or not to be", + sourceTag: "blockquote", + }); + expect(el.sourceTag).toBe("blockquote"); + }); + + it("carries textAsHtml on base Element", () => { + const el = new Element({ + type: "narrative-text", + text: "bold text", + textAsHtml: "<p><strong>bold</strong> text</p>", + }); + expect(el.textAsHtml).toBe("<p><strong>bold</strong> text</p>"); + }); + + it("carries textAsHtml on TableElement", () => { + const el = new TableElement({ + type: "table", + text: "Name\tAge\nAlice\t30", + textAsHtml: "<table><tr><td>Name</td><td>Age</td></tr></table>", + }); + expect(el.textAsHtml).toBe( + "<table><tr><td>Name</td><td>Age</td></tr></table>", + ); + }); + + it("carries pageName for worksheet-based sources", () => { + const el = new Element({ + type: "table", + text: "data", + pageName: "Sheet1", + }); + expect(el.pageName).toBe("Sheet1"); + }); + }); + + describe("form keyValuePairs", () => { + it("carries structured key-value pairs", () => { + const el = new FormElement({ + type: "form-keys-values", + text: "Name: Alice", + keyValuePairs: [ + { key: "Name", value: "Alice", confidence: 0.99 }, + { key: "Age", value: "30" }, + ], + }); + expect(el.keyValuePairs).toHaveLength(2); + expect(el.keyValuePairs![0].key).toBe("Name"); + expect(el.keyValuePairs![0].value).toBe("Alice"); + expect(el.keyValuePairs![0].confidence).toBe(0.99); + expect(el.keyValuePairs![1].confidence).toBeUndefined(); + }); + + it("keyValuePairs defaults to undefined", () => { + const el = new FormElement({ type: "form-keys-values", text: "" }); + expect(el.keyValuePairs).toBeUndefined(); + }); + }); + + describe("composite fields", () => { + it("carries origElements", () => { + const orig1 = new Element({ type: "narrative-text", text: "Part 1" }); + const orig2 = new Element({ type: "narrative-text", text: "Part 2" }); + const composite = new CompositeElement({ + type: "narrative-text", + text: "Part 1 Part 2", + origElements: [orig1, orig2], + }); + expect(composite).toBeInstanceOf(Element); + expect(composite.origElements).toHaveLength(2); + expect(composite.origElements[0].text).toBe("Part 1"); + expect(composite.origElements[1].text).toBe("Part 2"); + }); + }); +}); diff --git a/packages/nvisy-core/src/documents/elements.ts b/packages/nvisy-core/src/documents/elements.ts new file mode 100644 index 0000000..9544c56 --- /dev/null +++ b/packages/nvisy-core/src/documents/elements.ts @@ -0,0 +1,374 @@ +/** + * Document element model. + * + * Every structural piece of a parsed document — paragraphs, headings, + * tables, images, etc. — is represented as an {@link Element} instance. + * The {@link Element.type | type} field (one of the {@link ElementType} + * string literals defined in `ontology.ts`) is the primary discriminator. + * + * Type-specific fields live on dedicated subclasses: + * + * | Subclass | Category | Extra fields | + * | ------------------- | -------- | ----------------------------------------- | + * | {@link ImageElement} | media | base64, mime type, URL, path | + * | {@link TableElement} | table | structured cells | + * | {@link FormElement} | form | checkbox state, value, key-value pairs | + * | {@link EmailElement} | email | envelope (from, to, cc, bcc, subject, …) | + * | {@link CompositeElement} | any | pre-chunking original elements | + * + * Extraction / OCR provenance fields are bundled in + * {@link ElementProvenance} rather than scattered across the base class. + * + * Source-format fidelity is preserved via {@link Element.sourceTag} (the + * original HTML tag or format-specific type name) and + * {@link Element.textAsHtml} (original markup for round-tripping). + * + * @module + */ + +import type { Metadata } from "../types.js"; +import type { ElementCoordinates } from "./coordinates.js"; +import type { + ElementType, + EmailType, + FormType, + MediaType, + TableType, +} from "./ontology.js"; + +/** An inline hyperlink within element text. */ +export interface Link { + /** The visible link text. */ + readonly text: string; + /** The target URL. */ + readonly url: string; + /** 0-based character offset of the link text within the element's {@link Element.text}. */ + readonly startIndex: number; +} + +/** An inline formatting span within element text. */ +export interface EmphasizedText { + /** The formatted text content. */ + readonly text: string; + /** HTML tag name — `"b"`, `"i"`, `"em"`, `"strong"`, etc. */ + readonly tag: string; +} + +/** A single cell within a table structure. */ +export interface TableCellData { + /** 0-based row index. */ + readonly row: number; + /** 0-based column index. */ + readonly column: number; + /** Plain-text content of the cell. */ + readonly text: string; + /** `true` when this cell is part of the table header. */ + readonly isHeader?: boolean; +} + +/** Extraction / OCR provenance fields bundled into a single object. */ +export interface ElementProvenance { + /** Spatial position on the source page (OCR, PDF). */ + readonly coordinates?: ElementCoordinates; + /** Extraction confidence score (0–1). */ + readonly confidence?: number; + /** Which model or system produced this element. */ + readonly detectionOrigin?: string; + /** `true` when this element continues from a previous page or chunk. */ + readonly isContinuation?: boolean; + /** Distinguishes page-header vs document-header, etc. */ + readonly headerFooterType?: string; +} + +/** A structured key-value pair extracted from a form. */ +export interface FormKeyValuePair { + /** The field label. */ + readonly key: string; + /** The field value, if present. */ + readonly value?: string; + /** Extraction confidence score (0–1). */ + readonly confidence?: number; +} + +/** Options for constructing an {@link Element}. */ +export interface ElementOptions { + /** The element's structural type. */ + readonly type: ElementType; + /** Extracted text content. May be empty for non-textual elements. */ + readonly text: string; + /** ID of the parent element (e.g. a table cell's parent row). */ + readonly parentId?: string; + /** 1-based page number this element belongs to. */ + readonly pageNumber?: number; + /** Named page or sheet (e.g. XLSX worksheet name). */ + readonly pageName?: string; + /** Nesting depth — 1–6 for headings, 1+ for nested lists. */ + readonly level?: number; + /** BCP-47 language tags detected for this element. */ + readonly languages?: readonly string[]; + /** Element-scoped metadata (e.g. table caption, alt text). */ + readonly metadata?: Metadata; + /** Original source tag or format-specific type name (e.g. `"blockquote"`, `"dl"`). */ + readonly sourceTag?: string; + /** Original markup for round-tripping (e.g. the HTML of a table row). */ + readonly textAsHtml?: string; + /** Inline hyperlinks embedded in {@link text}. */ + readonly links?: readonly Link[]; + /** Bold / italic formatting spans embedded in {@link text}. */ + readonly emphasizedTexts?: readonly EmphasizedText[]; + /** Extraction / OCR provenance data. */ + readonly provenance?: ElementProvenance; +} + +/** + * A single structural element extracted from a document. + * + * Every element carries an {@link id}, a {@link type} discriminator, + * and its extracted {@link text}. Type-specific fields live on + * dedicated subclasses; provenance data is in {@link provenance}. + * + * Hierarchy is expressed via {@link parentId} references rather than + * nesting, keeping the element array flat and easy to iterate. + */ +export class Element { + /** Unique identifier for this element. */ + readonly id: string = crypto.randomUUID(); + /** The element's structural type. */ + readonly type: ElementType; + /** Extracted text content. May be empty for non-textual elements. */ + readonly text: string; + /** ID of the parent element (e.g. a table cell's parent row). */ + readonly parentId?: string | undefined; + /** 1-based page number this element belongs to. */ + readonly pageNumber?: number | undefined; + /** Named page or sheet (e.g. XLSX worksheet name). */ + readonly pageName?: string | undefined; + /** Nesting depth — 1–6 for headings, 1+ for nested lists. */ + readonly level?: number | undefined; + /** BCP-47 language tags detected for this element. */ + readonly languages?: readonly string[] | undefined; + /** Element-scoped metadata (e.g. table caption, alt text). */ + readonly metadata?: Metadata | undefined; + /** Original source tag or format-specific type name (e.g. `"blockquote"`, `"dl"`). */ + readonly sourceTag?: string | undefined; + /** Original markup for round-tripping (e.g. the HTML of a table row). */ + readonly textAsHtml?: string | undefined; + /** Inline hyperlinks embedded in {@link text}. */ + readonly links?: readonly Link[] | undefined; + /** Bold / italic formatting spans embedded in {@link text}. */ + readonly emphasizedTexts?: readonly EmphasizedText[] | undefined; + /** Extraction / OCR provenance data. */ + readonly provenance?: ElementProvenance | undefined; + + constructor(options: ElementOptions) { + this.type = options.type; + this.text = options.text; + this.parentId = options.parentId; + this.pageNumber = options.pageNumber; + this.pageName = options.pageName; + this.level = options.level; + this.languages = options.languages; + this.metadata = options.metadata; + this.sourceTag = options.sourceTag; + this.textAsHtml = options.textAsHtml; + this.links = options.links; + this.emphasizedTexts = options.emphasizedTexts; + this.provenance = options.provenance; + } +} + +/** + * Options for constructing an {@link ImageElement}. + * + * Narrows {@link ElementOptions.type | type} to {@link MediaType} and + * adds fields for carrying image data in various forms. + */ +export interface ImageElementOptions extends ElementOptions { + readonly type: MediaType; + /** Base64-encoded image content. */ + readonly imageBase64?: string; + /** MIME type of the image (e.g. `"image/png"`). */ + readonly imageMimeType?: string; + /** Remote URL where the image can be fetched. */ + readonly imageUrl?: string; + /** Local filesystem path to the image file. */ + readonly imagePath?: string; +} + +/** + * An element representing an image extracted from a document. + * + * Image data may be provided in one or more forms — inline base64, + * a remote URL, or a local file path. Use `instanceof ImageElement` + * for runtime type narrowing. + */ +export class ImageElement extends Element { + /** Base64-encoded image content. */ + readonly imageBase64?: string | undefined; + /** MIME type of the image (e.g. `"image/png"`). */ + readonly imageMimeType?: string | undefined; + /** Remote URL where the image can be fetched. */ + readonly imageUrl?: string | undefined; + /** Local filesystem path to the image file. */ + readonly imagePath?: string | undefined; + + constructor(options: ImageElementOptions) { + super(options); + this.imageBase64 = options.imageBase64; + this.imageMimeType = options.imageMimeType; + this.imageUrl = options.imageUrl; + this.imagePath = options.imagePath; + } +} + +/** + * Options for constructing a {@link TableElement}. + * + * Narrows {@link ElementOptions.type | type} to {@link TableType} and + * adds structured cell data. + */ +export interface TableElementOptions extends ElementOptions { + readonly type: TableType; + /** Structured cell data for the table. */ + readonly cells?: readonly TableCellData[]; +} + +/** + * An element representing a table extracted from a document. + * + * Structured cell data is in {@link cells}. The inherited + * {@link Element.textAsHtml | textAsHtml} field can carry the + * original `<table>` markup for lossless round-tripping. + */ +export class TableElement extends Element { + /** Structured cell data for the table. */ + readonly cells?: readonly TableCellData[] | undefined; + + constructor(options: TableElementOptions) { + super(options); + this.cells = options.cells; + } +} + +/** + * Options for constructing a {@link FormElement}. + * + * Narrows {@link ElementOptions.type | type} to {@link FormType} and + * adds checkbox / form-field state. + */ +export interface FormElementOptions extends ElementOptions { + readonly type: FormType; + /** Checkbox checked state. */ + readonly checked?: boolean; + /** Scalar form-field value. */ + readonly value?: string; + /** Structured key-value pairs extracted from a form. */ + readonly keyValuePairs?: readonly FormKeyValuePair[]; +} + +/** + * An element representing a form field or checkbox. + * + * Simple checkboxes use {@link checked}; richer forms use + * {@link keyValuePairs} for structured key-value extraction. + */ +export class FormElement extends Element { + /** Checkbox checked state. */ + readonly checked?: boolean | undefined; + /** Scalar form-field value. */ + readonly value?: string | undefined; + /** Structured key-value pairs extracted from a form. */ + readonly keyValuePairs?: readonly FormKeyValuePair[] | undefined; + + constructor(options: FormElementOptions) { + super(options); + this.checked = options.checked; + this.value = options.value; + this.keyValuePairs = options.keyValuePairs; + } +} + +/** + * Options for constructing an {@link EmailElement}. + * + * Narrows {@link ElementOptions.type | type} to {@link EmailType} and + * adds standard email envelope fields. + */ +export interface EmailElementOptions extends ElementOptions { + readonly type: EmailType; + /** Sender address(es). */ + readonly sentFrom?: readonly string[]; + /** Primary recipient address(es). */ + readonly sentTo?: readonly string[]; + /** CC recipient address(es). */ + readonly ccRecipient?: readonly string[]; + /** BCC recipient address(es). */ + readonly bccRecipient?: readonly string[]; + /** Email subject line. */ + readonly subject?: string; + /** Email signature block. */ + readonly signature?: string; + /** RFC 2822 Message-ID header value. */ + readonly emailMessageId?: string; +} + +/** + * An element representing an email message. + * + * Carries standard envelope fields (from, to, cc, bcc, subject) plus + * optional signature and message-id for threading. + */ +export class EmailElement extends Element { + /** Sender address(es). */ + readonly sentFrom?: readonly string[] | undefined; + /** Primary recipient address(es). */ + readonly sentTo?: readonly string[] | undefined; + /** CC recipient address(es). */ + readonly ccRecipient?: readonly string[] | undefined; + /** BCC recipient address(es). */ + readonly bccRecipient?: readonly string[] | undefined; + /** Email subject line. */ + readonly subject?: string | undefined; + /** Email signature block. */ + readonly signature?: string | undefined; + /** RFC 2822 Message-ID header value. */ + readonly emailMessageId?: string | undefined; + + constructor(options: EmailElementOptions) { + super(options); + this.sentFrom = options.sentFrom; + this.sentTo = options.sentTo; + this.ccRecipient = options.ccRecipient; + this.bccRecipient = options.bccRecipient; + this.subject = options.subject; + this.signature = options.signature; + this.emailMessageId = options.emailMessageId; + } +} + +/** + * Options for constructing a {@link CompositeElement}. + * + * Requires the original pre-chunking elements that were merged to + * form this composite. + */ +export interface CompositeElementOptions extends ElementOptions { + /** The original elements that were merged during chunking. */ + readonly origElements: readonly Element[]; +} + +/** + * A composite element formed by merging multiple elements during chunking. + * + * Preserves the original pre-chunking elements in {@link origElements} + * so downstream consumers can access fine-grained structure if needed. + */ +export class CompositeElement extends Element { + /** The original elements that were merged during chunking. */ + readonly origElements: readonly Element[]; + + constructor(options: CompositeElementOptions) { + super(options); + this.origElements = options.origElements; + } +} diff --git a/packages/nvisy-core/src/documents/index.ts b/packages/nvisy-core/src/documents/index.ts new file mode 100644 index 0000000..bbf56e6 --- /dev/null +++ b/packages/nvisy-core/src/documents/index.ts @@ -0,0 +1,69 @@ +/** + * @module documents + * + * Element ontology, coordinate types, and element class + * for structured document representations. + * + * @example + * ```ts + * import { + * CoordinateSystem, + * ElementType, + * TextType, + * categoryOf, + * } from "@nvisy/core"; + * + * // Use the const object for type-safe element type checks + * if (el.type === ElementType.Title) { … } + * + * // Look up which category an element belongs to + * categoryOf("title"); // => "text" + * + * // Convert coordinates between pixel and point space + * const px = CoordinateSystem.pixel(1920, 1080); + * const pt = CoordinateSystem.point(612, 792); + * const result = px.convertTo(pt, { x: 960, y: 540 }); + * ``` + */ + +export type { + ElementCoordinates, + Orientation, + Point, +} from "./coordinates.js"; +export { CoordinateSystem, Orientations } from "./coordinates.js"; +export type { + CompositeElementOptions, + ElementOptions, + ElementProvenance, + EmailElementOptions, + EmphasizedText, + FormElementOptions, + FormKeyValuePair, + ImageElementOptions, + Link, + TableCellData, + TableElementOptions, +} from "./elements.js"; +export { + CompositeElement, + Element, + EmailElement, + FormElement, + ImageElement, + TableElement, +} from "./elements.js"; +export type { ElementCategory } from "./ontology.js"; +export { + CodeType, + categoryOf, + ElementType, + EmailType, + FormType, + LayoutType, + MathType, + MediaType, + ontology, + TableType, + TextType, +} from "./ontology.js"; diff --git a/packages/nvisy-core/src/documents/ontology.test.ts b/packages/nvisy-core/src/documents/ontology.test.ts new file mode 100644 index 0000000..6170722 --- /dev/null +++ b/packages/nvisy-core/src/documents/ontology.test.ts @@ -0,0 +1,137 @@ +import { describe, expect, it } from "vitest"; +import { + CodeType, + categoryOf, + ElementType, + EmailType, + FormType, + LayoutType, + MathType, + MediaType, + ontology, + TableType, + TextType, +} from "./ontology.js"; + +describe("per-category const objects", () => { + it("TextType has 8 entries", () => { + expect(Object.values(TextType)).toHaveLength(8); + expect(TextType.Title).toBe("title"); + expect(TextType.NarrativeText).toBe("narrative-text"); + expect(TextType.ListItem).toBe("list-item"); + expect(TextType.Header).toBe("header"); + expect(TextType.Footer).toBe("footer"); + expect(TextType.FigureCaption).toBe("figure-caption"); + expect(TextType.Address).toBe("address"); + expect(TextType.UncategorizedText).toBe("uncategorized-text"); + }); + + it("TableType has 1 entry", () => { + expect(Object.values(TableType)).toEqual(["table"]); + }); + + it("MediaType has 1 entry", () => { + expect(Object.values(MediaType)).toEqual(["image"]); + }); + + it("CodeType has 1 entry", () => { + expect(Object.values(CodeType)).toEqual(["code-snippet"]); + }); + + it("MathType has 1 entry", () => { + expect(Object.values(MathType)).toEqual(["formula"]); + }); + + it("FormType has 2 entries", () => { + expect(Object.values(FormType)).toHaveLength(2); + expect(FormType.CheckBox).toBe("checkbox"); + expect(FormType.FormKeysValues).toBe("form-keys-values"); + }); + + it("LayoutType has 2 entries", () => { + expect(Object.values(LayoutType)).toHaveLength(2); + expect(LayoutType.PageBreak).toBe("page-break"); + expect(LayoutType.PageNumber).toBe("page-number"); + }); + + it("EmailType has 1 entry", () => { + expect(Object.values(EmailType)).toEqual(["email-message"]); + }); +}); + +describe("ElementType", () => { + it("has all 17 values", () => { + const allValues = Object.values(ElementType); + expect(allValues).toHaveLength(17); + }); + + it("includes values from every category", () => { + expect(ElementType.Title).toBe("title"); + expect(ElementType.Table).toBe("table"); + expect(ElementType.Image).toBe("image"); + expect(ElementType.CodeSnippet).toBe("code-snippet"); + expect(ElementType.Formula).toBe("formula"); + expect(ElementType.CheckBox).toBe("checkbox"); + expect(ElementType.PageBreak).toBe("page-break"); + }); +}); + +describe("ontology", () => { + it("maps every category to a non-empty array", () => { + for (const [category, types] of Object.entries(ontology)) { + expect(types.length, `${category} should have types`).toBeGreaterThan(0); + } + }); + + it("has 8 categories", () => { + expect(Object.keys(ontology)).toHaveLength(8); + }); + + it("has no duplicate element types across categories", () => { + const seen = new Map<string, string>(); + for (const [category, types] of Object.entries(ontology)) { + for (const t of types) { + expect( + seen.has(t), + `"${t}" appears in both "${seen.get(t)}" and "${category}"`, + ).toBe(false); + seen.set(t, category); + } + } + }); + + it("total entries across all categories equals 17", () => { + const total = Object.values(ontology).reduce( + (sum, arr) => sum + arr.length, + 0, + ); + expect(total).toBe(17); + }); +}); + +describe("categoryOf", () => { + it("returns the correct category for known types", () => { + expect(categoryOf(ElementType.Title)).toBe("text"); + expect(categoryOf(ElementType.NarrativeText)).toBe("text"); + expect(categoryOf(ElementType.ListItem)).toBe("text"); + expect(categoryOf(ElementType.Header)).toBe("text"); + expect(categoryOf(ElementType.Footer)).toBe("text"); + expect(categoryOf(ElementType.FigureCaption)).toBe("text"); + expect(categoryOf(ElementType.Address)).toBe("text"); + expect(categoryOf(ElementType.UncategorizedText)).toBe("text"); + expect(categoryOf(ElementType.Table)).toBe("table"); + expect(categoryOf(ElementType.Image)).toBe("media"); + expect(categoryOf(ElementType.CodeSnippet)).toBe("code"); + expect(categoryOf(ElementType.Formula)).toBe("math"); + expect(categoryOf(ElementType.CheckBox)).toBe("form"); + expect(categoryOf(ElementType.FormKeysValues)).toBe("form"); + expect(categoryOf(ElementType.PageBreak)).toBe("layout"); + expect(categoryOf(ElementType.PageNumber)).toBe("layout"); + expect(categoryOf(ElementType.EmailMessage)).toBe("email"); + }); + + it("returns undefined for unknown types", () => { + expect(categoryOf("unknown")).toBeUndefined(); + expect(categoryOf("")).toBeUndefined(); + }); +}); diff --git a/packages/nvisy-core/src/documents/ontology.ts b/packages/nvisy-core/src/documents/ontology.ts new file mode 100644 index 0000000..0d168d6 --- /dev/null +++ b/packages/nvisy-core/src/documents/ontology.ts @@ -0,0 +1,125 @@ +/** + * Element ontology — hierarchical categories for document elements. + * + * Every concrete {@link ElementType} belongs to exactly one + * {@link ElementCategory}. Categories let downstream consumers handle + * broad groups of elements (e.g. all text, all media) without matching + * individual types. + * + * Per-category const objects ({@link TextType}, {@link TableType}, etc.) + * are the single source of truth. The master {@link ElementType} is + * derived by spreading all category objects. + * + * @example + * ```ts + * import { categoryOf, ElementType, TextType } from "@nvisy/core"; + * + * categoryOf("title"); // => "text" + * categoryOf("table"); // => "table" + * ElementType.Title; // => "title" + * TextType.NarrativeText; // => "narrative-text" + * ``` + * + * @module + */ + +export const TextType = { + Title: "title", + NarrativeText: "narrative-text", + ListItem: "list-item", + Header: "header", + Footer: "footer", + FigureCaption: "figure-caption", + Address: "address", + UncategorizedText: "uncategorized-text", +} as const; +export type TextType = (typeof TextType)[keyof typeof TextType]; + +export const TableType = { Table: "table" } as const; +export type TableType = (typeof TableType)[keyof typeof TableType]; + +export const MediaType = { Image: "image" } as const; +export type MediaType = (typeof MediaType)[keyof typeof MediaType]; + +export const CodeType = { CodeSnippet: "code-snippet" } as const; +export type CodeType = (typeof CodeType)[keyof typeof CodeType]; + +export const MathType = { Formula: "formula" } as const; +export type MathType = (typeof MathType)[keyof typeof MathType]; + +export const FormType = { + CheckBox: "checkbox", + FormKeysValues: "form-keys-values", +} as const; +export type FormType = (typeof FormType)[keyof typeof FormType]; + +export const LayoutType = { + PageBreak: "page-break", + PageNumber: "page-number", +} as const; +export type LayoutType = (typeof LayoutType)[keyof typeof LayoutType]; + +export const EmailType = { EmailMessage: "email-message" } as const; +export type EmailType = (typeof EmailType)[keyof typeof EmailType]; + +/** Union of all per-category element type values. */ +export const ElementType = { + ...TextType, + ...TableType, + ...MediaType, + ...CodeType, + ...MathType, + ...FormType, + ...LayoutType, + ...EmailType, +} as const; +export type ElementType = (typeof ElementType)[keyof typeof ElementType]; + +export type ElementCategory = + | "text" + | "table" + | "media" + | "code" + | "math" + | "form" + | "layout" + | "email"; + +/** + * Map from {@link ElementCategory} to the element types it contains. + * + * This is the single source of truth for which types belong to which + * category. Use {@link categoryOf} for reverse lookups. + */ +export const ontology: Record<ElementCategory, readonly ElementType[]> = { + text: Object.values(TextType), + table: Object.values(TableType), + media: Object.values(MediaType), + code: Object.values(CodeType), + math: Object.values(MathType), + form: Object.values(FormType), + layout: Object.values(LayoutType), + email: Object.values(EmailType), +}; + +const reverseMap = new Map<string, ElementCategory>(); +for (const [category, types] of Object.entries(ontology)) { + for (const t of types) { + reverseMap.set(t, category as ElementCategory); + } +} + +/** + * Return the {@link ElementCategory} for a given element type string. + * + * @returns The category, or `undefined` for unrecognised types. + * + * @example + * ```ts + * categoryOf("title"); // => "text" + * categoryOf("unknown"); // => undefined + * ``` + */ +export function categoryOf(type: string): ElementCategory | undefined { + return reverseMap.get(type); +} diff --git a/packages/nvisy-core/src/errors/index.ts b/packages/nvisy-core/src/errors/index.ts index 0e240e0..38daf1a 100644 --- a/packages/nvisy-core/src/errors/index.ts +++ b/packages/nvisy-core/src/errors/index.ts @@ -10,6 +10,7 @@ * - {@link RuntimeError} — `true` (transient failures) * - {@link ValidationError} — `false` (bad input won't fix itself) * - {@link ConnectionError} — `true` (network issues are transient) + * - {@link TimeoutError} — `true` (timeouts are transient) * - {@link CancellationError} — `false` (intentional cancellation) * * @module @@ -19,4 +20,5 @@ export { CancellationError } from "./cancellation.js"; export { ConnectionError } from "./connection.js"; export type { ErrorContext, RuntimeErrorOptions } from "./runtime.js"; export { RuntimeError } from "./runtime.js"; +export { TimeoutError } from "./timeout.js"; export { ValidationError } from "./validation.js"; diff --git a/packages/nvisy-core/src/errors/timeout.ts b/packages/nvisy-core/src/errors/timeout.ts new file mode 100644 index 0000000..f6243ba --- /dev/null +++ b/packages/nvisy-core/src/errors/timeout.ts @@ -0,0 +1,15 @@ +import type { RuntimeErrorOptions } from "./runtime.js"; +import { RuntimeError } from "./runtime.js"; + +/** + * Thrown when an operation exceeds its time limit. + * + * Defaults to `retryable: true` because timeouts are typically transient. + * The engine handles retry timing via its backoff policies — this class + * does not carry a `retryAfterMs` field. + */ +export class TimeoutError extends RuntimeError { + constructor(message: string, options?: RuntimeErrorOptions) { + super(message, { retryable: true, ...options }); + } +} diff --git a/packages/nvisy-core/src/index.ts b/packages/nvisy-core/src/index.ts index e105aa8..2c7d292 100644 --- a/packages/nvisy-core/src/index.ts +++ b/packages/nvisy-core/src/index.ts @@ -7,43 +7,67 @@ export type { ActionInstance } from "./action.js"; export { Action } from "./action.js"; export type { + CompositeElementOptions, Datatype, - DocumentElement, DocumentOptions, - DocumentPage, - DocumentSection, - ElementType, - JsonValue, - Metadata, + ElementOptions, + ElementProvenance, + EmailElementOptions, + EmphasizedText, + FormElementOptions, + FormKeyValuePair, + ImageElementOptions, + Link, + TableCellData, + TableElementOptions, } from "./datatypes/index.js"; export { Blob, blobDatatype, + CompositeElement, Data, Datatypes, Document, documentDatatype, + Element, + EmailElement, Embedding, embeddingDatatype, + FormElement, + ImageElement, + TableElement, } from "./datatypes/index.js"; +export type { + ElementCategory, + ElementCoordinates, + Orientation, + Point, +} from "./documents/index.js"; +export { + CodeType, + CoordinateSystem, + categoryOf, + ElementType, + EmailType, + FormType, + LayoutType, + MathType, + MediaType, + Orientations, + ontology, + TableType, + TextType, +} from "./documents/index.js"; export type { ErrorContext } from "./errors/index.js"; export { CancellationError, ConnectionError, RuntimeError, + TimeoutError, ValidationError, } from "./errors/index.js"; -export type { - LoaderConfig, - LoaderInstance, - LoadFn, - PlaintextParams, -} from "./loaders/index.js"; -export { - Loader, - plaintextLoader, - plaintextParamsSchema, -} from "./loaders/index.js"; +export type { LoaderConfig, LoaderInstance, LoadFn } from "./loader.js"; +export { Loader } from "./loader.js"; export type { AnyActionInstance, AnyLoaderInstance, @@ -66,17 +90,18 @@ export type { WriterFn, } from "./stream.js"; export { Stream } from "./stream.js"; -export type { ClassRef } from "./types.js"; +export type { ClassRef, JsonValue, Metadata } from "./types.js"; import { blobDatatype, documentDatatype, embeddingDatatype, } from "./datatypes/index.js"; -import { plaintextLoader } from "./loaders/index.js"; import { Plugin } from "./plugin.js"; -/** Built-in core plugin that registers the Document, Blob, and Embedding datatypes, and plaintext loader. */ -export const corePlugin = Plugin.define("core") - .withDatatypes(documentDatatype, blobDatatype, embeddingDatatype) - .withLoaders(plaintextLoader); +/** Built-in core plugin that registers the Document, Blob, and Embedding datatypes. */ +export const corePlugin = Plugin.define("core").withDatatypes( + documentDatatype, + blobDatatype, + embeddingDatatype, +); diff --git a/packages/nvisy-core/src/loaders/loader.ts b/packages/nvisy-core/src/loader.ts similarity index 96% rename from packages/nvisy-core/src/loaders/loader.ts rename to packages/nvisy-core/src/loader.ts index 882c658..f80118f 100644 --- a/packages/nvisy-core/src/loaders/loader.ts +++ b/packages/nvisy-core/src/loader.ts @@ -1,6 +1,6 @@ import type { z } from "zod"; -import type { Blob } from "../datatypes/blob.js"; -import type { Document } from "../datatypes/document.js"; +import type { Blob } from "./datatypes/blob.js"; +import type { Document } from "./datatypes/document.js"; /** * Function that transforms a Blob into one or more Documents. diff --git a/packages/nvisy-core/src/plugin.ts b/packages/nvisy-core/src/plugin.ts index afbf766..e85cacb 100644 --- a/packages/nvisy-core/src/plugin.ts +++ b/packages/nvisy-core/src/plugin.ts @@ -1,6 +1,6 @@ import type { ActionInstance } from "./action.js"; import type { Datatype } from "./datatypes/index.js"; -import type { LoaderInstance } from "./loaders/loader.js"; +import type { LoaderInstance } from "./loader.js"; import type { ProviderFactory } from "./provider.js"; import type { StreamSource, StreamTarget } from "./stream.js"; diff --git a/packages/nvisy-core/src/types.ts b/packages/nvisy-core/src/types.ts index 799363a..49e83ff 100644 --- a/packages/nvisy-core/src/types.ts +++ b/packages/nvisy-core/src/types.ts @@ -1,2 +1,25 @@ +/** + * A JSON-compatible value. + * + * Mirrors the types that `JSON.parse` can return and `JSON.stringify` + * can accept, making it safe for serialisation boundaries (APIs, + * databases, message queues). + */ +export type JsonValue = + | string + | number + | boolean + | null + | JsonValue[] + | { [key: string]: JsonValue }; + +/** + * Key-value metadata bag attached to {@link Data} items. + * + * All values must be JSON-serialisable so metadata can travel across + * process boundaries without lossy conversion. + */ +export type Metadata = Record<string, JsonValue>; + /** Constructor reference for runtime `instanceof` checks and generic type inference. */ export type ClassRef<T> = abstract new (...args: never[]) => T; diff --git a/packages/nvisy-plugin-ai/src/actions/chunk.ts b/packages/nvisy-plugin-ai/src/actions/chunk.ts index d126537..2ba5440 100644 --- a/packages/nvisy-plugin-ai/src/actions/chunk.ts +++ b/packages/nvisy-plugin-ai/src/actions/chunk.ts @@ -1,4 +1,4 @@ -import type { DocumentPage, DocumentSection } from "@nvisy/core"; +import type { Element } from "@nvisy/core"; import { Action, Document } from "@nvisy/core"; import { z } from "zod"; import { Chunk } from "../datatypes/index.js"; @@ -47,12 +47,14 @@ async function* transformChunk( for await (const doc of stream) { switch (params.strategy) { case "page": { - if (doc.pages != null && doc.pages.length > 0) { - for (let i = 0; i < doc.pages.length; i++) { - const page = doc.pages[i]!; - yield new Chunk(Document.fromPages([page]).content, { + if (doc.elements != null && doc.elements.length > 0) { + const groups = groupByPage(doc.elements); + const pages = [...groups.entries()].sort(([a], [b]) => a - b); + for (let i = 0; i < pages.length; i++) { + const [, els] = pages[i]!; + yield new Chunk(Document.fromElements(els).content, { chunkIndex: i, - chunkTotal: doc.pages.length, + chunkTotal: pages.length, }).deriveFrom(doc); } continue; @@ -60,14 +62,14 @@ async function* transformChunk( break; } case "section": { - if (doc.pages != null && doc.pages.length > 0) { - const sections = chunkSectionsByLevel(doc.pages, params.level); + if (doc.elements != null && doc.elements.length > 0) { + const sections = splitByHeadingLevel(doc.elements, params.level); for (let i = 0; i < sections.length; i++) { - const sec = sections[i]!; - yield new Chunk( - Document.fromPages([{ pageNumber: 1, sections: [sec] }]).content, - { chunkIndex: i, chunkTotal: sections.length }, - ).deriveFrom(doc); + const els = sections[i]!; + yield new Chunk(Document.fromElements(els).content, { + chunkIndex: i, + chunkTotal: sections.length, + }).deriveFrom(doc); } continue; } @@ -98,6 +100,45 @@ async function* transformChunk( } } +/** Group elements by their `pageNumber`, preserving document order. */ +function groupByPage(elements: readonly Element[]): Map<number, Element[]> { + const groups = new Map<number, Element[]>(); + for (const el of elements) { + const page = el.pageNumber ?? 1; + let group = groups.get(page); + if (group == null) { + group = []; + groups.set(page, group); + } + group.push(el); + } + return groups; +} + +/** Split elements into sections at headings of the given level. */ +function splitByHeadingLevel( + elements: readonly Element[], + level: number, +): Element[][] { + const sections: Element[][] = []; + let current: Element[] = []; + + for (const el of elements) { + if (el.type === "title" && el.level != null && el.level <= level) { + if (current.length > 0) { + sections.push(current); + } + current = [el]; + } else { + current.push(el); + } + } + if (current.length > 0) { + sections.push(current); + } + return sections; +} + function chunkByCharacter( text: string, size: number, @@ -140,35 +181,3 @@ function chunkByPage(text: string): string[] { } return chunks.length > 0 ? chunks : [text]; } - -/** Walk the page->section tree, collecting sections at the target depth level. */ -function chunkSectionsByLevel( - pages: readonly DocumentPage[], - targetLevel: number, -): DocumentSection[] { - const out: DocumentSection[] = []; - for (const page of pages) { - for (const section of page.sections) { - collectSectionsAtLevel(section, 1, targetLevel, out); - } - } - return out; -} - -/** Recursively traverse the section tree, collecting sections at the target depth. */ -function collectSectionsAtLevel( - section: DocumentSection, - currentLevel: number, - targetLevel: number, - out: DocumentSection[], -): void { - if (currentLevel === targetLevel) { - out.push(section); - return; - } - if (section.children) { - for (const child of section.children) { - collectSectionsAtLevel(child, currentLevel + 1, targetLevel, out); - } - } -} diff --git a/packages/nvisy-plugin-ai/src/actions/enrich.ts b/packages/nvisy-plugin-ai/src/actions/enrich.ts index 221ace3..6f4e7ad 100644 --- a/packages/nvisy-plugin-ai/src/actions/enrich.ts +++ b/packages/nvisy-plugin-ai/src/actions/enrich.ts @@ -86,7 +86,7 @@ async function* transformEnrich( yield new Document(doc.content, { ...(doc.sourceType != null ? { sourceType: doc.sourceType } : {}), - ...(doc.pages != null ? { pages: doc.pages } : {}), + ...(doc.elements != null ? { elements: doc.elements } : {}), }) .deriveFrom(doc) .withMetadata({ ...(doc.metadata ?? {}), ...enrichedMeta }); diff --git a/packages/nvisy-plugin-ai/src/actions/partition.ts b/packages/nvisy-plugin-ai/src/actions/partition.ts index fd5e275..6462531 100644 --- a/packages/nvisy-plugin-ai/src/actions/partition.ts +++ b/packages/nvisy-plugin-ai/src/actions/partition.ts @@ -64,8 +64,8 @@ async function* transformPartition( partTotal: parts.length, }; yield new Document(parts[i]!, { - ...(params.strategy === "auto" && item.pages != null - ? { pages: item.pages } + ...(params.strategy === "auto" && item.elements != null + ? { elements: item.elements } : {}), }) .withParent(sourceId) diff --git a/packages/nvisy-plugin-markup/src/index.ts b/packages/nvisy-plugin-markup/src/index.ts index 2020a01..c02494e 100644 --- a/packages/nvisy-plugin-markup/src/index.ts +++ b/packages/nvisy-plugin-markup/src/index.ts @@ -8,6 +8,11 @@ */ import { Plugin } from "@nvisy/core"; +import { plaintextLoader } from "./loaders/index.js"; + +export type { PlaintextParams } from "./loaders/index.js"; +export { plaintextLoader, plaintextParamsSchema } from "./loaders/index.js"; /** Markup parsing plugin instance. */ -export const markupPlugin = Plugin.define("markup"); +export const markupPlugin = + Plugin.define("markup").withLoaders(plaintextLoader); diff --git a/packages/nvisy-core/src/loaders/index.ts b/packages/nvisy-plugin-markup/src/loaders/index.ts similarity index 53% rename from packages/nvisy-core/src/loaders/index.ts rename to packages/nvisy-plugin-markup/src/loaders/index.ts index e4b475f..1688cb8 100644 --- a/packages/nvisy-core/src/loaders/index.ts +++ b/packages/nvisy-plugin-markup/src/loaders/index.ts @@ -1,4 +1,2 @@ -export type { LoaderConfig, LoaderInstance, LoadFn } from "./loader.js"; -export { Loader } from "./loader.js"; export type { PlaintextParams } from "./plaintext.js"; export { plaintextLoader, plaintextParamsSchema } from "./plaintext.js"; diff --git a/packages/nvisy-core/src/loaders/plaintext.test.ts b/packages/nvisy-plugin-markup/src/loaders/plaintext.test.ts similarity index 95% rename from packages/nvisy-core/src/loaders/plaintext.test.ts rename to packages/nvisy-plugin-markup/src/loaders/plaintext.test.ts index f143c93..7f641b9 100644 --- a/packages/nvisy-core/src/loaders/plaintext.test.ts +++ b/packages/nvisy-plugin-markup/src/loaders/plaintext.test.ts @@ -1,10 +1,9 @@ +import type { Document } from "@nvisy/core"; +import { Blob } from "@nvisy/core"; import { describe, expect, it } from "vitest"; -import { Blob } from "../datatypes/blob.js"; import { plaintextLoader } from "./plaintext.js"; -async function collectDocs( - iter: AsyncIterable<import("../datatypes/document.js").Document>, -) { +async function collectDocs(iter: AsyncIterable<Document>) { const docs = []; for await (const doc of iter) { docs.push(doc); diff --git a/packages/nvisy-core/src/loaders/plaintext.ts b/packages/nvisy-plugin-markup/src/loaders/plaintext.ts similarity index 90% rename from packages/nvisy-core/src/loaders/plaintext.ts rename to packages/nvisy-plugin-markup/src/loaders/plaintext.ts index 812820b..eb8cea4 100644 --- a/packages/nvisy-core/src/loaders/plaintext.ts +++ b/packages/nvisy-plugin-markup/src/loaders/plaintext.ts @@ -1,6 +1,5 @@ +import { Document, Loader } from "@nvisy/core"; import { z } from "zod"; -import { Document } from "../datatypes/document.js"; -import { Loader } from "./loader.js"; /** Schema for plaintext loader parameters. */ export const plaintextParamsSchema = z diff --git a/packages/nvisy-runtime/src/engine/nodes.ts b/packages/nvisy-runtime/src/engine/nodes.ts index 579a383..18f82c0 100644 --- a/packages/nvisy-runtime/src/engine/nodes.ts +++ b/packages/nvisy-runtime/src/engine/nodes.ts @@ -9,7 +9,7 @@ import { getLogger } from "@logtape/logtape"; import type { Data } from "@nvisy/core"; -import { RuntimeError, ValidationError } from "@nvisy/core"; +import { TimeoutError, ValidationError } from "@nvisy/core"; import { call, type Operation, spawn } from "effection"; import type { ResolvedActionNode, @@ -292,9 +292,8 @@ export function* executeNode( const timeoutFallback: NodeResult = { nodeId, status: "failure", - error: new RuntimeError(`Node ${nodeId} timed out after ${timeoutMs}ms`, { + error: new TimeoutError(`Node ${nodeId} timed out after ${timeoutMs}ms`, { source: "engine", - retryable: true, }), itemsProcessed: 0, }; diff --git a/packages/nvisy-server/src/service/engine-factory.ts b/packages/nvisy-server/src/service/engine-factory.ts index d26204c..087e67a 100644 --- a/packages/nvisy-server/src/service/engine-factory.ts +++ b/packages/nvisy-server/src/service/engine-factory.ts @@ -3,10 +3,10 @@ import { aiPlugin } from "@nvisy/plugin-ai"; import { markupPlugin } from "@nvisy/plugin-markup"; import { nosqlPlugin } from "@nvisy/plugin-nosql"; import { objectPlugin } from "@nvisy/plugin-object"; -import { tesseractPlugin } from "@nvisy/plugin-tesseract"; import { pandocPlugin } from "@nvisy/plugin-pandoc"; import { queuePlugin } from "@nvisy/plugin-queue"; import { sqlPlugin } from "@nvisy/plugin-sql"; +import { tesseractPlugin } from "@nvisy/plugin-tesseract"; import { vectorPlugin } from "@nvisy/plugin-vector"; import { Engine } from "@nvisy/runtime"; From ea9565df9102c3f181012fc9671eae39e9ac2327 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Sat, 7 Feb 2026 23:52:32 +0100 Subject: [PATCH 03/17] refactor(core, ai): move chunk/partition actions to core, split enrich strategies, add tests Move rule-based chunk and partition actions from nvisy-plugin-ai to nvisy-core since they have no AI dependency. Split enrich.ts into individual strategy files following the same pattern. Add maxCharacters/combineUnder/inferTableStructure options and comprehensive tests for all strategies. Key changes: - Move Chunk datatype, chunk (character/section/page), and partition (auto/rule) actions to nvisy-core with full test coverage - Split enrich into enrich-by-metadata, enrich-by-ner, enrich-by-description, enrich-by-table-html strategy files - Use plain interfaces in strategy files, zod .extend() schemas in combining files - Move parseJsonResponse to providers/client.ts - Rename embed.ts to generate-embedding.ts - Move action.ts into actions/ folder Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- package-lock.json | 20 +- .../nvisy-core/src/{ => actions}/action.ts | 15 +- .../src/actions/chunk-by-character.test.ts | 85 ++++++++ .../src/actions/chunk-by-character.ts | 41 ++++ .../src/actions/chunk-by-page.test.ts | 52 +++++ .../nvisy-core/src/actions/chunk-by-page.ts | 70 +++++++ .../src/actions/chunk-by-section.test.ts | 142 ++++++++++++++ .../src/actions/chunk-by-section.ts | 126 ++++++++++++ packages/nvisy-core/src/actions/chunk.ts | 88 +++++++++ packages/nvisy-core/src/actions/index.ts | 10 + .../src/actions/partition-by-auto.test.ts | 24 +++ .../src/actions/partition-by-auto.ts | 21 ++ .../src/actions/partition-by-rule.test.ts | 113 +++++++++++ .../src/actions/partition-by-rule.ts | 82 ++++++++ .../src/actions/partition.ts | 57 +++--- packages/nvisy-core/src/datatypes/blob.ts | 6 + .../nvisy-core/src/datatypes/chunk.test.ts | 77 ++++++++ .../src/datatypes/chunk.ts | 8 +- packages/nvisy-core/src/datatypes/data.ts | 6 + .../nvisy-core/src/datatypes/document.test.ts | 47 +++++ packages/nvisy-core/src/datatypes/document.ts | 28 +++ .../nvisy-core/src/datatypes/embedding.ts | 6 + packages/nvisy-core/src/datatypes/index.ts | 13 +- .../nvisy-core/src/errors/cancellation.ts | 6 + packages/nvisy-core/src/errors/connection.ts | 6 + packages/nvisy-core/src/errors/runtime.ts | 6 + packages/nvisy-core/src/errors/timeout.ts | 6 + packages/nvisy-core/src/errors/validation.ts | 6 + packages/nvisy-core/src/index.ts | 26 +-- packages/nvisy-core/src/loader.ts | 10 + packages/nvisy-core/src/plugin.ts | 13 +- packages/nvisy-core/src/provider.ts | 11 ++ packages/nvisy-core/src/stream.ts | 11 ++ packages/nvisy-core/src/types.ts | 6 + packages/nvisy-core/test/action.fixtures.ts | 2 +- packages/nvisy-core/test/action.test.ts | 2 +- .../src/actions/chunk-contextual.ts | 3 +- .../src/actions/chunk-similarity.ts | 3 +- packages/nvisy-plugin-ai/src/actions/chunk.ts | 183 ------------------ .../src/actions/enrich-by-description.ts | 35 ++++ .../src/actions/enrich-by-metadata.ts | 35 ++++ .../src/actions/enrich-by-ner.ts | 39 ++++ .../src/actions/enrich-by-table-html.ts | 33 ++++ .../nvisy-plugin-ai/src/actions/enrich.ts | 172 ++++++---------- .../{embed.ts => generate-embedding.ts} | 0 packages/nvisy-plugin-ai/src/actions/index.ts | 4 +- .../nvisy-plugin-ai/src/datatypes/index.ts | 4 +- packages/nvisy-plugin-ai/src/index.ts | 15 +- .../nvisy-plugin-ai/src/providers/client.ts | 27 ++- 49 files changed, 1435 insertions(+), 366 deletions(-) rename packages/nvisy-core/src/{ => actions}/action.ts (94%) create mode 100644 packages/nvisy-core/src/actions/chunk-by-character.test.ts create mode 100644 packages/nvisy-core/src/actions/chunk-by-character.ts create mode 100644 packages/nvisy-core/src/actions/chunk-by-page.test.ts create mode 100644 packages/nvisy-core/src/actions/chunk-by-page.ts create mode 100644 packages/nvisy-core/src/actions/chunk-by-section.test.ts create mode 100644 packages/nvisy-core/src/actions/chunk-by-section.ts create mode 100644 packages/nvisy-core/src/actions/chunk.ts create mode 100644 packages/nvisy-core/src/actions/index.ts create mode 100644 packages/nvisy-core/src/actions/partition-by-auto.test.ts create mode 100644 packages/nvisy-core/src/actions/partition-by-auto.ts create mode 100644 packages/nvisy-core/src/actions/partition-by-rule.test.ts create mode 100644 packages/nvisy-core/src/actions/partition-by-rule.ts rename packages/{nvisy-plugin-ai => nvisy-core}/src/actions/partition.ts (57%) create mode 100644 packages/nvisy-core/src/datatypes/chunk.test.ts rename packages/{nvisy-plugin-ai => nvisy-core}/src/datatypes/chunk.ts (92%) delete mode 100644 packages/nvisy-plugin-ai/src/actions/chunk.ts create mode 100644 packages/nvisy-plugin-ai/src/actions/enrich-by-description.ts create mode 100644 packages/nvisy-plugin-ai/src/actions/enrich-by-metadata.ts create mode 100644 packages/nvisy-plugin-ai/src/actions/enrich-by-ner.ts create mode 100644 packages/nvisy-plugin-ai/src/actions/enrich-by-table-html.ts rename packages/nvisy-plugin-ai/src/actions/{embed.ts => generate-embedding.ts} (100%) diff --git a/package-lock.json b/package-lock.json index bd49562..7d1c95e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -6,7 +6,8 @@ "": { "name": "@nvisy/monorepo", "workspaces": [ - "packages/*" + "packages/*", + "sdks/nvisy-ts" ], "devDependencies": { "@biomejs/biome": "^2.3.14", @@ -5872,6 +5873,10 @@ } } }, + "node_modules/nvisy": { + "resolved": "sdks/nvisy-ts", + "link": true + }, "node_modules/object-assign": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", @@ -7600,6 +7605,19 @@ "engines": { "node": ">=22.0.0" } + }, + "sdks/nvisy-ts": { + "name": "nvisy", + "version": "0.1.0", + "devDependencies": { + "@biomejs/biome": "^2.3.14", + "tsup": "^8.5.1", + "typescript": "^5.9.3", + "vitest": "^4.0.18" + }, + "engines": { + "node": ">=22.0.0" + } } } } diff --git a/packages/nvisy-core/src/action.ts b/packages/nvisy-core/src/actions/action.ts similarity index 94% rename from packages/nvisy-core/src/action.ts rename to packages/nvisy-core/src/actions/action.ts index e9ade5a..0004ea2 100644 --- a/packages/nvisy-core/src/action.ts +++ b/packages/nvisy-core/src/actions/action.ts @@ -1,6 +1,17 @@ +/** + * Action factory and type definitions for stream transforms. + * + * Actions are the intermediate processing steps in a pipeline, + * transforming data between sources and targets. Use + * {@link Action.withoutClient} for rule-based transforms and + * {@link Action.withClient} for transforms that require a provider. + * + * @module + */ + import type { z } from "zod"; -import type { Data } from "./datatypes/data.js"; -import type { ClassRef } from "./types.js"; +import type { Data } from "../datatypes/data.js"; +import type { ClassRef } from "../types.js"; /** * Stream transform that operates without a provider client. diff --git a/packages/nvisy-core/src/actions/chunk-by-character.test.ts b/packages/nvisy-core/src/actions/chunk-by-character.test.ts new file mode 100644 index 0000000..7bc9749 --- /dev/null +++ b/packages/nvisy-core/src/actions/chunk-by-character.test.ts @@ -0,0 +1,85 @@ +import { describe, expect, it } from "vitest"; +import { Document } from "../datatypes/index.js"; +import { chunkByCharacter } from "./chunk-by-character.js"; + +describe("chunkByCharacter", () => { + it("splits text into chunks of maxCharacters", () => { + const doc = new Document("abcdefghij"); + const chunks = [ + ...chunkByCharacter(doc, { + maxCharacters: 3, + overlap: 0, + }), + ]; + expect(chunks).toHaveLength(4); + expect(chunks[0]!.content).toBe("abc"); + expect(chunks[1]!.content).toBe("def"); + expect(chunks[2]!.content).toBe("ghi"); + expect(chunks[3]!.content).toBe("j"); + }); + + it("applies overlap between chunks", () => { + const doc = new Document("abcdefghij"); + // maxCharacters=5, overlap=2 → step=3, starts at 0, 3, 6, 9 + const chunks = [ + ...chunkByCharacter(doc, { + maxCharacters: 5, + overlap: 2, + }), + ]; + expect(chunks[0]!.content).toBe("abcde"); + expect(chunks[1]!.content).toBe("defgh"); + expect(chunks[2]!.content).toBe("ghij"); + }); + + it("yields nothing when step is zero", () => { + const doc = new Document("hello"); + const chunks = [ + ...chunkByCharacter(doc, { + maxCharacters: 3, + overlap: 3, + }), + ]; + expect(chunks).toHaveLength(0); + }); + + it("sets chunkIndex and chunkTotal on each chunk", () => { + const doc = new Document("abcdef"); + const chunks = [ + ...chunkByCharacter(doc, { + maxCharacters: 2, + overlap: 0, + }), + ]; + expect(chunks).toHaveLength(3); + for (let i = 0; i < chunks.length; i++) { + expect(chunks[i]!.chunkIndex).toBe(i); + expect(chunks[i]!.chunkTotal).toBe(3); + } + }); + + it("derives chunks from the source document", () => { + const doc = new Document("abcdef"); + const chunks = [ + ...chunkByCharacter(doc, { + maxCharacters: 3, + overlap: 0, + }), + ]; + for (const chunk of chunks) { + expect(chunk.parentId).toBe(doc.id); + } + }); + + it("returns single chunk when text fits in maxCharacters", () => { + const doc = new Document("abc"); + const chunks = [ + ...chunkByCharacter(doc, { + maxCharacters: 10, + overlap: 0, + }), + ]; + expect(chunks).toHaveLength(1); + expect(chunks[0]!.content).toBe("abc"); + }); +}); diff --git a/packages/nvisy-core/src/actions/chunk-by-character.ts b/packages/nvisy-core/src/actions/chunk-by-character.ts new file mode 100644 index 0000000..cc233f8 --- /dev/null +++ b/packages/nvisy-core/src/actions/chunk-by-character.ts @@ -0,0 +1,41 @@ +/** + * Character-based chunking strategy. + * + * Splits document content into fixed-size character windows + * with configurable overlap. + * + * @module + */ + +import type { Document } from "../datatypes/index.js"; +import { Chunk } from "../datatypes/index.js"; + +/** Character-strategy parameters. */ +export interface CharacterStrategyParams { + /** Maximum chunk size in characters. */ + readonly maxCharacters: number; + /** Number of overlapping characters between chunks. */ + readonly overlap: number; +} + +/** Split a document into fixed-size character chunks with optional overlap. */ +export function* chunkByCharacter( + doc: Document, + params: CharacterStrategyParams, +): Generator<Chunk> { + const text = doc.content; + const step = params.maxCharacters - params.overlap; + if (step <= 0) return; + + const total = Math.ceil(text.length / step); + let index = 0; + let start = 0; + while (start < text.length) { + yield new Chunk(text.slice(start, start + params.maxCharacters), { + chunkIndex: index, + chunkTotal: total, + }).deriveFrom(doc); + index++; + start += step; + } +} diff --git a/packages/nvisy-core/src/actions/chunk-by-page.test.ts b/packages/nvisy-core/src/actions/chunk-by-page.test.ts new file mode 100644 index 0000000..6ff3992 --- /dev/null +++ b/packages/nvisy-core/src/actions/chunk-by-page.test.ts @@ -0,0 +1,52 @@ +import { describe, expect, it } from "vitest"; +import { Document } from "../datatypes/index.js"; +import { chunkByPage } from "./chunk-by-page.js"; + +describe("chunkByPage", () => { + it("splits on form feed markers", () => { + const doc = new Document("Page 1\fPage 2\fPage 3"); + const chunks = [...chunkByPage(doc, {})]; + expect(chunks).toHaveLength(3); + expect(chunks[0]!.content).toBe("Page 1"); + expect(chunks[1]!.content).toBe("Page 2"); + expect(chunks[2]!.content).toBe("Page 3"); + }); + + describe("maxCharacters", () => { + it("splits long pages into smaller chunks", () => { + const longPage = "a".repeat(100); + const doc = new Document(`${longPage}\f${"b".repeat(20)}`); + const chunks = [...chunkByPage(doc, { maxCharacters: 30 })]; + for (const chunk of chunks) { + expect(chunk.content.length).toBeLessThanOrEqual(30); + } + // 100 chars / 30 = 4 pieces + 1 short page = 5 + expect(chunks).toHaveLength(5); + }); + + it("leaves short pages intact", () => { + const doc = new Document("Page 1\fPage 2"); + const chunks = [...chunkByPage(doc, { maxCharacters: 1000 })]; + expect(chunks).toHaveLength(2); + expect(chunks[0]!.content).toBe("Page 1"); + expect(chunks[1]!.content).toBe("Page 2"); + }); + + it("updates chunkIndex and chunkTotal after splitting", () => { + const doc = new Document("a".repeat(50)); + const chunks = [...chunkByPage(doc, { maxCharacters: 20 })]; + for (let i = 0; i < chunks.length; i++) { + expect(chunks[i]!.chunkIndex).toBe(i); + expect(chunks[i]!.chunkTotal).toBe(chunks.length); + } + }); + }); + + it("derives all chunks from the source document", () => { + const doc = new Document("Page 1\fPage 2"); + const chunks = [...chunkByPage(doc, {})]; + for (const chunk of chunks) { + expect(chunk.parentId).toBe(doc.id); + } + }); +}); diff --git a/packages/nvisy-core/src/actions/chunk-by-page.ts b/packages/nvisy-core/src/actions/chunk-by-page.ts new file mode 100644 index 0000000..7be1996 --- /dev/null +++ b/packages/nvisy-core/src/actions/chunk-by-page.ts @@ -0,0 +1,70 @@ +/** + * Page-based chunking strategy. + * + * Splits documents at page boundaries. When structured elements with + * page numbers are available, elements are grouped by page; otherwise + * the raw text is split on common page-break markers (`\f`, `---`, + * `***`). + * + * @module + */ + +import { Chunk, Document } from "../datatypes/index.js"; + +/** Page-strategy parameters. */ +export interface PageStrategyParams { + /** Optional maximum chunk size in characters. Splits pages that exceed this limit. */ + readonly maxCharacters?: number | undefined; +} + +/** Split a document on page boundaries (elements or text markers). */ +export function* chunkByPage( + doc: Document, + params: PageStrategyParams, +): Generator<Chunk> { + let texts: string[]; + + // Element-based path: group elements by page number + if (doc.elements != null && doc.elements.length > 0) { + texts = [...doc.getElementsByPage().entries()] + .sort(([a], [b]) => a - b) + .map(([, els]) => Document.fromElements(els).content); + } else { + // Fallback: split on common page break markers + const pages = doc.content.split(/\f|\n---\n|\n\*\*\*\n/); + const filtered: string[] = []; + for (const page of pages) { + const trimmed = page.trim(); + if (trimmed.length > 0) { + filtered.push(trimmed); + } + } + texts = filtered.length > 0 ? filtered : [doc.content]; + } + + if (params.maxCharacters != null) { + texts = splitLongTexts(texts, params.maxCharacters); + } + + for (let i = 0; i < texts.length; i++) { + yield new Chunk(texts[i]!, { + chunkIndex: i, + chunkTotal: texts.length, + }).deriveFrom(doc); + } +} + +/** Split texts that exceed maxCharacters into smaller pieces. */ +function splitLongTexts(texts: string[], max: number): string[] { + const result: string[] = []; + for (const text of texts) { + if (text.length <= max) { + result.push(text); + } else { + for (let i = 0; i < text.length; i += max) { + result.push(text.slice(i, i + max)); + } + } + } + return result; +} diff --git a/packages/nvisy-core/src/actions/chunk-by-section.test.ts b/packages/nvisy-core/src/actions/chunk-by-section.test.ts new file mode 100644 index 0000000..1456701 --- /dev/null +++ b/packages/nvisy-core/src/actions/chunk-by-section.test.ts @@ -0,0 +1,142 @@ +import { describe, expect, it } from "vitest"; +import { Document } from "../datatypes/index.js"; +import { chunkBySection } from "./chunk-by-section.js"; + +describe("chunkBySection", () => { + const markdown = [ + "Intro text", + "## Section A", + "Content A is here", + "## Section B", + "Content B is here", + "## Section C", + "Content C is short", + ].join("\n"); + + it("splits on heading level", () => { + const doc = new Document(markdown); + const chunks = [...chunkBySection(doc, { level: 2 })]; + expect(chunks).toHaveLength(4); + expect(chunks[0]!.content).toBe("Intro text"); + expect(chunks[1]!.content).toContain("Section A"); + expect(chunks[2]!.content).toContain("Section B"); + expect(chunks[3]!.content).toContain("Section C"); + }); + + describe("maxCharacters", () => { + it("splits long sections into smaller chunks", () => { + const longContent = `## Title\n${"x".repeat(100)}`; + const doc = new Document(longContent); + const chunks = [ + ...chunkBySection(doc, { + level: 2, + maxCharacters: 30, + }), + ]; + for (const chunk of chunks) { + expect(chunk.content.length).toBeLessThanOrEqual(30); + } + expect(chunks.length).toBeGreaterThan(1); + }); + + it("leaves short sections intact", () => { + const doc = new Document("## Short\nHello"); + const chunks = [ + ...chunkBySection(doc, { + level: 2, + maxCharacters: 1000, + }), + ]; + expect(chunks).toHaveLength(1); + expect(chunks[0]!.content).toContain("Hello"); + }); + + it("updates chunkIndex and chunkTotal after splitting", () => { + const longContent = `## Title\n${"a".repeat(50)}`; + const doc = new Document(longContent); + const chunks = [ + ...chunkBySection(doc, { + level: 2, + maxCharacters: 20, + }), + ]; + for (let i = 0; i < chunks.length; i++) { + expect(chunks[i]!.chunkIndex).toBe(i); + expect(chunks[i]!.chunkTotal).toBe(chunks.length); + } + }); + }); + + describe("combineUnder", () => { + it("merges consecutive short sections", () => { + const short = "## A\naa\n## B\nbb\n## C\ncc"; + const doc = new Document(short); + const chunks = [ + ...chunkBySection(doc, { + level: 2, + combineUnder: 200, + }), + ]; + expect(chunks).toHaveLength(1); + expect(chunks[0]!.content).toContain("## A"); + expect(chunks[0]!.content).toContain("## C"); + }); + + it("does not merge sections that exceed threshold", () => { + const sections = [ + "## A", + "a".repeat(50), + "## B", + "b".repeat(50), + "## C", + "c".repeat(50), + ].join("\n"); + const doc = new Document(sections); + const chunks = [ + ...chunkBySection(doc, { + level: 2, + combineUnder: 30, + }), + ]; + expect(chunks).toHaveLength(3); + }); + + it("combines then splits with both options", () => { + const sections = "## A\naa\n## B\nbb\n## C\ncc"; + const doc = new Document(sections); + // Combine first (all short), then split result + const chunks = [ + ...chunkBySection(doc, { + level: 2, + combineUnder: 500, + maxCharacters: 10, + }), + ]; + for (const chunk of chunks) { + expect(chunk.content.length).toBeLessThanOrEqual(10); + } + }); + + it("keeps long sections separate", () => { + const sections = ["## Short", "hi", "## Long", "x".repeat(200)].join( + "\n", + ); + const doc = new Document(sections); + const chunks = [ + ...chunkBySection(doc, { + level: 2, + combineUnder: 50, + }), + ]; + expect(chunks.length).toBeGreaterThanOrEqual(2); + }); + }); + + it("derives all chunks from the source document", () => { + const doc = new Document(markdown); + const chunks = [...chunkBySection(doc, { level: 2 })]; + for (const chunk of chunks) { + expect(chunk.parentId).toBe(doc.id); + } + }); +}); diff --git a/packages/nvisy-core/src/actions/chunk-by-section.ts b/packages/nvisy-core/src/actions/chunk-by-section.ts new file mode 100644 index 0000000..7a00d23 --- /dev/null +++ b/packages/nvisy-core/src/actions/chunk-by-section.ts @@ -0,0 +1,126 @@ +/** + * Section-based chunking strategy. + * + * Splits documents at markdown heading boundaries. When structured + * elements are available, headings are matched by {@link Element.level}; + * otherwise the raw text is split on `#`-prefixed lines. + * + * @module + */ + +import type { Element } from "../datatypes/index.js"; +import { Chunk, Document } from "../datatypes/index.js"; + +/** Section-strategy parameters. */ +export interface SectionStrategyParams { + /** Heading level to split on (1-6). */ + readonly level: number; + /** Optional maximum chunk size in characters. Splits sections that exceed this limit. */ + readonly maxCharacters?: number | undefined; + /** Combine consecutive sections whose text is shorter than this threshold. */ + readonly combineUnder?: number | undefined; +} + +/** Split a document into sections at markdown headings of the given level. */ +export function* chunkBySection( + doc: Document, + params: SectionStrategyParams, +): Generator<Chunk> { + let texts: string[]; + + // Element-based path: split structured elements by heading level + if (doc.elements != null && doc.elements.length > 0) { + texts = splitByHeadingLevel(doc.elements, params.level).map( + (els) => Document.fromElements(els).content, + ); + } else { + // Fallback: string-based splitting + const prefix = "#".repeat(params.level); + const pattern = new RegExp(`^${prefix}\\s`, "m"); + const parts = doc.content.split(pattern); + + const sections: string[] = []; + for (let i = 0; i < parts.length; i++) { + const part = parts[i]!.trim(); + if (part.length === 0) continue; + // Re-add the heading prefix for sections after the first + sections.push(i > 0 ? `${prefix} ${part}` : part); + } + texts = sections.length > 0 ? sections : [doc.content]; + } + + if (params.combineUnder != null) { + texts = combineShortTexts(texts, params.combineUnder); + } + + if (params.maxCharacters != null) { + texts = splitLongTexts(texts, params.maxCharacters); + } + + for (let i = 0; i < texts.length; i++) { + yield new Chunk(texts[i]!, { + chunkIndex: i, + chunkTotal: texts.length, + }).deriveFrom(doc); + } +} + +/** Combine consecutive texts that are shorter than the threshold. */ +function combineShortTexts(texts: string[], threshold: number): string[] { + const result: string[] = []; + let buffer = ""; + + for (const text of texts) { + if (buffer.length === 0) { + buffer = text; + } else if (buffer.length + text.length < threshold) { + buffer += `\n\n${text}`; + } else { + result.push(buffer); + buffer = text; + } + } + if (buffer.length > 0) { + result.push(buffer); + } + return result; +} + +/** Split texts that exceed maxCharacters into smaller pieces. */ +function splitLongTexts(texts: string[], max: number): string[] { + const result: string[] = []; + for (const text of texts) { + if (text.length <= max) { + result.push(text); + } else { + for (let i = 0; i < text.length; i += max) { + result.push(text.slice(i, i + max)); + } + } + } + return result; +} + +/** Split elements into sections at headings of the given level. */ +function splitByHeadingLevel( + elements: readonly Element[], + level: number, +): Element[][] { + const sections: Element[][] = []; + let current: Element[] = []; + + for (const el of elements) { + if (el.type === "title" && el.level != null && el.level <= level) { + if (current.length > 0) { + sections.push(current); + } + current = [el]; + } else { + current.push(el); + } + } + if (current.length > 0) { + sections.push(current); + } + return sections; +} diff --git a/packages/nvisy-core/src/actions/chunk.ts b/packages/nvisy-core/src/actions/chunk.ts new file mode 100644 index 0000000..47fed6a --- /dev/null +++ b/packages/nvisy-core/src/actions/chunk.ts @@ -0,0 +1,88 @@ +/** + * Rule-based chunk action that splits documents using character, + * section, or page strategies. + * + * @module + */ + +import { z } from "zod"; +import { Chunk, Document } from "../datatypes/index.js"; +import { Action } from "./action.js"; +import { chunkByCharacter } from "./chunk-by-character.js"; +import { chunkByPage } from "./chunk-by-page.js"; +import { chunkBySection } from "./chunk-by-section.js"; + +export type { CharacterStrategyParams } from "./chunk-by-character.js"; +export type { PageStrategyParams } from "./chunk-by-page.js"; +export type { SectionStrategyParams } from "./chunk-by-section.js"; + +const BaseCharacter = z.object({ + maxCharacters: z.number(), + overlap: z.number().default(0), +}); + +const BaseSection = z.object({ + level: z.number().min(1).max(6).default(2), + maxCharacters: z.number().optional(), + combineUnder: z.number().optional(), +}); + +const BasePage = z.object({ + maxCharacters: z.number().optional(), +}); + +const CharacterStrategy = BaseCharacter.extend({ + strategy: z.literal("character"), +}); + +const SectionStrategy = BaseSection.extend({ + strategy: z.literal("section"), +}); + +const PageStrategy = BasePage.extend({ + strategy: z.literal("page"), +}); + +const ChunkParams = z.discriminatedUnion("strategy", [ + CharacterStrategy, + SectionStrategy, + PageStrategy, +]); + +/** + * Split documents into smaller chunks using various strategies. + * + * - `"character"`: fixed-size character splitting with optional overlap + * - `"section"`: split on markdown headings at a given level + * - `"page"`: split on page boundary markers in content + */ +export const chunkSimple = Action.withoutClient("chunk", { + types: [Document, Chunk], + params: ChunkParams, + transform: transformChunk, +}); + +async function* transformChunk( + stream: AsyncIterable<Document>, + params: z.infer<typeof ChunkParams>, +): AsyncGenerator<Chunk> { + for await (const doc of stream) { + switch (params.strategy) { + case "character": { + const { strategy: _, ...rest } = params; + yield* chunkByCharacter(doc, rest); + break; + } + case "section": { + const { strategy: _, ...rest } = params; + yield* chunkBySection(doc, rest); + break; + } + case "page": { + const { strategy: _, ...rest } = params; + yield* chunkByPage(doc, rest); + break; + } + } + } +} diff --git a/packages/nvisy-core/src/actions/index.ts b/packages/nvisy-core/src/actions/index.ts new file mode 100644 index 0000000..c194294 --- /dev/null +++ b/packages/nvisy-core/src/actions/index.ts @@ -0,0 +1,10 @@ +/** + * @module actions + * + * Action definitions and built-in rule-based transforms. + */ + +export type { ActionInstance } from "./action.js"; +export { Action } from "./action.js"; +export { chunkSimple } from "./chunk.js"; +export { partition } from "./partition.js"; diff --git a/packages/nvisy-core/src/actions/partition-by-auto.test.ts b/packages/nvisy-core/src/actions/partition-by-auto.test.ts new file mode 100644 index 0000000..e253871 --- /dev/null +++ b/packages/nvisy-core/src/actions/partition-by-auto.test.ts @@ -0,0 +1,24 @@ +import { describe, expect, it } from "vitest"; +import { Document } from "../datatypes/index.js"; +import { partitionByAuto } from "./partition-by-auto.js"; + +describe("partitionByAuto", () => { + it("returns document content as a single-element array", () => { + const doc = new Document("Hello, world!"); + const parts = partitionByAuto(doc, {}); + expect(parts).toEqual(["Hello, world!"]); + }); + + it("preserves full content without modification", () => { + const content = "Line 1\nLine 2\n\nParagraph 2"; + const doc = new Document(content); + const parts = partitionByAuto(doc, {}); + expect(parts).toEqual([content]); + }); + + it("handles empty content", () => { + const doc = new Document(""); + const parts = partitionByAuto(doc, {}); + expect(parts).toEqual([""]); + }); +}); diff --git a/packages/nvisy-core/src/actions/partition-by-auto.ts b/packages/nvisy-core/src/actions/partition-by-auto.ts new file mode 100644 index 0000000..c89feae --- /dev/null +++ b/packages/nvisy-core/src/actions/partition-by-auto.ts @@ -0,0 +1,21 @@ +/** + * Auto partition strategy. + * + * Passes document content through as-is, preserving structured + * elements when present. + * + * @module + */ + +import type { Document } from "../datatypes/index.js"; + +/** Auto-strategy parameters (no additional options). */ +export type AutoStrategyParams = Record<string, never>; + +/** Pass document content through unchanged. */ +export function partitionByAuto( + doc: Document, + _params: AutoStrategyParams, +): string[] { + return [doc.content]; +} diff --git a/packages/nvisy-core/src/actions/partition-by-rule.test.ts b/packages/nvisy-core/src/actions/partition-by-rule.test.ts new file mode 100644 index 0000000..3c3c974 --- /dev/null +++ b/packages/nvisy-core/src/actions/partition-by-rule.test.ts @@ -0,0 +1,113 @@ +import { describe, expect, it } from "vitest"; +import { Document, TableElement } from "../datatypes/index.js"; +import { partitionByRule } from "./partition-by-rule.js"; + +describe("partitionByRule", () => { + it("splits on a regex pattern", () => { + const doc = new Document("one---two---three"); + const parts = partitionByRule(doc, { + pattern: "---", + includeDelimiter: false, + inferTableStructure: false, + }); + expect(parts).toEqual(["one", "two", "three"]); + }); + + describe("inferTableStructure", () => { + it("replaces table text with HTML when enabled", () => { + const table = new TableElement({ + type: "table", + text: "Name Age\nAlice 30", + cells: [ + { row: 0, column: 0, text: "Name", isHeader: true }, + { row: 0, column: 1, text: "Age", isHeader: true }, + { row: 1, column: 0, text: "Alice" }, + { row: 1, column: 1, text: "30" }, + ], + }); + + const doc = new Document("Before\n---\nName Age\nAlice 30\n---\nAfter", { + elements: [table], + }); + + const parts = partitionByRule(doc, { + pattern: "\n---\n", + includeDelimiter: false, + inferTableStructure: true, + }); + + expect(parts).toHaveLength(3); + expect(parts[0]).toBe("Before"); + expect(parts[1]).toContain("<table>"); + expect(parts[1]).toContain("<th>Name</th>"); + expect(parts[1]).toContain("<th>Age</th>"); + expect(parts[1]).toContain("<td>Alice</td>"); + expect(parts[1]).toContain("<td>30</td>"); + expect(parts[1]).toContain("</table>"); + expect(parts[2]).toBe("After"); + }); + + it("does not modify content when disabled", () => { + const table = new TableElement({ + type: "table", + text: "Name Age", + cells: [ + { row: 0, column: 0, text: "Name", isHeader: true }, + { row: 0, column: 1, text: "Age", isHeader: true }, + ], + }); + + const doc = new Document("Name Age", { elements: [table] }); + + const parts = partitionByRule(doc, { + pattern: "---", + includeDelimiter: false, + inferTableStructure: false, + }); + + expect(parts).toEqual(["Name Age"]); + }); + + it("ignores elements without cells", () => { + const table = new TableElement({ + type: "table", + text: "some table", + }); + + const doc = new Document("some table", { elements: [table] }); + + const parts = partitionByRule(doc, { + pattern: "---", + includeDelimiter: false, + inferTableStructure: true, + }); + + expect(parts).toEqual(["some table"]); + }); + + it("sorts cells by row and column", () => { + const table = new TableElement({ + type: "table", + text: "data", + cells: [ + { row: 1, column: 1, text: "D" }, + { row: 0, column: 1, text: "B", isHeader: true }, + { row: 1, column: 0, text: "C" }, + { row: 0, column: 0, text: "A", isHeader: true }, + ], + }); + + const doc = new Document("data", { elements: [table] }); + + const parts = partitionByRule(doc, { + pattern: "---", + includeDelimiter: false, + inferTableStructure: true, + }); + + expect(parts[0]).toBe( + "<table><tr><th>A</th><th>B</th></tr><tr><td>C</td><td>D</td></tr></table>", + ); + }); + }); +}); diff --git a/packages/nvisy-core/src/actions/partition-by-rule.ts b/packages/nvisy-core/src/actions/partition-by-rule.ts new file mode 100644 index 0000000..8d54b20 --- /dev/null +++ b/packages/nvisy-core/src/actions/partition-by-rule.ts @@ -0,0 +1,82 @@ +/** + * Rule-based partition strategy. + * + * Splits document content using a user-supplied regex pattern. + * Optionally infers HTML table structure from structured + * {@link TableElement} cells. + * + * @module + */ + +import type { Document } from "../datatypes/index.js"; +import { TableElement } from "../datatypes/index.js"; + +/** Rule-strategy parameters. */ +export interface RuleStrategyParams { + /** Regex pattern to split content on. */ + readonly pattern: string; + /** Whether to include the delimiter in chunks. */ + readonly includeDelimiter: boolean; + /** Replace table element text with inferred HTML table markup. */ + readonly inferTableStructure: boolean; +} + +/** Split document content using a regex pattern. */ +export function partitionByRule( + doc: Document, + params: RuleStrategyParams, +): string[] { + let content = doc.content; + + if (params.inferTableStructure && doc.elements != null) { + content = applyTableStructure(content, doc.elements); + } + + const regex = new RegExp(params.pattern, "g"); + return content.split(regex).filter((p) => p.length > 0); +} + +/** Replace plain-text table representations with HTML tables built from cell data. */ +function applyTableStructure( + content: string, + elements: readonly import("../datatypes/index.js").Element[], +): string { + for (const el of elements) { + if ( + !(el instanceof TableElement) || + el.cells == null || + el.cells.length === 0 + ) { + continue; + } + + const html = cellsToHtml(el.cells); + content = content.replace(el.text, html); + } + return content; +} + +/** Build an HTML `<table>` string from structured cell data. */ +function cellsToHtml( + cells: readonly import("../datatypes/index.js").TableCellData[], +): string { + const rows = new Map<number, (typeof cells)[number][]>(); + for (const cell of cells) { + let row = rows.get(cell.row); + if (row == null) { + row = []; + rows.set(cell.row, row); + } + row.push(cell); + } + + const lines: string[] = ["<table>"]; + for (const [, rowCells] of [...rows.entries()].sort(([a], [b]) => a - b)) { + rowCells.sort((a, b) => a.column - b.column); + const tag = rowCells[0]?.isHeader ? "th" : "td"; + const cellHtml = rowCells.map((c) => `<${tag}>${c.text}</${tag}>`).join(""); + lines.push(`<tr>${cellHtml}</tr>`); + } + lines.push("</table>"); + return lines.join(""); +} diff --git a/packages/nvisy-plugin-ai/src/actions/partition.ts b/packages/nvisy-core/src/actions/partition.ts similarity index 57% rename from packages/nvisy-plugin-ai/src/actions/partition.ts rename to packages/nvisy-core/src/actions/partition.ts index 6462531..aab89fe 100644 --- a/packages/nvisy-plugin-ai/src/actions/partition.ts +++ b/packages/nvisy-core/src/actions/partition.ts @@ -1,17 +1,34 @@ -import type { Metadata } from "@nvisy/core"; -import { Action, Document } from "@nvisy/core"; +/** + * Rule-based partition action that splits documents using auto + * pass-through or regex-based splitting. + * + * @module + */ + import { z } from "zod"; +import type { Metadata } from "../datatypes/index.js"; +import { Document } from "../datatypes/index.js"; +import { Action } from "./action.js"; +import { partitionByAuto } from "./partition-by-auto.js"; +import { partitionByRule } from "./partition-by-rule.js"; + +export type { AutoStrategyParams } from "./partition-by-auto.js"; +export type { RuleStrategyParams } from "./partition-by-rule.js"; + +const BaseAuto = z.object({}); + +const BaseRule = z.object({ + pattern: z.string(), + includeDelimiter: z.boolean().default(false), + inferTableStructure: z.boolean().default(false), +}); -const AutoStrategy = z.object({ +const AutoStrategy = BaseAuto.extend({ strategy: z.literal("auto"), }); -const RuleStrategy = z.object({ +const RuleStrategy = BaseRule.extend({ strategy: z.literal("rule"), - /** Regex pattern to split content on. */ - pattern: z.string(), - /** Whether to include the delimiter in chunks. Defaults to false. */ - includeDelimiter: z.boolean().default(false), }); const PartitionParams = z.discriminatedUnion("strategy", [ @@ -34,29 +51,25 @@ export const partition = Action.withoutClient("partition", { async function* transformPartition( stream: AsyncIterable<Document>, params: z.infer<typeof PartitionParams>, -) { +): AsyncGenerator<Document> { for await (const item of stream) { - const text = item.content; - const sourceId = item.id; - const baseMeta = item.metadata; - let parts: string[]; - switch (params.strategy) { - case "auto": - parts = [text]; + case "auto": { + const { strategy: _, ...rest } = params; + parts = partitionByAuto(item, rest); break; + } case "rule": { - const regex = new RegExp(params.pattern, "g"); - if (params.includeDelimiter) { - parts = text.split(regex).filter((p) => p.length > 0); - } else { - parts = text.split(regex).filter((p) => p.length > 0); - } + const { strategy: _, ...rest } = params; + parts = partitionByRule(item, rest); break; } } + const sourceId = item.id; + const baseMeta = item.metadata; + for (let i = 0; i < parts.length; i++) { const metadata: Metadata = { ...(baseMeta ?? {}), diff --git a/packages/nvisy-core/src/datatypes/blob.ts b/packages/nvisy-core/src/datatypes/blob.ts index 891616c..6a57ab6 100644 --- a/packages/nvisy-core/src/datatypes/blob.ts +++ b/packages/nvisy-core/src/datatypes/blob.ts @@ -1,3 +1,9 @@ +/** + * Binary blob data type for files from object storage. + * + * @module + */ + import { Data } from "./data.js"; /** diff --git a/packages/nvisy-core/src/datatypes/chunk.test.ts b/packages/nvisy-core/src/datatypes/chunk.test.ts new file mode 100644 index 0000000..e81178e --- /dev/null +++ b/packages/nvisy-core/src/datatypes/chunk.test.ts @@ -0,0 +1,77 @@ +import { describe, expect, it } from "vitest"; +import { Chunk } from "./chunk.js"; + +describe("Chunk", () => { + it("stores content", () => { + const chunk = new Chunk("Hello, world!"); + expect(chunk.content).toBe("Hello, world!"); + }); + + it("defaults chunkIndex and chunkTotal to undefined", () => { + const chunk = new Chunk("text"); + expect(chunk.chunkIndex).toBeUndefined(); + expect(chunk.chunkTotal).toBeUndefined(); + }); + + it("accepts chunkIndex and chunkTotal via options", () => { + const chunk = new Chunk("text", { chunkIndex: 2, chunkTotal: 10 }); + expect(chunk.chunkIndex).toBe(2); + expect(chunk.chunkTotal).toBe(10); + }); + + it("accepts partial options", () => { + const indexOnly = new Chunk("a", { chunkIndex: 0 }); + expect(indexOnly.chunkIndex).toBe(0); + expect(indexOnly.chunkTotal).toBeUndefined(); + + const totalOnly = new Chunk("b", { chunkTotal: 5 }); + expect(totalOnly.chunkIndex).toBeUndefined(); + expect(totalOnly.chunkTotal).toBe(5); + }); + + it("handles empty content", () => { + const chunk = new Chunk(""); + expect(chunk.content).toBe(""); + }); + + it("extends Data and has id, parentId, metadata", () => { + const chunk = new Chunk("content"); + expect(chunk.id).toMatch( + /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/, + ); + expect(chunk.parentId).toBeNull(); + expect(chunk.metadata).toBeNull(); + }); + + it("has a unique id per instance", () => { + const a = new Chunk("same"); + const b = new Chunk("same"); + expect(a.id).not.toBe(b.id); + }); + + it("supports deriveFrom for lineage", () => { + const parent = new Chunk("parent text"); + const child = new Chunk("child text").deriveFrom(parent); + expect(child.parentId).toBe(parent.id); + expect(child.isDerived).toBe(true); + }); + + it("deriveFrom copies metadata from parent", () => { + const parent = new Chunk("parent").withMetadata({ source: "pdf" }); + const child = new Chunk("child").deriveFrom(parent); + expect(child.metadata).toEqual({ source: "pdf" }); + }); + + it("supports withMetadata", () => { + const chunk = new Chunk("text").withMetadata({ + page: 3, + section: "intro", + }); + expect(chunk.metadata).toEqual({ page: 3, section: "intro" }); + }); + + it("supports withParent", () => { + const chunk = new Chunk("text").withParent("custom-parent-id"); + expect(chunk.parentId).toBe("custom-parent-id"); + }); +}); diff --git a/packages/nvisy-plugin-ai/src/datatypes/chunk.ts b/packages/nvisy-core/src/datatypes/chunk.ts similarity index 92% rename from packages/nvisy-plugin-ai/src/datatypes/chunk.ts rename to packages/nvisy-core/src/datatypes/chunk.ts index aef1b85..e671046 100644 --- a/packages/nvisy-plugin-ai/src/datatypes/chunk.ts +++ b/packages/nvisy-core/src/datatypes/chunk.ts @@ -1,4 +1,10 @@ -import { Data } from "@nvisy/core"; +/** + * Text chunk data type produced by chunking steps. + * + * @module + */ + +import { Data } from "./data.js"; /** Options for constructing a {@link Chunk}. */ export interface ChunkOptions { diff --git a/packages/nvisy-core/src/datatypes/data.ts b/packages/nvisy-core/src/datatypes/data.ts index d0aaa0c..1343f8b 100644 --- a/packages/nvisy-core/src/datatypes/data.ts +++ b/packages/nvisy-core/src/datatypes/data.ts @@ -1,3 +1,9 @@ +/** + * Abstract base data class for all pipeline data types. + * + * @module + */ + import type { Metadata } from "../types.js"; /** diff --git a/packages/nvisy-core/src/datatypes/document.test.ts b/packages/nvisy-core/src/datatypes/document.test.ts index 64eabd5..111293d 100644 --- a/packages/nvisy-core/src/datatypes/document.test.ts +++ b/packages/nvisy-core/src/datatypes/document.test.ts @@ -145,6 +145,53 @@ describe("Document", () => { }); }); + describe("getElementsByPage", () => { + it("returns empty map when there are no elements", () => { + const doc = new Document("text"); + expect(doc.getElementsByPage().size).toBe(0); + }); + + it("groups elements by pageNumber", () => { + const doc = new Document("text", { + elements: [ + new Element({ type: "title", text: "Title", pageNumber: 1 }), + new Element({ type: "narrative-text", text: "p1", pageNumber: 1 }), + new Element({ type: "narrative-text", text: "p2", pageNumber: 2 }), + ], + }); + const pages = doc.getElementsByPage(); + expect(pages.size).toBe(2); + expect(pages.get(1)).toHaveLength(2); + expect(pages.get(2)).toHaveLength(1); + expect(pages.get(2)![0].text).toBe("p2"); + }); + + it("collects elements without pageNumber under key 0", () => { + const doc = new Document("text", { + elements: [ + new Element({ type: "title", text: "Title" }), + new Element({ type: "narrative-text", text: "p1", pageNumber: 1 }), + ], + }); + const pages = doc.getElementsByPage(); + expect(pages.get(0)).toHaveLength(1); + expect(pages.get(0)![0].text).toBe("Title"); + expect(pages.get(1)).toHaveLength(1); + }); + + it("preserves element order within each page", () => { + const doc = new Document("text", { + elements: [ + new Element({ type: "title", text: "A", pageNumber: 1 }), + new Element({ type: "narrative-text", text: "B", pageNumber: 2 }), + new Element({ type: "narrative-text", text: "C", pageNumber: 1 }), + ], + }); + const page1 = doc.getElementsByPage().get(1)!; + expect(page1.map((e) => e.text)).toEqual(["A", "C"]); + }); + }); + describe("Element", () => { it("auto-generates a unique id", () => { const a = new Element({ diff --git a/packages/nvisy-core/src/datatypes/document.ts b/packages/nvisy-core/src/datatypes/document.ts index 77c7e18..8fac4d1 100644 --- a/packages/nvisy-core/src/datatypes/document.ts +++ b/packages/nvisy-core/src/datatypes/document.ts @@ -1,3 +1,9 @@ +/** + * Document data type with optional structured elements. + * + * @module + */ + import type { Element } from "../documents/elements.js"; import { Data } from "./data.js"; @@ -99,6 +105,28 @@ export class Document extends Data { return this.#elements; } + /** + * Group elements by their 1-based page number. + * + * Returns a `Map` keyed by page number with each value being the + * ordered array of elements on that page. Elements without a + * `pageNumber` are collected under key `0`. + */ + getElementsByPage(): Map<number, Element[]> { + const map = new Map<number, Element[]>(); + if (this.#elements == null) return map; + for (const el of this.#elements) { + const page = el.pageNumber ?? 0; + let bucket = map.get(page); + if (bucket == null) { + bucket = []; + map.set(page, bucket); + } + bucket.push(el); + } + return map; + } + /** * Create a Document by deriving `content` from the element texts. * diff --git a/packages/nvisy-core/src/datatypes/embedding.ts b/packages/nvisy-core/src/datatypes/embedding.ts index 007b83e..7de32c4 100644 --- a/packages/nvisy-core/src/datatypes/embedding.ts +++ b/packages/nvisy-core/src/datatypes/embedding.ts @@ -1,3 +1,9 @@ +/** + * Dense vector embedding data type for similarity search. + * + * @module + */ + import { Data } from "./data.js"; /** diff --git a/packages/nvisy-core/src/datatypes/index.ts b/packages/nvisy-core/src/datatypes/index.ts index 10bde7d..7d24ef9 100644 --- a/packages/nvisy-core/src/datatypes/index.ts +++ b/packages/nvisy-core/src/datatypes/index.ts @@ -6,6 +6,8 @@ export type { JsonValue, Metadata } from "../types.js"; export { Blob } from "./blob.js"; +export type { ChunkOptions } from "./chunk.js"; +export { Chunk } from "./chunk.js"; export { Data } from "./data.js"; export type { CompositeElementOptions, @@ -57,14 +59,15 @@ export const Datatypes = { } as const; import { Blob } from "./blob.js"; +import { Chunk } from "./chunk.js"; import { Document } from "./document.js"; import { Embedding } from "./embedding.js"; /** Pre-defined Document datatype entry. */ -export const documentDatatype = Datatypes.define("document", Document); - +export const document = Datatypes.define("document", Document); +/** Pre-defined Chunk datatype entry. */ +export const chunk = Datatypes.define("chunk", Chunk); /** Pre-defined Blob datatype entry. */ -export const blobDatatype = Datatypes.define("blob", Blob); - +export const blob = Datatypes.define("blob", Blob); /** Pre-defined Embedding datatype entry. */ -export const embeddingDatatype = Datatypes.define("embedding", Embedding); +export const embedding = Datatypes.define("embedding", Embedding); diff --git a/packages/nvisy-core/src/errors/cancellation.ts b/packages/nvisy-core/src/errors/cancellation.ts index 3730d25..9fd3c72 100644 --- a/packages/nvisy-core/src/errors/cancellation.ts +++ b/packages/nvisy-core/src/errors/cancellation.ts @@ -1,3 +1,9 @@ +/** + * Cancellation error for intentionally aborted operations. + * + * @module + */ + import { RuntimeError, type RuntimeErrorOptions } from "./runtime.js"; /** diff --git a/packages/nvisy-core/src/errors/connection.ts b/packages/nvisy-core/src/errors/connection.ts index 881d102..2c449e5 100644 --- a/packages/nvisy-core/src/errors/connection.ts +++ b/packages/nvisy-core/src/errors/connection.ts @@ -1,3 +1,9 @@ +/** + * Connection error for unreachable external services. + * + * @module + */ + import { type ErrorContext, RuntimeError } from "./runtime.js"; /** diff --git a/packages/nvisy-core/src/errors/runtime.ts b/packages/nvisy-core/src/errors/runtime.ts index bff2788..2b7cea8 100644 --- a/packages/nvisy-core/src/errors/runtime.ts +++ b/packages/nvisy-core/src/errors/runtime.ts @@ -1,3 +1,9 @@ +/** + * Base error class and shared error interfaces for the Nvisy runtime. + * + * @module + */ + /** Structured context attached to runtime errors. */ export interface ErrorContext { /** Which component or subsystem produced the error. */ diff --git a/packages/nvisy-core/src/errors/timeout.ts b/packages/nvisy-core/src/errors/timeout.ts index f6243ba..33f0012 100644 --- a/packages/nvisy-core/src/errors/timeout.ts +++ b/packages/nvisy-core/src/errors/timeout.ts @@ -1,3 +1,9 @@ +/** + * Timeout error for operations that exceed their time limit. + * + * @module + */ + import type { RuntimeErrorOptions } from "./runtime.js"; import { RuntimeError } from "./runtime.js"; diff --git a/packages/nvisy-core/src/errors/validation.ts b/packages/nvisy-core/src/errors/validation.ts index 44e693a..9bd3446 100644 --- a/packages/nvisy-core/src/errors/validation.ts +++ b/packages/nvisy-core/src/errors/validation.ts @@ -1,3 +1,9 @@ +/** + * Validation error for input that fails schema or business rules. + * + * @module + */ + import { RuntimeError, type RuntimeErrorOptions } from "./runtime.js"; /** diff --git a/packages/nvisy-core/src/index.ts b/packages/nvisy-core/src/index.ts index 2c7d292..f4f2adb 100644 --- a/packages/nvisy-core/src/index.ts +++ b/packages/nvisy-core/src/index.ts @@ -4,9 +4,10 @@ * Public API surface for the nvisy core library. */ -export type { ActionInstance } from "./action.js"; -export { Action } from "./action.js"; +export type { ActionInstance } from "./actions/index.js"; +export { Action, chunkSimple, partition } from "./actions/index.js"; export type { + ChunkOptions, CompositeElementOptions, Datatype, DocumentOptions, @@ -23,16 +24,14 @@ export type { } from "./datatypes/index.js"; export { Blob, - blobDatatype, + Chunk, CompositeElement, Data, Datatypes, Document, - documentDatatype, Element, EmailElement, Embedding, - embeddingDatatype, FormElement, ImageElement, TableElement, @@ -92,16 +91,11 @@ export type { export { Stream } from "./stream.js"; export type { ClassRef, JsonValue, Metadata } from "./types.js"; -import { - blobDatatype, - documentDatatype, - embeddingDatatype, -} from "./datatypes/index.js"; +import { chunkSimple, partition } from "./actions/index.js"; +import { blob, chunk, document, embedding } from "./datatypes/index.js"; import { Plugin } from "./plugin.js"; -/** Built-in core plugin that registers the Document, Blob, and Embedding datatypes. */ -export const corePlugin = Plugin.define("core").withDatatypes( - documentDatatype, - blobDatatype, - embeddingDatatype, -); +/** Built-in core plugin that registers the Document, Blob, Chunk, and Embedding datatypes. */ +export const corePlugin = Plugin.define("core") + .withDatatypes(document, blob, chunk, embedding) + .withActions(chunkSimple, partition); diff --git a/packages/nvisy-core/src/loader.ts b/packages/nvisy-core/src/loader.ts index f80118f..330729d 100644 --- a/packages/nvisy-core/src/loader.ts +++ b/packages/nvisy-core/src/loader.ts @@ -1,3 +1,13 @@ +/** + * Loaders that transform {@link Blob | Blobs} into {@link Document | Documents}. + * + * Each loader declares the file extensions and MIME types it handles, + * so the engine can automatically select the right loader for a given + * blob. Use {@link Loader.define} to create new loaders. + * + * @module + */ + import type { z } from "zod"; import type { Blob } from "./datatypes/blob.js"; import type { Document } from "./datatypes/document.js"; diff --git a/packages/nvisy-core/src/plugin.ts b/packages/nvisy-core/src/plugin.ts index e85cacb..1fc7513 100644 --- a/packages/nvisy-core/src/plugin.ts +++ b/packages/nvisy-core/src/plugin.ts @@ -1,4 +1,15 @@ -import type { ActionInstance } from "./action.js"; +/** + * Plugin system for bundling providers, streams, actions, loaders, + * and custom datatypes under a single namespace. + * + * Plugins are the unit of registration with the engine. Use + * {@link Plugin.define} to create a new plugin, then chain + * `.withProviders()`, `.withActions()`, etc. to populate it. + * + * @module + */ + +import type { ActionInstance } from "./actions/action.js"; import type { Datatype } from "./datatypes/index.js"; import type { LoaderInstance } from "./loader.js"; import type { ProviderFactory } from "./provider.js"; diff --git a/packages/nvisy-core/src/provider.ts b/packages/nvisy-core/src/provider.ts index e76e5e2..d336e80 100644 --- a/packages/nvisy-core/src/provider.ts +++ b/packages/nvisy-core/src/provider.ts @@ -1,3 +1,14 @@ +/** + * Provider lifecycle management for external service connections. + * + * Providers abstract credentials, connection setup, and teardown + * for databases, APIs, and other external systems. Use + * {@link Provider.withAuthentication} or + * {@link Provider.withoutAuthentication} to define new providers. + * + * @module + */ + import { getLogger } from "@logtape/logtape"; import { z } from "zod"; import { ConnectionError } from "./errors/index.js"; diff --git a/packages/nvisy-core/src/stream.ts b/packages/nvisy-core/src/stream.ts index 6aacecb..a4495ff 100644 --- a/packages/nvisy-core/src/stream.ts +++ b/packages/nvisy-core/src/stream.ts @@ -1,3 +1,14 @@ +/** + * Stream sources and targets for reading from and writing to external systems. + * + * Sources are pipeline entry points that emit {@link Resumable} items + * for crash recovery. Targets are pipeline exit points that persist + * processed data. Use {@link Stream.createSource} and + * {@link Stream.createTarget} to define new endpoints. + * + * @module + */ + import type { z } from "zod"; import type { Data } from "./datatypes/index.js"; import type { ClassRef } from "./types.js"; diff --git a/packages/nvisy-core/src/types.ts b/packages/nvisy-core/src/types.ts index 49e83ff..139ec9d 100644 --- a/packages/nvisy-core/src/types.ts +++ b/packages/nvisy-core/src/types.ts @@ -1,3 +1,9 @@ +/** + * Shared type aliases used across the core library. + * + * @module + */ + /** * A JSON-compatible value. * diff --git a/packages/nvisy-core/test/action.fixtures.ts b/packages/nvisy-core/test/action.fixtures.ts index 241ef6c..3e3e0d0 100644 --- a/packages/nvisy-core/test/action.fixtures.ts +++ b/packages/nvisy-core/test/action.fixtures.ts @@ -1,5 +1,5 @@ import { z } from "zod"; -import { Action } from "../src/action.js"; +import { Action } from "../src/actions/action.js"; import type { JsonValue } from "../src/datatypes/data.js"; import { Data } from "../src/datatypes/data.js"; diff --git a/packages/nvisy-core/test/action.test.ts b/packages/nvisy-core/test/action.test.ts index 5b7d433..01791ae 100644 --- a/packages/nvisy-core/test/action.test.ts +++ b/packages/nvisy-core/test/action.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import type { ActionInstance } from "../src/action.js"; +import type { ActionInstance } from "../src/actions/action.js"; import type { Data } from "../src/datatypes/data.js"; import { ExampleFilter, ExampleMap, TestRow } from "./action.fixtures.js"; diff --git a/packages/nvisy-plugin-ai/src/actions/chunk-contextual.ts b/packages/nvisy-plugin-ai/src/actions/chunk-contextual.ts index 41f6849..1209890 100644 --- a/packages/nvisy-plugin-ai/src/actions/chunk-contextual.ts +++ b/packages/nvisy-plugin-ai/src/actions/chunk-contextual.ts @@ -1,6 +1,5 @@ -import { Action, Document } from "@nvisy/core"; +import { Action, Chunk, Document } from "@nvisy/core"; import { z } from "zod"; -import { Chunk } from "../datatypes/index.js"; import { AICompletionClient } from "../providers/client.js"; const ChunkContextualParams = z.object({ diff --git a/packages/nvisy-plugin-ai/src/actions/chunk-similarity.ts b/packages/nvisy-plugin-ai/src/actions/chunk-similarity.ts index 297662d..2b1940b 100644 --- a/packages/nvisy-plugin-ai/src/actions/chunk-similarity.ts +++ b/packages/nvisy-plugin-ai/src/actions/chunk-similarity.ts @@ -1,6 +1,5 @@ -import { Action, Document } from "@nvisy/core"; +import { Action, Chunk, Document } from "@nvisy/core"; import { z } from "zod"; -import { Chunk } from "../datatypes/index.js"; import { EmbeddingClient } from "../providers/client.js"; const ChunkSimilarityParams = z.object({ diff --git a/packages/nvisy-plugin-ai/src/actions/chunk.ts b/packages/nvisy-plugin-ai/src/actions/chunk.ts deleted file mode 100644 index 2ba5440..0000000 --- a/packages/nvisy-plugin-ai/src/actions/chunk.ts +++ /dev/null @@ -1,183 +0,0 @@ -import type { Element } from "@nvisy/core"; -import { Action, Document } from "@nvisy/core"; -import { z } from "zod"; -import { Chunk } from "../datatypes/index.js"; - -const CharacterStrategy = z.object({ - strategy: z.literal("character"), - /** Maximum chunk size in characters. */ - size: z.number(), - /** Number of overlapping characters between chunks. */ - overlap: z.number().default(0), -}); - -const SectionStrategy = z.object({ - strategy: z.literal("section"), - /** Heading level to split on (1-6). Defaults to 2. */ - level: z.number().min(1).max(6).default(2), -}); - -const PageStrategy = z.object({ - strategy: z.literal("page"), -}); - -const ChunkParams = z.discriminatedUnion("strategy", [ - CharacterStrategy, - SectionStrategy, - PageStrategy, -]); - -/** - * Split documents into smaller chunks using various strategies. - * - * - `"character"`: fixed-size character splitting with optional overlap - * - `"section"`: split on markdown headings at a given level - * - `"page"`: split on page boundary markers in content - */ -export const chunk = Action.withoutClient("chunk", { - types: [Document, Chunk], - params: ChunkParams, - transform: transformChunk, -}); - -async function* transformChunk( - stream: AsyncIterable<Document>, - params: z.infer<typeof ChunkParams>, -) { - for await (const doc of stream) { - switch (params.strategy) { - case "page": { - if (doc.elements != null && doc.elements.length > 0) { - const groups = groupByPage(doc.elements); - const pages = [...groups.entries()].sort(([a], [b]) => a - b); - for (let i = 0; i < pages.length; i++) { - const [, els] = pages[i]!; - yield new Chunk(Document.fromElements(els).content, { - chunkIndex: i, - chunkTotal: pages.length, - }).deriveFrom(doc); - } - continue; - } - break; - } - case "section": { - if (doc.elements != null && doc.elements.length > 0) { - const sections = splitByHeadingLevel(doc.elements, params.level); - for (let i = 0; i < sections.length; i++) { - const els = sections[i]!; - yield new Chunk(Document.fromElements(els).content, { - chunkIndex: i, - chunkTotal: sections.length, - }).deriveFrom(doc); - } - continue; - } - break; - } - } - - // Fallback: string-based chunking - let texts: string[]; - switch (params.strategy) { - case "character": - texts = chunkByCharacter(doc.content, params.size, params.overlap); - break; - case "section": - texts = chunkBySection(doc.content, params.level); - break; - case "page": - texts = chunkByPage(doc.content); - break; - } - - for (let i = 0; i < texts.length; i++) { - yield new Chunk(texts[i]!, { - chunkIndex: i, - chunkTotal: texts.length, - }).deriveFrom(doc); - } - } -} - -/** Group elements by their `pageNumber`, preserving document order. */ -function groupByPage(elements: readonly Element[]): Map<number, Element[]> { - const groups = new Map<number, Element[]>(); - for (const el of elements) { - const page = el.pageNumber ?? 1; - let group = groups.get(page); - if (group == null) { - group = []; - groups.set(page, group); - } - group.push(el); - } - return groups; -} - -/** Split elements into sections at headings of the given level. */ -function splitByHeadingLevel( - elements: readonly Element[], - level: number, -): Element[][] { - const sections: Element[][] = []; - let current: Element[] = []; - - for (const el of elements) { - if (el.type === "title" && el.level != null && el.level <= level) { - if (current.length > 0) { - sections.push(current); - } - current = [el]; - } else { - current.push(el); - } - } - if (current.length > 0) { - sections.push(current); - } - return sections; -} - -function chunkByCharacter( - text: string, - size: number, - overlap: number, -): string[] { - const chunks: string[] = []; - let start = 0; - while (start < text.length) { - chunks.push(text.slice(start, start + size)); - start += size - overlap; - if (size - overlap <= 0) break; - } - return chunks; -} - -function chunkBySection(text: string, level: number): string[] { - const prefix = "#".repeat(level); - const pattern = new RegExp(`^${prefix}\\s`, "m"); - const parts = text.split(pattern); - - const chunks: string[] = []; - for (let i = 0; i < parts.length; i++) { - const part = parts[i]!.trim(); - if (part.length === 0) continue; - // Re-add the heading prefix for sections after the first - chunks.push(i > 0 ? `${prefix} ${part}` : part); - } - return chunks.length > 0 ? chunks : [text]; -} - -function chunkByPage(text: string): string[] { - // Split on common page break markers - const pages = text.split(/\f|\n---\n|\n\*\*\*\n/); - const chunks: string[] = []; - for (const page of pages) { - const trimmed = page.trim(); - if (trimmed.length > 0) { - chunks.push(trimmed); - } - } - return chunks.length > 0 ? chunks : [text]; -} diff --git a/packages/nvisy-plugin-ai/src/actions/enrich-by-description.ts b/packages/nvisy-plugin-ai/src/actions/enrich-by-description.ts new file mode 100644 index 0000000..523147c --- /dev/null +++ b/packages/nvisy-plugin-ai/src/actions/enrich-by-description.ts @@ -0,0 +1,35 @@ +/** + * Content description enrichment strategy. + * + * Generates AI descriptions for image or table content. + * + * @module + */ + +import type { Metadata } from "@nvisy/core"; +import type { AICompletionClient } from "../providers/client.js"; +import { parseJsonResponse } from "../providers/client.js"; + +/** Description enrichment parameters. */ +export interface DescriptionEnrichParams { + /** The kind of content to describe. */ + readonly contentKind: "image" | "table"; +} + +/** Describe content using AI. */ +export async function enrichByDescription( + text: string, + params: DescriptionEnrichParams, + client: AICompletionClient, +): Promise<Metadata> { + const result = await client.complete({ + messages: [ + { + role: "system", + content: `Describe the following ${params.contentKind} content in detail. Return ONLY a JSON object with a "description" field containing your description.`, + }, + { role: "user", content: text }, + ], + }); + return parseJsonResponse(result.content, "description"); +} diff --git a/packages/nvisy-plugin-ai/src/actions/enrich-by-metadata.ts b/packages/nvisy-plugin-ai/src/actions/enrich-by-metadata.ts new file mode 100644 index 0000000..d0a4210 --- /dev/null +++ b/packages/nvisy-plugin-ai/src/actions/enrich-by-metadata.ts @@ -0,0 +1,35 @@ +/** + * Metadata extraction enrichment strategy. + * + * Extracts structured fields from document content using an AI model. + * + * @module + */ + +import type { Metadata } from "@nvisy/core"; +import type { AICompletionClient } from "../providers/client.js"; +import { parseJsonResponse } from "../providers/client.js"; + +/** Metadata enrichment parameters. */ +export interface MetadataEnrichParams { + /** Field names to extract from the document. */ + readonly fields: string[]; +} + +/** Extract structured metadata fields from text using AI. */ +export async function enrichByMetadata( + text: string, + params: MetadataEnrichParams, + client: AICompletionClient, +): Promise<Metadata> { + const result = await client.complete({ + messages: [ + { + role: "system", + content: `Extract the following fields from the document: ${params.fields.join(", ")}. Return ONLY a JSON object with these fields as keys. If a field cannot be determined, set it to null.`, + }, + { role: "user", content: text }, + ], + }); + return parseJsonResponse(result.content, "extracted"); +} diff --git a/packages/nvisy-plugin-ai/src/actions/enrich-by-ner.ts b/packages/nvisy-plugin-ai/src/actions/enrich-by-ner.ts new file mode 100644 index 0000000..f7fe9d4 --- /dev/null +++ b/packages/nvisy-plugin-ai/src/actions/enrich-by-ner.ts @@ -0,0 +1,39 @@ +/** + * Named entity recognition enrichment strategy. + * + * Extracts named entities from document content using an AI model. + * + * @module + */ + +import type { Metadata } from "@nvisy/core"; +import type { AICompletionClient } from "../providers/client.js"; +import { parseJsonResponse } from "../providers/client.js"; + +/** NER enrichment parameters. */ +export interface NerEnrichParams { + /** Entity types to extract (e.g. ["PERSON", "ORG"]). If omitted, extract all. */ + readonly entityTypes?: string[] | undefined; +} + +/** Extract named entities from text using AI. */ +export async function enrichByNer( + text: string, + params: NerEnrichParams, + client: AICompletionClient, +): Promise<Metadata> { + const typeClause = params.entityTypes + ? `Focus on these entity types: ${params.entityTypes.join(", ")}.` + : "Extract all entity types you can identify."; + + const result = await client.complete({ + messages: [ + { + role: "system", + content: `Perform named entity recognition on the following text. ${typeClause} Return ONLY a JSON object where keys are entity types and values are arrays of extracted entities.`, + }, + { role: "user", content: text }, + ], + }); + return { entities: parseJsonResponse(result.content, "entities") }; +} diff --git a/packages/nvisy-plugin-ai/src/actions/enrich-by-table-html.ts b/packages/nvisy-plugin-ai/src/actions/enrich-by-table-html.ts new file mode 100644 index 0000000..9263b1c --- /dev/null +++ b/packages/nvisy-plugin-ai/src/actions/enrich-by-table-html.ts @@ -0,0 +1,33 @@ +/** + * Table-to-HTML enrichment strategy. + * + * Converts table content into HTML markup using an AI model. + * + * @module + */ + +import type { Metadata } from "@nvisy/core"; +import type { AICompletionClient } from "../providers/client.js"; +import { parseJsonResponse } from "../providers/client.js"; + +/** Table-to-HTML enrichment parameters (no additional options). */ +export type TableHtmlEnrichParams = Record<string, never>; + +/** Convert table content to HTML using AI. */ +export async function enrichByTableHtml( + text: string, + _params: TableHtmlEnrichParams, + client: AICompletionClient, +): Promise<Metadata> { + const result = await client.complete({ + messages: [ + { + role: "system", + content: + 'Convert the following table content into clean HTML. Return ONLY a JSON object with an "html" field containing the HTML table markup.', + }, + { role: "user", content: text }, + ], + }); + return parseJsonResponse(result.content, "tableHtml"); +} diff --git a/packages/nvisy-plugin-ai/src/actions/enrich.ts b/packages/nvisy-plugin-ai/src/actions/enrich.ts index 6f4e7ad..06b4a0b 100644 --- a/packages/nvisy-plugin-ai/src/actions/enrich.ts +++ b/packages/nvisy-plugin-ai/src/actions/enrich.ts @@ -1,29 +1,55 @@ +/** + * AI-powered enrich action that extracts metadata, entities, + * descriptions, or HTML from documents. + * + * @module + */ + import type { Metadata } from "@nvisy/core"; import { Action, Document } from "@nvisy/core"; import { z } from "zod"; import { AICompletionClient } from "../providers/client.js"; +import { enrichByDescription } from "./enrich-by-description.js"; +import { enrichByMetadata } from "./enrich-by-metadata.js"; +import { enrichByNer } from "./enrich-by-ner.js"; +import { enrichByTableHtml } from "./enrich-by-table-html.js"; -const MetadataType = z.object({ - type: z.literal("metadata"), - /** Field names to extract from the document. */ +export type { DescriptionEnrichParams } from "./enrich-by-description.js"; +export type { MetadataEnrichParams } from "./enrich-by-metadata.js"; +export type { NerEnrichParams } from "./enrich-by-ner.js"; +export type { TableHtmlEnrichParams } from "./enrich-by-table-html.js"; + +const BaseMetadata = z.object({ fields: z.array(z.string()), }); -const NerType = z.object({ - type: z.literal("ner"), - /** Entity types to extract (e.g. ["PERSON", "ORG"]). If omitted, extract all. */ +const BaseNer = z.object({ entityTypes: z.array(z.string()).optional(), }); -const ImageDescriptionType = z.object({ +const BaseImageDescription = z.object({}); + +const BaseTableDescription = z.object({}); + +const BaseTableToHtml = z.object({}); + +const MetadataType = BaseMetadata.extend({ + type: z.literal("metadata"), +}); + +const NerType = BaseNer.extend({ + type: z.literal("ner"), +}); + +const ImageDescriptionType = BaseImageDescription.extend({ type: z.literal("image_description"), }); -const TableDescriptionType = z.object({ +const TableDescriptionType = BaseTableDescription.extend({ type: z.literal("table_description"), }); -const TableToHtmlType = z.object({ +const TableToHtmlType = BaseTableToHtml.extend({ type: z.literal("table_to_html"), }); @@ -54,34 +80,44 @@ async function* transformEnrich( stream: AsyncIterable<Document>, params: z.infer<typeof EnrichParams>, client: AICompletionClient, -) { +): AsyncGenerator<Document> { for await (const doc of stream) { let enrichedMeta: Metadata; switch (params.type) { - case "metadata": - enrichedMeta = await extractMetadata( + case "metadata": { + const { type: _, ...rest } = params; + enrichedMeta = await enrichByMetadata(doc.content, rest, client); + break; + } + case "ner": { + const { type: _, ...rest } = params; + enrichedMeta = await enrichByNer(doc.content, rest, client); + break; + } + case "image_description": { + const { type: _, ...rest } = params; + enrichedMeta = await enrichByDescription( doc.content, - params.fields, + { ...rest, contentKind: "image" }, client, ); break; - case "ner": - enrichedMeta = await extractEntities( + } + case "table_description": { + const { type: _, ...rest } = params; + enrichedMeta = await enrichByDescription( doc.content, - params.entityTypes, + { ...rest, contentKind: "table" }, client, ); break; - case "image_description": - enrichedMeta = await describeContent(doc.content, "image", client); - break; - case "table_description": - enrichedMeta = await describeContent(doc.content, "table", client); - break; - case "table_to_html": - enrichedMeta = await convertTableToHtml(doc.content, client); + } + case "table_to_html": { + const { type: _, ...rest } = params; + enrichedMeta = await enrichByTableHtml(doc.content, rest, client); break; + } } yield new Document(doc.content, { @@ -92,91 +128,3 @@ async function* transformEnrich( .withMetadata({ ...(doc.metadata ?? {}), ...enrichedMeta }); } } - -async function extractMetadata( - text: string, - fields: string[], - client: AICompletionClient, -): Promise<Metadata> { - const result = await client.complete({ - messages: [ - { - role: "system", - content: `Extract the following fields from the document: ${fields.join(", ")}. Return ONLY a JSON object with these fields as keys. If a field cannot be determined, set it to null.`, - }, - { role: "user", content: text }, - ], - }); - return parseJsonResponse(result.content, "extracted"); -} - -async function extractEntities( - text: string, - entityTypes: string[] | undefined, - client: AICompletionClient, -): Promise<Metadata> { - const typeClause = entityTypes - ? `Focus on these entity types: ${entityTypes.join(", ")}.` - : "Extract all entity types you can identify."; - - const result = await client.complete({ - messages: [ - { - role: "system", - content: `Perform named entity recognition on the following text. ${typeClause} Return ONLY a JSON object where keys are entity types and values are arrays of extracted entities.`, - }, - { role: "user", content: text }, - ], - }); - return { entities: parseJsonResponse(result.content, "entities") }; -} - -async function describeContent( - text: string, - contentKind: "image" | "table", - client: AICompletionClient, -): Promise<Metadata> { - const result = await client.complete({ - messages: [ - { - role: "system", - content: `Describe the following ${contentKind} content in detail. Return ONLY a JSON object with a "description" field containing your description.`, - }, - { role: "user", content: text }, - ], - }); - return parseJsonResponse(result.content, "description"); -} - -async function convertTableToHtml( - text: string, - client: AICompletionClient, -): Promise<Metadata> { - const result = await client.complete({ - messages: [ - { - role: "system", - content: - 'Convert the following table content into clean HTML. Return ONLY a JSON object with an "html" field containing the HTML table markup.', - }, - { role: "user", content: text }, - ], - }); - return parseJsonResponse(result.content, "tableHtml"); -} - -function parseJsonResponse(content: string, fallbackKey: string): Metadata { - try { - const parsed = JSON.parse(content) as Record<string, unknown>; - if ( - typeof parsed === "object" && - parsed !== null && - !Array.isArray(parsed) - ) { - return parsed as Metadata; - } - } catch { - // If JSON parsing fails, store the raw response - } - return { [fallbackKey]: content }; -} diff --git a/packages/nvisy-plugin-ai/src/actions/embed.ts b/packages/nvisy-plugin-ai/src/actions/generate-embedding.ts similarity index 100% rename from packages/nvisy-plugin-ai/src/actions/embed.ts rename to packages/nvisy-plugin-ai/src/actions/generate-embedding.ts diff --git a/packages/nvisy-plugin-ai/src/actions/index.ts b/packages/nvisy-plugin-ai/src/actions/index.ts index bf7d74e..b9d6b05 100644 --- a/packages/nvisy-plugin-ai/src/actions/index.ts +++ b/packages/nvisy-plugin-ai/src/actions/index.ts @@ -1,7 +1,5 @@ -export { chunk } from "./chunk.js"; export { chunkContextual } from "./chunk-contextual.js"; export { chunkSimilarity } from "./chunk-similarity.js"; -export { embed } from "./embed.js"; export { enrich } from "./enrich.js"; -export { partition } from "./partition.js"; +export { embed } from "./generate-embedding.js"; export { partitionContextual } from "./partition-contextual.js"; diff --git a/packages/nvisy-plugin-ai/src/datatypes/index.ts b/packages/nvisy-plugin-ai/src/datatypes/index.ts index 17b2f57..4c83b6c 100644 --- a/packages/nvisy-plugin-ai/src/datatypes/index.ts +++ b/packages/nvisy-plugin-ai/src/datatypes/index.ts @@ -1,2 +1,2 @@ -export type { ChunkOptions } from "./chunk.js"; -export { Chunk } from "./chunk.js"; +export type { ChunkOptions } from "@nvisy/core"; +export { Chunk } from "@nvisy/core"; diff --git a/packages/nvisy-plugin-ai/src/index.ts b/packages/nvisy-plugin-ai/src/index.ts index e8b8a49..5ea7561 100644 --- a/packages/nvisy-plugin-ai/src/index.ts +++ b/packages/nvisy-plugin-ai/src/index.ts @@ -16,17 +16,14 @@ * ``` */ -import { Datatypes, Plugin } from "@nvisy/core"; +import { Plugin } from "@nvisy/core"; import { - chunk, chunkContextual, chunkSimilarity, embed, enrich, - partition, partitionContextual, } from "./actions/index.js"; -import { Chunk } from "./datatypes/index.js"; import { anthropicCompletion, geminiCompletion, @@ -46,15 +43,11 @@ export const aiPlugin = Plugin.define("ai") ) .withActions( embed, - chunk, chunkSimilarity, chunkContextual, - partition, partitionContextual, enrich, - ) - .withDatatypes(Datatypes.define("chunk", Chunk)); + ); -export { Embedding } from "@nvisy/core"; -export type { ChunkOptions } from "./datatypes/index.js"; -export { Chunk } from "./datatypes/index.js"; +export type { ChunkOptions } from "@nvisy/core"; +export { Chunk, Embedding } from "@nvisy/core"; diff --git a/packages/nvisy-plugin-ai/src/providers/client.ts b/packages/nvisy-plugin-ai/src/providers/client.ts index 0d92e04..9b4a986 100644 --- a/packages/nvisy-plugin-ai/src/providers/client.ts +++ b/packages/nvisy-plugin-ai/src/providers/client.ts @@ -1,5 +1,10 @@ import { getLogger } from "@logtape/logtape"; -import { ConnectionError, Provider, type ProviderFactory } from "@nvisy/core"; +import { + ConnectionError, + type Metadata, + Provider, + type ProviderFactory, +} from "@nvisy/core"; import type { EmbeddingModel, LanguageModel } from "ai"; import { embedMany, generateText } from "ai"; import type { ProviderConnection } from "./schemas.js"; @@ -154,6 +159,26 @@ export class VercelCompletionClient extends AICompletionClient { } } +/** Parse an AI response as JSON, falling back to a keyed wrapper. */ +export function parseJsonResponse( + content: string, + fallbackKey: string, +): Metadata { + try { + const parsed = JSON.parse(content) as Record<string, unknown>; + if ( + typeof parsed === "object" && + parsed !== null && + !Array.isArray(parsed) + ) { + return parsed as Metadata; + } + } catch { + // If JSON parsing fails, store the raw response + } + return { [fallbackKey]: content }; +} + /** Normalise an unknown throw into a {@link ConnectionError}. */ function toConnectionError(error: unknown, source: string): ConnectionError { if (error instanceof ConnectionError) return error; From 90759a5dcdb0b7e6a54ea71cd000524349b5996d Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Sun, 8 Feb 2026 02:25:06 +0100 Subject: [PATCH 04/17] refactor(core, ai): move chunk/partition actions to core, split enrich strategies, add tests - Create @nvisy/plugin-core with chunk/partition actions, plaintext/CSV/JSON loaders, and core datatype registration - Move Action factory from src/actions/action.ts back to src/action.ts in nvisy-core as a framework primitive - Remove corePlugin and action implementations from nvisy-core - Add limit parameter to SQL read stream for capped pagination - Delete @nvisy/plugin-markup (empty after moving plaintext loader to core) - Improve JSDoc across the entire engine module - Update nvisy-runtime and nvisy-server to use @nvisy/plugin-core Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- package-lock.json | 28 ++- packages/README.md | 2 +- .../nvisy-core/src/{actions => }/action.ts | 4 +- packages/nvisy-core/src/actions/index.ts | 10 - packages/nvisy-core/src/index.ts | 13 +- packages/nvisy-core/src/plugin.ts | 2 +- packages/nvisy-core/test/action.fixtures.ts | 2 +- packages/nvisy-core/test/action.test.ts | 2 +- .../package.json | 4 +- .../src/actions/chunk-by-character.test.ts | 2 +- .../src/actions/chunk-by-character.ts | 3 +- .../src/actions/chunk-by-page.test.ts | 2 +- .../src/actions/chunk-by-page.ts | 2 +- .../src/actions/chunk-by-section.test.ts | 2 +- .../src/actions/chunk-by-section.ts | 3 +- .../src/actions/chunk.ts | 3 +- .../nvisy-plugin-core/src/actions/index.ts | 8 + .../src/actions/partition-by-auto.test.ts | 2 +- .../src/actions/partition-by-auto.ts | 2 +- .../src/actions/partition-by-rule.test.ts | 2 +- .../src/actions/partition-by-rule.ts | 10 +- .../src/actions/partition.ts | 5 +- packages/nvisy-plugin-core/src/index.ts | 37 ++++ .../nvisy-plugin-core/src/loaders/csv.test.ts | 208 ++++++++++++++++++ packages/nvisy-plugin-core/src/loaders/csv.ts | 120 ++++++++++ .../nvisy-plugin-core/src/loaders/index.ts | 12 + .../src/loaders/json.test.ts | 161 ++++++++++++++ .../nvisy-plugin-core/src/loaders/json.ts | 103 +++++++++ .../src/loaders/plaintext.test.ts | 3 +- .../src/loaders/plaintext.ts | 9 + .../tsconfig.json | 2 +- .../tsup.config.ts | 0 packages/nvisy-plugin-markup/src/index.ts | 18 -- .../nvisy-plugin-markup/src/loaders/index.ts | 2 - packages/nvisy-plugin-sql/src/streams/read.ts | 8 +- .../nvisy-plugin-sql/src/streams/schemas.ts | 2 + packages/nvisy-runtime/package.json | 1 + packages/nvisy-runtime/src/engine/bridge.ts | 39 ++-- .../nvisy-runtime/src/engine/connections.ts | 33 ++- packages/nvisy-runtime/src/engine/context.ts | 15 +- packages/nvisy-runtime/src/engine/engine.ts | 115 ++++++++-- packages/nvisy-runtime/src/engine/executor.ts | 16 +- packages/nvisy-runtime/src/engine/index.ts | 10 + packages/nvisy-runtime/src/engine/nodes.ts | 29 ++- packages/nvisy-runtime/src/engine/policies.ts | 13 +- packages/nvisy-runtime/src/engine/runs.ts | 26 ++- packages/nvisy-runtime/tsconfig.json | 5 +- packages/nvisy-server/package.json | 1 - .../src/service/engine-factory.ts | 2 - 49 files changed, 952 insertions(+), 151 deletions(-) rename packages/nvisy-core/src/{actions => }/action.ts (98%) delete mode 100644 packages/nvisy-core/src/actions/index.ts rename packages/{nvisy-plugin-markup => nvisy-plugin-core}/package.json (77%) rename packages/{nvisy-core => nvisy-plugin-core}/src/actions/chunk-by-character.test.ts (97%) rename packages/{nvisy-core => nvisy-plugin-core}/src/actions/chunk-by-character.ts (90%) rename packages/{nvisy-core => nvisy-plugin-core}/src/actions/chunk-by-page.test.ts (97%) rename packages/{nvisy-core => nvisy-plugin-core}/src/actions/chunk-by-page.ts (97%) rename packages/{nvisy-core => nvisy-plugin-core}/src/actions/chunk-by-section.test.ts (98%) rename packages/{nvisy-core => nvisy-plugin-core}/src/actions/chunk-by-section.ts (96%) rename packages/{nvisy-core => nvisy-plugin-core}/src/actions/chunk.ts (95%) create mode 100644 packages/nvisy-plugin-core/src/actions/index.ts rename packages/{nvisy-core => nvisy-plugin-core}/src/actions/partition-by-auto.test.ts (93%) rename packages/{nvisy-core => nvisy-plugin-core}/src/actions/partition-by-auto.ts (88%) rename packages/{nvisy-core => nvisy-plugin-core}/src/actions/partition-by-rule.test.ts (97%) rename packages/{nvisy-core => nvisy-plugin-core}/src/actions/partition-by-rule.ts (88%) rename packages/{nvisy-core => nvisy-plugin-core}/src/actions/partition.ts (93%) create mode 100644 packages/nvisy-plugin-core/src/index.ts create mode 100644 packages/nvisy-plugin-core/src/loaders/csv.test.ts create mode 100644 packages/nvisy-plugin-core/src/loaders/csv.ts create mode 100644 packages/nvisy-plugin-core/src/loaders/index.ts create mode 100644 packages/nvisy-plugin-core/src/loaders/json.test.ts create mode 100644 packages/nvisy-plugin-core/src/loaders/json.ts rename packages/{nvisy-plugin-markup => nvisy-plugin-core}/src/loaders/plaintext.test.ts (97%) rename packages/{nvisy-plugin-markup => nvisy-plugin-core}/src/loaders/plaintext.ts (86%) rename packages/{nvisy-plugin-markup => nvisy-plugin-core}/tsconfig.json (74%) rename packages/{nvisy-plugin-markup => nvisy-plugin-core}/tsup.config.ts (100%) delete mode 100644 packages/nvisy-plugin-markup/src/index.ts delete mode 100644 packages/nvisy-plugin-markup/src/loaders/index.ts diff --git a/package-lock.json b/package-lock.json index 7d1c95e..42536f5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -6,8 +6,7 @@ "": { "name": "@nvisy/monorepo", "workspaces": [ - "packages/*", - "sdks/nvisy-ts" + "packages/*" ], "devDependencies": { "@biomejs/biome": "^2.3.14", @@ -2411,8 +2410,8 @@ "resolved": "packages/nvisy-plugin-ai", "link": true }, - "node_modules/@nvisy/plugin-markup": { - "resolved": "packages/nvisy-plugin-markup", + "node_modules/@nvisy/plugin-core": { + "resolved": "packages/nvisy-plugin-core", "link": true }, "node_modules/@nvisy/plugin-nosql": { @@ -5873,10 +5872,6 @@ } } }, - "node_modules/nvisy": { - "resolved": "sdks/nvisy-ts", - "link": true - }, "node_modules/object-assign": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", @@ -7430,12 +7425,26 @@ "node": ">=22.0.0" } }, + "packages/nvisy-plugin-core": { + "name": "@nvisy/plugin-core", + "version": "0.1.0", + "dependencies": { + "@logtape/logtape": "^2.0.2", + "@nvisy/core": "*", + "zod": "^4.3.6" + }, + "engines": { + "node": ">=22.0.0" + } + }, "packages/nvisy-plugin-markup": { "name": "@nvisy/plugin-markup", "version": "0.1.0", + "extraneous": true, "dependencies": { "@logtape/logtape": "^2.0.2", "@nvisy/core": "*", + "@nvisy/plugin-core": "*", "zod": "^4.3.6" }, "engines": { @@ -7562,6 +7571,7 @@ "dependencies": { "@logtape/logtape": "^2.0.2", "@nvisy/core": "*", + "@nvisy/plugin-core": "*", "effection": "^4.0.2", "graphology": "^0.26.0", "graphology-dag": "^0.4.1", @@ -7589,7 +7599,6 @@ "@logtape/redaction": "^2.0.2", "@nvisy/core": "*", "@nvisy/plugin-ai": "*", - "@nvisy/plugin-markup": "*", "@nvisy/plugin-nosql": "*", "@nvisy/plugin-object": "*", "@nvisy/plugin-pandoc": "*", @@ -7609,6 +7618,7 @@ "sdks/nvisy-ts": { "name": "nvisy", "version": "0.1.0", + "extraneous": true, "devDependencies": { "@biomejs/biome": "^2.3.14", "tsup": "^8.5.1", diff --git a/packages/README.md b/packages/README.md index 9719434..f375c9f 100644 --- a/packages/README.md +++ b/packages/README.md @@ -54,6 +54,6 @@ must match the adjacent nodes in the DAG. | Package | Description | |---------|-------------| -| [`nvisy-plugin-markup`](nvisy-plugin-markup/) | HTML, XML, JSON, CSV, TSV, and plain text parsing | +| [`nvisy-plugin-core`](nvisy-plugin-core/) | Built-in chunk/partition actions, plaintext/CSV/JSON loaders, core datatype registration | | [`nvisy-plugin-tesseract`](nvisy-plugin-tesseract/) | Optical character recognition (Tesseract) | | [`nvisy-plugin-pandoc`](nvisy-plugin-pandoc/) | Document format conversion (Pandoc) | diff --git a/packages/nvisy-core/src/actions/action.ts b/packages/nvisy-core/src/action.ts similarity index 98% rename from packages/nvisy-core/src/actions/action.ts rename to packages/nvisy-core/src/action.ts index 0004ea2..1f3fdfc 100644 --- a/packages/nvisy-core/src/actions/action.ts +++ b/packages/nvisy-core/src/action.ts @@ -10,8 +10,8 @@ */ import type { z } from "zod"; -import type { Data } from "../datatypes/data.js"; -import type { ClassRef } from "../types.js"; +import type { Data } from "./datatypes/data.js"; +import type { ClassRef } from "./types.js"; /** * Stream transform that operates without a provider client. diff --git a/packages/nvisy-core/src/actions/index.ts b/packages/nvisy-core/src/actions/index.ts deleted file mode 100644 index c194294..0000000 --- a/packages/nvisy-core/src/actions/index.ts +++ /dev/null @@ -1,10 +0,0 @@ -/** - * @module actions - * - * Action definitions and built-in rule-based transforms. - */ - -export type { ActionInstance } from "./action.js"; -export { Action } from "./action.js"; -export { chunkSimple } from "./chunk.js"; -export { partition } from "./partition.js"; diff --git a/packages/nvisy-core/src/index.ts b/packages/nvisy-core/src/index.ts index f4f2adb..c6c3c0b 100644 --- a/packages/nvisy-core/src/index.ts +++ b/packages/nvisy-core/src/index.ts @@ -4,8 +4,8 @@ * Public API surface for the nvisy core library. */ -export type { ActionInstance } from "./actions/index.js"; -export { Action, chunkSimple, partition } from "./actions/index.js"; +export type { ActionInstance } from "./action.js"; +export { Action } from "./action.js"; export type { ChunkOptions, CompositeElementOptions, @@ -90,12 +90,3 @@ export type { } from "./stream.js"; export { Stream } from "./stream.js"; export type { ClassRef, JsonValue, Metadata } from "./types.js"; - -import { chunkSimple, partition } from "./actions/index.js"; -import { blob, chunk, document, embedding } from "./datatypes/index.js"; -import { Plugin } from "./plugin.js"; - -/** Built-in core plugin that registers the Document, Blob, Chunk, and Embedding datatypes. */ -export const corePlugin = Plugin.define("core") - .withDatatypes(document, blob, chunk, embedding) - .withActions(chunkSimple, partition); diff --git a/packages/nvisy-core/src/plugin.ts b/packages/nvisy-core/src/plugin.ts index 1fc7513..940a14f 100644 --- a/packages/nvisy-core/src/plugin.ts +++ b/packages/nvisy-core/src/plugin.ts @@ -9,7 +9,7 @@ * @module */ -import type { ActionInstance } from "./actions/action.js"; +import type { ActionInstance } from "./action.js"; import type { Datatype } from "./datatypes/index.js"; import type { LoaderInstance } from "./loader.js"; import type { ProviderFactory } from "./provider.js"; diff --git a/packages/nvisy-core/test/action.fixtures.ts b/packages/nvisy-core/test/action.fixtures.ts index 3e3e0d0..241ef6c 100644 --- a/packages/nvisy-core/test/action.fixtures.ts +++ b/packages/nvisy-core/test/action.fixtures.ts @@ -1,5 +1,5 @@ import { z } from "zod"; -import { Action } from "../src/actions/action.js"; +import { Action } from "../src/action.js"; import type { JsonValue } from "../src/datatypes/data.js"; import { Data } from "../src/datatypes/data.js"; diff --git a/packages/nvisy-core/test/action.test.ts b/packages/nvisy-core/test/action.test.ts index 01791ae..5b7d433 100644 --- a/packages/nvisy-core/test/action.test.ts +++ b/packages/nvisy-core/test/action.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import type { ActionInstance } from "../src/actions/action.js"; +import type { ActionInstance } from "../src/action.js"; import type { Data } from "../src/datatypes/data.js"; import { ExampleFilter, ExampleMap, TestRow } from "./action.fixtures.js"; diff --git a/packages/nvisy-plugin-markup/package.json b/packages/nvisy-plugin-core/package.json similarity index 77% rename from packages/nvisy-plugin-markup/package.json rename to packages/nvisy-plugin-core/package.json index fc9caca..de9f34a 100644 --- a/packages/nvisy-plugin-markup/package.json +++ b/packages/nvisy-plugin-core/package.json @@ -1,7 +1,7 @@ { - "name": "@nvisy/plugin-markup", + "name": "@nvisy/plugin-core", "version": "0.1.0", - "description": "Markup, tabular, and text format parsing for the Nvisy platform", + "description": "Core plugin with built-in chunk and partition actions for the Nvisy platform", "type": "module", "exports": { ".": { diff --git a/packages/nvisy-core/src/actions/chunk-by-character.test.ts b/packages/nvisy-plugin-core/src/actions/chunk-by-character.test.ts similarity index 97% rename from packages/nvisy-core/src/actions/chunk-by-character.test.ts rename to packages/nvisy-plugin-core/src/actions/chunk-by-character.test.ts index 7bc9749..92ae1d1 100644 --- a/packages/nvisy-core/src/actions/chunk-by-character.test.ts +++ b/packages/nvisy-plugin-core/src/actions/chunk-by-character.test.ts @@ -1,5 +1,5 @@ +import { Document } from "@nvisy/core"; import { describe, expect, it } from "vitest"; -import { Document } from "../datatypes/index.js"; import { chunkByCharacter } from "./chunk-by-character.js"; describe("chunkByCharacter", () => { diff --git a/packages/nvisy-core/src/actions/chunk-by-character.ts b/packages/nvisy-plugin-core/src/actions/chunk-by-character.ts similarity index 90% rename from packages/nvisy-core/src/actions/chunk-by-character.ts rename to packages/nvisy-plugin-core/src/actions/chunk-by-character.ts index cc233f8..948a8d4 100644 --- a/packages/nvisy-core/src/actions/chunk-by-character.ts +++ b/packages/nvisy-plugin-core/src/actions/chunk-by-character.ts @@ -7,8 +7,7 @@ * @module */ -import type { Document } from "../datatypes/index.js"; -import { Chunk } from "../datatypes/index.js"; +import { Chunk, type Document } from "@nvisy/core"; /** Character-strategy parameters. */ export interface CharacterStrategyParams { diff --git a/packages/nvisy-core/src/actions/chunk-by-page.test.ts b/packages/nvisy-plugin-core/src/actions/chunk-by-page.test.ts similarity index 97% rename from packages/nvisy-core/src/actions/chunk-by-page.test.ts rename to packages/nvisy-plugin-core/src/actions/chunk-by-page.test.ts index 6ff3992..0f31acb 100644 --- a/packages/nvisy-core/src/actions/chunk-by-page.test.ts +++ b/packages/nvisy-plugin-core/src/actions/chunk-by-page.test.ts @@ -1,5 +1,5 @@ +import { Document } from "@nvisy/core"; import { describe, expect, it } from "vitest"; -import { Document } from "../datatypes/index.js"; import { chunkByPage } from "./chunk-by-page.js"; describe("chunkByPage", () => { diff --git a/packages/nvisy-core/src/actions/chunk-by-page.ts b/packages/nvisy-plugin-core/src/actions/chunk-by-page.ts similarity index 97% rename from packages/nvisy-core/src/actions/chunk-by-page.ts rename to packages/nvisy-plugin-core/src/actions/chunk-by-page.ts index 7be1996..c317614 100644 --- a/packages/nvisy-core/src/actions/chunk-by-page.ts +++ b/packages/nvisy-plugin-core/src/actions/chunk-by-page.ts @@ -9,7 +9,7 @@ * @module */ -import { Chunk, Document } from "../datatypes/index.js"; +import { Chunk, Document } from "@nvisy/core"; /** Page-strategy parameters. */ export interface PageStrategyParams { diff --git a/packages/nvisy-core/src/actions/chunk-by-section.test.ts b/packages/nvisy-plugin-core/src/actions/chunk-by-section.test.ts similarity index 98% rename from packages/nvisy-core/src/actions/chunk-by-section.test.ts rename to packages/nvisy-plugin-core/src/actions/chunk-by-section.test.ts index 1456701..361b512 100644 --- a/packages/nvisy-core/src/actions/chunk-by-section.test.ts +++ b/packages/nvisy-plugin-core/src/actions/chunk-by-section.test.ts @@ -1,5 +1,5 @@ +import { Document } from "@nvisy/core"; import { describe, expect, it } from "vitest"; -import { Document } from "../datatypes/index.js"; import { chunkBySection } from "./chunk-by-section.js"; describe("chunkBySection", () => { diff --git a/packages/nvisy-core/src/actions/chunk-by-section.ts b/packages/nvisy-plugin-core/src/actions/chunk-by-section.ts similarity index 96% rename from packages/nvisy-core/src/actions/chunk-by-section.ts rename to packages/nvisy-plugin-core/src/actions/chunk-by-section.ts index 7a00d23..d22eb74 100644 --- a/packages/nvisy-core/src/actions/chunk-by-section.ts +++ b/packages/nvisy-plugin-core/src/actions/chunk-by-section.ts @@ -8,8 +8,7 @@ * @module */ -import type { Element } from "../datatypes/index.js"; -import { Chunk, Document } from "../datatypes/index.js"; +import { Chunk, Document, type Element } from "@nvisy/core"; /** Section-strategy parameters. */ export interface SectionStrategyParams { diff --git a/packages/nvisy-core/src/actions/chunk.ts b/packages/nvisy-plugin-core/src/actions/chunk.ts similarity index 95% rename from packages/nvisy-core/src/actions/chunk.ts rename to packages/nvisy-plugin-core/src/actions/chunk.ts index 47fed6a..8cc8c83 100644 --- a/packages/nvisy-core/src/actions/chunk.ts +++ b/packages/nvisy-plugin-core/src/actions/chunk.ts @@ -5,9 +5,8 @@ * @module */ +import { Action, Chunk, Document } from "@nvisy/core"; import { z } from "zod"; -import { Chunk, Document } from "../datatypes/index.js"; -import { Action } from "./action.js"; import { chunkByCharacter } from "./chunk-by-character.js"; import { chunkByPage } from "./chunk-by-page.js"; import { chunkBySection } from "./chunk-by-section.js"; diff --git a/packages/nvisy-plugin-core/src/actions/index.ts b/packages/nvisy-plugin-core/src/actions/index.ts new file mode 100644 index 0000000..e1b0d0c --- /dev/null +++ b/packages/nvisy-plugin-core/src/actions/index.ts @@ -0,0 +1,8 @@ +/** + * @module actions + * + * Built-in rule-based transforms for the core plugin. + */ + +export { chunkSimple } from "./chunk.js"; +export { partition } from "./partition.js"; diff --git a/packages/nvisy-core/src/actions/partition-by-auto.test.ts b/packages/nvisy-plugin-core/src/actions/partition-by-auto.test.ts similarity index 93% rename from packages/nvisy-core/src/actions/partition-by-auto.test.ts rename to packages/nvisy-plugin-core/src/actions/partition-by-auto.test.ts index e253871..9bd3bdb 100644 --- a/packages/nvisy-core/src/actions/partition-by-auto.test.ts +++ b/packages/nvisy-plugin-core/src/actions/partition-by-auto.test.ts @@ -1,5 +1,5 @@ +import { Document } from "@nvisy/core"; import { describe, expect, it } from "vitest"; -import { Document } from "../datatypes/index.js"; import { partitionByAuto } from "./partition-by-auto.js"; describe("partitionByAuto", () => { diff --git a/packages/nvisy-core/src/actions/partition-by-auto.ts b/packages/nvisy-plugin-core/src/actions/partition-by-auto.ts similarity index 88% rename from packages/nvisy-core/src/actions/partition-by-auto.ts rename to packages/nvisy-plugin-core/src/actions/partition-by-auto.ts index c89feae..734bf1b 100644 --- a/packages/nvisy-core/src/actions/partition-by-auto.ts +++ b/packages/nvisy-plugin-core/src/actions/partition-by-auto.ts @@ -7,7 +7,7 @@ * @module */ -import type { Document } from "../datatypes/index.js"; +import type { Document } from "@nvisy/core"; /** Auto-strategy parameters (no additional options). */ export type AutoStrategyParams = Record<string, never>; diff --git a/packages/nvisy-core/src/actions/partition-by-rule.test.ts b/packages/nvisy-plugin-core/src/actions/partition-by-rule.test.ts similarity index 97% rename from packages/nvisy-core/src/actions/partition-by-rule.test.ts rename to packages/nvisy-plugin-core/src/actions/partition-by-rule.test.ts index 3c3c974..15191b7 100644 --- a/packages/nvisy-core/src/actions/partition-by-rule.test.ts +++ b/packages/nvisy-plugin-core/src/actions/partition-by-rule.test.ts @@ -1,5 +1,5 @@ +import { Document, TableElement } from "@nvisy/core"; import { describe, expect, it } from "vitest"; -import { Document, TableElement } from "../datatypes/index.js"; import { partitionByRule } from "./partition-by-rule.js"; describe("partitionByRule", () => { diff --git a/packages/nvisy-core/src/actions/partition-by-rule.ts b/packages/nvisy-plugin-core/src/actions/partition-by-rule.ts similarity index 88% rename from packages/nvisy-core/src/actions/partition-by-rule.ts rename to packages/nvisy-plugin-core/src/actions/partition-by-rule.ts index 8d54b20..ae14ce8 100644 --- a/packages/nvisy-core/src/actions/partition-by-rule.ts +++ b/packages/nvisy-plugin-core/src/actions/partition-by-rule.ts @@ -8,8 +8,8 @@ * @module */ -import type { Document } from "../datatypes/index.js"; -import { TableElement } from "../datatypes/index.js"; +import type { Document } from "@nvisy/core"; +import { type Element, type TableCellData, TableElement } from "@nvisy/core"; /** Rule-strategy parameters. */ export interface RuleStrategyParams { @@ -39,7 +39,7 @@ export function partitionByRule( /** Replace plain-text table representations with HTML tables built from cell data. */ function applyTableStructure( content: string, - elements: readonly import("../datatypes/index.js").Element[], + elements: readonly Element[], ): string { for (const el of elements) { if ( @@ -57,9 +57,7 @@ function applyTableStructure( } /** Build an HTML `<table>` string from structured cell data. */ -function cellsToHtml( - cells: readonly import("../datatypes/index.js").TableCellData[], -): string { +function cellsToHtml(cells: readonly TableCellData[]): string { const rows = new Map<number, (typeof cells)[number][]>(); for (const cell of cells) { let row = rows.get(cell.row); diff --git a/packages/nvisy-core/src/actions/partition.ts b/packages/nvisy-plugin-core/src/actions/partition.ts similarity index 93% rename from packages/nvisy-core/src/actions/partition.ts rename to packages/nvisy-plugin-core/src/actions/partition.ts index aab89fe..559e336 100644 --- a/packages/nvisy-core/src/actions/partition.ts +++ b/packages/nvisy-plugin-core/src/actions/partition.ts @@ -5,10 +5,9 @@ * @module */ +import type { Metadata } from "@nvisy/core"; +import { Action, Document } from "@nvisy/core"; import { z } from "zod"; -import type { Metadata } from "../datatypes/index.js"; -import { Document } from "../datatypes/index.js"; -import { Action } from "./action.js"; import { partitionByAuto } from "./partition-by-auto.js"; import { partitionByRule } from "./partition-by-rule.js"; diff --git a/packages/nvisy-plugin-core/src/index.ts b/packages/nvisy-plugin-core/src/index.ts new file mode 100644 index 0000000..bf6e8fb --- /dev/null +++ b/packages/nvisy-plugin-core/src/index.ts @@ -0,0 +1,37 @@ +import { + Blob, + Chunk, + Datatypes, + Document, + Embedding, + Plugin, +} from "@nvisy/core"; +import { chunkSimple, partition } from "./actions/index.js"; +import { csvLoader, jsonLoader, plaintextLoader } from "./loaders/index.js"; + +export const corePlugin = Plugin.define("core") + .withDatatypes( + Datatypes.define("document", Document), + Datatypes.define("blob", Blob), + Datatypes.define("chunk", Chunk), + Datatypes.define("embedding", Embedding), + ) + .withActions(chunkSimple, partition) + .withLoaders(plaintextLoader, csvLoader, jsonLoader); + +export type { + CharacterStrategyParams, + PageStrategyParams, + SectionStrategyParams, +} from "./actions/chunk.js"; +export { chunkSimple, partition } from "./actions/index.js"; +export type { + AutoStrategyParams, + RuleStrategyParams, +} from "./actions/partition.js"; +export type { CsvParams } from "./loaders/csv.js"; +export { csvLoader, csvParamsSchema } from "./loaders/csv.js"; +export type { JsonParams } from "./loaders/json.js"; +export { jsonLoader, jsonParamsSchema } from "./loaders/json.js"; +export type { PlaintextParams } from "./loaders/plaintext.js"; +export { plaintextLoader, plaintextParamsSchema } from "./loaders/plaintext.js"; diff --git a/packages/nvisy-plugin-core/src/loaders/csv.test.ts b/packages/nvisy-plugin-core/src/loaders/csv.test.ts new file mode 100644 index 0000000..9e6116e --- /dev/null +++ b/packages/nvisy-plugin-core/src/loaders/csv.test.ts @@ -0,0 +1,208 @@ +import { Blob, type Document } from "@nvisy/core"; +import { describe, expect, it } from "vitest"; +import { csvLoader } from "./csv.js"; + +async function collectDocs(iter: AsyncIterable<Document>) { + const docs = []; + for await (const doc of iter) { + docs.push(doc); + } + return docs; +} + +describe("csvLoader", () => { + it("has id 'csv'", () => { + expect(csvLoader.id).toBe("csv"); + }); + + it("matches .csv and .tsv extensions", () => { + expect(csvLoader.extensions).toContain(".csv"); + expect(csvLoader.extensions).toContain(".tsv"); + }); + + it("matches text/csv content type", () => { + expect(csvLoader.contentTypes).toContain("text/csv"); + }); + + it("parses CSV with headers into one document per row", async () => { + const csv = "name,age\nAlice,30\nBob,25"; + const blob = new Blob("data.csv", Buffer.from(csv)); + const docs = await collectDocs( + csvLoader.load(blob, { + delimiter: ",", + hasHeader: true, + encoding: "utf-8", + }), + ); + + expect(docs).toHaveLength(2); + expect(docs[0]!.content).toBe("name: Alice\nage: 30"); + expect(docs[1]!.content).toBe("name: Bob\nage: 25"); + }); + + it("stores header values as metadata", async () => { + const csv = "name,age\nAlice,30"; + const blob = new Blob("data.csv", Buffer.from(csv)); + const docs = await collectDocs( + csvLoader.load(blob, { + delimiter: ",", + hasHeader: true, + encoding: "utf-8", + }), + ); + + expect(docs[0]!.metadata).toMatchObject({ + name: "Alice", + age: "30", + rowIndex: 0, + }); + }); + + it("sets sourceType to csv", async () => { + const csv = "a,b\n1,2"; + const blob = new Blob("data.csv", Buffer.from(csv)); + const docs = await collectDocs( + csvLoader.load(blob, { + delimiter: ",", + hasHeader: true, + encoding: "utf-8", + }), + ); + + expect(docs[0]!.sourceType).toBe("csv"); + }); + + it("parses CSV without headers", async () => { + const csv = "Alice,30\nBob,25"; + const blob = new Blob("data.csv", Buffer.from(csv)); + const docs = await collectDocs( + csvLoader.load(blob, { + delimiter: ",", + hasHeader: false, + encoding: "utf-8", + }), + ); + + expect(docs).toHaveLength(2); + expect(docs[0]!.content).toBe("Alice,30"); + expect(docs[1]!.content).toBe("Bob,25"); + }); + + it("supports tab delimiter for TSV", async () => { + const tsv = "name\tage\nAlice\t30"; + const blob = new Blob("data.tsv", Buffer.from(tsv)); + const docs = await collectDocs( + csvLoader.load(blob, { + delimiter: "\t", + hasHeader: true, + encoding: "utf-8", + }), + ); + + expect(docs).toHaveLength(1); + expect(docs[0]!.content).toBe("name: Alice\nage: 30"); + }); + + it("handles quoted fields with commas", async () => { + const csv = 'name,address\nAlice,"123 Main St, Apt 4"'; + const blob = new Blob("data.csv", Buffer.from(csv)); + const docs = await collectDocs( + csvLoader.load(blob, { + delimiter: ",", + hasHeader: true, + encoding: "utf-8", + }), + ); + + expect(docs[0]!.metadata).toMatchObject({ + address: "123 Main St, Apt 4", + }); + }); + + it("handles escaped quotes in fields", async () => { + const csv = 'name,note\nAlice,"She said ""hello"""'; + const blob = new Blob("data.csv", Buffer.from(csv)); + const docs = await collectDocs( + csvLoader.load(blob, { + delimiter: ",", + hasHeader: true, + encoding: "utf-8", + }), + ); + + expect(docs[0]!.metadata).toMatchObject({ + note: 'She said "hello"', + }); + }); + + it("derives documents from blob", async () => { + const csv = "a\n1\n2"; + const blob = new Blob("data.csv", Buffer.from(csv)); + const docs = await collectDocs( + csvLoader.load(blob, { + delimiter: ",", + hasHeader: true, + encoding: "utf-8", + }), + ); + + for (const doc of docs) { + expect(doc.parentId).toBe(blob.id); + } + }); + + it("handles empty file", async () => { + const blob = new Blob("empty.csv", Buffer.alloc(0)); + const docs = await collectDocs( + csvLoader.load(blob, { + delimiter: ",", + hasHeader: true, + encoding: "utf-8", + }), + ); + + expect(docs).toHaveLength(0); + }); + + it("handles header-only file", async () => { + const csv = "name,age"; + const blob = new Blob("header.csv", Buffer.from(csv)); + const docs = await collectDocs( + csvLoader.load(blob, { + delimiter: ",", + hasHeader: true, + encoding: "utf-8", + }), + ); + + expect(docs).toHaveLength(0); + }); + + it("handles CRLF line endings", async () => { + const csv = "name,age\r\nAlice,30\r\nBob,25"; + const blob = new Blob("data.csv", Buffer.from(csv)); + const docs = await collectDocs( + csvLoader.load(blob, { + delimiter: ",", + hasHeader: true, + encoding: "utf-8", + }), + ); + + expect(docs).toHaveLength(2); + }); + + it("uses defaults for optional params", async () => { + const csv = "a,b\n1,2"; + const blob = new Blob("data.csv", Buffer.from(csv)); + const params = csvLoader.schema.parse({}); + const docs = await collectDocs(csvLoader.load(blob, params)); + + expect(docs).toHaveLength(1); + expect(docs[0]!.content).toBe("a: 1\nb: 2"); + }); + + it("schema rejects unknown properties", () => { + expect(() => csvLoader.schema.parse({ extra: "field" })).toThrow(); + }); +}); diff --git a/packages/nvisy-plugin-core/src/loaders/csv.ts b/packages/nvisy-plugin-core/src/loaders/csv.ts new file mode 100644 index 0000000..6d924c2 --- /dev/null +++ b/packages/nvisy-plugin-core/src/loaders/csv.ts @@ -0,0 +1,120 @@ +/** + * CSV loader. + * + * Converts `.csv` and `.tsv` blobs into Documents. Each row becomes + * a separate Document whose content is built from the cell values. + * When a header row is present, cell values are formatted as + * `"column: value"` lines; otherwise raw comma-separated values are + * used as content. + * + * @module + */ + +import type { Metadata } from "@nvisy/core"; +import { Document, Loader } from "@nvisy/core"; +import { z } from "zod"; + +/** Schema for CSV loader parameters. */ +export const csvParamsSchema = z + .object({ + /** Column delimiter. Defaults to `","`. */ + delimiter: z.string().optional().default(","), + /** Whether the first row contains column headers. Defaults to `true`. */ + hasHeader: z.boolean().optional().default(true), + /** Character encoding of the blob data. Defaults to `"utf-8"`. */ + encoding: z + .enum(["utf-8", "ascii", "latin1", "utf16le"]) + .optional() + .default("utf-8"), + }) + .strict(); + +export type CsvParams = z.infer<typeof csvParamsSchema>; + +/** + * Loader that converts CSV/TSV blobs into one Document per row. + * + * Header columns are stored as metadata on each Document. + */ +export const csvLoader = Loader.define<CsvParams>("csv", { + extensions: [".csv", ".tsv"], + contentTypes: ["text/csv", "text/tab-separated-values"], + params: csvParamsSchema, + async *load(blob, params) { + const text = blob.data.toString(params.encoding); + const lines = parseLines(text); + if (lines.length === 0) return; + + let headers: string[] | null = null; + let startIndex = 0; + + if (params.hasHeader && lines.length > 0) { + headers = splitRow(lines[0]!, params.delimiter); + startIndex = 1; + } + + for (let i = startIndex; i < lines.length; i++) { + const cells = splitRow(lines[i]!, params.delimiter); + const content = headers + ? headers.map((h, j) => `${h}: ${cells[j] ?? ""}`).join("\n") + : cells.join(params.delimiter); + + const metadata: Metadata = { + rowIndex: i - startIndex, + ...(headers + ? Object.fromEntries(headers.map((h, j) => [h, cells[j] ?? ""])) + : {}), + }; + + const doc = new Document(content, { sourceType: "csv" }) + .deriveFrom(blob) + .withMetadata(metadata); + yield doc; + } + }, +}); + +/** Split text into non-empty lines, handling \r\n and \n. */ +function parseLines(text: string): string[] { + return text.split(/\r?\n/).filter((line) => line.length > 0); +} + +/** Split a single CSV row on the delimiter, respecting double-quoted fields. */ +function splitRow(line: string, delimiter: string): string[] { + const fields: string[] = []; + let current = ""; + let inQuotes = false; + let i = 0; + + while (i < line.length) { + const char = line[i]!; + + if (inQuotes) { + if (char === '"') { + if (i + 1 < line.length && line[i + 1] === '"') { + current += '"'; + i += 2; + } else { + inQuotes = false; + i++; + } + } else { + current += char; + i++; + } + } else if (char === '"') { + inQuotes = true; + i++; + } else if (line.startsWith(delimiter, i)) { + fields.push(current); + current = ""; + i += delimiter.length; + } else { + current += char; + i++; + } + } + + fields.push(current); + return fields; +} diff --git a/packages/nvisy-plugin-core/src/loaders/index.ts b/packages/nvisy-plugin-core/src/loaders/index.ts new file mode 100644 index 0000000..f317cc3 --- /dev/null +++ b/packages/nvisy-plugin-core/src/loaders/index.ts @@ -0,0 +1,12 @@ +/** + * @module loaders + * + * Built-in loaders for the core plugin. + */ + +export type { CsvParams } from "./csv.js"; +export { csvLoader, csvParamsSchema } from "./csv.js"; +export type { JsonParams } from "./json.js"; +export { jsonLoader, jsonParamsSchema } from "./json.js"; +export type { PlaintextParams } from "./plaintext.js"; +export { plaintextLoader, plaintextParamsSchema } from "./plaintext.js"; diff --git a/packages/nvisy-plugin-core/src/loaders/json.test.ts b/packages/nvisy-plugin-core/src/loaders/json.test.ts new file mode 100644 index 0000000..c9607e0 --- /dev/null +++ b/packages/nvisy-plugin-core/src/loaders/json.test.ts @@ -0,0 +1,161 @@ +import { Blob, type Document } from "@nvisy/core"; +import { describe, expect, it } from "vitest"; +import { jsonLoader } from "./json.js"; + +async function collectDocs(iter: AsyncIterable<Document>) { + const docs = []; + for await (const doc of iter) { + docs.push(doc); + } + return docs; +} + +describe("jsonLoader", () => { + it("has id 'json'", () => { + expect(jsonLoader.id).toBe("json"); + }); + + it("matches .json, .jsonl, and .ndjson extensions", () => { + expect(jsonLoader.extensions).toContain(".json"); + expect(jsonLoader.extensions).toContain(".jsonl"); + expect(jsonLoader.extensions).toContain(".ndjson"); + }); + + it("matches application/json content type", () => { + expect(jsonLoader.contentTypes).toContain("application/json"); + }); + + describe("JSON files", () => { + it("creates one document from a JSON object", async () => { + const json = JSON.stringify({ name: "Alice", age: 30 }); + const blob = new Blob("data.json", Buffer.from(json)); + const docs = await collectDocs( + jsonLoader.load(blob, { encoding: "utf-8" }), + ); + + expect(docs).toHaveLength(1); + expect(docs[0]!.sourceType).toBe("json"); + }); + + it("promotes scalar fields to metadata", async () => { + const json = JSON.stringify({ name: "Alice", age: 30, active: true }); + const blob = new Blob("data.json", Buffer.from(json)); + const docs = await collectDocs( + jsonLoader.load(blob, { encoding: "utf-8" }), + ); + + expect(docs[0]!.metadata).toMatchObject({ + name: "Alice", + age: 30, + active: true, + }); + }); + + it("explodes JSON arrays into one document per element", async () => { + const json = JSON.stringify([ + { id: 1, text: "first" }, + { id: 2, text: "second" }, + ]); + const blob = new Blob("data.json", Buffer.from(json)); + const docs = await collectDocs( + jsonLoader.load(blob, { encoding: "utf-8" }), + ); + + expect(docs).toHaveLength(2); + expect(docs[0]!.metadata).toMatchObject({ id: 1, arrayIndex: 0 }); + expect(docs[1]!.metadata).toMatchObject({ id: 2, arrayIndex: 1 }); + }); + + it("handles string JSON values", async () => { + const json = JSON.stringify("just a string"); + const blob = new Blob("data.json", Buffer.from(json)); + const docs = await collectDocs( + jsonLoader.load(blob, { encoding: "utf-8" }), + ); + + expect(docs).toHaveLength(1); + expect(docs[0]!.content).toBe("just a string"); + }); + + it("pretty-prints object content", async () => { + const obj = { key: "value" }; + const blob = new Blob("data.json", Buffer.from(JSON.stringify(obj))); + const docs = await collectDocs( + jsonLoader.load(blob, { encoding: "utf-8" }), + ); + + expect(docs[0]!.content).toBe(JSON.stringify(obj, null, 2)); + }); + + it("derives documents from blob", async () => { + const json = JSON.stringify([{ a: 1 }, { b: 2 }]); + const blob = new Blob("data.json", Buffer.from(json)); + const docs = await collectDocs( + jsonLoader.load(blob, { encoding: "utf-8" }), + ); + + for (const doc of docs) { + expect(doc.parentId).toBe(blob.id); + } + }); + }); + + describe("JSONL files", () => { + it("creates one document per line", async () => { + const jsonl = '{"id":1}\n{"id":2}\n{"id":3}'; + const blob = new Blob("data.jsonl", Buffer.from(jsonl)); + const docs = await collectDocs( + jsonLoader.load(blob, { encoding: "utf-8" }), + ); + + expect(docs).toHaveLength(3); + expect(docs[0]!.metadata).toMatchObject({ id: 1, lineIndex: 0 }); + expect(docs[2]!.metadata).toMatchObject({ id: 3, lineIndex: 2 }); + }); + + it("skips empty lines", async () => { + const jsonl = '{"a":1}\n\n{"b":2}\n'; + const blob = new Blob("data.jsonl", Buffer.from(jsonl)); + const docs = await collectDocs( + jsonLoader.load(blob, { encoding: "utf-8" }), + ); + + expect(docs).toHaveLength(2); + }); + + it("handles .ndjson extension", async () => { + const ndjson = '{"x":1}\n{"x":2}'; + const blob = new Blob("data.ndjson", Buffer.from(ndjson)); + const docs = await collectDocs( + jsonLoader.load(blob, { encoding: "utf-8" }), + ); + + expect(docs).toHaveLength(2); + }); + + it("derives documents from blob", async () => { + const jsonl = '{"a":1}\n{"b":2}'; + const blob = new Blob("data.jsonl", Buffer.from(jsonl)); + const docs = await collectDocs( + jsonLoader.load(blob, { encoding: "utf-8" }), + ); + + for (const doc of docs) { + expect(doc.parentId).toBe(blob.id); + } + }); + }); + + it("uses defaults for optional params", async () => { + const json = JSON.stringify({ hello: "world" }); + const blob = new Blob("data.json", Buffer.from(json)); + const params = jsonLoader.schema.parse({}); + const docs = await collectDocs(jsonLoader.load(blob, params)); + + expect(docs).toHaveLength(1); + }); + + it("schema rejects unknown properties", () => { + expect(() => jsonLoader.schema.parse({ extra: "field" })).toThrow(); + }); +}); diff --git a/packages/nvisy-plugin-core/src/loaders/json.ts b/packages/nvisy-plugin-core/src/loaders/json.ts new file mode 100644 index 0000000..5b65259 --- /dev/null +++ b/packages/nvisy-plugin-core/src/loaders/json.ts @@ -0,0 +1,103 @@ +/** + * JSON / JSON Lines loader. + * + * Converts `.json` and `.jsonl` blobs into Documents. + * + * - **`.json`** — if the root value is an array, each element becomes + * a Document; otherwise the entire file becomes a single Document. + * - **`.jsonl`** — each non-empty line is parsed as a separate JSON + * object and becomes its own Document. + * + * @module + */ + +import type { Blob, Metadata } from "@nvisy/core"; +import { Document, Loader } from "@nvisy/core"; +import { z } from "zod"; + +/** Schema for JSON loader parameters. */ +export const jsonParamsSchema = z + .object({ + /** Character encoding of the blob data. Defaults to `"utf-8"`. */ + encoding: z + .enum(["utf-8", "ascii", "latin1", "utf16le"]) + .optional() + .default("utf-8"), + }) + .strict(); + +export type JsonParams = z.infer<typeof jsonParamsSchema>; + +/** + * Loader that converts JSON / JSONL blobs into Documents. + * + * Each JSON value is stringified as the Document's content, with + * scalar fields promoted to metadata when the value is an object. + */ +export const jsonLoader = Loader.define<JsonParams>("json", { + extensions: [".json", ".jsonl", ".ndjson"], + contentTypes: ["application/json", "application/x-ndjson"], + params: jsonParamsSchema, + async *load(blob, params) { + const text = blob.data.toString(params.encoding); + const isJsonLines = + blob.path.endsWith(".jsonl") || blob.path.endsWith(".ndjson"); + + if (isJsonLines) { + yield* loadJsonLines(text, blob); + } else { + yield* loadJson(text, blob); + } + }, +}); + +/** Parse a single JSON file. Arrays are exploded into one Document per element. */ +function* loadJson(text: string, blob: Blob): Generator<Document> { + const parsed: unknown = JSON.parse(text); + + if (Array.isArray(parsed)) { + for (let i = 0; i < parsed.length; i++) { + yield toDocument(parsed[i], blob, { arrayIndex: i }); + } + } else { + yield toDocument(parsed, blob, {}); + } +} + +/** Parse newline-delimited JSON (one object per line). */ +function* loadJsonLines(text: string, blob: Blob): Generator<Document> { + const lines = text.split(/\r?\n/); + let index = 0; + + for (const line of lines) { + const trimmed = line.trim(); + if (trimmed.length === 0) continue; + + const parsed: unknown = JSON.parse(trimmed); + yield toDocument(parsed, blob, { lineIndex: index }); + index++; + } +} + +/** Convert a parsed JSON value into a Document with metadata. */ +function toDocument(value: unknown, blob: Blob, baseMeta: Metadata): Document { + const content = + typeof value === "string" ? value : JSON.stringify(value, null, 2); + const metadata: Metadata = { ...baseMeta }; + + if (typeof value === "object" && value !== null && !Array.isArray(value)) { + for (const [k, v] of Object.entries(value)) { + if ( + typeof v === "string" || + typeof v === "number" || + typeof v === "boolean" + ) { + metadata[k] = v; + } + } + } + + return new Document(content, { sourceType: "json" }) + .deriveFrom(blob) + .withMetadata(metadata); +} diff --git a/packages/nvisy-plugin-markup/src/loaders/plaintext.test.ts b/packages/nvisy-plugin-core/src/loaders/plaintext.test.ts similarity index 97% rename from packages/nvisy-plugin-markup/src/loaders/plaintext.test.ts rename to packages/nvisy-plugin-core/src/loaders/plaintext.test.ts index 7f641b9..92e8e36 100644 --- a/packages/nvisy-plugin-markup/src/loaders/plaintext.test.ts +++ b/packages/nvisy-plugin-core/src/loaders/plaintext.test.ts @@ -1,5 +1,4 @@ -import type { Document } from "@nvisy/core"; -import { Blob } from "@nvisy/core"; +import { Blob, type Document } from "@nvisy/core"; import { describe, expect, it } from "vitest"; import { plaintextLoader } from "./plaintext.js"; diff --git a/packages/nvisy-plugin-markup/src/loaders/plaintext.ts b/packages/nvisy-plugin-core/src/loaders/plaintext.ts similarity index 86% rename from packages/nvisy-plugin-markup/src/loaders/plaintext.ts rename to packages/nvisy-plugin-core/src/loaders/plaintext.ts index eb8cea4..a06c1fc 100644 --- a/packages/nvisy-plugin-markup/src/loaders/plaintext.ts +++ b/packages/nvisy-plugin-core/src/loaders/plaintext.ts @@ -1,3 +1,12 @@ +/** + * Plaintext loader. + * + * Converts `.txt` blobs into Documents by decoding the raw bytes + * with a configurable character encoding. + * + * @module + */ + import { Document, Loader } from "@nvisy/core"; import { z } from "zod"; diff --git a/packages/nvisy-plugin-markup/tsconfig.json b/packages/nvisy-plugin-core/tsconfig.json similarity index 74% rename from packages/nvisy-plugin-markup/tsconfig.json rename to packages/nvisy-plugin-core/tsconfig.json index c91a2dd..67241bf 100644 --- a/packages/nvisy-plugin-markup/tsconfig.json +++ b/packages/nvisy-plugin-core/tsconfig.json @@ -8,6 +8,6 @@ }, /* Scope */ "include": ["src/**/*"], - "exclude": ["node_modules", "dist"], + "exclude": ["node_modules", "dist", "src/**/*.test.ts", "src/**/*.spec.ts"], "references": [{ "path": "../nvisy-core" }] } diff --git a/packages/nvisy-plugin-markup/tsup.config.ts b/packages/nvisy-plugin-core/tsup.config.ts similarity index 100% rename from packages/nvisy-plugin-markup/tsup.config.ts rename to packages/nvisy-plugin-core/tsup.config.ts diff --git a/packages/nvisy-plugin-markup/src/index.ts b/packages/nvisy-plugin-markup/src/index.ts deleted file mode 100644 index c02494e..0000000 --- a/packages/nvisy-plugin-markup/src/index.ts +++ /dev/null @@ -1,18 +0,0 @@ -/** - * @module @nvisy/plugin-markup - * - * Markup and text format parsing plugin for the Nvisy runtime. - * - * Provides actions for parsing and extracting structured data from - * HTML, XML, JSON, CSV, TSV, and plain text formats. - */ - -import { Plugin } from "@nvisy/core"; -import { plaintextLoader } from "./loaders/index.js"; - -export type { PlaintextParams } from "./loaders/index.js"; -export { plaintextLoader, plaintextParamsSchema } from "./loaders/index.js"; - -/** Markup parsing plugin instance. */ -export const markupPlugin = - Plugin.define("markup").withLoaders(plaintextLoader); diff --git a/packages/nvisy-plugin-markup/src/loaders/index.ts b/packages/nvisy-plugin-markup/src/loaders/index.ts deleted file mode 100644 index 1688cb8..0000000 --- a/packages/nvisy-plugin-markup/src/loaders/index.ts +++ /dev/null @@ -1,2 +0,0 @@ -export type { PlaintextParams } from "./plaintext.js"; -export { plaintextLoader, plaintextParamsSchema } from "./plaintext.js"; diff --git a/packages/nvisy-plugin-sql/src/streams/read.ts b/packages/nvisy-plugin-sql/src/streams/read.ts index a3a8e32..3f93395 100644 --- a/packages/nvisy-plugin-sql/src/streams/read.ts +++ b/packages/nvisy-plugin-sql/src/streams/read.ts @@ -13,7 +13,8 @@ const logger = getLogger(["nvisy", "sql"]); * * Pages are fetched using a composite `(idColumn, tiebreaker)` cursor * for stable ordering across batches. The stream terminates when a - * batch returns fewer rows than `batchSize`. + * batch returns fewer rows than `batchSize`, or when `limit` rows + * have been yielded. */ export const read = Stream.createSource("read", KyselyClient, { types: [Row, SqlCursor, SqlParams], @@ -25,7 +26,7 @@ async function* readStream( cursor: SqlCursor, params: SqlParams, ): AsyncIterable<Resumable<Row, SqlCursor>> { - const { table, columns, idColumn, tiebreaker, batchSize } = params; + const { table, columns, idColumn, tiebreaker, batchSize, limit } = params; const { ref } = client.db.dynamic; logger.debug("Read stream opened on {table}", { @@ -33,6 +34,7 @@ async function* readStream( idColumn, tiebreaker, batchSize, + ...(limit != null ? { limit } : {}), }); let lastId = cursor.lastId; @@ -82,8 +84,10 @@ async function* readStream( data: new Row(row as Record<string, JsonValue>), context: { lastId, lastTiebreaker } as SqlCursor, }; + if (limit != null && totalRows >= limit) break; } + if (limit != null && totalRows >= limit) break; if (rows.length < batchSize) break; } diff --git a/packages/nvisy-plugin-sql/src/streams/schemas.ts b/packages/nvisy-plugin-sql/src/streams/schemas.ts index faed9df..c19f68c 100644 --- a/packages/nvisy-plugin-sql/src/streams/schemas.ts +++ b/packages/nvisy-plugin-sql/src/streams/schemas.ts @@ -16,6 +16,8 @@ export const SqlParams = z.object({ tiebreaker: z.string(), /** Maximum rows per page during keyset pagination. */ batchSize: z.number(), + /** Maximum total rows to yield. When omitted, all rows are read. */ + limit: z.number().int().positive().optional(), }); export type SqlParams = z.infer<typeof SqlParams>; diff --git a/packages/nvisy-runtime/package.json b/packages/nvisy-runtime/package.json index f45d89a..4395af2 100644 --- a/packages/nvisy-runtime/package.json +++ b/packages/nvisy-runtime/package.json @@ -22,6 +22,7 @@ "dependencies": { "@logtape/logtape": "^2.0.2", "@nvisy/core": "*", + "@nvisy/plugin-core": "*", "effection": "^4.0.2", "graphology": "^0.26.0", "graphology-dag": "^0.4.1", diff --git a/packages/nvisy-runtime/src/engine/bridge.ts b/packages/nvisy-runtime/src/engine/bridge.ts index de52ce8..fd2f22a 100644 --- a/packages/nvisy-runtime/src/engine/bridge.ts +++ b/packages/nvisy-runtime/src/engine/bridge.ts @@ -1,10 +1,15 @@ /** * Loader bridge for automatic Blob → Document conversion. * - * When a source node produces Blobs but downstream nodes expect Documents, - * this bridge automatically detects and applies the appropriate loader. - * Converted documents are cached by blob ID to avoid duplicate conversions - * when a source has multiple downstream consumers. + * When a source node produces {@link Blob}s but downstream action or + * target nodes expect {@link Document}s, the bridge transparently + * selects a matching loader from the registry (by file extension and + * magic-byte content type), converts each blob, and yields the + * resulting documents. Converted documents are cached by blob ID in a + * per-run {@link LoaderCache} so the same blob is never loaded twice + * even when consumed by multiple downstream branches. + * + * @module */ import { getLogger } from "@logtape/logtape"; @@ -21,22 +26,26 @@ export function createLoaderCache(): LoaderCache { return new Map(); } -/** - * Wraps an async iterable to automatically convert Blobs to Documents. - * - * The bridge inspects each data item: - * - If it's a Blob, looks up the appropriate loader and converts it - * - If it's already a Document (or other type), passes it through unchanged - * - * Converted documents are cached by blob.id using the shared cache - * to avoid redundant conversions when the same blob is consumed - * by multiple downstream nodes. - */ +/** Options for the loader bridge. */ export interface BridgeOptions { /** When true, skip blobs with no matching loader instead of throwing. */ readonly ignoreUnsupported?: boolean; } +/** + * Wrap an async iterable to automatically convert Blobs to Documents. + * + * Non-Blob items pass through unchanged. For each Blob the registry + * is queried for a loader that matches the file's extension / content + * type. If no loader is found, behaviour depends on + * {@link BridgeOptions.ignoreUnsupported}: when true the blob is + * silently dropped; otherwise a {@link RuntimeError} is thrown. + * + * @param stream - Upstream data items (may contain a mix of Blobs and other types). + * @param registry - Used to look up loaders by extension / magic bytes. + * @param cache - Per-run cache; blobs already converted are yielded from cache. + * @param options - Optional bridge configuration. + */ export async function* applyLoaderBridge( stream: AsyncIterable<Data>, registry: Registry, diff --git a/packages/nvisy-runtime/src/engine/connections.ts b/packages/nvisy-runtime/src/engine/connections.ts index 7eb8086..45d3baf 100644 --- a/packages/nvisy-runtime/src/engine/connections.ts +++ b/packages/nvisy-runtime/src/engine/connections.ts @@ -1,8 +1,14 @@ /** * Connection validation and types. * - * Validates connection credentials against provider schemas before - * execution begins, ensuring all connections are valid upfront. + * A "connection" pairs a provider type with its credentials (and an + * optional resumption context). Before graph execution, every + * connection referenced by the plan is validated upfront against its + * provider's Zod credential schema via {@link validateConnections}, + * ensuring misconfigured credentials surface early rather than + * mid-pipeline. + * + * @module */ import type { AnyProviderFactory } from "@nvisy/core"; @@ -42,14 +48,18 @@ export type Connection = z.infer<typeof ConnectionSchema>; export type Connections = z.infer<typeof ConnectionsSchema>; /** - * A connection with validated credentials. + * A connection whose credentials have passed provider-schema validation. * - * Created during upfront validation, credentials have been parsed - * against the provider's schema and are ready for use. + * Created by {@link validateConnections} before execution starts. + * `credentials` is the Zod-parsed output (defaults applied, types + * narrowed), ready to be passed directly to `provider.connect()`. */ export interface ValidatedConnection { + /** The provider factory that owns this connection's credential schema. */ readonly provider: AnyProviderFactory; + /** Parsed credentials (output of `provider.credentialSchema.parse`). */ readonly credentials: unknown; + /** Optional resumption context carried from a previous run. */ readonly context: unknown; } @@ -66,11 +76,16 @@ function hasConnection( } /** - * Validate all connections referenced by the execution plan. + * Validate every connection referenced by the execution plan. + * + * Iterates through each plan node that has an associated connection, + * resolves the connection entry from the `connections` map, and parses + * its credentials against the provider's Zod schema. Missing or + * invalid entries are collected and thrown as a single + * {@link ValidationError} so callers see all problems at once. * - * Performs upfront validation of credentials against provider schemas. - * This ensures all connections are valid before execution begins, - * avoiding partial execution failures due to credential issues. + * @returns Map of connection ID → validated connection, ready for execution. + * @throws {ValidationError} If any connection is missing or has invalid credentials. */ export function validateConnections( plan: ExecutionPlan, diff --git a/packages/nvisy-runtime/src/engine/context.ts b/packages/nvisy-runtime/src/engine/context.ts index 3520d36..361960a 100644 --- a/packages/nvisy-runtime/src/engine/context.ts +++ b/packages/nvisy-runtime/src/engine/context.ts @@ -1,8 +1,16 @@ /** * Execution context and edge graph construction. * - * Provides the runtime context that carries validated state through - * execution, and builds edge queues for data flow between nodes. + * The {@link ExecutionContext} is the single object threaded through + * every node executor during a run. It carries the compiled plan, + * validated connections, Effection edge queues, and convenience + * accessors for looking up nodes and connections by ID. + * + * {@link buildEdges} converts the plan's DAG edges into pairs of + * Effection {@link Queue}s (one per edge direction) that enable + * backpressure-aware streaming between producer and consumer nodes. + * + * @module */ import type { Data } from "@nvisy/core"; @@ -44,8 +52,11 @@ export interface ExecutionContext { readonly registry: Registry; readonly loaderCache: LoaderCache; + /** Look up a node's raw graph schema. Throws {@link ValidationError} if missing. */ getNode(nodeId: string): GraphNode; + /** Look up a node's compiler-resolved metadata. Throws {@link ValidationError} if missing. */ getResolved(nodeId: string): ResolvedNode; + /** Look up the validated connection for a provider-backed node. Throws {@link ValidationError} if the node has no connection or the connection is missing. */ getConnection(nodeId: string): ValidatedConnection; } diff --git a/packages/nvisy-runtime/src/engine/engine.ts b/packages/nvisy-runtime/src/engine/engine.ts index 19b857d..3f0b678 100644 --- a/packages/nvisy-runtime/src/engine/engine.ts +++ b/packages/nvisy-runtime/src/engine/engine.ts @@ -2,20 +2,33 @@ * Primary runtime entry point. * * Coordinates plugin registration, graph validation, and execution. - * Delegates actual graph execution to the executor module and run - * tracking to the RunManager. + * The Engine auto-loads {@link corePlugin} (Document, Blob, Chunk, + * Embedding datatypes plus chunk/partition actions) at construction. + * Additional plugins are registered via {@link Engine.register}. + * + * Delegates graph execution to the {@link execute executor} and run + * tracking to the {@link RunManager}. * * @example * ```ts - * const engine = new Engine(); - * engine.register(sqlPlugin); + * const engine = new Engine() + * .register(sqlPlugin) + * .register(aiPlugin); + * + * // Background execution with run tracking * const runId = engine.execute(graph, connections); * const state = engine.getRun(runId); + * + * // Synchronous execution (blocks until completion) + * const result = await engine.executeSync(graph, connections); * ``` + * + * @module */ import type { PluginInstance } from "@nvisy/core"; -import { corePlugin, ValidationError } from "@nvisy/core"; +import { ValidationError } from "@nvisy/core"; +import { corePlugin } from "@nvisy/plugin-core"; import { compile, type ExecutionPlan } from "../compiler/index.js"; import { Registry, type RegistrySchema } from "../registry.js"; import { @@ -31,26 +44,56 @@ import { type RunSummary, } from "./runs.js"; -/** Result of graph validation. */ +/** + * Result of graph validation. + * + * Returned by {@link Engine.validate}. When `valid` is false, `errors` + * contains human-readable descriptions of every issue found (graph + * structure problems, missing connections, credential schema mismatches). + */ export interface ValidationResult { + /** Whether the graph and its connections passed all checks. */ readonly valid: boolean; + /** Validation error messages (empty when valid). */ readonly errors: ReadonlyArray<string>; } +/** + * Central orchestrator for pipeline registration, validation, and execution. + * + * The constructor pre-loads {@link corePlugin} so the built-in datatypes + * (Document, Blob, Chunk, Embedding) and actions (chunk, partition) are + * always available. Call {@link register} to add provider and action + * plugins before executing graphs. + * + * Execution modes: + * - {@link execute} — fire-and-forget; returns a `runId` for polling via + * {@link getRun}, {@link listRuns}, and {@link cancelRun}. + * - {@link executeSync} — awaitable; resolves with the full + * {@link RunResult} when the graph finishes. + */ export class Engine { readonly #registry = new Registry(); readonly #runs = new RunManager(); + /** Pre-loads {@link corePlugin} so built-in datatypes and actions are always available. */ constructor() { this.#registry.load(corePlugin); } - /** Snapshot of all registered actions and providers with their schemas. */ + /** Snapshot of every registered action, provider, stream, loader, and datatype. */ get schema(): RegistrySchema { return this.#registry.schema; } - /** Register a plugin's providers, actions, and streams. */ + /** + * Register a plugin's providers, actions, streams, loaders, and datatypes. + * + * Plugins are registered under their `id`; duplicate IDs throw a + * {@link ValidationError}. Returns `this` to allow fluent chaining. + * + * @param plugin - Plugin instance produced by `Plugin.define(…)`. + */ register(plugin: PluginInstance): this { this.#registry.load(plugin); return this; @@ -59,8 +102,14 @@ export class Engine { /** * Validate a graph definition and connections without executing. * - * Checks graph structure (parse, cycles, dangling edges, name resolution) - * and validates each connection's credentials against its provider schema. + * Performs three layers of validation: + * 1. **Connection shape** — each entry matches {@link ConnectionSchema}. + * 2. **Graph structure** — JSON parsing, cycle detection, dangling + * edges, and name resolution against the registry. + * 3. **Credential validation** — each connection's credentials are + * checked against the provider's Zod schema. + * + * All errors are collected; the method never throws. */ validate(graph: unknown, connections: Connections): ValidationResult { const errors: string[] = []; @@ -97,8 +146,16 @@ export class Engine { /** * Execute a graph in the background. * - * Returns immediately with a runId for tracking progress, - * retrieving results, or cancelling execution. + * Compiles and validates the graph, then hands it to the + * {@link RunManager} for asynchronous execution. Returns + * immediately with a `runId` for polling progress via + * {@link getRun} or cancelling via {@link cancelRun}. + * + * @param graph - Raw graph definition (validated and compiled internally). + * @param connections - Connection credentials keyed by UUID. + * @param options - Optional abort signal and context-update callback. + * @returns Unique run ID (UUID). + * @throws {ValidationError} If graph or connections fail validation. */ execute( graph: unknown, @@ -117,9 +174,17 @@ export class Engine { } /** - * Execute a graph synchronously. + * Execute a graph and await the result. + * + * Unlike {@link execute}, this method resolves only when the entire + * graph has finished (or an abort signal fires). Use this for + * scripting, tests, or any context where you need the result inline. * - * Blocks until execution completes. For background execution, use {@link execute}. + * @param graph - Raw graph definition (validated and compiled internally). + * @param connections - Connection credentials keyed by UUID. + * @param options - Optional abort signal and context-update callback. + * @throws {ValidationError} If graph or connections fail validation. + * @throws {CancellationError} If execution is aborted. */ async executeSync( graph: unknown, @@ -130,17 +195,33 @@ export class Engine { return execute(plan, connections, this.#registry, options); } - /** Get the current state of a run by its ID. */ + /** + * Get the current state of a run. + * + * Returns per-node progress, overall status, and (once finished) the + * final {@link RunResult}. Returns `undefined` if the run ID is unknown + * or has already been cleaned up (see {@link RunManager} TTL). + */ getRun(runId: string): RunState | undefined { return this.#runs.get(runId); } - /** List all runs, optionally filtered by status. */ + /** + * List all tracked runs, optionally filtered by status. + * + * @param status - If provided, only runs in this lifecycle phase are returned. + */ listRuns(status?: RunStatus): RunSummary[] { return this.#runs.list(status); } - /** Cancel a running execution. */ + /** + * Request cancellation of a running or pending execution. + * + * Signals the run's internal {@link AbortController}; nodes that + * have already completed are unaffected. Returns `false` if the + * run was not found or already finished. + */ cancelRun(runId: string): boolean { return this.#runs.cancel(runId); } diff --git a/packages/nvisy-runtime/src/engine/executor.ts b/packages/nvisy-runtime/src/engine/executor.ts index 4256bf3..3d9332c 100644 --- a/packages/nvisy-runtime/src/engine/executor.ts +++ b/packages/nvisy-runtime/src/engine/executor.ts @@ -30,9 +30,14 @@ import { executeNode, type NodeResult } from "./nodes.js"; const logger = getLogger(["nvisy", "executor"]); -/** Options for graph execution. */ +/** Options for controlling graph execution behaviour. */ export interface ExecuteOptions { + /** External abort signal; when fired, Effection halts all spawned tasks. */ readonly signal?: AbortSignal; + /** + * Callback invoked after each source item is read, providing the + * resumption context for crash-recovery persistence. + */ readonly onContextUpdate?: ( nodeId: string, connectionId: string, @@ -40,7 +45,14 @@ export interface ExecuteOptions { ) => void; } -/** Result of executing a complete graph. */ +/** + * Result of executing a complete graph. + * + * `status` is derived from per-node outcomes: + * - `"success"` — every node succeeded. + * - `"partial_failure"` — at least one node failed but others succeeded. + * - `"failure"` — every node failed. + */ export interface RunResult { readonly runId: string; readonly status: "success" | "partial_failure" | "failure"; diff --git a/packages/nvisy-runtime/src/engine/index.ts b/packages/nvisy-runtime/src/engine/index.ts index e4455fb..c044c87 100644 --- a/packages/nvisy-runtime/src/engine/index.ts +++ b/packages/nvisy-runtime/src/engine/index.ts @@ -1,3 +1,13 @@ +/** + * Engine module public API. + * + * Re-exports the {@link Engine} class and all supporting types that + * consumers need for graph registration, validation, execution, + * and run monitoring. + * + * @module + */ + export type { ActionDescriptor, ProviderDescriptor, diff --git a/packages/nvisy-runtime/src/engine/nodes.ts b/packages/nvisy-runtime/src/engine/nodes.ts index 18f82c0..41eb358 100644 --- a/packages/nvisy-runtime/src/engine/nodes.ts +++ b/packages/nvisy-runtime/src/engine/nodes.ts @@ -1,10 +1,22 @@ /** * Node execution logic for source, action, and target nodes. * - * Each node type has a dedicated executor that handles: - * - Provider connection management - * - Parameter validation - * - Data streaming through edges + * Each of the three node types has a dedicated executor: + * + * - **Source** — connects to a provider, reads items via a stream + * source, and pushes each item to all outgoing edges. Emits + * resumption-context callbacks after every item for crash recovery. + * - **Action** — drains incoming edge queues (with automatic + * Blob → Document bridging), pipes them through the action's + * transform, and writes results to outgoing edges. Optionally + * connects to a provider when the action requires a client. + * - **Target** — connects to a provider and writes each incoming + * item via the stream's writer function. + * + * All executors are wrapped by {@link withRetry} and + * {@link withTimeout} policies before being spawned by the executor. + * + * @module */ import { getLogger } from "@logtape/logtape"; @@ -24,11 +36,18 @@ import { withRetry, withTimeout } from "./policies.js"; const logger = getLogger(["nvisy", "nodes"]); -/** Result of executing a single node. */ +/** + * Result of executing a single node. + * + * Collected by the executor after each node completes (or fails) + * and aggregated into the overall {@link RunResult}. + */ export interface NodeResult { readonly nodeId: string; readonly status: "success" | "failure" | "skipped"; + /** Present only when `status` is `"failure"`. */ readonly error?: Error; + /** Number of data items that flowed through this node. */ readonly itemsProcessed: number; } diff --git a/packages/nvisy-runtime/src/engine/policies.ts b/packages/nvisy-runtime/src/engine/policies.ts index 6279092..fe4a0ed 100644 --- a/packages/nvisy-runtime/src/engine/policies.ts +++ b/packages/nvisy-runtime/src/engine/policies.ts @@ -1,11 +1,16 @@ /** * Execution policies for retry and timeout handling. * - * These policies wrap Effection operations to provide: - * - Retry with configurable backoff strategies - * - Timeout with fallback values + * Both policies wrap Effection {@link Operation}s and compose freely: * - * Policies are composable and respect Effection structured concurrency. + * - {@link withRetry} — retries an operation up to `maxRetries` times + * using fixed, exponential, or jittered backoff. Non-retryable + * {@link RuntimeError}s bypass retry and propagate immediately. + * - {@link withTimeout} — races an operation against an Effection + * `sleep` timer; if the timer wins, the operation is cancelled and + * a caller-supplied fallback value is returned. + * + * @module */ import { getLogger } from "@logtape/logtape"; diff --git a/packages/nvisy-runtime/src/engine/runs.ts b/packages/nvisy-runtime/src/engine/runs.ts index 4328124..05201ac 100644 --- a/packages/nvisy-runtime/src/engine/runs.ts +++ b/packages/nvisy-runtime/src/engine/runs.ts @@ -1,11 +1,14 @@ /** * Run management for background graph executions. * - * Provides: - * - Tracking of in-flight and completed runs - * - Progress monitoring at the node level - * - Cancellation support via AbortController - * - Automatic cleanup of completed runs after TTL + * A "run" is a single execution of a compiled graph. The + * {@link RunManager} tracks every run through its lifecycle + * (`pending → running → completed | failed | cancelled`), + * exposes per-node progress for monitoring, supports mid-flight + * cancellation via {@link AbortController}, and automatically + * evicts finished runs after a configurable TTL. + * + * @module */ import { getLogger } from "@logtape/logtape"; @@ -29,11 +32,18 @@ export type RunStatus = | "failed" | "cancelled"; -/** Progress of a single node within a run. */ +/** + * Progress of a single node within a run. + * + * Updated as items flow through the node; `itemsProcessed` is + * incremented each time a context-update callback fires. + */ export interface NodeProgress { readonly nodeId: string; readonly status: "pending" | "running" | "completed" | "failed"; + /** Number of data items the node has processed so far. */ readonly itemsProcessed: number; + /** Present only when `status` is `"failed"`. */ readonly error?: Error; } @@ -60,7 +70,7 @@ export interface RunSummary { readonly completedAt?: Date; } -/** Function signature for executing a plan. */ +/** Function signature for executing a compiled plan (injected into {@link SubmitConfig}). */ export type PlanExecutor = ( plan: ExecutionPlan, connections: Connections, @@ -68,7 +78,7 @@ export type PlanExecutor = ( options?: ExecuteOptions, ) => Promise<RunResult>; -/** Configuration for submitting a graph execution. */ +/** Configuration for submitting a graph execution to the {@link RunManager}. */ export interface SubmitConfig { readonly runId: string; readonly plan: ExecutionPlan; diff --git a/packages/nvisy-runtime/tsconfig.json b/packages/nvisy-runtime/tsconfig.json index c91a2dd..8b06e27 100644 --- a/packages/nvisy-runtime/tsconfig.json +++ b/packages/nvisy-runtime/tsconfig.json @@ -9,5 +9,8 @@ /* Scope */ "include": ["src/**/*"], "exclude": ["node_modules", "dist"], - "references": [{ "path": "../nvisy-core" }] + "references": [ + { "path": "../nvisy-core" }, + { "path": "../nvisy-plugin-core" } + ] } diff --git a/packages/nvisy-server/package.json b/packages/nvisy-server/package.json index eb8ed30..945067c 100644 --- a/packages/nvisy-server/package.json +++ b/packages/nvisy-server/package.json @@ -25,7 +25,6 @@ "@logtape/redaction": "^2.0.2", "@nvisy/core": "*", "@nvisy/plugin-ai": "*", - "@nvisy/plugin-markup": "*", "@nvisy/plugin-nosql": "*", "@nvisy/plugin-object": "*", "@nvisy/plugin-tesseract": "*", diff --git a/packages/nvisy-server/src/service/engine-factory.ts b/packages/nvisy-server/src/service/engine-factory.ts index 087e67a..cc9ea3c 100644 --- a/packages/nvisy-server/src/service/engine-factory.ts +++ b/packages/nvisy-server/src/service/engine-factory.ts @@ -1,6 +1,5 @@ import { getLogger } from "@logtape/logtape"; import { aiPlugin } from "@nvisy/plugin-ai"; -import { markupPlugin } from "@nvisy/plugin-markup"; import { nosqlPlugin } from "@nvisy/plugin-nosql"; import { objectPlugin } from "@nvisy/plugin-object"; import { pandocPlugin } from "@nvisy/plugin-pandoc"; @@ -19,7 +18,6 @@ export function createEngine(): Engine { try { const engine = new Engine() .register(aiPlugin) - .register(markupPlugin) .register(nosqlPlugin) .register(objectPlugin) .register(tesseractPlugin) From 0f90f4abc0cef530504c66d164d8f7a5bcaf39a8 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Mon, 9 Feb 2026 11:59:46 +0100 Subject: [PATCH 05/17] refactor(core): separate stream config types, simplify loaders and datatypes, bump pinecone - Replace SourceConfig/TargetConfig `types` tuple with separate `type`, `context`, `params` fields for consistency with Action configs - Simplify Blob and Document datatypes, update loader implementations - Streamline registry and bridge internals - Bump @pinecone-database/pinecone from 4.x to 7.x, update upsert call - Update README examples and add plugin-core README Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- package-lock.json | 29 ++-- packages/nvisy-core/README.md | 122 +++++++++++---- packages/nvisy-core/package.json | 1 + .../nvisy-core/src/datatypes/blob.test.ts | 79 ++++++++-- packages/nvisy-core/src/datatypes/blob.ts | 99 ++++++++++-- .../nvisy-core/src/datatypes/document.test.ts | 12 -- packages/nvisy-core/src/datatypes/document.ts | 44 +----- packages/nvisy-core/src/datatypes/index.ts | 49 +----- packages/nvisy-core/src/index.ts | 26 ++-- packages/nvisy-core/src/plugin.ts | 8 +- packages/nvisy-core/src/stream.ts | 26 ++-- packages/nvisy-core/test/provider.fixtures.ts | 7 +- .../nvisy-plugin-ai/src/actions/enrich.ts | 1 - .../nvisy-plugin-ai/src/datatypes/index.ts | 2 - packages/nvisy-plugin-core/README.md | 141 ++++++++++++++++++ packages/nvisy-plugin-core/package.json | 1 + packages/nvisy-plugin-core/src/index.ts | 15 +- .../nvisy-plugin-core/src/loaders/csv.test.ts | 61 ++------ packages/nvisy-plugin-core/src/loaders/csv.ts | 116 +++++--------- .../src/loaders/json.test.ts | 47 +++--- .../nvisy-plugin-core/src/loaders/json.ts | 95 +++++------- .../src/loaders/plaintext.test.ts | 1 - .../src/loaders/plaintext.ts | 19 ++- .../src/splitter/delimiter.test.ts | 90 +++++++++++ .../src/splitter/delimiter.ts | 31 ++++ .../nvisy-plugin-core/src/splitter/index.ts | 4 + .../src/splitter/regex.test.ts | 90 +++++++++++ .../nvisy-plugin-core/src/splitter/regex.ts | 59 ++++++++ .../nvisy-plugin-object/src/streams/read.ts | 6 +- .../nvisy-plugin-object/src/streams/write.ts | 5 +- packages/nvisy-plugin-sql/src/index.ts | 4 +- packages/nvisy-plugin-sql/src/streams/read.ts | 4 +- .../nvisy-plugin-sql/src/streams/write.ts | 3 +- packages/nvisy-plugin-vector/package.json | 2 +- .../src/providers/pinecone.ts | 6 +- .../nvisy-plugin-vector/src/streams/upsert.ts | 3 +- packages/nvisy-runtime/package.json | 2 +- packages/nvisy-runtime/src/engine/bridge.ts | 8 +- packages/nvisy-runtime/src/registry.ts | 55 +++---- packages/nvisy-runtime/test/engine.test.ts | 22 ++- packages/nvisy-runtime/test/fixtures.ts | 7 +- packages/nvisy-runtime/test/registry.test.ts | 4 +- 42 files changed, 932 insertions(+), 474 deletions(-) create mode 100644 packages/nvisy-plugin-core/README.md create mode 100644 packages/nvisy-plugin-core/src/splitter/delimiter.test.ts create mode 100644 packages/nvisy-plugin-core/src/splitter/delimiter.ts create mode 100644 packages/nvisy-plugin-core/src/splitter/index.ts create mode 100644 packages/nvisy-plugin-core/src/splitter/regex.test.ts create mode 100644 packages/nvisy-plugin-core/src/splitter/regex.ts diff --git a/package-lock.json b/package-lock.json index 42536f5..2076e00 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2474,15 +2474,6 @@ "integrity": "sha512-8awtpHXCx/bNpFt4mt2xdkgtgVvKqty8VbjHI/WWWQuEw+KLzFot3f4+LkQY9YmOtq7A5GdOnqoIC8Pdygjk2g==", "license": "MIT" }, - "node_modules/@pinecone-database/pinecone": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-4.1.0.tgz", - "integrity": "sha512-WoVsbvmCgvZfjm/nCasJXuQ/tw0es5BpedLHvRScAm6xJ/nL07s3B0TrsM8m8rACTiUgbdYsdLY1W6cEBhS9xA==", - "license": "Apache-2.0", - "engines": { - "node": ">=18.0.0" - } - }, "node_modules/@protobufjs/aspromise": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", @@ -4475,6 +4466,12 @@ "node-fetch": "^2.7.0" } }, + "node_modules/csv-parse": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-6.1.0.tgz", + "integrity": "sha512-CEE+jwpgLn+MmtCpVcPtiCZpVtB6Z2OKPTr34pycYYoL7sxdOkXDdQ4lRiw6ioC0q6BLqhc6cKweCVvral8yhw==", + "license": "MIT" + }, "node_modules/dayjs": { "version": "1.11.19", "resolved": "https://registry.npmjs.org/dayjs/-/dayjs-1.11.19.tgz", @@ -7403,6 +7400,7 @@ "version": "0.1.0", "dependencies": { "@logtape/logtape": "^2.0.2", + "magic-bytes.js": "^1.13.0", "zod": "^4.3.6" }, "engines": { @@ -7431,6 +7429,7 @@ "dependencies": { "@logtape/logtape": "^2.0.2", "@nvisy/core": "*", + "csv-parse": "^6.1.0", "zod": "^4.3.6" }, "engines": { @@ -7553,7 +7552,7 @@ "dependencies": { "@logtape/logtape": "^2.0.2", "@nvisy/core": "*", - "@pinecone-database/pinecone": "^4.0.0", + "@pinecone-database/pinecone": "^7.0.0", "@qdrant/js-client-rest": "^1.13.0", "@zilliz/milvus2-sdk-node": "^2.5.0", "pg": "^8.13.0", @@ -7565,6 +7564,15 @@ "node": ">=22.0.0" } }, + "packages/nvisy-plugin-vector/node_modules/@pinecone-database/pinecone": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-7.0.0.tgz", + "integrity": "sha512-/+SzpIJPXhrwv27CCz+sFw/r22Sjk9s+i8nFTryPiQAcwgWmWRWFqT1/LGkTdd9NRhuEA48yaCx/HgM6ugLNJA==", + "license": "Apache-2.0", + "engines": { + "node": ">=20.0.0" + } + }, "packages/nvisy-runtime": { "name": "@nvisy/runtime", "version": "0.1.0", @@ -7576,7 +7584,6 @@ "graphology": "^0.26.0", "graphology-dag": "^0.4.1", "graphology-types": "^0.24.8", - "magic-bytes.js": "^1.13.0", "zod": "^4.3.6" }, "engines": { diff --git a/packages/nvisy-core/README.md b/packages/nvisy-core/README.md index 29464c2..02407e7 100644 --- a/packages/nvisy-core/README.md +++ b/packages/nvisy-core/README.md @@ -6,22 +6,26 @@ Core primitives and abstractions for the Nvisy runtime platform. ## Features -- **Data types**: `Document`, `Embedding`, `Blob`, and `Entry` for pipeline data -- **Module system**: bundle providers, streams, and actions under a namespace +- **Data types**: `Document`, `Chunk`, `Embedding`, and `Blob` for pipeline data, with lineage tracking via `Data` base class +- **Document model**: structured `Element` hierarchy with typed subclasses (`ImageElement`, `TableElement`, `FormElement`, `EmailElement`, `CompositeElement`) and provenance metadata +- **Plugin system**: bundle providers, streams, actions, loaders, and custom datatypes under a namespace - **Provider abstraction**: connection lifecycle management with credential validation - **Stream contracts**: resumable sources and sinks for external systems - **Action contracts**: stream transforms with optional client dependencies -- **Error taxonomy**: `RuntimeError`, `ValidationError`, `ConnectionError`, `CancellationError` +- **Loader contracts**: `Blob` to `Document` transforms with file extension and MIME type matching +- **Error taxonomy**: `RuntimeError`, `ValidationError`, `ConnectionError`, `CancellationError`, `TimeoutError` ## Overview -This package defines the foundational abstractions that all Nvisy modules implement: +This package defines the foundational abstractions that all Nvisy plugins implement: -- **Data types** (`Data`, `Document`, `Embedding`, `Blob`, `Entry`): immutable data containers that flow through pipelines. -- **Modules** (`Module.define`): namespace for grouping providers, streams, and actions. +- **Data types** (`Data`, `Document`, `Chunk`, `Embedding`, `Blob`): immutable data containers that flow through pipelines. All extend `Data`, which provides `id`, `parentId`, `metadata`, and lineage methods (`deriveFrom`, `withParent`). +- **Elements** (`Element`, `ImageElement`, `TableElement`, etc.): structured content within documents, with typed subclasses for images, tables, forms, and emails. Includes provenance metadata, coordinate systems, and an element type ontology. +- **Plugins** (`Plugin.define`): namespace for grouping providers, streams, actions, loaders, and custom datatypes. - **Providers** (`Provider.withAuthentication`, `Provider.withoutAuthentication`): external client lifecycle management. - **Streams** (`Stream.createSource`, `Stream.createTarget`): data I/O layer for reading from and writing to external systems. - **Actions** (`Action.withClient`, `Action.withoutClient`): stream transforms that process data between sources and targets. +- **Loaders** (`Loader.define`): specialized transforms that convert `Blob` objects into `Document` instances, matched by file extension and MIME type. ## Usage @@ -31,11 +35,13 @@ This package defines the foundational abstractions that all Nvisy modules implem import { Provider } from "@nvisy/core"; import { z } from "zod"; +const credentialSchema = z.object({ + apiKey: z.string(), + endpoint: z.string().url(), +}); + const myProvider = Provider.withAuthentication("my-provider", { - credentials: z.object({ - apiKey: z.string(), - endpoint: z.string().url(), - }), + credentials: credentialSchema, connect: async (creds) => { const client = await createClient(creds); return { @@ -49,14 +55,17 @@ const myProvider = Provider.withAuthentication("my-provider", { ### Defining a Stream Source ```ts -import { Stream, Entry } from "@nvisy/core"; +import { Stream, Document } from "@nvisy/core"; import { z } from "zod"; +const contextSchema = z.object({ cursor: z.string().optional() }); +const sourceParamSchema = z.object({ limit: z.number() }); + const mySource = Stream.createSource("my-source", MyClient, { - types: [Entry, z.object({ cursor: z.string().optional() }), z.object({ limit: z.number() })], + type: Document, context: contextSchema, params: sourceParamSchema, reader: async function* (client, ctx, params) { for await (const item of client.list({ cursor: ctx.cursor, limit: params.limit })) { - yield { data: new Entry(item), context: { cursor: item.id } }; + yield { data: new Document(item.text), context: { cursor: item.id } }; } }, }); @@ -65,13 +74,15 @@ const mySource = Stream.createSource("my-source", MyClient, { ### Defining a Stream Target ```ts -import { Stream, Entry } from "@nvisy/core"; +import { Stream, Embedding } from "@nvisy/core"; import { z } from "zod"; +const targetParamSchema = z.object({ collection: z.string() }); + const myTarget = Stream.createTarget("my-target", MyClient, { - types: [Entry, z.object({ collection: z.string() })], + type: Embedding, params: targetParamSchema, writer: (client, params) => async (item) => { - await client.insert(params.collection, item.fields); + await client.insert(params.collection, item); }, }); ``` @@ -79,34 +90,85 @@ const myTarget = Stream.createTarget("my-target", MyClient, { ### Defining an Action ```ts -import { Action, Entry } from "@nvisy/core"; +import { Action, Document, Chunk } from "@nvisy/core"; import { z } from "zod"; -const myFilter = Action.withoutClient("my-filter", { - types: [Entry], - params: z.object({ minValue: z.number() }), +const chunkerParamSchema = z.object({ maxLength: z.number() }); + +const myChunker = Action.withoutClient("my-chunker", { + types: [Document, Chunk], + params: chunkerParamSchema, transform: async function* (stream, params) { - for await (const entry of stream) { - if ((entry.get("value") as number) >= params.minValue) { - yield entry; + for await (const doc of stream) { + for (let i = 0; i < doc.content.length; i += params.maxLength) { + yield new Chunk(doc.content.slice(i, i + params.maxLength)).deriveFrom(doc); } } }, }); ``` -### Bundling into a Module +### Defining a Loader + +```ts +import { Loader, Document } from "@nvisy/core"; +import { z } from "zod"; + +const loaderParamSchema = z.object({ + encoding: z.enum(["utf-8", "ascii"]).default("utf-8"), +}); + +const myLoader = Loader.define("markdown", { + extensions: [".md", ".markdown"], + contentTypes: ["text/markdown"], + params: loaderParamSchema, + load: async function* (blob, params) { + const text = blob.data.toString(params.encoding); + yield new Document(text).deriveFrom(blob); + }, +}); +``` + +### Defining a Datatype + +Custom data types extend the `Data` base class and are registered with `Datatype.define`. All `Data` subclasses get a unique `id`, optional `metadata`, and lineage tracking via `deriveFrom` / `withParent`. ```ts -import { Module } from "@nvisy/core"; +import { Data, Datatype } from "@nvisy/core"; + +class Audio extends Data { + readonly #duration: number; + readonly #sampleRate: number; + + constructor(duration: number, sampleRate: number) { + super(); + this.#duration = duration; + this.#sampleRate = sampleRate; + } + + get duration(): number { + return this.#duration; + } + + get sampleRate(): number { + return this.#sampleRate; + } +} -const myModule = Module.define("my-module") +const audioDatatype = Datatype.define("audio", Audio); +``` + +### Bundling into a Plugin + +```ts +import { Plugin, Datatype, Document, Chunk } from "@nvisy/core"; + +const myPlugin = Plugin.define("my-plugin") + .withDatatypes(audioDatatype) .withProviders(myProvider) .withStreams(mySource, myTarget) - .withActions(myFilter); - -// Register with the engine -registry.load(myModule); + .withActions(myChunker) + .withLoaders(myLoader); ``` ## Changelog diff --git a/packages/nvisy-core/package.json b/packages/nvisy-core/package.json index 565b7b6..bc36220 100644 --- a/packages/nvisy-core/package.json +++ b/packages/nvisy-core/package.json @@ -21,6 +21,7 @@ }, "dependencies": { "@logtape/logtape": "^2.0.2", + "magic-bytes.js": "^1.13.0", "zod": "^4.3.6" }, "engines": { diff --git a/packages/nvisy-core/src/datatypes/blob.test.ts b/packages/nvisy-core/src/datatypes/blob.test.ts index da330ec..85ceef4 100644 --- a/packages/nvisy-core/src/datatypes/blob.test.ts +++ b/packages/nvisy-core/src/datatypes/blob.test.ts @@ -10,18 +10,16 @@ describe("Blob", () => { expect(blob.data.toString()).toBe("hello world"); }); - it("contentType is optional and defaults to undefined", () => { + it("provided.mime is undefined when no contentType given", () => { const blob = new Blob("file.bin", Buffer.from([0x00, 0x01])); - expect(blob.contentType).toBeUndefined(); + expect(blob.provided.mime).toBeUndefined(); }); - it("accepts contentType in constructor", () => { - const blob = new Blob( - "report.pdf", - Buffer.from("pdf content"), - "application/pdf", - ); - expect(blob.contentType).toBe("application/pdf"); + it("provided.mime reflects constructor contentType", () => { + const blob = new Blob("report.pdf", Buffer.from("pdf content"), { + contentType: "application/pdf", + }); + expect(blob.provided.mime).toBe("application/pdf"); }); it("size returns byte length of data", () => { @@ -66,6 +64,69 @@ describe("Blob", () => { expect(blob.metadata).toEqual({ source: "s3", bucket: "my-bucket" }); }); + describe("createdAt / updatedAt", () => { + it("defaults to undefined when not provided", () => { + const blob = new Blob("file.txt", Buffer.from("")); + expect(blob.createdAt).toBeUndefined(); + expect(blob.updatedAt).toBeUndefined(); + }); + + it("stores and returns the dates when provided", () => { + const created = new Date("2025-01-01T00:00:00Z"); + const updated = new Date("2025-06-15T12:00:00Z"); + const blob = new Blob("file.txt", Buffer.from(""), { + createdAt: created, + updatedAt: updated, + }); + expect(blob.createdAt).toBe(created); + expect(blob.updatedAt).toBe(updated); + }); + }); + + describe("provided", () => { + it("extracts extension from path", () => { + const blob = new Blob("report.pdf", Buffer.from("")); + expect(blob.provided.extension).toBe(".pdf"); + }); + + it("includes mime from contentType", () => { + const blob = new Blob("report.pdf", Buffer.from(""), { + contentType: "application/pdf", + }); + expect(blob.provided.mime).toBe("application/pdf"); + }); + + it("omits extension for extensionless path", () => { + const blob = new Blob("Makefile", Buffer.from("")); + expect(blob.provided.extension).toBeUndefined(); + }); + + it("lowercases the extension", () => { + const blob = new Blob("photo.JPG", Buffer.from("")); + expect(blob.provided.extension).toBe(".jpg"); + }); + + it("handles paths with multiple dots", () => { + const blob = new Blob("archive.tar.gz", Buffer.from("")); + expect(blob.provided.extension).toBe(".gz"); + }); + }); + + describe("identified", () => { + it("detects PDF from magic bytes", () => { + const pdfHeader = Buffer.from("%PDF-1.4 ..."); + const blob = new Blob("mystery.bin", pdfHeader); + expect(blob.identified.extension).toBe(".pdf"); + expect(blob.identified.mime).toBe("application/pdf"); + }); + + it("returns empty filetype for unrecognizable bytes (e.g. CSV)", () => { + const blob = new Blob("data.csv", Buffer.from("a,b\n1,2")); + expect(blob.identified.extension).toBeUndefined(); + expect(blob.identified.mime).toBeUndefined(); + }); + }); + it("handles various path formats", () => { const s3Blob = new Blob("s3://bucket/key/file.pdf", Buffer.from("")); expect(s3Blob.path).toBe("s3://bucket/key/file.pdf"); diff --git a/packages/nvisy-core/src/datatypes/blob.ts b/packages/nvisy-core/src/datatypes/blob.ts index 6a57ab6..be03d4c 100644 --- a/packages/nvisy-core/src/datatypes/blob.ts +++ b/packages/nvisy-core/src/datatypes/blob.ts @@ -4,30 +4,70 @@ * @module */ +import { filetypeinfo } from "magic-bytes.js"; import { Data } from "./data.js"; +/** Extension and MIME type pair describing a file type. */ +export interface Filetype { + /** File extension including the dot (e.g. `".pdf"`). */ + readonly extension?: string; + /** MIME type (e.g. `"application/pdf"`). */ + readonly mime?: string; +} + +/** Options for constructing a {@link Blob}. */ +export interface BlobOptions { + /** MIME type declared by the source (e.g. `"application/pdf"`). */ + readonly contentType?: string; + /** Timestamp when the object was created in the source store. */ + readonly createdAt?: Date; + /** Timestamp when the object was last modified in the source store. */ + readonly updatedAt?: Date; +} + /** * A file or binary blob retrieved from object storage (S3, GCS, Dropbox, etc.). * - * Wraps raw bytes together with their storage path and MIME type so - * downstream processors can decide how to parse the content. + * Wraps raw bytes together with their storage path so downstream + * processors can decide how to parse the content. File-type information + * is available via two {@link Filetype} getters: + * + * - {@link provided} — declared type from the path extension and the + * cloud-provider / caller-supplied `contentType`. + * - {@link identified} — detected type from the actual bytes via + * magic-bytes signatures (lazy, cached on first access). * * @example * ```ts - * const obj = new Blob("uploads/report.pdf", Buffer.from(pdfBytes)); - * console.log(obj.size); // byte length + * const blob = new Blob("uploads/report.pdf", pdfBytes, { + * contentType: "application/pdf", + * }); + * blob.provided; // { extension: ".pdf", mime: "application/pdf" } + * blob.identified; // { extension: ".pdf", mime: "application/pdf" } * ``` */ export class Blob extends Data { readonly #path: string; readonly #data: Buffer; - readonly #contentType: string | undefined; + readonly #filetype: Filetype; + readonly #createdAt?: Date | undefined; + readonly #updatedAt?: Date | undefined; - constructor(path: string, data: Buffer, contentType?: string) { + // Lazy magic-bytes cache — `false` means "not yet computed" + #identified: false | Filetype = false; + + constructor(path: string, data: Buffer, options?: BlobOptions) { super(); this.#path = path; this.#data = data; - this.#contentType = contentType; + this.#createdAt = options?.createdAt; + this.#updatedAt = options?.updatedAt; + + const ext = Blob.#parseExtension(path); + this.#filetype = { + ...(ext && { extension: ext }), + ...(options?.contentType && { mime: options.contentType }), + }; } /** Storage path or key (e.g. `"s3://bucket/file.pdf"`). */ @@ -40,13 +80,48 @@ export class Blob extends Data { return this.#data; } - /** MIME type of the content (e.g. `"application/pdf"`). */ - get contentType(): string | undefined { - return this.#contentType; - } - /** Size of the raw data in bytes. */ get size(): number { return this.#data.byteLength; } + + /** Timestamp when the object was created in the source store. */ + get createdAt(): Date | undefined { + return this.#createdAt; + } + + /** Timestamp when the object was last modified in the source store. */ + get updatedAt(): Date | undefined { + return this.#updatedAt; + } + + /** Declared file type derived from path extension and constructor contentType. */ + get provided(): Filetype { + return this.#filetype; + } + + /** File type detected from magic bytes. Fields are absent when bytes are not recognizable. */ + get identified(): Filetype { + return this.#identify(); + } + + #identify(): Filetype { + if (this.#identified === false) { + const detected = filetypeinfo(this.#data); + const first = detected[0]; + this.#identified = first + ? { + ...(first.extension && { extension: `.${first.extension}` }), + ...(first.mime && { mime: first.mime }), + } + : {}; + } + return this.#identified; + } + + static #parseExtension(path: string): string | undefined { + const lastDot = path.lastIndexOf("."); + if (lastDot === -1 || lastDot === path.length - 1) return undefined; + return path.slice(lastDot).toLowerCase(); + } } diff --git a/packages/nvisy-core/src/datatypes/document.test.ts b/packages/nvisy-core/src/datatypes/document.test.ts index 111293d..940b4ad 100644 --- a/packages/nvisy-core/src/datatypes/document.test.ts +++ b/packages/nvisy-core/src/datatypes/document.test.ts @@ -37,11 +37,9 @@ describe("Document", () => { text: "hi", }); const doc = Document.fromElements([el], { - sourceType: "html", title: "My Page", }); expect(doc.title).toBe("My Page"); - expect(doc.sourceType).toBe("html"); }); }); @@ -133,16 +131,6 @@ describe("Document", () => { expect(doc.content).toBe(""); expect(doc.elements).toEqual([]); }); - - it("preserves sourceType", () => { - const el = new Element({ - type: "narrative-text", - text: "text", - }); - const doc = Document.fromElements([el], { sourceType: "pdf" }); - expect(doc.sourceType).toBe("pdf"); - expect(doc.elements).toHaveLength(1); - }); }); describe("getElementsByPage", () => { diff --git a/packages/nvisy-core/src/datatypes/document.ts b/packages/nvisy-core/src/datatypes/document.ts index 8fac4d1..27998a8 100644 --- a/packages/nvisy-core/src/datatypes/document.ts +++ b/packages/nvisy-core/src/datatypes/document.ts @@ -7,31 +7,8 @@ import type { Element } from "../documents/elements.js"; import { Data } from "./data.js"; -export type { - CompositeElementOptions, - ElementOptions, - ElementProvenance, - EmailElementOptions, - EmphasizedText, - FormElementOptions, - FormKeyValuePair, - ImageElementOptions, - Link, - TableCellData, - TableElementOptions, -} from "../documents/elements.js"; -export { - CompositeElement, - Element, - EmailElement, - FormElement, - ImageElement, - TableElement, -} from "../documents/elements.js"; - /** Options for constructing a {@link Document}. */ export interface DocumentOptions { - readonly sourceType?: string; /** Document title (e.g. HTML `<title>`, PDF metadata). */ readonly title?: string; /** Pre-extracted structural elements. */ @@ -54,19 +31,17 @@ export interface DocumentOptions { * const doc = Document.fromElements([ * new Element({ type: "title", text: "Quarterly Report", pageNumber: 1 }), * new Element({ type: "narrative-text", text: "Revenue increased…", pageNumber: 1 }), - * ], { sourceType: "pdf" }); + * ]); * ``` */ export class Document extends Data { readonly #content: string; - readonly #sourceType?: string | undefined; readonly #title?: string | undefined; readonly #elements?: readonly Element[] | undefined; constructor(content: string, options?: DocumentOptions) { super(); this.#content = content; - this.#sourceType = options?.sourceType; this.#title = options?.title; this.#elements = options?.elements; } @@ -76,11 +51,6 @@ export class Document extends Data { return this.#content; } - /** Origin format (e.g. "pdf", "markdown", "docx", "html", "transcript", "database"). */ - get sourceType(): string | undefined { - return this.#sourceType; - } - /** Document title (e.g. HTML `<title>`, PDF metadata). */ get title(): string | undefined { return this.#title; @@ -89,15 +59,15 @@ export class Document extends Data { /** Unique BCP-47 language tags collected from all elements. */ get languages(): readonly string[] { if (this.#elements == null) return []; - const set = new Set<string>(); - for (const el of this.#elements) { - if (el.languages != null) { - for (const lang of el.languages) { - set.add(lang); + const uniqueLanguages = new Set<string>(); + for (const element of this.#elements) { + if (element.languages != null) { + for (const language of element.languages) { + uniqueLanguages.add(language); } } } - return [...set]; + return [...uniqueLanguages]; } /** Flat ordered list of structural elements. */ diff --git a/packages/nvisy-core/src/datatypes/index.ts b/packages/nvisy-core/src/datatypes/index.ts index 7d24ef9..0573345 100644 --- a/packages/nvisy-core/src/datatypes/index.ts +++ b/packages/nvisy-core/src/datatypes/index.ts @@ -4,34 +4,13 @@ * Base data model and built-in types for the Nvisy pipeline. */ -export type { JsonValue, Metadata } from "../types.js"; +export type { BlobOptions, Filetype } from "./blob.js"; export { Blob } from "./blob.js"; export type { ChunkOptions } from "./chunk.js"; export { Chunk } from "./chunk.js"; export { Data } from "./data.js"; -export type { - CompositeElementOptions, - DocumentOptions, - ElementOptions, - ElementProvenance, - EmailElementOptions, - EmphasizedText, - FormElementOptions, - FormKeyValuePair, - ImageElementOptions, - Link, - TableCellData, - TableElementOptions, -} from "./document.js"; -export { - CompositeElement, - Document, - Element, - EmailElement, - FormElement, - ImageElement, - TableElement, -} from "./document.js"; +export type { DocumentOptions } from "./document.js"; +export { Document } from "./document.js"; export { Embedding } from "./embedding.js"; import type { ClassRef } from "../types.js"; @@ -43,7 +22,7 @@ import type { Data } from "./data.js"; * Plugins use this to extend the type system with new {@link Data} * subclasses without modifying nvisy-core. */ -export interface Datatype { +export interface DatatypeDescriptor { /** Unique identifier for this data type (e.g. "audio", "image"). */ readonly id: string; /** Class reference for the custom data type. */ @@ -51,23 +30,9 @@ export interface Datatype { } /** Factory for creating data type entries. */ -export const Datatypes = { - /** Create a Datatype for registering a custom data type with a plugin. */ - define(id: string, dataClass: ClassRef<Data>): Datatype { +export const Datatype = { + /** Create a DatatypeDescriptor for registering a custom data type with a plugin. */ + define(id: string, dataClass: ClassRef<Data>): DatatypeDescriptor { return { id, dataClass }; }, } as const; - -import { Blob } from "./blob.js"; -import { Chunk } from "./chunk.js"; -import { Document } from "./document.js"; -import { Embedding } from "./embedding.js"; - -/** Pre-defined Document datatype entry. */ -export const document = Datatypes.define("document", Document); -/** Pre-defined Chunk datatype entry. */ -export const chunk = Datatypes.define("chunk", Chunk); -/** Pre-defined Blob datatype entry. */ -export const blob = Datatypes.define("blob", Blob); -/** Pre-defined Embedding datatype entry. */ -export const embedding = Datatypes.define("embedding", Embedding); diff --git a/packages/nvisy-core/src/index.ts b/packages/nvisy-core/src/index.ts index c6c3c0b..10bcd05 100644 --- a/packages/nvisy-core/src/index.ts +++ b/packages/nvisy-core/src/index.ts @@ -7,10 +7,22 @@ export type { ActionInstance } from "./action.js"; export { Action } from "./action.js"; export type { + BlobOptions, ChunkOptions, - CompositeElementOptions, - Datatype, + DatatypeDescriptor, DocumentOptions, + Filetype, +} from "./datatypes/index.js"; +export { + Blob, + Chunk, + Data, + Datatype, + Document, + Embedding, +} from "./datatypes/index.js"; +export type { + CompositeElementOptions, ElementOptions, ElementProvenance, EmailElementOptions, @@ -21,21 +33,15 @@ export type { Link, TableCellData, TableElementOptions, -} from "./datatypes/index.js"; +} from "./documents/elements.js"; export { - Blob, - Chunk, CompositeElement, - Data, - Datatypes, - Document, Element, EmailElement, - Embedding, FormElement, ImageElement, TableElement, -} from "./datatypes/index.js"; +} from "./documents/elements.js"; export type { ElementCategory, ElementCoordinates, diff --git a/packages/nvisy-core/src/plugin.ts b/packages/nvisy-core/src/plugin.ts index 940a14f..5f7cb11 100644 --- a/packages/nvisy-core/src/plugin.ts +++ b/packages/nvisy-core/src/plugin.ts @@ -10,7 +10,7 @@ */ import type { ActionInstance } from "./action.js"; -import type { Datatype } from "./datatypes/index.js"; +import type { DatatypeDescriptor } from "./datatypes/index.js"; import type { LoaderInstance } from "./loader.js"; import type { ProviderFactory } from "./provider.js"; import type { StreamSource, StreamTarget } from "./stream.js"; @@ -48,7 +48,7 @@ export interface PluginInstance { /** Loaders keyed by their ID. */ readonly loaders: Readonly<Record<string, AnyLoaderInstance>>; /** Custom data types keyed by their ID. */ - readonly datatypes: Readonly<Record<string, Datatype>>; + readonly datatypes: Readonly<Record<string, DatatypeDescriptor>>; } class PluginBuilder implements PluginInstance { @@ -59,7 +59,7 @@ class PluginBuilder implements PluginInstance { > = {}; readonly actions: Readonly<Record<string, AnyActionInstance>> = {}; readonly loaders: Readonly<Record<string, AnyLoaderInstance>> = {}; - readonly datatypes: Readonly<Record<string, Datatype>> = {}; + readonly datatypes: Readonly<Record<string, DatatypeDescriptor>> = {}; constructor(id: string) { this.id = id; @@ -98,7 +98,7 @@ class PluginBuilder implements PluginInstance { } /** Add custom data types to this plugin. */ - withDatatypes(...datatypes: Datatype[]): this { + withDatatypes(...datatypes: DatatypeDescriptor[]): this { const record = { ...this.datatypes }; for (const d of datatypes) record[d.id] = d; (this as { datatypes: typeof record }).datatypes = record; diff --git a/packages/nvisy-core/src/stream.ts b/packages/nvisy-core/src/stream.ts index a4495ff..d9a079e 100644 --- a/packages/nvisy-core/src/stream.ts +++ b/packages/nvisy-core/src/stream.ts @@ -64,12 +64,12 @@ export type WriterFn<TClient, TData extends Data, TParam> = ( * @template TParam - Configuration parameters for the source. */ export interface SourceConfig<TClient, TData extends Data, TCtx, TParam> { - /** Type information: data class, context schema, and param schema. */ - readonly types: [ - dataClass: ClassRef<TData>, - contextSchema: z.ZodType<TCtx>, - paramSchema: z.ZodType<TParam>, - ]; + /** Class reference for the data type produced. */ + readonly type: ClassRef<TData>; + /** Zod schema for validating and parsing resumption context. */ + readonly context: z.ZodType<TCtx>; + /** Zod schema for validating stream parameters. */ + readonly params: z.ZodType<TParam>; /** The reader function that produces data items. */ readonly reader: ReaderFn<TClient, TData, TCtx, TParam>; } @@ -82,8 +82,10 @@ export interface SourceConfig<TClient, TData extends Data, TCtx, TParam> { * @template TParam - Configuration parameters for the target. */ export interface TargetConfig<TClient, TData extends Data, TParam> { - /** Type information: data class and param schema. */ - readonly types: [dataClass: ClassRef<TData>, paramSchema: z.ZodType<TParam>]; + /** Class reference for the data type consumed. */ + readonly type: ClassRef<TData>; + /** Zod schema for validating stream parameters. */ + readonly params: z.ZodType<TParam>; /** The writer function that persists data items. */ readonly writer: WriterFn<TClient, TData, TParam>; } @@ -229,7 +231,11 @@ export const Stream = { clientClass: ClassRef<TClient>, config: SourceConfig<TClient, TData, TCtx, TParam>, ): StreamSource<TClient, TData, TCtx, TParam> { - const [dataClass, contextSchema, paramSchema] = config.types; + const { + type: dataClass, + context: contextSchema, + params: paramSchema, + } = config; return new StreamSourceImpl({ id, clientClass, @@ -252,7 +258,7 @@ export const Stream = { clientClass: ClassRef<TClient>, config: TargetConfig<TClient, TData, TParam>, ): StreamTarget<TClient, TData, TParam> { - const [dataClass, paramSchema] = config.types; + const { type: dataClass, params: paramSchema } = config; return new StreamTargetImpl({ id, clientClass, diff --git a/packages/nvisy-core/test/provider.fixtures.ts b/packages/nvisy-core/test/provider.fixtures.ts index 4c1c299..a073848 100644 --- a/packages/nvisy-core/test/provider.fixtures.ts +++ b/packages/nvisy-core/test/provider.fixtures.ts @@ -68,11 +68,14 @@ export const ExampleProvider = Provider.withAuthentication("example", { }); export const ExampleSource = Stream.createSource("read", ExampleClient, { - types: [TestRow, Cursor, Params], + type: TestRow, + context: Cursor, + params: Params, reader: (client, ctx, params) => readStream(client, ctx, params), }); export const ExampleTarget = Stream.createTarget("write", ExampleClient, { - types: [TestRow, Params], + type: TestRow, + params: Params, writer: (_client, _params) => async (_item) => {}, }); diff --git a/packages/nvisy-plugin-ai/src/actions/enrich.ts b/packages/nvisy-plugin-ai/src/actions/enrich.ts index 06b4a0b..055bdda 100644 --- a/packages/nvisy-plugin-ai/src/actions/enrich.ts +++ b/packages/nvisy-plugin-ai/src/actions/enrich.ts @@ -121,7 +121,6 @@ async function* transformEnrich( } yield new Document(doc.content, { - ...(doc.sourceType != null ? { sourceType: doc.sourceType } : {}), ...(doc.elements != null ? { elements: doc.elements } : {}), }) .deriveFrom(doc) diff --git a/packages/nvisy-plugin-ai/src/datatypes/index.ts b/packages/nvisy-plugin-ai/src/datatypes/index.ts index 4c83b6c..e69de29 100644 --- a/packages/nvisy-plugin-ai/src/datatypes/index.ts +++ b/packages/nvisy-plugin-ai/src/datatypes/index.ts @@ -1,2 +0,0 @@ -export type { ChunkOptions } from "@nvisy/core"; -export { Chunk } from "@nvisy/core"; diff --git a/packages/nvisy-plugin-core/README.md b/packages/nvisy-plugin-core/README.md new file mode 100644 index 0000000..0522bd0 --- /dev/null +++ b/packages/nvisy-plugin-core/README.md @@ -0,0 +1,141 @@ +# @nvisy/plugin-core + +Core plugin for the Nvisy runtime with built-in chunking, partitioning, loading, and text splitting. + +## Install + +```bash +npm install @nvisy/plugin-core +``` + +## Plugin Registration + +```ts +import { corePlugin } from "@nvisy/plugin-core"; + +// Register with the engine +registry.load(corePlugin); +``` + +The `corePlugin` registers: + +- **Datatype**: `Document`, `Blob`, `Chunk`, `Embedding` +- **Actions**: `chunkSimple`, `partition` +- **Loaders**: `plaintextLoader`, `csvLoader`, `jsonLoader` + +## Actions + +### `chunkSimple` + +Splits documents into smaller chunks. Accepts a `strategy` discriminator to select the splitting method. + +**Character strategy** — fixed-size windows with optional overlap: + +```ts +{ strategy: "character", maxCharacters: 500, overlap: 50 } +``` + +**Section strategy** — split on markdown headings: + +```ts +{ strategy: "section", level: 2, maxCharacters: 1000, combineUnder: 200 } +``` + +**Page strategy** — split on page boundaries (`\f`, `---`, `***`) or structured page elements: + +```ts +{ strategy: "page", maxCharacters: 2000 } +``` + +### `partition` + +Partitions documents into multiple documents with metadata tracking. + +**Auto strategy** — pass-through, preserves content as-is: + +```ts +{ strategy: "auto" } +``` + +**Rule strategy** — split on a regex pattern: + +```ts +{ strategy: "rule", pattern: "\\n{2,}", includeDelimiter: false, inferTableStructure: false } +``` + +## Loaders + +### `plaintextLoader` + +Converts `.txt` blobs into documents. + +| Parameter | Type | Default | +|-----------|------|---------| +| `encoding` | `"utf-8" \| "ascii" \| "latin1" \| "utf16le"` | `"utf-8"` | + +### `csvLoader` + +Converts `.csv` / `.tsv` blobs into documents. Rows are formatted as `column: value` when headers are present. + +| Parameter | Type | Default | +|-----------|------|---------| +| `delimiter` | `string` | `","` | +| `hasHeader` | `boolean` | `true` | +| `encoding` | `"utf-8" \| "ascii" \| "latin1" \| "utf16le"` | `"utf-8"` | + +### `jsonLoader` + +Converts `.json` / `.jsonl` / `.ndjson` blobs into documents. Scalar object fields are extracted as document metadata. + +| Parameter | Type | Default | +|-----------|------|---------| +| `encoding` | `"utf-8" \| "ascii" \| "latin1" \| "utf16le"` | `"utf-8"` | + +## Splitters + +Reusable `string → string[]` splitting utilities, usable independently of the action system. + +### `splitByDelimiter` + +Split text on a literal string delimiter. + +```ts +import { splitByDelimiter } from "@nvisy/plugin-core"; + +splitByDelimiter("a---b---c", { delimiter: "---" }); +// → ["a", "b", "c"] + +splitByDelimiter("a---b---c", { delimiter: "---", keepDelimiter: true }); +// → ["a", "---b", "---c"] +``` + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `delimiter` | `string` | — | String to split on | +| `keepDelimiter` | `boolean` | `false` | Prepend delimiter to subsequent segments | +| `trimEmpty` | `boolean` | `true` | Discard empty/whitespace-only segments | + +### `splitByRegex` + +Split text on a regex pattern (compiled with `gm` flags). + +```ts +import { splitByRegex } from "@nvisy/plugin-core"; + +splitByRegex("intro\n## A\ncontent A\n## B\ncontent B", { pattern: "^## .+$", keepSeparator: true }); +// → ["intro\n", "## A\ncontent A\n", "## B\ncontent B"] +``` + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `pattern` | `string` | — | Regex pattern to split on | +| `keepSeparator` | `boolean` | `false` | Keep matched separator at start of segments | +| `trimEmpty` | `boolean` | `true` | Discard empty/whitespace-only segments | + +## Changelog + +See [CHANGELOG.md](../../CHANGELOG.md) for release notes and version history. + +## License + +Apache 2.0 License - see [LICENSE.txt](../../LICENSE.txt) diff --git a/packages/nvisy-plugin-core/package.json b/packages/nvisy-plugin-core/package.json index de9f34a..d69cdb3 100644 --- a/packages/nvisy-plugin-core/package.json +++ b/packages/nvisy-plugin-core/package.json @@ -22,6 +22,7 @@ "dependencies": { "@logtape/logtape": "^2.0.2", "@nvisy/core": "*", + "csv-parse": "^6.1.0", "zod": "^4.3.6" }, "engines": { diff --git a/packages/nvisy-plugin-core/src/index.ts b/packages/nvisy-plugin-core/src/index.ts index bf6e8fb..5be4e00 100644 --- a/packages/nvisy-plugin-core/src/index.ts +++ b/packages/nvisy-plugin-core/src/index.ts @@ -1,7 +1,7 @@ import { Blob, Chunk, - Datatypes, + Datatype, Document, Embedding, Plugin, @@ -11,10 +11,10 @@ import { csvLoader, jsonLoader, plaintextLoader } from "./loaders/index.js"; export const corePlugin = Plugin.define("core") .withDatatypes( - Datatypes.define("document", Document), - Datatypes.define("blob", Blob), - Datatypes.define("chunk", Chunk), - Datatypes.define("embedding", Embedding), + Datatype.define("document", Document), + Datatype.define("blob", Blob), + Datatype.define("chunk", Chunk), + Datatype.define("embedding", Embedding), ) .withActions(chunkSimple, partition) .withLoaders(plaintextLoader, csvLoader, jsonLoader); @@ -35,3 +35,8 @@ export type { JsonParams } from "./loaders/json.js"; export { jsonLoader, jsonParamsSchema } from "./loaders/json.js"; export type { PlaintextParams } from "./loaders/plaintext.js"; export { plaintextLoader, plaintextParamsSchema } from "./loaders/plaintext.js"; +export type { + DelimiterSplitOptions, + RegexSplitOptions, +} from "./splitter/index.js"; +export { splitByDelimiter, splitByRegex } from "./splitter/index.js"; diff --git a/packages/nvisy-plugin-core/src/loaders/csv.test.ts b/packages/nvisy-plugin-core/src/loaders/csv.test.ts index 9e6116e..b42cf2b 100644 --- a/packages/nvisy-plugin-core/src/loaders/csv.test.ts +++ b/packages/nvisy-plugin-core/src/loaders/csv.test.ts @@ -24,7 +24,7 @@ describe("csvLoader", () => { expect(csvLoader.contentTypes).toContain("text/csv"); }); - it("parses CSV with headers into one document per row", async () => { + it("parses CSV with headers into a single document", async () => { const csv = "name,age\nAlice,30\nBob,25"; const blob = new Blob("data.csv", Buffer.from(csv)); const docs = await collectDocs( @@ -35,41 +35,8 @@ describe("csvLoader", () => { }), ); - expect(docs).toHaveLength(2); - expect(docs[0]!.content).toBe("name: Alice\nage: 30"); - expect(docs[1]!.content).toBe("name: Bob\nage: 25"); - }); - - it("stores header values as metadata", async () => { - const csv = "name,age\nAlice,30"; - const blob = new Blob("data.csv", Buffer.from(csv)); - const docs = await collectDocs( - csvLoader.load(blob, { - delimiter: ",", - hasHeader: true, - encoding: "utf-8", - }), - ); - - expect(docs[0]!.metadata).toMatchObject({ - name: "Alice", - age: "30", - rowIndex: 0, - }); - }); - - it("sets sourceType to csv", async () => { - const csv = "a,b\n1,2"; - const blob = new Blob("data.csv", Buffer.from(csv)); - const docs = await collectDocs( - csvLoader.load(blob, { - delimiter: ",", - hasHeader: true, - encoding: "utf-8", - }), - ); - - expect(docs[0]!.sourceType).toBe("csv"); + expect(docs).toHaveLength(1); + expect(docs[0]!.content).toBe("name: Alice\nage: 30\n\nname: Bob\nage: 25"); }); it("parses CSV without headers", async () => { @@ -83,9 +50,8 @@ describe("csvLoader", () => { }), ); - expect(docs).toHaveLength(2); - expect(docs[0]!.content).toBe("Alice,30"); - expect(docs[1]!.content).toBe("Bob,25"); + expect(docs).toHaveLength(1); + expect(docs[0]!.content).toBe("Alice,30\nBob,25"); }); it("supports tab delimiter for TSV", async () => { @@ -114,9 +80,7 @@ describe("csvLoader", () => { }), ); - expect(docs[0]!.metadata).toMatchObject({ - address: "123 Main St, Apt 4", - }); + expect(docs[0]!.content).toContain("address: 123 Main St, Apt 4"); }); it("handles escaped quotes in fields", async () => { @@ -130,12 +94,10 @@ describe("csvLoader", () => { }), ); - expect(docs[0]!.metadata).toMatchObject({ - note: 'She said "hello"', - }); + expect(docs[0]!.content).toContain('note: She said "hello"'); }); - it("derives documents from blob", async () => { + it("derives document from blob", async () => { const csv = "a\n1\n2"; const blob = new Blob("data.csv", Buffer.from(csv)); const docs = await collectDocs( @@ -146,9 +108,7 @@ describe("csvLoader", () => { }), ); - for (const doc of docs) { - expect(doc.parentId).toBe(blob.id); - } + expect(docs[0]!.parentId).toBe(blob.id); }); it("handles empty file", async () => { @@ -189,7 +149,8 @@ describe("csvLoader", () => { }), ); - expect(docs).toHaveLength(2); + expect(docs).toHaveLength(1); + expect(docs[0]!.content).toBe("name: Alice\nage: 30\n\nname: Bob\nage: 25"); }); it("uses defaults for optional params", async () => { diff --git a/packages/nvisy-plugin-core/src/loaders/csv.ts b/packages/nvisy-plugin-core/src/loaders/csv.ts index 6d924c2..826ce9d 100644 --- a/packages/nvisy-plugin-core/src/loaders/csv.ts +++ b/packages/nvisy-plugin-core/src/loaders/csv.ts @@ -1,17 +1,16 @@ /** * CSV loader. * - * Converts `.csv` and `.tsv` blobs into Documents. Each row becomes - * a separate Document whose content is built from the cell values. - * When a header row is present, cell values are formatted as - * `"column: value"` lines; otherwise raw comma-separated values are - * used as content. + * Converts `.csv` and `.tsv` blobs into a single Document. + * When a header row is present the content is formatted as + * `"column: value"` blocks separated by blank lines; otherwise + * raw delimited rows are used. * * @module */ -import type { Metadata } from "@nvisy/core"; -import { Document, Loader } from "@nvisy/core"; +import { type Blob, Document, Loader } from "@nvisy/core"; +import { parse } from "csv-parse/sync"; import { z } from "zod"; /** Schema for CSV loader parameters. */ @@ -32,89 +31,48 @@ export const csvParamsSchema = z export type CsvParams = z.infer<typeof csvParamsSchema>; /** - * Loader that converts CSV/TSV blobs into one Document per row. + * Loader that converts CSV/TSV blobs into a single Document. * - * Header columns are stored as metadata on each Document. + * Header columns are stored as metadata on the Document. */ export const csvLoader = Loader.define<CsvParams>("csv", { extensions: [".csv", ".tsv"], contentTypes: ["text/csv", "text/tab-separated-values"], params: csvParamsSchema, - async *load(blob, params) { - const text = blob.data.toString(params.encoding); - const lines = parseLines(text); - if (lines.length === 0) return; - - let headers: string[] | null = null; - let startIndex = 0; - - if (params.hasHeader && lines.length > 0) { - headers = splitRow(lines[0]!, params.delimiter); - startIndex = 1; - } - - for (let i = startIndex; i < lines.length; i++) { - const cells = splitRow(lines[i]!, params.delimiter); - const content = headers - ? headers.map((h, j) => `${h}: ${cells[j] ?? ""}`).join("\n") - : cells.join(params.delimiter); - - const metadata: Metadata = { - rowIndex: i - startIndex, - ...(headers - ? Object.fromEntries(headers.map((h, j) => [h, cells[j] ?? ""])) - : {}), - }; - - const doc = new Document(content, { sourceType: "csv" }) - .deriveFrom(blob) - .withMetadata(metadata); - yield doc; - } - }, + load: loadCsv, }); -/** Split text into non-empty lines, handling \r\n and \n. */ -function parseLines(text: string): string[] { - return text.split(/\r?\n/).filter((line) => line.length > 0); -} +async function* loadCsv( + blob: Blob, + params: CsvParams, +): AsyncGenerator<Document> { + const text = blob.data.toString(params.encoding); + if (text.trim().length === 0) return; -/** Split a single CSV row on the delimiter, respecting double-quoted fields. */ -function splitRow(line: string, delimiter: string): string[] { - const fields: string[] = []; - let current = ""; - let inQuotes = false; - let i = 0; + const records: string[][] = parse(text, { + delimiter: params.delimiter, + relax_column_count: true, + skip_empty_lines: true, + }); + if (records.length === 0) return; - while (i < line.length) { - const char = line[i]!; + let headers: string[] | null = null; + let dataRows: string[][] = records; - if (inQuotes) { - if (char === '"') { - if (i + 1 < line.length && line[i + 1] === '"') { - current += '"'; - i += 2; - } else { - inQuotes = false; - i++; - } - } else { - current += char; - i++; - } - } else if (char === '"') { - inQuotes = true; - i++; - } else if (line.startsWith(delimiter, i)) { - fields.push(current); - current = ""; - i += delimiter.length; - } else { - current += char; - i++; - } + if (params.hasHeader) { + headers = records[0]!; + dataRows = records.slice(1); } - fields.push(current); - return fields; + if (dataRows.length === 0) return; + + const content = headers + ? dataRows + .map((row) => headers.map((h, j) => `${h}: ${row[j] ?? ""}`).join("\n")) + .join("\n\n") + : dataRows.map((row) => row.join(params.delimiter)).join("\n"); + + const doc = new Document(content); + doc.deriveFrom(blob); + yield doc; } diff --git a/packages/nvisy-plugin-core/src/loaders/json.test.ts b/packages/nvisy-plugin-core/src/loaders/json.test.ts index c9607e0..5250931 100644 --- a/packages/nvisy-plugin-core/src/loaders/json.test.ts +++ b/packages/nvisy-plugin-core/src/loaders/json.test.ts @@ -34,7 +34,6 @@ describe("jsonLoader", () => { ); expect(docs).toHaveLength(1); - expect(docs[0]!.sourceType).toBe("json"); }); it("promotes scalar fields to metadata", async () => { @@ -51,7 +50,7 @@ describe("jsonLoader", () => { }); }); - it("explodes JSON arrays into one document per element", async () => { + it("creates one document from a JSON array", async () => { const json = JSON.stringify([ { id: 1, text: "first" }, { id: 2, text: "second" }, @@ -61,9 +60,17 @@ describe("jsonLoader", () => { jsonLoader.load(blob, { encoding: "utf-8" }), ); - expect(docs).toHaveLength(2); - expect(docs[0]!.metadata).toMatchObject({ id: 1, arrayIndex: 0 }); - expect(docs[1]!.metadata).toMatchObject({ id: 2, arrayIndex: 1 }); + expect(docs).toHaveLength(1); + expect(docs[0]!.content).toBe( + JSON.stringify( + [ + { id: 1, text: "first" }, + { id: 2, text: "second" }, + ], + null, + 2, + ), + ); }); it("handles string JSON values", async () => { @@ -87,30 +94,29 @@ describe("jsonLoader", () => { expect(docs[0]!.content).toBe(JSON.stringify(obj, null, 2)); }); - it("derives documents from blob", async () => { - const json = JSON.stringify([{ a: 1 }, { b: 2 }]); + it("derives document from blob", async () => { + const json = JSON.stringify({ a: 1 }); const blob = new Blob("data.json", Buffer.from(json)); const docs = await collectDocs( jsonLoader.load(blob, { encoding: "utf-8" }), ); - for (const doc of docs) { - expect(doc.parentId).toBe(blob.id); - } + expect(docs[0]!.parentId).toBe(blob.id); }); }); describe("JSONL files", () => { - it("creates one document per line", async () => { + it("creates one document from JSONL", async () => { const jsonl = '{"id":1}\n{"id":2}\n{"id":3}'; const blob = new Blob("data.jsonl", Buffer.from(jsonl)); const docs = await collectDocs( jsonLoader.load(blob, { encoding: "utf-8" }), ); - expect(docs).toHaveLength(3); - expect(docs[0]!.metadata).toMatchObject({ id: 1, lineIndex: 0 }); - expect(docs[2]!.metadata).toMatchObject({ id: 3, lineIndex: 2 }); + expect(docs).toHaveLength(1); + expect(docs[0]!.content).toBe( + JSON.stringify([{ id: 1 }, { id: 2 }, { id: 3 }], null, 2), + ); }); it("skips empty lines", async () => { @@ -120,7 +126,10 @@ describe("jsonLoader", () => { jsonLoader.load(blob, { encoding: "utf-8" }), ); - expect(docs).toHaveLength(2); + expect(docs).toHaveLength(1); + expect(docs[0]!.content).toBe( + JSON.stringify([{ a: 1 }, { b: 2 }], null, 2), + ); }); it("handles .ndjson extension", async () => { @@ -130,19 +139,17 @@ describe("jsonLoader", () => { jsonLoader.load(blob, { encoding: "utf-8" }), ); - expect(docs).toHaveLength(2); + expect(docs).toHaveLength(1); }); - it("derives documents from blob", async () => { + it("derives document from blob", async () => { const jsonl = '{"a":1}\n{"b":2}'; const blob = new Blob("data.jsonl", Buffer.from(jsonl)); const docs = await collectDocs( jsonLoader.load(blob, { encoding: "utf-8" }), ); - for (const doc of docs) { - expect(doc.parentId).toBe(blob.id); - } + expect(docs[0]!.parentId).toBe(blob.id); }); }); diff --git a/packages/nvisy-plugin-core/src/loaders/json.ts b/packages/nvisy-plugin-core/src/loaders/json.ts index 5b65259..43c33c8 100644 --- a/packages/nvisy-plugin-core/src/loaders/json.ts +++ b/packages/nvisy-plugin-core/src/loaders/json.ts @@ -1,18 +1,14 @@ /** * JSON / JSON Lines loader. * - * Converts `.json` and `.jsonl` blobs into Documents. - * - * - **`.json`** — if the root value is an array, each element becomes - * a Document; otherwise the entire file becomes a single Document. - * - **`.jsonl`** — each non-empty line is parsed as a separate JSON - * object and becomes its own Document. + * Converts `.json`, `.jsonl`, and `.ndjson` blobs into a single + * Document whose content is the pretty-printed JSON text. + * For JSONL/NDJSON files the lines are collected into an array first. * * @module */ -import type { Blob, Metadata } from "@nvisy/core"; -import { Document, Loader } from "@nvisy/core"; +import { type Blob, Document, Loader } from "@nvisy/core"; import { z } from "zod"; /** Schema for JSON loader parameters. */ @@ -29,64 +25,35 @@ export const jsonParamsSchema = z export type JsonParams = z.infer<typeof jsonParamsSchema>; /** - * Loader that converts JSON / JSONL blobs into Documents. + * Loader that converts JSON / JSONL blobs into a single Document. * - * Each JSON value is stringified as the Document's content, with - * scalar fields promoted to metadata when the value is an object. + * Scalar object fields are promoted to metadata. */ export const jsonLoader = Loader.define<JsonParams>("json", { extensions: [".json", ".jsonl", ".ndjson"], contentTypes: ["application/json", "application/x-ndjson"], params: jsonParamsSchema, - async *load(blob, params) { - const text = blob.data.toString(params.encoding); - const isJsonLines = - blob.path.endsWith(".jsonl") || blob.path.endsWith(".ndjson"); - - if (isJsonLines) { - yield* loadJsonLines(text, blob); - } else { - yield* loadJson(text, blob); - } - }, + load: loadJson, }); -/** Parse a single JSON file. Arrays are exploded into one Document per element. */ -function* loadJson(text: string, blob: Blob): Generator<Document> { - const parsed: unknown = JSON.parse(text); - - if (Array.isArray(parsed)) { - for (let i = 0; i < parsed.length; i++) { - yield toDocument(parsed[i], blob, { arrayIndex: i }); - } - } else { - yield toDocument(parsed, blob, {}); - } -} - -/** Parse newline-delimited JSON (one object per line). */ -function* loadJsonLines(text: string, blob: Blob): Generator<Document> { - const lines = text.split(/\r?\n/); - let index = 0; - - for (const line of lines) { - const trimmed = line.trim(); - if (trimmed.length === 0) continue; +async function* loadJson( + blob: Blob, + params: JsonParams, +): AsyncGenerator<Document> { + const text = blob.data.toString(params.encoding); + const isJsonLines = + blob.path.endsWith(".jsonl") || blob.path.endsWith(".ndjson"); - const parsed: unknown = JSON.parse(trimmed); - yield toDocument(parsed, blob, { lineIndex: index }); - index++; - } -} - -/** Convert a parsed JSON value into a Document with metadata. */ -function toDocument(value: unknown, blob: Blob, baseMeta: Metadata): Document { + const parsed: unknown = isJsonLines ? parseJsonLines(text) : JSON.parse(text); const content = - typeof value === "string" ? value : JSON.stringify(value, null, 2); - const metadata: Metadata = { ...baseMeta }; + typeof parsed === "string" ? parsed : JSON.stringify(parsed, null, 2); - if (typeof value === "object" && value !== null && !Array.isArray(value)) { - for (const [k, v] of Object.entries(value)) { + const doc = new Document(content); + doc.deriveFrom(blob); + + if (typeof parsed === "object" && parsed !== null && !Array.isArray(parsed)) { + const metadata: Record<string, string | number | boolean> = {}; + for (const [k, v] of Object.entries(parsed)) { if ( typeof v === "string" || typeof v === "number" || @@ -95,9 +62,21 @@ function toDocument(value: unknown, blob: Blob, baseMeta: Metadata): Document { metadata[k] = v; } } + if (Object.keys(metadata).length > 0) { + doc.withMetadata(metadata); + } } - return new Document(content, { sourceType: "json" }) - .deriveFrom(blob) - .withMetadata(metadata); + yield doc; +} + +/** Parse newline-delimited JSON into an array of values. */ +function parseJsonLines(text: string): unknown[] { + const results: unknown[] = []; + for (const line of text.split(/\r?\n/)) { + const trimmed = line.trim(); + if (trimmed.length === 0) continue; + results.push(JSON.parse(trimmed)); + } + return results; } diff --git a/packages/nvisy-plugin-core/src/loaders/plaintext.test.ts b/packages/nvisy-plugin-core/src/loaders/plaintext.test.ts index 92e8e36..03507ee 100644 --- a/packages/nvisy-plugin-core/src/loaders/plaintext.test.ts +++ b/packages/nvisy-plugin-core/src/loaders/plaintext.test.ts @@ -31,7 +31,6 @@ describe("plaintextLoader", () => { expect(docs).toHaveLength(1); expect(docs[0]!.content).toBe("Hello, world!"); - expect(docs[0]!.sourceType).toBe("text"); }); it("derives document from blob (sets parentId)", async () => { diff --git a/packages/nvisy-plugin-core/src/loaders/plaintext.ts b/packages/nvisy-plugin-core/src/loaders/plaintext.ts index a06c1fc..5fe2202 100644 --- a/packages/nvisy-plugin-core/src/loaders/plaintext.ts +++ b/packages/nvisy-plugin-core/src/loaders/plaintext.ts @@ -7,7 +7,7 @@ * @module */ -import { Document, Loader } from "@nvisy/core"; +import { type Blob, Document, Loader } from "@nvisy/core"; import { z } from "zod"; /** Schema for plaintext loader parameters. */ @@ -33,10 +33,15 @@ export const plaintextLoader = Loader.define<PlaintextParams>("plaintext", { extensions: [".txt"], contentTypes: ["text/plain"], params: plaintextParamsSchema, - async *load(blob, params) { - const content = blob.data.toString(params.encoding); - const doc = new Document(content, { sourceType: "text" }); - doc.deriveFrom(blob); - yield doc; - }, + load: loadPlaintext, }); + +async function* loadPlaintext( + blob: Blob, + params: PlaintextParams, +): AsyncGenerator<Document> { + const content = blob.data.toString(params.encoding); + const doc = new Document(content); + doc.deriveFrom(blob); + yield doc; +} diff --git a/packages/nvisy-plugin-core/src/splitter/delimiter.test.ts b/packages/nvisy-plugin-core/src/splitter/delimiter.test.ts new file mode 100644 index 0000000..91c76d2 --- /dev/null +++ b/packages/nvisy-plugin-core/src/splitter/delimiter.test.ts @@ -0,0 +1,90 @@ +import { describe, expect, it } from "vitest"; +import { splitByDelimiter } from "./delimiter.js"; + +describe("splitByDelimiter", () => { + it("splits on a simple delimiter", () => { + const result = splitByDelimiter("a,b,c", { delimiter: "," }); + expect(result).toEqual(["a", "b", "c"]); + }); + + it("splits on a multi-character delimiter", () => { + const result = splitByDelimiter("a---b---c", { delimiter: "---" }); + expect(result).toEqual(["a", "b", "c"]); + }); + + it("splits on newline delimiter", () => { + const result = splitByDelimiter("line1\nline2\nline3", { + delimiter: "\n", + }); + expect(result).toEqual(["line1", "line2", "line3"]); + }); + + it("returns [text] when delimiter is not found", () => { + const result = splitByDelimiter("no match here", { delimiter: "," }); + expect(result).toEqual(["no match here"]); + }); + + it("returns empty array for empty input (trimEmpty=true)", () => { + const result = splitByDelimiter("", { delimiter: "," }); + expect(result).toEqual([]); + }); + + it("returns [''] for empty input when trimEmpty=false", () => { + const result = splitByDelimiter("", { + delimiter: ",", + trimEmpty: false, + }); + expect(result).toEqual([""]); + }); + + describe("keepDelimiter", () => { + it("prepends delimiter to subsequent segments", () => { + const result = splitByDelimiter("a,b,c", { + delimiter: ",", + keepDelimiter: true, + }); + expect(result).toEqual(["a", ",b", ",c"]); + }); + + it("prepends multi-character delimiter", () => { + const result = splitByDelimiter("a---b---c", { + delimiter: "---", + keepDelimiter: true, + }); + expect(result).toEqual(["a", "---b", "---c"]); + }); + }); + + describe("trimEmpty", () => { + it("filters whitespace-only segments by default", () => { + const result = splitByDelimiter("a,, ,b", { delimiter: "," }); + expect(result).toEqual(["a", "b"]); + }); + + it("keeps whitespace-only segments when trimEmpty=false", () => { + const result = splitByDelimiter("a,, ,b", { + delimiter: ",", + trimEmpty: false, + }); + expect(result).toEqual(["a", "", " ", "b"]); + }); + }); + + it("handles consecutive delimiters", () => { + const result = splitByDelimiter("a,,b", { delimiter: "," }); + expect(result).toEqual(["a", "b"]); + }); + + it("handles delimiter at start and end", () => { + const result = splitByDelimiter(",a,b,", { delimiter: "," }); + expect(result).toEqual(["a", "b"]); + }); + + it("handles delimiter at start and end with trimEmpty=false", () => { + const result = splitByDelimiter(",a,b,", { + delimiter: ",", + trimEmpty: false, + }); + expect(result).toEqual(["", "a", "b", ""]); + }); +}); diff --git a/packages/nvisy-plugin-core/src/splitter/delimiter.ts b/packages/nvisy-plugin-core/src/splitter/delimiter.ts new file mode 100644 index 0000000..9251fdf --- /dev/null +++ b/packages/nvisy-plugin-core/src/splitter/delimiter.ts @@ -0,0 +1,31 @@ +export interface DelimiterSplitOptions { + /** String to split on (e.g. `"\n"`, `"---"`). */ + readonly delimiter: string; + /** If true, keep the delimiter at the start of each subsequent segment. Default: false. */ + readonly keepDelimiter?: boolean; + /** Discard segments that are empty or whitespace-only after splitting. Default: true. */ + readonly trimEmpty?: boolean; +} + +/** Split `text` on a literal delimiter string. */ +export function splitByDelimiter( + text: string, + options: DelimiterSplitOptions, +): string[] { + const { delimiter, keepDelimiter = false, trimEmpty = true } = options; + + const raw = text.split(delimiter); + + let segments: string[]; + if (keepDelimiter) { + segments = raw.map((seg, i) => (i === 0 ? seg : `${delimiter}${seg}`)); + } else { + segments = raw; + } + + if (trimEmpty) { + segments = segments.filter((s) => s.trim().length > 0); + } + + return segments; +} diff --git a/packages/nvisy-plugin-core/src/splitter/index.ts b/packages/nvisy-plugin-core/src/splitter/index.ts new file mode 100644 index 0000000..a9345a4 --- /dev/null +++ b/packages/nvisy-plugin-core/src/splitter/index.ts @@ -0,0 +1,4 @@ +export type { DelimiterSplitOptions } from "./delimiter.js"; +export { splitByDelimiter } from "./delimiter.js"; +export type { RegexSplitOptions } from "./regex.js"; +export { splitByRegex } from "./regex.js"; diff --git a/packages/nvisy-plugin-core/src/splitter/regex.test.ts b/packages/nvisy-plugin-core/src/splitter/regex.test.ts new file mode 100644 index 0000000..54c53c1 --- /dev/null +++ b/packages/nvisy-plugin-core/src/splitter/regex.test.ts @@ -0,0 +1,90 @@ +import { describe, expect, it } from "vitest"; +import { splitByRegex } from "./regex.js"; + +describe("splitByRegex", () => { + it("splits on a simple pattern", () => { + const result = splitByRegex("a1b2c", { pattern: "\\d" }); + expect(result).toEqual(["a", "b", "c"]); + }); + + it("splits on a multi-character pattern", () => { + const result = splitByRegex("hello---world---end", { pattern: "-+" }); + expect(result).toEqual(["hello", "world", "end"]); + }); + + it("splits on newline patterns", () => { + const result = splitByRegex("line1\n\nline2\n\nline3", { + pattern: "\\n{2,}", + }); + expect(result).toEqual(["line1", "line2", "line3"]); + }); + + it("returns [text] when pattern does not match", () => { + const result = splitByRegex("no match here", { pattern: "\\d+" }); + expect(result).toEqual(["no match here"]); + }); + + it("returns empty array for empty input (trimEmpty=true)", () => { + const result = splitByRegex("", { pattern: "\\d" }); + expect(result).toEqual([]); + }); + + it("returns [''] for empty input when trimEmpty=false", () => { + const result = splitByRegex("", { pattern: "\\d", trimEmpty: false }); + expect(result).toEqual([""]); + }); + + describe("keepSeparator", () => { + it("prepends matched separator to subsequent segments", () => { + const result = splitByRegex("intro\n## A\ncontent A\n## B\ncontent B", { + pattern: "^## .+$", + keepSeparator: true, + }); + expect(result).toEqual([ + "intro\n", + "## A\ncontent A\n", + "## B\ncontent B", + ]); + }); + + it("keeps separator with simple pattern", () => { + const result = splitByRegex("a1b2c", { + pattern: "\\d", + keepSeparator: true, + }); + expect(result).toEqual(["a", "1b", "2c"]); + }); + }); + + describe("trimEmpty", () => { + it("filters whitespace-only segments by default", () => { + const result = splitByRegex("a,,b", { pattern: "," }); + expect(result).toEqual(["a", "b"]); + }); + + it("keeps whitespace-only segments when trimEmpty=false", () => { + const result = splitByRegex("a,,b", { + pattern: ",", + trimEmpty: false, + }); + expect(result).toEqual(["a", "", "b"]); + }); + }); + + it("handles consecutive separators", () => { + const result = splitByRegex("a--b--c", { pattern: "-" }); + expect(result).toEqual(["a", "b", "c"]); + }); + + it("handles pattern at start and end", () => { + const result = splitByRegex("1a1b1", { pattern: "\\d" }); + expect(result).toEqual(["a", "b"]); + }); + + it("uses multiline flag so ^ matches line starts", () => { + const result = splitByRegex("line1\nline2\nline3", { + pattern: "^line2$", + }); + expect(result).toEqual(["line1\n", "\nline3"]); + }); +}); diff --git a/packages/nvisy-plugin-core/src/splitter/regex.ts b/packages/nvisy-plugin-core/src/splitter/regex.ts new file mode 100644 index 0000000..c39e71a --- /dev/null +++ b/packages/nvisy-plugin-core/src/splitter/regex.ts @@ -0,0 +1,59 @@ +export interface RegexSplitOptions { + /** Pattern to split on. Compiled to a RegExp with the `gm` flags. */ + readonly pattern: string; + /** If true, keep the matched separator at the start of each subsequent segment. Default: false. */ + readonly keepSeparator?: boolean; + /** Discard segments that are empty or whitespace-only after splitting. Default: true. */ + readonly trimEmpty?: boolean; +} + +/** Split `text` on a regex pattern. */ +export function splitByRegex( + text: string, + options: RegexSplitOptions, +): string[] { + const { pattern, keepSeparator = false, trimEmpty = true } = options; + + const re = new RegExp(pattern, "gm"); + + // Collect all match boundaries + const boundaries: { start: number; end: number }[] = []; + for (let match = re.exec(text); match !== null; match = re.exec(text)) { + if (match[0].length === 0) { + re.lastIndex++; + continue; + } + boundaries.push({ start: match.index, end: match.index + match[0].length }); + } + + if (boundaries.length === 0) { + const result = trimEmpty && text.trim().length === 0 ? [] : [text]; + return result; + } + + const segments: string[] = []; + + if (keepSeparator) { + // First segment: everything before the first match + segments.push(text.slice(0, boundaries[0]!.start)); + // Subsequent segments start at each match start, end at next match start (or end of text) + for (let i = 0; i < boundaries.length; i++) { + const segStart = boundaries[i]!.start; + const segEnd = + i + 1 < boundaries.length ? boundaries[i + 1]!.start : text.length; + segments.push(text.slice(segStart, segEnd)); + } + } else { + let cursor = 0; + for (const b of boundaries) { + segments.push(text.slice(cursor, b.start)); + cursor = b.end; + } + segments.push(text.slice(cursor)); + } + + if (trimEmpty) { + return segments.filter((s) => s.trim().length > 0); + } + return segments; +} diff --git a/packages/nvisy-plugin-object/src/streams/read.ts b/packages/nvisy-plugin-object/src/streams/read.ts index 48a6181..73cdd60 100644 --- a/packages/nvisy-plugin-object/src/streams/read.ts +++ b/packages/nvisy-plugin-object/src/streams/read.ts @@ -33,7 +33,9 @@ export type ObjectCursor = z.infer<typeof ObjectCursor>; * list API. */ export const read = Stream.createSource("read", ObjectStoreClient, { - types: [Blob, ObjectCursor, ObjectParams], + type: Blob, + context: ObjectCursor, + params: ObjectParams, reader: (client, cursor, params) => readStream(client, cursor, params), }); @@ -71,7 +73,7 @@ async function* readStream( const { data, contentType } = await client.get(key); totalObjects++; yield { - data: new Blob(key, data, contentType), + data: new Blob(key, data, { contentType }), context: { lastKey: key } as ObjectCursor, }; } catch (error) { diff --git a/packages/nvisy-plugin-object/src/streams/write.ts b/packages/nvisy-plugin-object/src/streams/write.ts index 2553438..0fb199f 100644 --- a/packages/nvisy-plugin-object/src/streams/write.ts +++ b/packages/nvisy-plugin-object/src/streams/write.ts @@ -19,11 +19,12 @@ export type WriteParams = z.infer<typeof WriteParams>; * via the provider client's `put` method. */ export const write = Stream.createTarget("write", ObjectStoreClient, { - types: [Blob, WriteParams], + type: Blob, + params: WriteParams, writer: (client, params) => async (item: Blob) => { const key = params.prefix ? `${params.prefix}${item.path}` : item.path; try { - await client.put(key, item.data, item.contentType); + await client.put(key, item.data, item.provided.mime); logger.debug("Put object {key} ({size} bytes)", { key, size: item.size, diff --git a/packages/nvisy-plugin-sql/src/index.ts b/packages/nvisy-plugin-sql/src/index.ts index 52749e2..7eff668 100644 --- a/packages/nvisy-plugin-sql/src/index.ts +++ b/packages/nvisy-plugin-sql/src/index.ts @@ -16,7 +16,7 @@ * ``` */ -import { Datatypes, Plugin } from "@nvisy/core"; +import { Datatype, Plugin } from "@nvisy/core"; import { coerce, filter, project, rename } from "./actions/index.js"; import { Row } from "./datatypes/index.js"; import { mssql, mysql, postgres } from "./providers/index.js"; @@ -27,6 +27,6 @@ export const sqlPlugin = Plugin.define("sql") .withProviders(postgres, mysql, mssql) .withStreams(read, write) .withActions(filter, project, rename, coerce) - .withDatatypes(Datatypes.define("row", Row)); + .withDatatypes(Datatype.define("row", Row)); export { Row } from "./datatypes/index.js"; diff --git a/packages/nvisy-plugin-sql/src/streams/read.ts b/packages/nvisy-plugin-sql/src/streams/read.ts index 3f93395..9d886e8 100644 --- a/packages/nvisy-plugin-sql/src/streams/read.ts +++ b/packages/nvisy-plugin-sql/src/streams/read.ts @@ -17,7 +17,9 @@ const logger = getLogger(["nvisy", "sql"]); * have been yielded. */ export const read = Stream.createSource("read", KyselyClient, { - types: [Row, SqlCursor, SqlParams], + type: Row, + context: SqlCursor, + params: SqlParams, reader: (client, cursor, params) => readStream(client, cursor, params), }); diff --git a/packages/nvisy-plugin-sql/src/streams/write.ts b/packages/nvisy-plugin-sql/src/streams/write.ts index accd9d9..9d63f46 100644 --- a/packages/nvisy-plugin-sql/src/streams/write.ts +++ b/packages/nvisy-plugin-sql/src/streams/write.ts @@ -14,7 +14,8 @@ const logger = getLogger(["nvisy", "sql"]); * an individual INSERT statement. */ export const write = Stream.createTarget("write", KyselyClient, { - types: [Row, SqlParams], + type: Row, + params: SqlParams, writer: (client, params) => async (item: Row) => { const record = item.columns as Record<string, unknown>; if (Object.keys(record).length === 0) return; diff --git a/packages/nvisy-plugin-vector/package.json b/packages/nvisy-plugin-vector/package.json index abfe938..1787197 100644 --- a/packages/nvisy-plugin-vector/package.json +++ b/packages/nvisy-plugin-vector/package.json @@ -22,7 +22,7 @@ "dependencies": { "@logtape/logtape": "^2.0.2", "@nvisy/core": "*", - "@pinecone-database/pinecone": "^4.0.0", + "@pinecone-database/pinecone": "^7.0.0", "@qdrant/js-client-rest": "^1.13.0", "@zilliz/milvus2-sdk-node": "^2.5.0", "pg": "^8.13.0", diff --git a/packages/nvisy-plugin-vector/src/providers/pinecone.ts b/packages/nvisy-plugin-vector/src/providers/pinecone.ts index 76e90f7..baba0a1 100644 --- a/packages/nvisy-plugin-vector/src/providers/pinecone.ts +++ b/packages/nvisy-plugin-vector/src/providers/pinecone.ts @@ -30,13 +30,13 @@ class PineconeVectorClient extends VectorClient { } async upsert(vectors: UpsertVector[]): Promise<void> { - await this.#index.upsert( - vectors.map((v) => ({ + await this.#index.upsert({ + records: vectors.map((v) => ({ id: v.id, values: [...v.vector], metadata: v.metadata as Record<string, string>, })), - ); + }); } } diff --git a/packages/nvisy-plugin-vector/src/streams/upsert.ts b/packages/nvisy-plugin-vector/src/streams/upsert.ts index e12da23..4cb4f7e 100644 --- a/packages/nvisy-plugin-vector/src/streams/upsert.ts +++ b/packages/nvisy-plugin-vector/src/streams/upsert.ts @@ -16,7 +16,8 @@ export type UpsertParams = z.infer<typeof UpsertParams>; * via the provider client's `upsert` method. */ export const upsert = Stream.createTarget("upsert", VectorClient, { - types: [Embedding, UpsertParams], + type: Embedding, + params: UpsertParams, writer: (client: VectorClient, _params: UpsertParams) => async (item: Embedding) => { diff --git a/packages/nvisy-runtime/package.json b/packages/nvisy-runtime/package.json index 4395af2..b74d8cd 100644 --- a/packages/nvisy-runtime/package.json +++ b/packages/nvisy-runtime/package.json @@ -27,7 +27,7 @@ "graphology": "^0.26.0", "graphology-dag": "^0.4.1", "graphology-types": "^0.24.8", - "magic-bytes.js": "^1.13.0", + "zod": "^4.3.6" }, "engines": { diff --git a/packages/nvisy-runtime/src/engine/bridge.ts b/packages/nvisy-runtime/src/engine/bridge.ts index fd2f22a..cc195f4 100644 --- a/packages/nvisy-runtime/src/engine/bridge.ts +++ b/packages/nvisy-runtime/src/engine/bridge.ts @@ -67,11 +67,7 @@ export async function* applyLoaderBridge( continue; } - const loader = registry.findLoaderForBlob({ - path: item.path, - data: item.data, - ...(item.contentType && { contentType: item.contentType }), - }); + const loader = registry.findLoaderForBlob(item); if (!loader) { if (options?.ignoreUnsupported) { @@ -81,7 +77,7 @@ export async function* applyLoaderBridge( continue; } throw new RuntimeError( - `No loader found for blob: ${item.path} (contentType: ${item.contentType ?? "unknown"})`, + `No loader found for blob: ${item.path} (mime: ${item.provided.mime ?? "unknown"})`, { source: "bridge", retryable: false }, ); } diff --git a/packages/nvisy-runtime/src/registry.ts b/packages/nvisy-runtime/src/registry.ts index b5fbd22..08ffe14 100644 --- a/packages/nvisy-runtime/src/registry.ts +++ b/packages/nvisy-runtime/src/registry.ts @@ -5,11 +5,10 @@ import type { AnyProviderFactory, AnyStreamSource, AnyStreamTarget, - Datatype, + DatatypeDescriptor, PluginInstance, } from "@nvisy/core"; -import { ValidationError } from "@nvisy/core"; -import { filetypeinfo } from "magic-bytes.js"; +import { type Blob, ValidationError } from "@nvisy/core"; const logger = getLogger(["nvisy", "registry"]); @@ -49,7 +48,7 @@ export class Registry { readonly #loaders = new Map<string, AnyLoaderInstance>(); readonly #providers = new Map<string, AnyProviderFactory>(); readonly #streams = new Map<string, AnyStreamSource | AnyStreamTarget>(); - readonly #datatypes = new Map<string, Datatype>(); + readonly #datatypes = new Map<string, DatatypeDescriptor>(); readonly #plugins = new Set<string>(); /** Snapshot of all registered actions and providers with their schemas. */ @@ -159,7 +158,7 @@ export class Registry { } /** Look up a data type by name. */ - getDataType(name: string): Datatype { + getDataType(name: string): DatatypeDescriptor { return this.#getOrThrow(this.#datatypes, name, "datatype"); } @@ -184,7 +183,7 @@ export class Registry { } /** Look up a data type by name, returning undefined if not found. */ - findDataType(name: string): Datatype | undefined { + findDataType(name: string): DatatypeDescriptor | undefined { return this.#datatypes.get(name); } @@ -192,40 +191,32 @@ export class Registry { * Find a loader that matches the given blob by content type, magic bytes, or extension. * * Matching priority: - * 1. If blob has contentType, match by contentType first - * 2. Detect file type from magic bytes and match by extension - * 3. Fall back to file extension from blob.path + * 1. If blob has provided MIME (contentType), match by contentType first + * 2. Match by identified (magic bytes) extension + * 3. Fall back to provided extension from blob path */ - findLoaderForBlob(blob: { - path: string; - contentType?: string; - data?: Uint8Array; - }): AnyLoaderInstance | undefined { - if (blob.contentType) { + findLoaderForBlob(blob: Blob): AnyLoaderInstance | undefined { + const provided = blob.provided; + if (provided.mime) { for (const loader of this.#loaders.values()) { - if (loader.contentTypes.includes(blob.contentType)) { + if (loader.contentTypes.includes(provided.mime)) { return loader; } } } - if (blob.data) { - const detected = filetypeinfo(blob.data); - const first = detected[0]; - if (first?.extension) { - const ext = `.${first.extension}`; - for (const loader of this.#loaders.values()) { - if (loader.extensions.includes(ext)) { - return loader; - } + const identified = blob.identified; + if (identified.extension) { + for (const loader of this.#loaders.values()) { + if (loader.extensions.includes(identified.extension)) { + return loader; } } } - const ext = this.#getExtension(blob.path); - if (ext) { + if (provided.extension) { for (const loader of this.#loaders.values()) { - if (loader.extensions.includes(ext)) { + if (loader.extensions.includes(provided.extension)) { return loader; } } @@ -242,12 +233,4 @@ export class Registry { } return entry; } - - #getExtension(path: string): string | undefined { - const lastDot = path.lastIndexOf("."); - if (lastDot === -1 || lastDot === path.length - 1) { - return undefined; - } - return path.slice(lastDot).toLowerCase(); - } } diff --git a/packages/nvisy-runtime/test/engine.test.ts b/packages/nvisy-runtime/test/engine.test.ts index 885b14d..6ea0060 100644 --- a/packages/nvisy-runtime/test/engine.test.ts +++ b/packages/nvisy-runtime/test/engine.test.ts @@ -222,11 +222,9 @@ describe("execute", () => { }); const failSource = Stream.createSource("read", FailClient, { - types: [ - Document, - z.object({}).default({}), - z.record(z.string(), z.unknown()), - ], + type: Document, + context: z.object({}).default({}), + params: z.record(z.string(), z.unknown()), // biome-ignore lint/correctness/useYield: intentionally throws before yielding to test error handling reader: async function* () { throw new RuntimeError("Non-retryable failure", { @@ -236,7 +234,8 @@ describe("execute", () => { }); const failTarget = Stream.createTarget("write", FailClient, { - types: [Document, z.record(z.string(), z.unknown())], + type: Document, + params: z.record(z.string(), z.unknown()), writer: () => async () => {}, }); @@ -326,11 +325,9 @@ describe("execute", () => { }); const retrySource = Stream.createSource("read", RetryClient, { - types: [ - Document, - z.object({}).default({}), - z.record(z.string(), z.unknown()), - ], + type: Document, + context: z.object({}).default({}), + params: z.record(z.string(), z.unknown()), reader: async function* () { attempts++; if (attempts < 3) { @@ -344,7 +341,8 @@ describe("execute", () => { }); const retryTarget = Stream.createTarget("write", RetryClient, { - types: [Document, z.record(z.string(), z.unknown())], + type: Document, + params: z.record(z.string(), z.unknown()), writer: () => async () => {}, }); diff --git a/packages/nvisy-runtime/test/fixtures.ts b/packages/nvisy-runtime/test/fixtures.ts index 6b09fac..d42b087 100644 --- a/packages/nvisy-runtime/test/fixtures.ts +++ b/packages/nvisy-runtime/test/fixtures.ts @@ -66,7 +66,9 @@ export const sourceEntries: TestRow[] = [ ]; export const testSourceStream = Stream.createSource("read", TestClient, { - types: [TestRow, TestContext, TestParams], + type: TestRow, + context: TestContext, + params: TestParams, reader: async function* (_client, _ctx, _params) { for (const row of sourceEntries) { yield { data: row, context: { cursor: row.id } } as Resumable< @@ -84,7 +86,8 @@ export const testSourceStream = Stream.createSource("read", TestClient, { export const writtenItems: Data[] = []; export const testTargetStream = Stream.createTarget("write", TestClient, { - types: [TestRow, TestParams], + type: TestRow, + params: TestParams, writer: (_client, _params) => { return async (item: TestRow) => { writtenItems.push(item); diff --git a/packages/nvisy-runtime/test/registry.test.ts b/packages/nvisy-runtime/test/registry.test.ts index 8f4f114..5252715 100644 --- a/packages/nvisy-runtime/test/registry.test.ts +++ b/packages/nvisy-runtime/test/registry.test.ts @@ -1,4 +1,4 @@ -import { Datatypes, Document, Plugin, ValidationError } from "@nvisy/core"; +import { Datatype, Document, Plugin, ValidationError } from "@nvisy/core"; import { describe, expect, it } from "vitest"; import { Registry } from "../src/registry.js"; import { @@ -49,7 +49,7 @@ describe("Registry", () => { it("loads datatypes and resolves them", () => { const registry = new Registry(); const plugin = Plugin.define("dt").withDatatypes( - Datatypes.define("document", Document), + Datatype.define("document", Document), ); registry.load(plugin); From d5e7d199d66c9d20d6d0289c59acc5262b0793fe Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Tue, 10 Feb 2026 08:56:56 +0100 Subject: [PATCH 06/17] feat: rewrite runtime from TypeScript to Rust + Python Replace the entire TypeScript implementation with a Rust workspace (6 crates) and a Python AI package, porting all domain types, DAG engine, detection patterns, object store, and HTTP server. Rust crates: - nvisy-core: domain types, traits (Action, Provider, Stream, Loader), plugin registry - nvisy-detect: 10 regex patterns, 6 actions (detect/evaluate/redact/classify/audit), 3 loaders - nvisy-engine: DAG compiler (petgraph), executor (tokio tasks + mpsc), retry/timeout, run tracking - nvisy-object: ObjectStoreClient trait, S3 provider (aws-sdk-s3), read/write streams - nvisy-python: PyO3 bridge for AI NER (GIL + spawn_blocking), action wrappers - nvisy-server: Axum HTTP server with health, graphs, redact, policies, audit routes Python package (packages/nvisy-ai): - NER detection for text and images via OpenAI, Anthropic, and Gemini providers Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- .cargo/config.toml | 5 + .gitignore | 59 +- Cargo.lock | 3402 ++++++++ Cargo.toml | 89 + biome.json | 42 - crates/nvisy-core/Cargo.toml | 46 + crates/nvisy-core/src/data.rs | 84 + crates/nvisy-core/src/datatypes/audit.rs | 70 + crates/nvisy-core/src/datatypes/blob.rs | 84 + crates/nvisy-core/src/datatypes/document.rs | 95 + crates/nvisy-core/src/datatypes/entity.rs | 68 + crates/nvisy-core/src/datatypes/image.rs | 51 + crates/nvisy-core/src/datatypes/mod.rs | 7 + crates/nvisy-core/src/datatypes/policy.rs | 87 + crates/nvisy-core/src/datatypes/redaction.rs | 47 + crates/nvisy-core/src/documents/elements.rs | 177 + crates/nvisy-core/src/documents/mod.rs | 2 + crates/nvisy-core/src/documents/ontology.rs | 76 + crates/nvisy-core/src/errors/mod.rs | 109 + crates/nvisy-core/src/lib.rs | 8 + crates/nvisy-core/src/plugin.rs | 52 + crates/nvisy-core/src/registry.rs | 149 + crates/nvisy-core/src/traits/action.rs | 43 + crates/nvisy-core/src/traits/loader.rs | 26 + crates/nvisy-core/src/traits/mod.rs | 4 + crates/nvisy-core/src/traits/provider.rs | 29 + crates/nvisy-core/src/traits/stream.rs | 39 + crates/nvisy-core/src/types.rs | 51 + crates/nvisy-detect/Cargo.toml | 43 + .../src/actions/apply_redaction.rs | 138 + crates/nvisy-detect/src/actions/classify.rs | 114 + .../src/actions/detect_checksum.rs | 99 + .../nvisy-detect/src/actions/detect_regex.rs | 115 + crates/nvisy-detect/src/actions/emit_audit.rs | 90 + .../src/actions/evaluate_policy.rs | 137 + crates/nvisy-detect/src/actions/mod.rs | 6 + crates/nvisy-detect/src/lib.rs | 29 + crates/nvisy-detect/src/loaders/csv_loader.rs | 37 + .../nvisy-detect/src/loaders/json_loader.rs | 41 + crates/nvisy-detect/src/loaders/mod.rs | 3 + crates/nvisy-detect/src/loaders/plaintext.rs | 40 + crates/nvisy-detect/src/patterns/api_key.rs | 39 + .../nvisy-detect/src/patterns/credit_card.rs | 38 + crates/nvisy-detect/src/patterns/email.rs | 12 + .../nvisy-detect/src/patterns/ip_address.rs | 21 + crates/nvisy-detect/src/patterns/mod.rs | 55 + crates/nvisy-detect/src/patterns/phone.rs | 12 + crates/nvisy-detect/src/patterns/ssn.rs | 32 + crates/nvisy-engine/Cargo.toml | 51 + crates/nvisy-engine/src/compiler/mod.rs | 5 + crates/nvisy-engine/src/compiler/parse.rs | 44 + crates/nvisy-engine/src/compiler/plan.rs | 110 + crates/nvisy-engine/src/connections.rs | 15 + crates/nvisy-engine/src/executor/context.rs | 43 + crates/nvisy-engine/src/executor/mod.rs | 5 + crates/nvisy-engine/src/executor/nodes.rs | 63 + crates/nvisy-engine/src/executor/runner.rs | 152 + crates/nvisy-engine/src/lib.rs | 6 + crates/nvisy-engine/src/policies.rs | 62 + crates/nvisy-engine/src/runs.rs | 168 + crates/nvisy-engine/src/schema.rs | 120 + crates/nvisy-object/Cargo.toml | 48 + crates/nvisy-object/src/client.rs | 32 + crates/nvisy-object/src/lib.rs | 16 + crates/nvisy-object/src/providers/mod.rs | 1 + crates/nvisy-object/src/providers/s3.rs | 156 + crates/nvisy-object/src/streams/mod.rs | 2 + crates/nvisy-object/src/streams/read.rs | 73 + crates/nvisy-object/src/streams/write.rs | 55 + crates/nvisy-python/Cargo.toml | 46 + crates/nvisy-python/src/actions.rs | 141 + crates/nvisy-python/src/bridge.rs | 38 + crates/nvisy-python/src/error.rs | 13 + crates/nvisy-python/src/lib.rs | 17 + crates/nvisy-python/src/ner.rs | 171 + crates/nvisy-python/src/provider.rs | 34 + crates/nvisy-server/Cargo.toml | 62 + crates/nvisy-server/src/app.rs | 41 + crates/nvisy-server/src/config.rs | 19 + crates/nvisy-server/src/main.rs | 29 + crates/nvisy-server/src/middleware/mod.rs | 2 + crates/nvisy-server/src/routes/audit.rs | 47 + crates/nvisy-server/src/routes/graphs.rs | 71 + crates/nvisy-server/src/routes/health.rs | 16 + crates/nvisy-server/src/routes/mod.rs | 5 + crates/nvisy-server/src/routes/policies.rs | 100 + crates/nvisy-server/src/routes/redact.rs | 42 + crates/nvisy-server/src/schemas/mod.rs | 1 + .../nvisy-server/src/service/audit_store.rs | 72 + .../src/service/engine_factory.rs | 16 + crates/nvisy-server/src/service/mod.rs | 3 + .../nvisy-server/src/service/policy_store.rs | 78 + crates/nvisy-server/src/state.rs | 14 + package-lock.json | 7640 ----------------- package.json | 38 - packages/README.md | 59 - packages/nvisy-ai/pyproject.toml | 16 + packages/nvisy-ai/src/nvisy_ai/__init__.py | 5 + packages/nvisy-ai/src/nvisy_ai/ner.py | 145 + packages/nvisy-ai/src/nvisy_ai/prompts.py | 37 + .../src/nvisy_ai/providers/__init__.py | 1 + .../src/nvisy_ai/providers/anthropic.py | 61 + .../nvisy-ai/src/nvisy_ai/providers/base.py | 30 + .../nvisy-ai/src/nvisy_ai/providers/gemini.py | 39 + .../nvisy-ai/src/nvisy_ai/providers/openai.py | 59 + packages/nvisy-core/README.md | 186 - packages/nvisy-core/package.json | 30 - packages/nvisy-core/src/action.ts | 246 - .../nvisy-core/src/datatypes/blob.test.ts | 140 - packages/nvisy-core/src/datatypes/blob.ts | 127 - .../nvisy-core/src/datatypes/chunk.test.ts | 77 - packages/nvisy-core/src/datatypes/chunk.ts | 57 - .../nvisy-core/src/datatypes/data.test.ts | 88 - packages/nvisy-core/src/datatypes/data.ts | 66 - .../nvisy-core/src/datatypes/document.test.ts | 242 - packages/nvisy-core/src/datatypes/document.ts | 112 - .../src/datatypes/embedding.test.ts | 50 - .../nvisy-core/src/datatypes/embedding.ts | 40 - packages/nvisy-core/src/datatypes/index.ts | 38 - .../src/documents/coordinates.test.ts | 148 - .../nvisy-core/src/documents/coordinates.ts | 174 - .../nvisy-core/src/documents/elements.test.ts | 310 - packages/nvisy-core/src/documents/elements.ts | 374 - packages/nvisy-core/src/documents/index.ts | 69 - .../nvisy-core/src/documents/ontology.test.ts | 137 - packages/nvisy-core/src/documents/ontology.ts | 125 - .../nvisy-core/src/errors/cancellation.ts | 26 - packages/nvisy-core/src/errors/connection.ts | 63 - packages/nvisy-core/src/errors/index.ts | 24 - packages/nvisy-core/src/errors/runtime.ts | 88 - packages/nvisy-core/src/errors/timeout.ts | 21 - packages/nvisy-core/src/errors/validation.ts | 57 - packages/nvisy-core/src/index.ts | 98 - packages/nvisy-core/src/loader.ts | 109 - packages/nvisy-core/src/plugin.ts | 115 - packages/nvisy-core/src/provider.ts | 197 - packages/nvisy-core/src/stream.ts | 270 - packages/nvisy-core/src/types.ts | 31 - packages/nvisy-core/test/action.fixtures.ts | 64 - packages/nvisy-core/test/action.test.ts | 84 - packages/nvisy-core/test/provider.fixtures.ts | 81 - packages/nvisy-core/test/provider.test.ts | 83 - packages/nvisy-core/tsconfig.json | 12 - packages/nvisy-core/tsup.config.ts | 22 - packages/nvisy-plugin-ai/README.md | 53 - packages/nvisy-plugin-ai/package.json | 34 - .../src/actions/chunk-contextual.ts | 71 - .../src/actions/chunk-similarity.ts | 90 - .../src/actions/enrich-by-description.ts | 35 - .../src/actions/enrich-by-metadata.ts | 35 - .../src/actions/enrich-by-ner.ts | 39 - .../src/actions/enrich-by-table-html.ts | 33 - .../nvisy-plugin-ai/src/actions/enrich.ts | 129 - .../src/actions/generate-embedding.ts | 60 - packages/nvisy-plugin-ai/src/actions/index.ts | 5 - .../src/actions/partition-contextual.ts | 33 - .../nvisy-plugin-ai/src/datatypes/index.ts | 0 packages/nvisy-plugin-ai/src/index.ts | 53 - .../src/providers/anthropic.ts | 31 - .../nvisy-plugin-ai/src/providers/client.ts | 230 - .../nvisy-plugin-ai/src/providers/gemini.ts | 48 - .../nvisy-plugin-ai/src/providers/index.ts | 18 - .../nvisy-plugin-ai/src/providers/openai.ts | 48 - .../nvisy-plugin-ai/src/providers/schemas.ts | 16 - packages/nvisy-plugin-ai/tsconfig.json | 13 - packages/nvisy-plugin-ai/tsup.config.ts | 22 - packages/nvisy-plugin-core/README.md | 141 - packages/nvisy-plugin-core/package.json | 31 - .../src/actions/chunk-by-character.test.ts | 85 - .../src/actions/chunk-by-character.ts | 40 - .../src/actions/chunk-by-page.test.ts | 52 - .../src/actions/chunk-by-page.ts | 70 - .../src/actions/chunk-by-section.test.ts | 142 - .../src/actions/chunk-by-section.ts | 125 - .../nvisy-plugin-core/src/actions/chunk.ts | 87 - .../nvisy-plugin-core/src/actions/index.ts | 8 - .../src/actions/partition-by-auto.test.ts | 24 - .../src/actions/partition-by-auto.ts | 21 - .../src/actions/partition-by-rule.test.ts | 113 - .../src/actions/partition-by-rule.ts | 80 - .../src/actions/partition.ts | 87 - packages/nvisy-plugin-core/src/index.ts | 42 - .../nvisy-plugin-core/src/loaders/csv.test.ts | 169 - packages/nvisy-plugin-core/src/loaders/csv.ts | 78 - .../nvisy-plugin-core/src/loaders/index.ts | 12 - .../src/loaders/json.test.ts | 168 - .../nvisy-plugin-core/src/loaders/json.ts | 82 - .../src/loaders/plaintext.test.ts | 103 - .../src/loaders/plaintext.ts | 47 - .../src/splitter/delimiter.test.ts | 90 - .../src/splitter/delimiter.ts | 31 - .../nvisy-plugin-core/src/splitter/index.ts | 4 - .../src/splitter/regex.test.ts | 90 - .../nvisy-plugin-core/src/splitter/regex.ts | 59 - packages/nvisy-plugin-core/tsconfig.json | 13 - packages/nvisy-plugin-core/tsup.config.ts | 22 - packages/nvisy-plugin-nosql/package.json | 30 - packages/nvisy-plugin-nosql/src/index.ts | 13 - packages/nvisy-plugin-nosql/tsconfig.json | 13 - packages/nvisy-plugin-nosql/tsup.config.ts | 22 - packages/nvisy-plugin-object/package.json | 33 - packages/nvisy-plugin-object/src/index.ts | 28 - .../src/providers/azure.ts | 130 - .../src/providers/client.ts | 83 - .../nvisy-plugin-object/src/providers/gcs.ts | 86 - .../src/providers/index.ts | 9 - .../nvisy-plugin-object/src/providers/s3.ts | 115 - .../nvisy-plugin-object/src/streams/index.ts | 2 - .../nvisy-plugin-object/src/streams/read.ts | 95 - .../nvisy-plugin-object/src/streams/write.ts | 40 - packages/nvisy-plugin-object/tsconfig.json | 13 - packages/nvisy-plugin-object/tsup.config.ts | 22 - packages/nvisy-plugin-pandoc/package.json | 30 - packages/nvisy-plugin-pandoc/src/index.ts | 12 - packages/nvisy-plugin-pandoc/tsconfig.json | 13 - packages/nvisy-plugin-pandoc/tsup.config.ts | 22 - packages/nvisy-plugin-queue/package.json | 30 - packages/nvisy-plugin-queue/src/index.ts | 13 - packages/nvisy-plugin-queue/tsconfig.json | 13 - packages/nvisy-plugin-queue/tsup.config.ts | 22 - packages/nvisy-plugin-sql/README.md | 42 - packages/nvisy-plugin-sql/package.json | 38 - .../nvisy-plugin-sql/src/actions/coerce.ts | 63 - .../nvisy-plugin-sql/src/actions/filter.ts | 111 - .../nvisy-plugin-sql/src/actions/index.ts | 4 - .../nvisy-plugin-sql/src/actions/project.ts | 52 - .../nvisy-plugin-sql/src/actions/rename.ts | 37 - .../nvisy-plugin-sql/src/datatypes/index.ts | 1 - .../nvisy-plugin-sql/src/datatypes/row.ts | 35 - packages/nvisy-plugin-sql/src/index.ts | 32 - .../nvisy-plugin-sql/src/providers/client.ts | 140 - .../nvisy-plugin-sql/src/providers/index.ts | 6 - .../nvisy-plugin-sql/src/providers/mssql.ts | 41 - .../nvisy-plugin-sql/src/providers/mysql.ts | 20 - .../src/providers/postgres.ts | 20 - .../nvisy-plugin-sql/src/providers/schemas.ts | 20 - .../nvisy-plugin-sql/src/streams/index.ts | 3 - packages/nvisy-plugin-sql/src/streams/read.ts | 100 - .../nvisy-plugin-sql/src/streams/schemas.ts | 36 - .../nvisy-plugin-sql/src/streams/write.ts | 34 - packages/nvisy-plugin-sql/tsconfig.json | 13 - packages/nvisy-plugin-sql/tsup.config.ts | 22 - packages/nvisy-plugin-tesseract/package.json | 30 - packages/nvisy-plugin-tesseract/src/index.ts | 13 - packages/nvisy-plugin-tesseract/tsconfig.json | 13 - .../nvisy-plugin-tesseract/tsup.config.ts | 22 - packages/nvisy-plugin-vector/README.md | 42 - packages/nvisy-plugin-vector/package.json | 36 - packages/nvisy-plugin-vector/src/index.ts | 33 - .../src/providers/client.ts | 73 - .../src/providers/index.ts | 11 - .../src/providers/milvus.ts | 75 - .../src/providers/pgvector.ts | 73 - .../src/providers/pinecone.ts | 57 - .../src/providers/qdrant.ts | 71 - .../src/providers/weaviate.ts | 77 - .../nvisy-plugin-vector/src/streams/index.ts | 1 - .../nvisy-plugin-vector/src/streams/upsert.ts | 44 - packages/nvisy-plugin-vector/tsconfig.json | 13 - packages/nvisy-plugin-vector/tsup.config.ts | 22 - packages/nvisy-runtime/README.md | 69 - packages/nvisy-runtime/package.json | 36 - packages/nvisy-runtime/src/compiler/index.ts | 26 - packages/nvisy-runtime/src/compiler/parse.ts | 61 - packages/nvisy-runtime/src/compiler/plan.ts | 196 - packages/nvisy-runtime/src/engine/bridge.ts | 98 - .../nvisy-runtime/src/engine/connections.ts | 136 - packages/nvisy-runtime/src/engine/context.ts | 169 - packages/nvisy-runtime/src/engine/engine.ts | 243 - packages/nvisy-runtime/src/engine/executor.ts | 202 - packages/nvisy-runtime/src/engine/index.ts | 20 - packages/nvisy-runtime/src/engine/nodes.ts | 325 - packages/nvisy-runtime/src/engine/policies.ts | 150 - packages/nvisy-runtime/src/engine/runs.ts | 305 - packages/nvisy-runtime/src/index.ts | 37 - packages/nvisy-runtime/src/registry.ts | 236 - packages/nvisy-runtime/src/schema.ts | 140 - packages/nvisy-runtime/test/compile.test.ts | 126 - packages/nvisy-runtime/test/engine.test.ts | 493 -- packages/nvisy-runtime/test/fixtures.ts | 236 - packages/nvisy-runtime/test/parse.test.ts | 175 - packages/nvisy-runtime/test/plan.test.ts | 179 - packages/nvisy-runtime/test/registry.test.ts | 84 - packages/nvisy-runtime/tsconfig.json | 16 - packages/nvisy-runtime/tsup.config.ts | 22 - packages/nvisy-server/README.md | 43 - packages/nvisy-server/package.json | 43 - packages/nvisy-server/src/app.ts | 74 - packages/nvisy-server/src/config.ts | 89 - .../nvisy-server/src/handler/graphs-routes.ts | 106 - .../nvisy-server/src/handler/graphs-schema.ts | 90 - packages/nvisy-server/src/handler/graphs.ts | 146 - .../nvisy-server/src/handler/health-routes.ts | 32 - .../nvisy-server/src/handler/health-schema.ts | 9 - packages/nvisy-server/src/handler/health.ts | 25 - packages/nvisy-server/src/handler/index.ts | 23 - packages/nvisy-server/src/handler/openapi.ts | 45 - packages/nvisy-server/src/main.ts | 36 - .../src/middleware/error-handler.ts | 96 - .../src/middleware/hono-context.ts | 23 - packages/nvisy-server/src/middleware/index.ts | 54 - .../src/middleware/request-logger.ts | 36 - .../src/service/engine-factory.ts | 43 - packages/nvisy-server/src/service/index.ts | 1 - packages/nvisy-server/tsconfig.json | 19 - packages/nvisy-server/tsup.config.ts | 22 - pyproject.toml | 7 + rust-toolchain.toml | 3 + tsconfig.json | 36 - vitest.config.ts | 48 - 310 files changed, 8742 insertions(+), 22682 deletions(-) create mode 100644 .cargo/config.toml create mode 100644 Cargo.lock create mode 100644 Cargo.toml delete mode 100644 biome.json create mode 100644 crates/nvisy-core/Cargo.toml create mode 100644 crates/nvisy-core/src/data.rs create mode 100644 crates/nvisy-core/src/datatypes/audit.rs create mode 100644 crates/nvisy-core/src/datatypes/blob.rs create mode 100644 crates/nvisy-core/src/datatypes/document.rs create mode 100644 crates/nvisy-core/src/datatypes/entity.rs create mode 100644 crates/nvisy-core/src/datatypes/image.rs create mode 100644 crates/nvisy-core/src/datatypes/mod.rs create mode 100644 crates/nvisy-core/src/datatypes/policy.rs create mode 100644 crates/nvisy-core/src/datatypes/redaction.rs create mode 100644 crates/nvisy-core/src/documents/elements.rs create mode 100644 crates/nvisy-core/src/documents/mod.rs create mode 100644 crates/nvisy-core/src/documents/ontology.rs create mode 100644 crates/nvisy-core/src/errors/mod.rs create mode 100644 crates/nvisy-core/src/lib.rs create mode 100644 crates/nvisy-core/src/plugin.rs create mode 100644 crates/nvisy-core/src/registry.rs create mode 100644 crates/nvisy-core/src/traits/action.rs create mode 100644 crates/nvisy-core/src/traits/loader.rs create mode 100644 crates/nvisy-core/src/traits/mod.rs create mode 100644 crates/nvisy-core/src/traits/provider.rs create mode 100644 crates/nvisy-core/src/traits/stream.rs create mode 100644 crates/nvisy-core/src/types.rs create mode 100644 crates/nvisy-detect/Cargo.toml create mode 100644 crates/nvisy-detect/src/actions/apply_redaction.rs create mode 100644 crates/nvisy-detect/src/actions/classify.rs create mode 100644 crates/nvisy-detect/src/actions/detect_checksum.rs create mode 100644 crates/nvisy-detect/src/actions/detect_regex.rs create mode 100644 crates/nvisy-detect/src/actions/emit_audit.rs create mode 100644 crates/nvisy-detect/src/actions/evaluate_policy.rs create mode 100644 crates/nvisy-detect/src/actions/mod.rs create mode 100644 crates/nvisy-detect/src/lib.rs create mode 100644 crates/nvisy-detect/src/loaders/csv_loader.rs create mode 100644 crates/nvisy-detect/src/loaders/json_loader.rs create mode 100644 crates/nvisy-detect/src/loaders/mod.rs create mode 100644 crates/nvisy-detect/src/loaders/plaintext.rs create mode 100644 crates/nvisy-detect/src/patterns/api_key.rs create mode 100644 crates/nvisy-detect/src/patterns/credit_card.rs create mode 100644 crates/nvisy-detect/src/patterns/email.rs create mode 100644 crates/nvisy-detect/src/patterns/ip_address.rs create mode 100644 crates/nvisy-detect/src/patterns/mod.rs create mode 100644 crates/nvisy-detect/src/patterns/phone.rs create mode 100644 crates/nvisy-detect/src/patterns/ssn.rs create mode 100644 crates/nvisy-engine/Cargo.toml create mode 100644 crates/nvisy-engine/src/compiler/mod.rs create mode 100644 crates/nvisy-engine/src/compiler/parse.rs create mode 100644 crates/nvisy-engine/src/compiler/plan.rs create mode 100644 crates/nvisy-engine/src/connections.rs create mode 100644 crates/nvisy-engine/src/executor/context.rs create mode 100644 crates/nvisy-engine/src/executor/mod.rs create mode 100644 crates/nvisy-engine/src/executor/nodes.rs create mode 100644 crates/nvisy-engine/src/executor/runner.rs create mode 100644 crates/nvisy-engine/src/lib.rs create mode 100644 crates/nvisy-engine/src/policies.rs create mode 100644 crates/nvisy-engine/src/runs.rs create mode 100644 crates/nvisy-engine/src/schema.rs create mode 100644 crates/nvisy-object/Cargo.toml create mode 100644 crates/nvisy-object/src/client.rs create mode 100644 crates/nvisy-object/src/lib.rs create mode 100644 crates/nvisy-object/src/providers/mod.rs create mode 100644 crates/nvisy-object/src/providers/s3.rs create mode 100644 crates/nvisy-object/src/streams/mod.rs create mode 100644 crates/nvisy-object/src/streams/read.rs create mode 100644 crates/nvisy-object/src/streams/write.rs create mode 100644 crates/nvisy-python/Cargo.toml create mode 100644 crates/nvisy-python/src/actions.rs create mode 100644 crates/nvisy-python/src/bridge.rs create mode 100644 crates/nvisy-python/src/error.rs create mode 100644 crates/nvisy-python/src/lib.rs create mode 100644 crates/nvisy-python/src/ner.rs create mode 100644 crates/nvisy-python/src/provider.rs create mode 100644 crates/nvisy-server/Cargo.toml create mode 100644 crates/nvisy-server/src/app.rs create mode 100644 crates/nvisy-server/src/config.rs create mode 100644 crates/nvisy-server/src/main.rs create mode 100644 crates/nvisy-server/src/middleware/mod.rs create mode 100644 crates/nvisy-server/src/routes/audit.rs create mode 100644 crates/nvisy-server/src/routes/graphs.rs create mode 100644 crates/nvisy-server/src/routes/health.rs create mode 100644 crates/nvisy-server/src/routes/mod.rs create mode 100644 crates/nvisy-server/src/routes/policies.rs create mode 100644 crates/nvisy-server/src/routes/redact.rs create mode 100644 crates/nvisy-server/src/schemas/mod.rs create mode 100644 crates/nvisy-server/src/service/audit_store.rs create mode 100644 crates/nvisy-server/src/service/engine_factory.rs create mode 100644 crates/nvisy-server/src/service/mod.rs create mode 100644 crates/nvisy-server/src/service/policy_store.rs create mode 100644 crates/nvisy-server/src/state.rs delete mode 100644 package-lock.json delete mode 100644 package.json delete mode 100644 packages/README.md create mode 100644 packages/nvisy-ai/pyproject.toml create mode 100644 packages/nvisy-ai/src/nvisy_ai/__init__.py create mode 100644 packages/nvisy-ai/src/nvisy_ai/ner.py create mode 100644 packages/nvisy-ai/src/nvisy_ai/prompts.py create mode 100644 packages/nvisy-ai/src/nvisy_ai/providers/__init__.py create mode 100644 packages/nvisy-ai/src/nvisy_ai/providers/anthropic.py create mode 100644 packages/nvisy-ai/src/nvisy_ai/providers/base.py create mode 100644 packages/nvisy-ai/src/nvisy_ai/providers/gemini.py create mode 100644 packages/nvisy-ai/src/nvisy_ai/providers/openai.py delete mode 100644 packages/nvisy-core/README.md delete mode 100644 packages/nvisy-core/package.json delete mode 100644 packages/nvisy-core/src/action.ts delete mode 100644 packages/nvisy-core/src/datatypes/blob.test.ts delete mode 100644 packages/nvisy-core/src/datatypes/blob.ts delete mode 100644 packages/nvisy-core/src/datatypes/chunk.test.ts delete mode 100644 packages/nvisy-core/src/datatypes/chunk.ts delete mode 100644 packages/nvisy-core/src/datatypes/data.test.ts delete mode 100644 packages/nvisy-core/src/datatypes/data.ts delete mode 100644 packages/nvisy-core/src/datatypes/document.test.ts delete mode 100644 packages/nvisy-core/src/datatypes/document.ts delete mode 100644 packages/nvisy-core/src/datatypes/embedding.test.ts delete mode 100644 packages/nvisy-core/src/datatypes/embedding.ts delete mode 100644 packages/nvisy-core/src/datatypes/index.ts delete mode 100644 packages/nvisy-core/src/documents/coordinates.test.ts delete mode 100644 packages/nvisy-core/src/documents/coordinates.ts delete mode 100644 packages/nvisy-core/src/documents/elements.test.ts delete mode 100644 packages/nvisy-core/src/documents/elements.ts delete mode 100644 packages/nvisy-core/src/documents/index.ts delete mode 100644 packages/nvisy-core/src/documents/ontology.test.ts delete mode 100644 packages/nvisy-core/src/documents/ontology.ts delete mode 100644 packages/nvisy-core/src/errors/cancellation.ts delete mode 100644 packages/nvisy-core/src/errors/connection.ts delete mode 100644 packages/nvisy-core/src/errors/index.ts delete mode 100644 packages/nvisy-core/src/errors/runtime.ts delete mode 100644 packages/nvisy-core/src/errors/timeout.ts delete mode 100644 packages/nvisy-core/src/errors/validation.ts delete mode 100644 packages/nvisy-core/src/index.ts delete mode 100644 packages/nvisy-core/src/loader.ts delete mode 100644 packages/nvisy-core/src/plugin.ts delete mode 100644 packages/nvisy-core/src/provider.ts delete mode 100644 packages/nvisy-core/src/stream.ts delete mode 100644 packages/nvisy-core/src/types.ts delete mode 100644 packages/nvisy-core/test/action.fixtures.ts delete mode 100644 packages/nvisy-core/test/action.test.ts delete mode 100644 packages/nvisy-core/test/provider.fixtures.ts delete mode 100644 packages/nvisy-core/test/provider.test.ts delete mode 100644 packages/nvisy-core/tsconfig.json delete mode 100644 packages/nvisy-core/tsup.config.ts delete mode 100644 packages/nvisy-plugin-ai/README.md delete mode 100644 packages/nvisy-plugin-ai/package.json delete mode 100644 packages/nvisy-plugin-ai/src/actions/chunk-contextual.ts delete mode 100644 packages/nvisy-plugin-ai/src/actions/chunk-similarity.ts delete mode 100644 packages/nvisy-plugin-ai/src/actions/enrich-by-description.ts delete mode 100644 packages/nvisy-plugin-ai/src/actions/enrich-by-metadata.ts delete mode 100644 packages/nvisy-plugin-ai/src/actions/enrich-by-ner.ts delete mode 100644 packages/nvisy-plugin-ai/src/actions/enrich-by-table-html.ts delete mode 100644 packages/nvisy-plugin-ai/src/actions/enrich.ts delete mode 100644 packages/nvisy-plugin-ai/src/actions/generate-embedding.ts delete mode 100644 packages/nvisy-plugin-ai/src/actions/index.ts delete mode 100644 packages/nvisy-plugin-ai/src/actions/partition-contextual.ts delete mode 100644 packages/nvisy-plugin-ai/src/datatypes/index.ts delete mode 100644 packages/nvisy-plugin-ai/src/index.ts delete mode 100644 packages/nvisy-plugin-ai/src/providers/anthropic.ts delete mode 100644 packages/nvisy-plugin-ai/src/providers/client.ts delete mode 100644 packages/nvisy-plugin-ai/src/providers/gemini.ts delete mode 100644 packages/nvisy-plugin-ai/src/providers/index.ts delete mode 100644 packages/nvisy-plugin-ai/src/providers/openai.ts delete mode 100644 packages/nvisy-plugin-ai/src/providers/schemas.ts delete mode 100644 packages/nvisy-plugin-ai/tsconfig.json delete mode 100644 packages/nvisy-plugin-ai/tsup.config.ts delete mode 100644 packages/nvisy-plugin-core/README.md delete mode 100644 packages/nvisy-plugin-core/package.json delete mode 100644 packages/nvisy-plugin-core/src/actions/chunk-by-character.test.ts delete mode 100644 packages/nvisy-plugin-core/src/actions/chunk-by-character.ts delete mode 100644 packages/nvisy-plugin-core/src/actions/chunk-by-page.test.ts delete mode 100644 packages/nvisy-plugin-core/src/actions/chunk-by-page.ts delete mode 100644 packages/nvisy-plugin-core/src/actions/chunk-by-section.test.ts delete mode 100644 packages/nvisy-plugin-core/src/actions/chunk-by-section.ts delete mode 100644 packages/nvisy-plugin-core/src/actions/chunk.ts delete mode 100644 packages/nvisy-plugin-core/src/actions/index.ts delete mode 100644 packages/nvisy-plugin-core/src/actions/partition-by-auto.test.ts delete mode 100644 packages/nvisy-plugin-core/src/actions/partition-by-auto.ts delete mode 100644 packages/nvisy-plugin-core/src/actions/partition-by-rule.test.ts delete mode 100644 packages/nvisy-plugin-core/src/actions/partition-by-rule.ts delete mode 100644 packages/nvisy-plugin-core/src/actions/partition.ts delete mode 100644 packages/nvisy-plugin-core/src/index.ts delete mode 100644 packages/nvisy-plugin-core/src/loaders/csv.test.ts delete mode 100644 packages/nvisy-plugin-core/src/loaders/csv.ts delete mode 100644 packages/nvisy-plugin-core/src/loaders/index.ts delete mode 100644 packages/nvisy-plugin-core/src/loaders/json.test.ts delete mode 100644 packages/nvisy-plugin-core/src/loaders/json.ts delete mode 100644 packages/nvisy-plugin-core/src/loaders/plaintext.test.ts delete mode 100644 packages/nvisy-plugin-core/src/loaders/plaintext.ts delete mode 100644 packages/nvisy-plugin-core/src/splitter/delimiter.test.ts delete mode 100644 packages/nvisy-plugin-core/src/splitter/delimiter.ts delete mode 100644 packages/nvisy-plugin-core/src/splitter/index.ts delete mode 100644 packages/nvisy-plugin-core/src/splitter/regex.test.ts delete mode 100644 packages/nvisy-plugin-core/src/splitter/regex.ts delete mode 100644 packages/nvisy-plugin-core/tsconfig.json delete mode 100644 packages/nvisy-plugin-core/tsup.config.ts delete mode 100644 packages/nvisy-plugin-nosql/package.json delete mode 100644 packages/nvisy-plugin-nosql/src/index.ts delete mode 100644 packages/nvisy-plugin-nosql/tsconfig.json delete mode 100644 packages/nvisy-plugin-nosql/tsup.config.ts delete mode 100644 packages/nvisy-plugin-object/package.json delete mode 100644 packages/nvisy-plugin-object/src/index.ts delete mode 100644 packages/nvisy-plugin-object/src/providers/azure.ts delete mode 100644 packages/nvisy-plugin-object/src/providers/client.ts delete mode 100644 packages/nvisy-plugin-object/src/providers/gcs.ts delete mode 100644 packages/nvisy-plugin-object/src/providers/index.ts delete mode 100644 packages/nvisy-plugin-object/src/providers/s3.ts delete mode 100644 packages/nvisy-plugin-object/src/streams/index.ts delete mode 100644 packages/nvisy-plugin-object/src/streams/read.ts delete mode 100644 packages/nvisy-plugin-object/src/streams/write.ts delete mode 100644 packages/nvisy-plugin-object/tsconfig.json delete mode 100644 packages/nvisy-plugin-object/tsup.config.ts delete mode 100644 packages/nvisy-plugin-pandoc/package.json delete mode 100644 packages/nvisy-plugin-pandoc/src/index.ts delete mode 100644 packages/nvisy-plugin-pandoc/tsconfig.json delete mode 100644 packages/nvisy-plugin-pandoc/tsup.config.ts delete mode 100644 packages/nvisy-plugin-queue/package.json delete mode 100644 packages/nvisy-plugin-queue/src/index.ts delete mode 100644 packages/nvisy-plugin-queue/tsconfig.json delete mode 100644 packages/nvisy-plugin-queue/tsup.config.ts delete mode 100644 packages/nvisy-plugin-sql/README.md delete mode 100644 packages/nvisy-plugin-sql/package.json delete mode 100644 packages/nvisy-plugin-sql/src/actions/coerce.ts delete mode 100644 packages/nvisy-plugin-sql/src/actions/filter.ts delete mode 100644 packages/nvisy-plugin-sql/src/actions/index.ts delete mode 100644 packages/nvisy-plugin-sql/src/actions/project.ts delete mode 100644 packages/nvisy-plugin-sql/src/actions/rename.ts delete mode 100644 packages/nvisy-plugin-sql/src/datatypes/index.ts delete mode 100644 packages/nvisy-plugin-sql/src/datatypes/row.ts delete mode 100644 packages/nvisy-plugin-sql/src/index.ts delete mode 100644 packages/nvisy-plugin-sql/src/providers/client.ts delete mode 100644 packages/nvisy-plugin-sql/src/providers/index.ts delete mode 100644 packages/nvisy-plugin-sql/src/providers/mssql.ts delete mode 100644 packages/nvisy-plugin-sql/src/providers/mysql.ts delete mode 100644 packages/nvisy-plugin-sql/src/providers/postgres.ts delete mode 100644 packages/nvisy-plugin-sql/src/providers/schemas.ts delete mode 100644 packages/nvisy-plugin-sql/src/streams/index.ts delete mode 100644 packages/nvisy-plugin-sql/src/streams/read.ts delete mode 100644 packages/nvisy-plugin-sql/src/streams/schemas.ts delete mode 100644 packages/nvisy-plugin-sql/src/streams/write.ts delete mode 100644 packages/nvisy-plugin-sql/tsconfig.json delete mode 100644 packages/nvisy-plugin-sql/tsup.config.ts delete mode 100644 packages/nvisy-plugin-tesseract/package.json delete mode 100644 packages/nvisy-plugin-tesseract/src/index.ts delete mode 100644 packages/nvisy-plugin-tesseract/tsconfig.json delete mode 100644 packages/nvisy-plugin-tesseract/tsup.config.ts delete mode 100644 packages/nvisy-plugin-vector/README.md delete mode 100644 packages/nvisy-plugin-vector/package.json delete mode 100644 packages/nvisy-plugin-vector/src/index.ts delete mode 100644 packages/nvisy-plugin-vector/src/providers/client.ts delete mode 100644 packages/nvisy-plugin-vector/src/providers/index.ts delete mode 100644 packages/nvisy-plugin-vector/src/providers/milvus.ts delete mode 100644 packages/nvisy-plugin-vector/src/providers/pgvector.ts delete mode 100644 packages/nvisy-plugin-vector/src/providers/pinecone.ts delete mode 100644 packages/nvisy-plugin-vector/src/providers/qdrant.ts delete mode 100644 packages/nvisy-plugin-vector/src/providers/weaviate.ts delete mode 100644 packages/nvisy-plugin-vector/src/streams/index.ts delete mode 100644 packages/nvisy-plugin-vector/src/streams/upsert.ts delete mode 100644 packages/nvisy-plugin-vector/tsconfig.json delete mode 100644 packages/nvisy-plugin-vector/tsup.config.ts delete mode 100644 packages/nvisy-runtime/README.md delete mode 100644 packages/nvisy-runtime/package.json delete mode 100644 packages/nvisy-runtime/src/compiler/index.ts delete mode 100644 packages/nvisy-runtime/src/compiler/parse.ts delete mode 100644 packages/nvisy-runtime/src/compiler/plan.ts delete mode 100644 packages/nvisy-runtime/src/engine/bridge.ts delete mode 100644 packages/nvisy-runtime/src/engine/connections.ts delete mode 100644 packages/nvisy-runtime/src/engine/context.ts delete mode 100644 packages/nvisy-runtime/src/engine/engine.ts delete mode 100644 packages/nvisy-runtime/src/engine/executor.ts delete mode 100644 packages/nvisy-runtime/src/engine/index.ts delete mode 100644 packages/nvisy-runtime/src/engine/nodes.ts delete mode 100644 packages/nvisy-runtime/src/engine/policies.ts delete mode 100644 packages/nvisy-runtime/src/engine/runs.ts delete mode 100644 packages/nvisy-runtime/src/index.ts delete mode 100644 packages/nvisy-runtime/src/registry.ts delete mode 100644 packages/nvisy-runtime/src/schema.ts delete mode 100644 packages/nvisy-runtime/test/compile.test.ts delete mode 100644 packages/nvisy-runtime/test/engine.test.ts delete mode 100644 packages/nvisy-runtime/test/fixtures.ts delete mode 100644 packages/nvisy-runtime/test/parse.test.ts delete mode 100644 packages/nvisy-runtime/test/plan.test.ts delete mode 100644 packages/nvisy-runtime/test/registry.test.ts delete mode 100644 packages/nvisy-runtime/tsconfig.json delete mode 100644 packages/nvisy-runtime/tsup.config.ts delete mode 100644 packages/nvisy-server/README.md delete mode 100644 packages/nvisy-server/package.json delete mode 100644 packages/nvisy-server/src/app.ts delete mode 100644 packages/nvisy-server/src/config.ts delete mode 100644 packages/nvisy-server/src/handler/graphs-routes.ts delete mode 100644 packages/nvisy-server/src/handler/graphs-schema.ts delete mode 100644 packages/nvisy-server/src/handler/graphs.ts delete mode 100644 packages/nvisy-server/src/handler/health-routes.ts delete mode 100644 packages/nvisy-server/src/handler/health-schema.ts delete mode 100644 packages/nvisy-server/src/handler/health.ts delete mode 100644 packages/nvisy-server/src/handler/index.ts delete mode 100644 packages/nvisy-server/src/handler/openapi.ts delete mode 100644 packages/nvisy-server/src/main.ts delete mode 100644 packages/nvisy-server/src/middleware/error-handler.ts delete mode 100644 packages/nvisy-server/src/middleware/hono-context.ts delete mode 100644 packages/nvisy-server/src/middleware/index.ts delete mode 100644 packages/nvisy-server/src/middleware/request-logger.ts delete mode 100644 packages/nvisy-server/src/service/engine-factory.ts delete mode 100644 packages/nvisy-server/src/service/index.ts delete mode 100644 packages/nvisy-server/tsconfig.json delete mode 100644 packages/nvisy-server/tsup.config.ts create mode 100644 pyproject.toml create mode 100644 rust-toolchain.toml delete mode 100644 tsconfig.json delete mode 100644 vitest.config.ts diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..af95132 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,5 @@ +[target.x86_64-apple-darwin] +rustflags = ["-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup"] + +[target.aarch64-apple-darwin] +rustflags = ["-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup"] diff --git a/.gitignore b/.gitignore index 3705388..9c1daf2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,50 +2,57 @@ Thumbs.db .DS_Store -# Editors +# IDE and Editors .vs/ .vscode/ .idea/ .zed/ -# Node & JavaScript -node_modules/ -.vite/ -.astro/ -.output/ -.nuxt/ -.nitro/ -.cache/ - -# Testing -playwright-report/ -test-results/ +# Rust +debug/ +target/ +**/*.rs.bk +*.pdb + +# Python +__pycache__/ +*.py[cod] +.venv/ +*.egg-info/ +.ruff_cache/ +.pytest_cache/ + +# Generated files +*.pem +encryption.key +*.backup coverage/ -.lighthouse/ +*.lcov + +# Intermediate output +.diesel_lock +crates/nvisy-postgres/src/migrations/ +crates/nvisy-postgres/src/schema.rs.bak -# Build Output +# Build output dist/ build/ -output/ -# Environment +# Environment files .env* !.env.example # Logs +logs/ *.log *.log* -# Generated -*.pem +# Backup and temporary files +*.bak *.backup -*.tsbuildinfo - -# Generated config output files -*.config.d.ts -*.config.d.ts.map -*.config.js -*.config.js.map +*.tmp +tmp/ +temp/ # Other .ignore*/ diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..b2342d5 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,3402 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "addr2line" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anyhow" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" +dependencies = [ + "backtrace", +] + +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +dependencies = [ + "derive_arbitrary", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "aws-config" +version = "1.8.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c456581cb3c77fafcc8c67204a70680d40b61112d6da78c77bd31d945b65f1b5" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sdk-sso", + "aws-sdk-ssooidc", + "aws-sdk-sts", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "hex", + "http 1.4.0", + "ring", + "time", + "tokio", + "tracing", + "url", + "zeroize", +] + +[[package]] +name = "aws-credential-types" +version = "1.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cd362783681b15d136480ad555a099e82ecd8e2d10a841e14dfd0078d67fee3" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "zeroize", +] + +[[package]] +name = "aws-lc-rs" +version = "1.15.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b7b6141e96a8c160799cc2d5adecd5cbbe5054cb8c7c4af53da0f83bb7ad256" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.37.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c34dda4df7017c8db52132f0f8a2e0f8161649d15723ed63fc00c82d0f2081a" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + +[[package]] +name = "aws-runtime" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c635c2dc792cb4a11ce1a4f392a925340d1bdf499289b5ec1ec6810954eb43f5" +dependencies = [ + "aws-credential-types", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "percent-encoding", + "pin-project-lite", + "tracing", + "uuid", +] + +[[package]] +name = "aws-sdk-s3" +version = "1.122.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94c2ca0cba97e8e279eb6c0b2d0aa10db5959000e602ab2b7c02de6b85d4c19b" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-checksums", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "bytes", + "fastrand", + "hex", + "hmac", + "http 0.2.12", + "http 1.4.0", + "http-body 1.0.1", + "lru", + "percent-encoding", + "regex-lite", + "sha2", + "tracing", + "url", +] + +[[package]] +name = "aws-sdk-sso" +version = "1.93.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dcb38bb33fc0a11f1ffc3e3e85669e0a11a37690b86f77e75306d8f369146a0" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-ssooidc" +version = "1.95.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ada8ffbea7bd1be1f53df1dadb0f8fdb04badb13185b3321b929d1ee3caad09" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-sts" +version = "1.97.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6443ccadc777095d5ed13e21f5c364878c9f5bad4e35187a6cdbd863b0afcad" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sigv4" +version = "1.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efa49f3c607b92daae0c078d48a4571f599f966dce3caee5f1ea55c4d9073f99" +dependencies = [ + "aws-credential-types", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "crypto-bigint 0.5.5", + "form_urlencoded", + "hex", + "hmac", + "http 0.2.12", + "http 1.4.0", + "p256", + "percent-encoding", + "ring", + "sha2", + "subtle", + "time", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-async" +version = "1.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52eec3db979d18cb807fc1070961cc51d87d069abe9ab57917769687368a8c6c" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "aws-smithy-checksums" +version = "0.64.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddcf418858f9f3edd228acb8759d77394fed7531cce78d02bdda499025368439" +dependencies = [ + "aws-smithy-http", + "aws-smithy-types", + "bytes", + "crc-fast", + "hex", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "md-5", + "pin-project-lite", + "sha1", + "sha2", + "tracing", +] + +[[package]] +name = "aws-smithy-eventstream" +version = "0.60.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35b9c7354a3b13c66f60fe4616d6d1969c9fd36b1b5333a5dfb3ee716b33c588" +dependencies = [ + "aws-smithy-types", + "bytes", + "crc32fast", +] + +[[package]] +name = "aws-smithy-http" +version = "0.63.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630e67f2a31094ffa51b210ae030855cb8f3b7ee1329bdd8d085aaf61e8b97fc" +dependencies = [ + "aws-smithy-eventstream", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "bytes-utils", + "futures-core", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "percent-encoding", + "pin-project-lite", + "pin-utils", + "tracing", +] + +[[package]] +name = "aws-smithy-http-client" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12fb0abf49ff0cab20fd31ac1215ed7ce0ea92286ba09e2854b42ba5cabe7525" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "h2 0.3.27", + "h2 0.4.13", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "hyper 0.14.32", + "hyper 1.8.1", + "hyper-rustls 0.24.2", + "hyper-rustls 0.27.7", + "hyper-util", + "pin-project-lite", + "rustls 0.21.12", + "rustls 0.23.36", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.26.4", + "tower", + "tracing", +] + +[[package]] +name = "aws-smithy-json" +version = "0.62.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cb96aa208d62ee94104645f7b2ecaf77bf27edf161590b6224bfbac2832f979" +dependencies = [ + "aws-smithy-types", +] + +[[package]] +name = "aws-smithy-observability" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0a46543fbc94621080b3cf553eb4cbbdc41dd9780a30c4756400f0139440a1d" +dependencies = [ + "aws-smithy-runtime-api", +] + +[[package]] +name = "aws-smithy-query" +version = "0.60.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cebbddb6f3a5bd81553643e9c7daf3cc3dc5b0b5f398ac668630e8a84e6fff0" +dependencies = [ + "aws-smithy-types", + "urlencoding", +] + +[[package]] +name = "aws-smithy-runtime" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3df87c14f0127a0d77eb261c3bc45d5b4833e2a1f63583ebfb728e4852134ee" +dependencies = [ + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-http-client", + "aws-smithy-observability", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "pin-project-lite", + "pin-utils", + "tokio", + "tracing", +] + +[[package]] +name = "aws-smithy-runtime-api" +version = "1.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49952c52f7eebb72ce2a754d3866cc0f87b97d2a46146b79f80f3a93fb2b3716" +dependencies = [ + "aws-smithy-async", + "aws-smithy-types", + "bytes", + "http 0.2.12", + "http 1.4.0", + "pin-project-lite", + "tokio", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-types" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3a26048eeab0ddeba4b4f9d51654c79af8c3b32357dc5f336cee85ab331c33" +dependencies = [ + "base64-simd", + "bytes", + "bytes-utils", + "futures-core", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "itoa", + "num-integer", + "pin-project-lite", + "pin-utils", + "ryu", + "serde", + "time", + "tokio", + "tokio-util", +] + +[[package]] +name = "aws-smithy-xml" +version = "0.60.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11b2f670422ff42bf7065031e72b45bc52a3508bd089f743ea90731ca2b6ea57" +dependencies = [ + "xmlparser", +] + +[[package]] +name = "aws-types" +version = "1.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d980627d2dd7bfc32a3c025685a033eeab8d365cc840c631ef59d1b8f428164" +dependencies = [ + "aws-credential-types", + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "rustc_version", + "tracing", +] + +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core 0.4.5", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "itoa", + "matchit 0.7.3", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "serde_json", + "serde_path_to_error", + "sync_wrapper", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" +dependencies = [ + "axum-core 0.5.6", + "axum-macros", + "bytes", + "form_urlencoded", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper 1.8.1", + "hyper-util", + "itoa", + "matchit 0.8.4", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde_core", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" +dependencies = [ + "bytes", + "futures-core", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "mime", + "pin-project-lite", + "sync_wrapper", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-macros" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "604fde5e028fea851ce1d8570bbdc034bec850d157f7569d10f347d06808c05c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "backtrace" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-link", +] + +[[package]] +name = "base16ct" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + +[[package]] +name = "base64ct" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" + +[[package]] +name = "bitflags" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bumpalo" +version = "3.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +dependencies = [ + "serde", +] + +[[package]] +name = "bytes-utils" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" +dependencies = [ + "bytes", + "either", +] + +[[package]] +name = "cc" +version = "1.2.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b26a0954ae34af09b50f0de26458fa95369a0d478d8236d3f93082b219bd29" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfb" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38f2da7a0a2c4ccf0065be06397cc26a81f4e528be095826eee9d4adbb8c60f" +dependencies = [ + "byteorder", + "fnv", + "uuid", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "chrono" +version = "0.4.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "serde", + "wasm-bindgen", + "windows-link", +] + +[[package]] +name = "cmake" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +dependencies = [ + "cc", +] + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + +[[package]] +name = "crc-fast" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd92aca2c6001b1bf5ba0ff84ee74ec8501b52bbef0cac80bf25a6c1d87a83d" +dependencies = [ + "crc", + "digest", + "rustversion", + "spin", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crypto-bigint" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef2b4b23cddf68b89b8f8069890e8c270d54e2d5fe1b143820234805e4cb17ef" +dependencies = [ + "generic-array", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "der" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" +dependencies = [ + "const-oid", + "zeroize", +] + +[[package]] +name = "deranged" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "derive_arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "ecdsa" +version = "0.14.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" +dependencies = [ + "der", + "elliptic-curve", + "rfc6979", + "signature", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "elliptic-curve" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" +dependencies = [ + "base16ct", + "crypto-bigint 0.4.9", + "der", + "digest", + "ff", + "generic-array", + "group", + "pkcs8", + "rand_core 0.6.4", + "sec1", + "subtle", + "zeroize", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "ff" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-core", + "futures-task", + "pin-project-lite", + "pin-utils", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + +[[package]] +name = "gimli" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" + +[[package]] +name = "group" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" +dependencies = [ + "ff", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "h2" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 0.2.12", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "h2" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http 1.4.0", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] + +[[package]] +name = "hdrhistogram" +version = "7.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d" +dependencies = [ + "byteorder", + "num-traits", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http 0.2.12", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http 1.4.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http 1.4.0", + "http-body 1.0.1", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "0.14.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2 0.3.27", + "http 0.2.12", + "http-body 0.4.6", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2 0.5.10", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2 0.4.13", + "http 1.4.0", + "http-body 1.0.1", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +dependencies = [ + "futures-util", + "http 0.2.12", + "hyper 0.14.32", + "log", + "rustls 0.21.12", + "tokio", + "tokio-rustls 0.24.1", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http 1.4.0", + "hyper 1.8.1", + "hyper-util", + "rustls 0.23.36", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.26.4", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "base64", + "bytes", + "futures-channel", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "hyper 1.8.1", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2 0.6.2", + "tokio", + "tower-service", + "tracing", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" + +[[package]] +name = "icu_properties" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" + +[[package]] +name = "icu_provider" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown", + "serde", + "serde_core", +] + +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + +[[package]] +name = "infer" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847" +dependencies = [ + "cfb", +] + +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "itoa" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.181" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "459427e2af2b9c839b132acb702a1c654d95e10f8c326bfc2ad11310e458b1c5" + +[[package]] +name = "litemap" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "lru" +version = "0.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + +[[package]] +name = "matchit" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "mime_guess" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" +dependencies = [ + "mime", + "unicase", +] + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "mio" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.61.2", +] + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "num-conv" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "nvisy-core" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "bytes", + "chrono", + "infer", + "serde", + "serde_json", + "thiserror", + "tokio", + "tracing", + "uuid", +] + +[[package]] +name = "nvisy-detect" +version = "0.1.0" +dependencies = [ + "async-trait", + "nvisy-core", + "regex", + "serde", + "serde_json", + "tokio", + "tracing", + "uuid", +] + +[[package]] +name = "nvisy-engine" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "nvisy-core", + "petgraph", + "rand", + "serde", + "serde_json", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "uuid", +] + +[[package]] +name = "nvisy-object" +version = "0.1.0" +dependencies = [ + "async-trait", + "aws-config", + "aws-sdk-s3", + "bytes", + "nvisy-core", + "serde", + "serde_json", + "thiserror", + "tokio", + "tracing", + "uuid", +] + +[[package]] +name = "nvisy-python" +version = "0.1.0" +dependencies = [ + "async-trait", + "nvisy-core", + "pyo3", + "serde", + "serde_json", + "thiserror", + "tokio", + "tracing", + "uuid", +] + +[[package]] +name = "nvisy-server" +version = "0.1.0" +dependencies = [ + "anyhow", + "axum 0.8.8", + "chrono", + "nvisy-core", + "nvisy-detect", + "nvisy-engine", + "nvisy-object", + "nvisy-python", + "serde", + "serde_json", + "thiserror", + "tokio", + "tower", + "tower-http", + "tracing", + "tracing-subscriber", + "utoipa", + "utoipa-swagger-ui", + "uuid", +] + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + +[[package]] +name = "outref" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" + +[[package]] +name = "p256" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" +dependencies = [ + "ecdsa", + "elliptic-curve", + "sha2", +] + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset", + "indexmap", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkcs8" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" +dependencies = [ + "der", + "spki", +] + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "potential_utf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +dependencies = [ + "zerovec", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "serde", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-lite" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" + +[[package]] +name = "regex-syntax" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" + +[[package]] +name = "rfc6979" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb" +dependencies = [ + "crypto-bigint 0.4.9", + "hmac", + "zeroize", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rust-embed" +version = "8.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04113cb9355a377d83f06ef1f0a45b8ab8cd7d8b1288160717d66df5c7988d27" +dependencies = [ + "rust-embed-impl", + "rust-embed-utils", + "walkdir", +] + +[[package]] +name = "rust-embed-impl" +version = "8.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da0902e4c7c8e997159ab384e6d0fc91c221375f6894346ae107f47dd0f3ccaa" +dependencies = [ + "proc-macro2", + "quote", + "rust-embed-utils", + "syn", + "walkdir", +] + +[[package]] +name = "rust-embed-utils" +version = "8.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bcdef0be6fe7f6fa333b1073c949729274b05f123a0ad7efcb8efd878e5c3b1" +dependencies = [ + "sha2", + "walkdir", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustls" +version = "0.21.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" +dependencies = [ + "log", + "ring", + "rustls-webpki 0.101.7", + "sct", +] + +[[package]] +name = "rustls" +version = "0.23.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" +dependencies = [ + "aws-lc-rs", + "once_cell", + "rustls-pki-types", + "rustls-webpki 0.103.9", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.101.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "sct" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "sec1" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" +dependencies = [ + "base16ct", + "der", + "generic-array", + "pkcs8", + "subtle", + "zeroize", +] + +[[package]] +name = "security-framework" +version = "3.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_path_to_error" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" +dependencies = [ + "itoa", + "serde", + "serde_core", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + +[[package]] +name = "signature" +version = "1.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "socket2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "socket2" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86f4aa3ad99f2088c990dfa82d367e19cb29268ed67c574d10d0a4bfe71f07e0" +dependencies = [ + "libc", + "windows-sys 0.60.2", +] + +[[package]] +name = "spin" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" + +[[package]] +name = "spki" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" +dependencies = [ + "base64ct", + "der", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "2.0.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "time" +version = "0.3.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +dependencies = [ + "deranged", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" + +[[package]] +name = "time-macros" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tinystr" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tokio" +version = "1.49.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" +dependencies = [ + "bytes", + "libc", + "mio", + "pin-project-lite", + "signal-hook-registry", + "socket2 0.6.2", + "tokio-macros", + "windows-sys 0.61.2", +] + +[[package]] +name = "tokio-macros" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +dependencies = [ + "rustls 0.21.12", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls 0.23.36", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "hdrhistogram", + "indexmap", + "pin-project-lite", + "slab", + "sync_wrapper", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +dependencies = [ + "bitflags", + "bytes", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "pin-project-lite", + "tower-layer", + "tower-service", + "tracing", + "uuid", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + +[[package]] +name = "unicode-ident" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "537dd038a89878be9b64dd4bd1b260315c1bb94f4d784956b81e27a088d9a09e" + +[[package]] +name = "unindent" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utoipa" +version = "5.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fcc29c80c21c31608227e0912b2d7fddba57ad76b606890627ba8ee7964e993" +dependencies = [ + "indexmap", + "serde", + "serde_json", + "utoipa-gen", +] + +[[package]] +name = "utoipa-gen" +version = "5.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d79d08d92ab8af4c5e8a6da20c47ae3f61a0f1dabc1997cdf2d082b757ca08b" +dependencies = [ + "proc-macro2", + "quote", + "regex", + "syn", +] + +[[package]] +name = "utoipa-swagger-ui" +version = "8.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db4b5ac679cc6dfc5ea3f2823b0291c777750ffd5e13b21137e0f7ac0e8f9617" +dependencies = [ + "axum 0.7.9", + "base64", + "mime_guess", + "regex", + "rust-embed", + "serde", + "serde_json", + "url", + "utoipa", + "zip", +] + +[[package]] +name = "uuid" +version = "1.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee48d38b119b0cd71fe4141b30f5ba9c7c5d9f4e7a3a8b4a674e4b6ef789976f" +dependencies = [ + "getrandom 0.3.4", + "js-sys", + "serde_core", + "wasm-bindgen", +] + +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" + +[[package]] +name = "writeable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" + +[[package]] +name = "xmlparser" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" + +[[package]] +name = "yoke" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + +[[package]] +name = "zerotrie" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zip" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" +dependencies = [ + "arbitrary", + "crc32fast", + "crossbeam-utils", + "displaydoc", + "flate2", + "indexmap", + "memchr", + "thiserror", + "zopfli", +] + +[[package]] +name = "zmij" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4de98dfa5d5b7fef4ee834d0073d560c9ca7b6c46a71d058c48db7960f8cfaf7" + +[[package]] +name = "zopfli" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249" +dependencies = [ + "bumpalo", + "crc32fast", + "log", + "simd-adler32", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..c5191cb --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,89 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[workspace] +resolver = "2" +members = [ + "./crates/nvisy-core", + "./crates/nvisy-detect", + "./crates/nvisy-engine", + "./crates/nvisy-object", + "./crates/nvisy-python", + "./crates/nvisy-server", +] + +[workspace.package] +version = "0.1.0" +rust-version = "1.85" +edition = "2024" +license = "MIT" +publish = false + +authors = ["nvisy <contact@nvisy.com>"] +repository = "https://github.com/nvisycom/runtime" +homepage = "https://github.com/nvisycom/runtime" +documentation = "https://docs.rs/nvisy-runtime" + +[workspace.dependencies] +# Default features are disabled for certain dependencies to allow +# downstream workspaces/crates to selectively enable them as needed. +# +# See for more details: https://github.com/rust-lang/cargo/issues/11329 + +# Internal crates +nvisy-core = { path = "./crates/nvisy-core", version = "0.1.0" } +nvisy-detect = { path = "./crates/nvisy-detect", version = "0.1.0" } +nvisy-engine = { path = "./crates/nvisy-engine", version = "0.1.0" } +nvisy-object = { path = "./crates/nvisy-object", version = "0.1.0" } +nvisy-python = { path = "./crates/nvisy-python", version = "0.1.0" } +nvisy-server = { path = "./crates/nvisy-server", version = "0.1.0" } + +# Async runtime +tokio = { version = "1", features = [] } +tokio-util = { version = "0.7", features = [] } +futures = { version = "0.3", features = [] } +async-trait = { version = "0.1", features = [] } + +# HTTP server +axum = { version = "0.8", features = [] } +tower = { version = "0.5", features = [] } +tower-http = { version = "0.6", features = [] } + +# OpenAPI / Documentation +utoipa = { version = "5", features = [] } +utoipa-swagger-ui = { version = "8", features = [] } + +# Observability +tracing = { version = "0.1", features = [] } +tracing-subscriber = { version = "0.3", features = [] } + +# (De)serialization +serde = { version = "1.0", features = ["derive"] } +serde_json = { version = "1.0", features = [] } + +# Error handling +thiserror = { version = "2.0", features = [] } +anyhow = { version = "1.0", features = [] } + +# Primitive datatypes +uuid = { version = "1", features = ["serde", "v4"] } +chrono = { version = "0.4", features = ["serde"] } +bytes = { version = "1", features = ["serde"] } + +# Text processing +regex = { version = "1.0", features = [] } + +# Graph data structures +petgraph = { version = "0.7", features = [] } + +# File type detection +infer = { version = "0.16", features = [] } + +# Python interop +pyo3 = { version = "0.23", features = [] } + +# AWS SDK +aws-sdk-s3 = { version = "1", features = [] } +aws-config = { version = "1", features = [] } + +# Randomness +rand = { version = "0.9", features = [] } diff --git a/biome.json b/biome.json deleted file mode 100644 index b8fb431..0000000 --- a/biome.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "$schema": "https://biomejs.dev/schemas/2.3.14/schema.json", - "vcs": { - "enabled": true, - "clientKind": "git", - "useIgnoreFile": true - }, - "files": { - "ignoreUnknown": false, - "includes": ["**", "!node_modules", "!dist", "!coverage"] - }, - "formatter": { - "enabled": true, - "indentStyle": "tab" - }, - "linter": { - "enabled": true, - "rules": { - "recommended": true, - "style": { - "recommended": true, - "noNonNullAssertion": "off" - }, - "correctness": { - "noUnusedImports": "warn", - "noUnusedPrivateClassMembers": "warn", - "useYield": "warn" - }, - "complexity": { - "noUselessConstructor": "warn" - }, - "suspicious": { - "noEmptyInterface": "off" - } - } - }, - "javascript": { - "formatter": { - "quoteStyle": "double" - } - } -} diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml new file mode 100644 index 0000000..4c0d3c7 --- /dev/null +++ b/crates/nvisy-core/Cargo.toml @@ -0,0 +1,46 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-core" +description = "Domain types, traits, errors, and plugin registry for the Nvisy platform" +keywords = ["nvisy", "core", "domain", "types"] +categories = ["data-structures"] + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +# (De)serialization +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true, features = [] } + +# Async runtime +tokio = { workspace = true, features = ["sync"] } +async-trait = { workspace = true, features = [] } + +# Primitive datatypes +uuid = { workspace = true, features = ["serde", "v4"] } +chrono = { workspace = true, features = ["serde"] } +bytes = { workspace = true, features = ["serde"] } + +# File type detection +infer = { workspace = true, features = [] } + +# Error handling +thiserror = { workspace = true, features = [] } +anyhow = { workspace = true, features = [] } + +# Observability +tracing = { workspace = true, features = [] } diff --git a/crates/nvisy-core/src/data.rs b/crates/nvisy-core/src/data.rs new file mode 100644 index 0000000..3a88ae9 --- /dev/null +++ b/crates/nvisy-core/src/data.rs @@ -0,0 +1,84 @@ +use serde::{Deserialize, Serialize}; +use uuid::Uuid; +use crate::types::Metadata; +use crate::datatypes::{ + entity::Entity, redaction::Redaction, policy::Policy, audit::Audit, + document::Document, blob::Blob, image::ImageData, +}; + +/// Common fields shared by all domain data items. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataItem { + pub id: Uuid, + #[serde(skip_serializing_if = "Option::is_none")] + pub parent_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option<Metadata>, +} + +impl DataItem { + pub fn new() -> Self { + Self { + id: Uuid::new_v4(), + parent_id: None, + metadata: None, + } + } + + pub fn with_metadata(mut self, metadata: Metadata) -> Self { + self.metadata = Some(metadata); + self + } + + pub fn derive_from(mut self, parent: &DataItem) -> Self { + self.parent_id = Some(parent.id); + self + } +} + +impl Default for DataItem { + fn default() -> Self { + Self::new() + } +} + +/// Discriminated union of all data types that flow through DAG channels. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "_type", rename_all = "snake_case")] +pub enum DataValue { + Document(Document), + Blob(Blob), + Entity(Entity), + Redaction(Redaction), + Policy(Policy), + Audit(Audit), + Image(ImageData), +} + +impl DataValue { + /// Get the type name of this data value. + pub fn type_name(&self) -> &'static str { + match self { + DataValue::Document(_) => "document", + DataValue::Blob(_) => "blob", + DataValue::Entity(_) => "entity", + DataValue::Redaction(_) => "redaction", + DataValue::Policy(_) => "policy", + DataValue::Audit(_) => "audit", + DataValue::Image(_) => "image", + } + } + + /// Get the underlying DataItem common fields. + pub fn data_item(&self) -> &DataItem { + match self { + DataValue::Document(d) => &d.data, + DataValue::Blob(b) => &b.data, + DataValue::Entity(e) => &e.data, + DataValue::Redaction(r) => &r.data, + DataValue::Policy(p) => &p.data, + DataValue::Audit(a) => &a.data, + DataValue::Image(i) => &i.data, + } + } +} diff --git a/crates/nvisy-core/src/datatypes/audit.rs b/crates/nvisy-core/src/datatypes/audit.rs new file mode 100644 index 0000000..c293603 --- /dev/null +++ b/crates/nvisy-core/src/datatypes/audit.rs @@ -0,0 +1,70 @@ +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; +use crate::data::DataItem; +use crate::types::{AuditAction, Metadata}; + +/// An immutable audit record tracking a data protection event. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Audit { + #[serde(flatten)] + pub data: DataItem, + pub action: AuditAction, + pub timestamp: DateTime<Utc>, + #[serde(skip_serializing_if = "Option::is_none")] + pub entity_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + pub redaction_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + pub policy_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + pub source_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + pub run_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + pub actor: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub details: Option<Metadata>, +} + +impl Audit { + pub fn new(action: AuditAction) -> Self { + Self { + data: DataItem::new(), + action, + timestamp: Utc::now(), + entity_id: None, + redaction_id: None, + policy_id: None, + source_id: None, + run_id: None, + actor: None, + details: None, + } + } + + pub fn with_entity_id(mut self, id: Uuid) -> Self { + self.entity_id = Some(id); + self + } + + pub fn with_redaction_id(mut self, id: Uuid) -> Self { + self.redaction_id = Some(id); + self + } + + pub fn with_run_id(mut self, id: Uuid) -> Self { + self.run_id = Some(id); + self + } + + pub fn with_actor(mut self, actor: impl Into<String>) -> Self { + self.actor = Some(actor.into()); + self + } + + pub fn with_details(mut self, details: Metadata) -> Self { + self.details = Some(details); + self + } +} diff --git a/crates/nvisy-core/src/datatypes/blob.rs b/crates/nvisy-core/src/datatypes/blob.rs new file mode 100644 index 0000000..7d62bca --- /dev/null +++ b/crates/nvisy-core/src/datatypes/blob.rs @@ -0,0 +1,84 @@ +use bytes::Bytes; +use serde::{Deserialize, Serialize}; +use crate::data::DataItem; + +/// Content type information for a blob. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct BlobContentInfo { + /// MIME type provided by the caller (e.g. from HTTP Content-Type header). + #[serde(skip_serializing_if = "Option::is_none")] + pub mime: Option<String>, + /// MIME type detected from magic bytes. + #[serde(skip_serializing_if = "Option::is_none")] + pub detected_mime: Option<String>, +} + +/// A binary object from storage (file content + path + content type). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Blob { + #[serde(flatten)] + pub data: DataItem, + pub path: String, + #[serde(with = "bytes_serde")] + pub content: Bytes, + pub provided: BlobContentInfo, +} + +impl Blob { + pub fn new(path: impl Into<String>, content: impl Into<Bytes>) -> Self { + let content = content.into(); + let detected_mime = infer::get(&content).map(|t| t.mime_type().to_string()); + Self { + data: DataItem::new(), + path: path.into(), + content, + provided: BlobContentInfo { + mime: None, + detected_mime, + }, + } + } + + pub fn with_content_type(mut self, mime: impl Into<String>) -> Self { + self.provided.mime = Some(mime.into()); + self + } + + /// Get the best-available MIME type (provided takes precedence over detected). + pub fn content_type(&self) -> Option<&str> { + self.provided + .mime + .as_deref() + .or(self.provided.detected_mime.as_deref()) + } + + /// Get the file extension from the path. + pub fn extension(&self) -> Option<&str> { + self.path.rsplit('.').next() + } +} + +pub(crate) mod bytes_serde { + use bytes::Bytes; + use serde::{self, Deserialize, Deserializer, Serializer}; + + pub fn serialize<S>(bytes: &Bytes, serializer: S) -> Result<S::Ok, S::Error> + where + S: Serializer, + { + use serde::ser::SerializeSeq; + let mut seq = serializer.serialize_seq(Some(bytes.len()))?; + for b in bytes.iter() { + seq.serialize_element(b)?; + } + seq.end() + } + + pub fn deserialize<'de, D>(deserializer: D) -> Result<Bytes, D::Error> + where + D: Deserializer<'de>, + { + let v: Vec<u8> = Vec::deserialize(deserializer)?; + Ok(Bytes::from(v)) + } +} diff --git a/crates/nvisy-core/src/datatypes/document.rs b/crates/nvisy-core/src/datatypes/document.rs new file mode 100644 index 0000000..39dac9c --- /dev/null +++ b/crates/nvisy-core/src/datatypes/document.rs @@ -0,0 +1,95 @@ +use serde::{Deserialize, Serialize}; +use crate::data::DataItem; +use crate::documents::elements::Element; + +/// A parsed human-readable text representation of a document. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Document { + #[serde(flatten)] + pub data: DataItem, + pub content: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub title: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub elements: Option<Vec<Element>>, + #[serde(skip_serializing_if = "Option::is_none")] + pub source_format: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub page_count: Option<u32>, +} + +impl Document { + pub fn new(content: impl Into<String>) -> Self { + Self { + data: DataItem::new(), + content: content.into(), + title: None, + elements: None, + source_format: None, + page_count: None, + } + } + + pub fn with_title(mut self, title: impl Into<String>) -> Self { + self.title = Some(title.into()); + self + } + + pub fn with_elements(mut self, elements: Vec<Element>) -> Self { + self.elements = Some(elements); + self + } + + pub fn with_source_format(mut self, format: impl Into<String>) -> Self { + self.source_format = Some(format.into()); + self + } + + pub fn with_page_count(mut self, count: u32) -> Self { + self.page_count = Some(count); + self + } + + /// Create a Document by deriving content from element texts joined with "\n\n". + pub fn from_elements(elements: Vec<Element>) -> Self { + let content = elements.iter().map(|e| e.text.as_str()).collect::<Vec<_>>().join("\n\n"); + Self { + data: DataItem::new(), + content, + title: None, + elements: Some(elements), + source_format: None, + page_count: None, + } + } + + /// Unique BCP-47 language tags collected from all elements. + pub fn languages(&self) -> Vec<String> { + let mut langs = Vec::new(); + if let Some(elements) = &self.elements { + for el in elements { + if let Some(ref element_langs) = el.languages { + for lang in element_langs { + if !langs.contains(lang) { + langs.push(lang.clone()); + } + } + } + } + } + langs + } + + /// Group elements by their 1-based page number. + /// Elements without a page_number are collected under key 0. + pub fn get_elements_by_page(&self) -> std::collections::HashMap<u32, Vec<&Element>> { + let mut map = std::collections::HashMap::new(); + if let Some(elements) = &self.elements { + for el in elements { + let page = el.page_number.unwrap_or(0); + map.entry(page).or_insert_with(Vec::new).push(el); + } + } + map + } +} diff --git a/crates/nvisy-core/src/datatypes/entity.rs b/crates/nvisy-core/src/datatypes/entity.rs new file mode 100644 index 0000000..3b9fec3 --- /dev/null +++ b/crates/nvisy-core/src/datatypes/entity.rs @@ -0,0 +1,68 @@ +use serde::{Deserialize, Serialize}; +use uuid::Uuid; +use crate::data::DataItem; +use crate::types::{DetectionMethod, EntityCategory}; + +/// Bounding box for image-based entity locations. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BoundingBox { + pub x: f64, + pub y: f64, + pub width: f64, + pub height: f64, +} + +/// Location of an entity within its source document. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EntityLocation { + pub start_offset: usize, + pub end_offset: usize, + #[serde(skip_serializing_if = "Option::is_none")] + pub element_id: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub page_number: Option<u32>, + #[serde(skip_serializing_if = "Option::is_none")] + pub bounding_box: Option<BoundingBox>, +} + +/// A detected sensitive data occurrence within a document. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Entity { + #[serde(flatten)] + pub data: DataItem, + pub category: EntityCategory, + pub entity_type: String, + pub value: String, + pub detection_method: DetectionMethod, + pub confidence: f64, + pub location: EntityLocation, + #[serde(skip_serializing_if = "Option::is_none")] + pub source_id: Option<Uuid>, +} + +impl Entity { + pub fn new( + category: EntityCategory, + entity_type: impl Into<String>, + value: impl Into<String>, + detection_method: DetectionMethod, + confidence: f64, + location: EntityLocation, + ) -> Self { + Self { + data: DataItem::new(), + category, + entity_type: entity_type.into(), + value: value.into(), + detection_method, + confidence, + location, + source_id: None, + } + } + + pub fn with_source_id(mut self, source_id: Uuid) -> Self { + self.source_id = Some(source_id); + self + } +} diff --git a/crates/nvisy-core/src/datatypes/image.rs b/crates/nvisy-core/src/datatypes/image.rs new file mode 100644 index 0000000..d8dc412 --- /dev/null +++ b/crates/nvisy-core/src/datatypes/image.rs @@ -0,0 +1,51 @@ +use bytes::Bytes; +use serde::{Deserialize, Serialize}; +use crate::data::DataItem; + +/// An image extracted from a document or provided directly. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ImageData { + #[serde(flatten)] + pub data: DataItem, + #[serde(with = "crate::datatypes::blob::bytes_serde")] + pub image_data: Bytes, + pub mime_type: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub width: Option<u32>, + #[serde(skip_serializing_if = "Option::is_none")] + pub height: Option<u32>, + #[serde(skip_serializing_if = "Option::is_none")] + pub source_path: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub page_number: Option<u32>, +} + +impl ImageData { + pub fn new(image_data: impl Into<Bytes>, mime_type: impl Into<String>) -> Self { + Self { + data: DataItem::new(), + image_data: image_data.into(), + mime_type: mime_type.into(), + width: None, + height: None, + source_path: None, + page_number: None, + } + } + + pub fn with_dimensions(mut self, width: u32, height: u32) -> Self { + self.width = Some(width); + self.height = Some(height); + self + } + + pub fn with_source_path(mut self, path: impl Into<String>) -> Self { + self.source_path = Some(path.into()); + self + } + + pub fn with_page_number(mut self, page: u32) -> Self { + self.page_number = Some(page); + self + } +} diff --git a/crates/nvisy-core/src/datatypes/mod.rs b/crates/nvisy-core/src/datatypes/mod.rs new file mode 100644 index 0000000..151a49e --- /dev/null +++ b/crates/nvisy-core/src/datatypes/mod.rs @@ -0,0 +1,7 @@ +pub mod audit; +pub mod blob; +pub mod document; +pub mod entity; +pub mod image; +pub mod policy; +pub mod redaction; diff --git a/crates/nvisy-core/src/datatypes/policy.rs b/crates/nvisy-core/src/datatypes/policy.rs new file mode 100644 index 0000000..96d2f3a --- /dev/null +++ b/crates/nvisy-core/src/datatypes/policy.rs @@ -0,0 +1,87 @@ +use serde::{Deserialize, Serialize}; +use crate::data::DataItem; +use crate::types::{EntityCategory, RedactionMethod}; + +/// A single rule within a redaction policy. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PolicyRule { + pub id: String, + pub name: String, + pub categories: Vec<EntityCategory>, + pub entity_types: Vec<String>, + pub confidence_threshold: f64, + pub method: RedactionMethod, + pub replacement_template: String, + pub enabled: bool, + pub priority: i32, +} + +/// A redaction policy containing rules. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Policy { + #[serde(flatten)] + pub data: DataItem, + pub name: String, + pub rules: Vec<PolicyRule>, + pub default_method: RedactionMethod, + pub default_confidence_threshold: f64, +} + +impl Policy { + pub fn new(name: impl Into<String>, rules: Vec<PolicyRule>) -> Self { + Self { + data: DataItem::new(), + name: name.into(), + rules, + default_method: RedactionMethod::Mask, + default_confidence_threshold: 0.5, + } + } + + pub fn with_default_method(mut self, method: RedactionMethod) -> Self { + self.default_method = method; + self + } + + pub fn with_default_confidence_threshold(mut self, threshold: f64) -> Self { + self.default_confidence_threshold = threshold; + self + } + + /// Find the first matching enabled rule for a given entity. + /// + /// Rules are sorted by priority (ascending). A rule matches when: + /// - It is enabled + /// - The entity's confidence meets the rule's threshold + /// - The entity's category is in the rule's categories (or categories is empty) + /// - The entity's type is in the rule's entityTypes (or entityTypes is empty) + pub fn find_matching_rule( + &self, + category: EntityCategory, + entity_type: &str, + confidence: f64, + ) -> Option<&PolicyRule> { + let mut sorted: Vec<&PolicyRule> = self.rules.iter().collect(); + sorted.sort_by_key(|r| r.priority); + + for rule in sorted { + if !rule.enabled { + continue; + } + if confidence < rule.confidence_threshold { + continue; + } + if !rule.categories.is_empty() && !rule.categories.contains(&category) { + continue; + } + if !rule.entity_types.is_empty() + && !rule.entity_types.iter().any(|t| t == entity_type) + { + continue; + } + return Some(rule); + } + + None + } +} diff --git a/crates/nvisy-core/src/datatypes/redaction.rs b/crates/nvisy-core/src/datatypes/redaction.rs new file mode 100644 index 0000000..1a6582e --- /dev/null +++ b/crates/nvisy-core/src/datatypes/redaction.rs @@ -0,0 +1,47 @@ +use serde::{Deserialize, Serialize}; +use uuid::Uuid; +use crate::data::DataItem; +use crate::types::RedactionMethod; + +/// A redaction decision for a detected entity. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Redaction { + #[serde(flatten)] + pub data: DataItem, + pub entity_id: Uuid, + pub method: RedactionMethod, + pub replacement_value: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub original_value: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub policy_rule_id: Option<String>, + pub applied: bool, +} + +impl Redaction { + pub fn new( + entity_id: Uuid, + method: RedactionMethod, + replacement_value: impl Into<String>, + ) -> Self { + Self { + data: DataItem::new(), + entity_id, + method, + replacement_value: replacement_value.into(), + original_value: None, + policy_rule_id: None, + applied: false, + } + } + + pub fn with_original_value(mut self, value: impl Into<String>) -> Self { + self.original_value = Some(value.into()); + self + } + + pub fn with_policy_rule_id(mut self, id: impl Into<String>) -> Self { + self.policy_rule_id = Some(id.into()); + self + } +} diff --git a/crates/nvisy-core/src/documents/elements.rs b/crates/nvisy-core/src/documents/elements.rs new file mode 100644 index 0000000..1e4f4a1 --- /dev/null +++ b/crates/nvisy-core/src/documents/elements.rs @@ -0,0 +1,177 @@ +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::documents::ontology::ElementType; +use crate::types::Metadata; + +/// An inline hyperlink within element text. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Link { + pub text: String, + pub url: String, + pub start_index: usize, +} + +/// An inline formatting span within element text. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EmphasizedText { + pub text: String, + pub tag: String, +} + +/// A single cell within a table structure. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TableCellData { + pub row: usize, + pub column: usize, + pub text: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub is_header: Option<bool>, +} + +/// Extraction / OCR provenance data. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ElementProvenance { + #[serde(skip_serializing_if = "Option::is_none")] + pub confidence: Option<f64>, + #[serde(skip_serializing_if = "Option::is_none")] + pub detection_origin: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub is_continuation: Option<bool>, + #[serde(skip_serializing_if = "Option::is_none")] + pub header_footer_type: Option<String>, +} + +/// Structured key-value pair from a form. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FormKeyValuePair { + pub key: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub value: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub confidence: Option<f64>, +} + +/// A single structural element extracted from a document. +/// +/// Combines base element fields with optional type-specific fields +/// (image, table, form, email) in a flat struct rather than inheritance. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Element { + pub id: Uuid, + #[serde(rename = "type")] + pub element_type: ElementType, + pub text: String, + + #[serde(skip_serializing_if = "Option::is_none")] + pub parent_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + pub page_number: Option<u32>, + #[serde(skip_serializing_if = "Option::is_none")] + pub page_name: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub level: Option<u32>, + #[serde(skip_serializing_if = "Option::is_none")] + pub languages: Option<Vec<String>>, + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option<Metadata>, + #[serde(skip_serializing_if = "Option::is_none")] + pub source_tag: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub text_as_html: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub links: Option<Vec<Link>>, + #[serde(skip_serializing_if = "Option::is_none")] + pub emphasized_texts: Option<Vec<EmphasizedText>>, + #[serde(skip_serializing_if = "Option::is_none")] + pub provenance: Option<ElementProvenance>, + + // Image-specific fields (when element_type is Image) + #[serde(skip_serializing_if = "Option::is_none")] + pub image_base64: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub image_mime_type: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub image_url: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub image_path: Option<String>, + + // Table-specific fields (when element_type is Table) + #[serde(skip_serializing_if = "Option::is_none")] + pub cells: Option<Vec<TableCellData>>, + + // Form-specific fields (when element_type is Checkbox/FormKeysValues) + #[serde(skip_serializing_if = "Option::is_none")] + pub checked: Option<bool>, + #[serde(skip_serializing_if = "Option::is_none")] + pub value: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub key_value_pairs: Option<Vec<FormKeyValuePair>>, + + // Email-specific fields (when element_type is EmailMessage) + #[serde(skip_serializing_if = "Option::is_none")] + pub sent_from: Option<Vec<String>>, + #[serde(skip_serializing_if = "Option::is_none")] + pub sent_to: Option<Vec<String>>, + #[serde(skip_serializing_if = "Option::is_none")] + pub cc_recipient: Option<Vec<String>>, + #[serde(skip_serializing_if = "Option::is_none")] + pub bcc_recipient: Option<Vec<String>>, + #[serde(skip_serializing_if = "Option::is_none")] + pub subject: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub signature: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub email_message_id: Option<String>, +} + +impl Element { + pub fn new(element_type: ElementType, text: impl Into<String>) -> Self { + Self { + id: Uuid::new_v4(), + element_type, + text: text.into(), + parent_id: None, + page_number: None, + page_name: None, + level: None, + languages: None, + metadata: None, + source_tag: None, + text_as_html: None, + links: None, + emphasized_texts: None, + provenance: None, + image_base64: None, + image_mime_type: None, + image_url: None, + image_path: None, + cells: None, + checked: None, + value: None, + key_value_pairs: None, + sent_from: None, + sent_to: None, + cc_recipient: None, + bcc_recipient: None, + subject: None, + signature: None, + email_message_id: None, + } + } + + pub fn with_page_number(mut self, page: u32) -> Self { + self.page_number = Some(page); + self + } + + pub fn with_level(mut self, level: u32) -> Self { + self.level = Some(level); + self + } + + pub fn with_languages(mut self, langs: Vec<String>) -> Self { + self.languages = Some(langs); + self + } +} diff --git a/crates/nvisy-core/src/documents/mod.rs b/crates/nvisy-core/src/documents/mod.rs new file mode 100644 index 0000000..45c93da --- /dev/null +++ b/crates/nvisy-core/src/documents/mod.rs @@ -0,0 +1,2 @@ +pub mod elements; +pub mod ontology; diff --git a/crates/nvisy-core/src/documents/ontology.rs b/crates/nvisy-core/src/documents/ontology.rs new file mode 100644 index 0000000..75e6cfb --- /dev/null +++ b/crates/nvisy-core/src/documents/ontology.rs @@ -0,0 +1,76 @@ +use serde::{Deserialize, Serialize}; + +/// Element category — broad grouping of element types. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ElementCategory { + Text, + Table, + Media, + Code, + Math, + Form, + Layout, + Email, +} + +/// All element types across all categories. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum ElementType { + // Text + Title, + NarrativeText, + ListItem, + Header, + Footer, + FigureCaption, + Address, + UncategorizedText, + // Table + Table, + // Media + Image, + // Code + CodeSnippet, + // Math + Formula, + // Form + Checkbox, + FormKeysValues, + // Layout + PageBreak, + PageNumber, + // Email + EmailMessage, +} + +impl ElementType { + /// Return the category this element type belongs to. + pub fn category(&self) -> ElementCategory { + match self { + Self::Title + | Self::NarrativeText + | Self::ListItem + | Self::Header + | Self::Footer + | Self::FigureCaption + | Self::Address + | Self::UncategorizedText => ElementCategory::Text, + Self::Table => ElementCategory::Table, + Self::Image => ElementCategory::Media, + Self::CodeSnippet => ElementCategory::Code, + Self::Formula => ElementCategory::Math, + Self::Checkbox | Self::FormKeysValues => ElementCategory::Form, + Self::PageBreak | Self::PageNumber => ElementCategory::Layout, + Self::EmailMessage => ElementCategory::Email, + } + } +} + +/// Return the category for a given element type string. +pub fn category_of(type_str: &str) -> Option<ElementCategory> { + let et: ElementType = + serde_json::from_value(serde_json::Value::String(type_str.to_string())).ok()?; + Some(et.category()) +} diff --git a/crates/nvisy-core/src/errors/mod.rs b/crates/nvisy-core/src/errors/mod.rs new file mode 100644 index 0000000..35d9148 --- /dev/null +++ b/crates/nvisy-core/src/errors/mod.rs @@ -0,0 +1,109 @@ +/// Unified error type for the Nvisy platform. +#[derive(Debug, thiserror::Error)] +pub enum NvisyError { + #[error("Validation: {message}")] + Validation { + message: String, + source_component: String, + }, + + #[error("Connection: {message}")] + Connection { + message: String, + source_component: String, + retryable: bool, + }, + + #[error("Timeout: {message}")] + Timeout { message: String }, + + #[error("Cancelled: {message}")] + Cancellation { message: String }, + + #[error("Policy: {message}")] + Policy { message: String }, + + #[error("Runtime: {message}")] + Runtime { + message: String, + source_component: String, + retryable: bool, + }, + + #[error("Python: {message}")] + Python { + message: String, + traceback: Option<String>, + }, + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +impl NvisyError { + pub fn validation(message: impl Into<String>, source: impl Into<String>) -> Self { + Self::Validation { + message: message.into(), + source_component: source.into(), + } + } + + pub fn connection( + message: impl Into<String>, + source: impl Into<String>, + retryable: bool, + ) -> Self { + Self::Connection { + message: message.into(), + source_component: source.into(), + retryable, + } + } + + pub fn timeout(message: impl Into<String>) -> Self { + Self::Timeout { + message: message.into(), + } + } + + pub fn cancellation(message: impl Into<String>) -> Self { + Self::Cancellation { + message: message.into(), + } + } + + pub fn policy(message: impl Into<String>) -> Self { + Self::Policy { + message: message.into(), + } + } + + pub fn runtime( + message: impl Into<String>, + source: impl Into<String>, + retryable: bool, + ) -> Self { + Self::Runtime { + message: message.into(), + source_component: source.into(), + retryable, + } + } + + pub fn python(message: impl Into<String>, traceback: Option<String>) -> Self { + Self::Python { + message: message.into(), + traceback, + } + } + + /// Whether this error is retryable. + pub fn is_retryable(&self) -> bool { + match self { + Self::Connection { retryable, .. } => *retryable, + Self::Runtime { retryable, .. } => *retryable, + Self::Timeout { .. } => true, + _ => false, + } + } +} diff --git a/crates/nvisy-core/src/lib.rs b/crates/nvisy-core/src/lib.rs new file mode 100644 index 0000000..d8b1994 --- /dev/null +++ b/crates/nvisy-core/src/lib.rs @@ -0,0 +1,8 @@ +pub mod data; +pub mod datatypes; +pub mod documents; +pub mod errors; +pub mod plugin; +pub mod registry; +pub mod traits; +pub mod types; diff --git a/crates/nvisy-core/src/plugin.rs b/crates/nvisy-core/src/plugin.rs new file mode 100644 index 0000000..a66434d --- /dev/null +++ b/crates/nvisy-core/src/plugin.rs @@ -0,0 +1,52 @@ +use crate::traits::action::Action; +use crate::traits::loader::Loader; +use crate::traits::provider::ProviderFactory; +use crate::traits::stream::{StreamSource, StreamTarget}; + +/// Describes a plugin that bundles actions, providers, streams, and loaders. +pub struct PluginDescriptor { + pub id: String, + pub actions: Vec<Box<dyn Action>>, + pub providers: Vec<Box<dyn ProviderFactory>>, + pub sources: Vec<Box<dyn StreamSource>>, + pub targets: Vec<Box<dyn StreamTarget>>, + pub loaders: Vec<Box<dyn Loader>>, +} + +impl PluginDescriptor { + pub fn new(id: impl Into<String>) -> Self { + Self { + id: id.into(), + actions: Vec::new(), + providers: Vec::new(), + sources: Vec::new(), + targets: Vec::new(), + loaders: Vec::new(), + } + } + + pub fn with_action(mut self, action: impl Action) -> Self { + self.actions.push(Box::new(action)); + self + } + + pub fn with_provider(mut self, provider: impl ProviderFactory) -> Self { + self.providers.push(Box::new(provider)); + self + } + + pub fn with_source(mut self, source: impl StreamSource) -> Self { + self.sources.push(Box::new(source)); + self + } + + pub fn with_target(mut self, target: impl StreamTarget) -> Self { + self.targets.push(Box::new(target)); + self + } + + pub fn with_loader(mut self, loader: impl Loader) -> Self { + self.loaders.push(Box::new(loader)); + self + } +} diff --git a/crates/nvisy-core/src/registry.rs b/crates/nvisy-core/src/registry.rs new file mode 100644 index 0000000..52233a9 --- /dev/null +++ b/crates/nvisy-core/src/registry.rs @@ -0,0 +1,149 @@ +use std::collections::HashMap; + +use crate::datatypes::blob::Blob; +use crate::errors::NvisyError; +use crate::plugin::PluginDescriptor; +use crate::traits::action::Action; +use crate::traits::loader::Loader; +use crate::traits::provider::ProviderFactory; +use crate::traits::stream::{StreamSource, StreamTarget}; + +/// Registry of all actions, providers, streams, and loaders. +/// +/// Items are keyed by "plugin_id/item_id" (e.g. "detect/detect-regex"). +pub struct Registry { + actions: HashMap<String, Box<dyn Action>>, + providers: HashMap<String, Box<dyn ProviderFactory>>, + sources: HashMap<String, Box<dyn StreamSource>>, + targets: HashMap<String, Box<dyn StreamTarget>>, + loaders: Vec<Box<dyn Loader>>, +} + +impl Registry { + pub fn new() -> Self { + Self { + actions: HashMap::new(), + providers: HashMap::new(), + sources: HashMap::new(), + targets: HashMap::new(), + loaders: Vec::new(), + } + } + + /// Load a plugin, registering all its items under "plugin_id/item_id" keys. + pub fn load(&mut self, plugin: PluginDescriptor) -> Result<(), NvisyError> { + let prefix = &plugin.id; + + for action in plugin.actions { + let key = format!("{}/{}", prefix, action.id()); + if self.actions.contains_key(&key) { + return Err(NvisyError::validation( + format!("Duplicate action: {}", key), + "registry", + )); + } + self.actions.insert(key, action); + } + + for provider in plugin.providers { + let key = format!("{}/{}", prefix, provider.id()); + if self.providers.contains_key(&key) { + return Err(NvisyError::validation( + format!("Duplicate provider: {}", key), + "registry", + )); + } + self.providers.insert(key, provider); + } + + for source in plugin.sources { + let key = format!("{}/{}", prefix, source.id()); + if self.sources.contains_key(&key) { + return Err(NvisyError::validation( + format!("Duplicate source: {}", key), + "registry", + )); + } + self.sources.insert(key, source); + } + + for target in plugin.targets { + let key = format!("{}/{}", prefix, target.id()); + if self.targets.contains_key(&key) { + return Err(NvisyError::validation( + format!("Duplicate target: {}", key), + "registry", + )); + } + self.targets.insert(key, target); + } + + for loader in plugin.loaders { + self.loaders.push(loader); + } + + Ok(()) + } + + pub fn get_action(&self, key: &str) -> Option<&dyn Action> { + self.actions.get(key).map(|a| a.as_ref()) + } + + pub fn get_provider(&self, key: &str) -> Option<&dyn ProviderFactory> { + self.providers.get(key).map(|p| p.as_ref()) + } + + pub fn get_source(&self, key: &str) -> Option<&dyn StreamSource> { + self.sources.get(key).map(|s| s.as_ref()) + } + + pub fn get_target(&self, key: &str) -> Option<&dyn StreamTarget> { + self.targets.get(key).map(|t| t.as_ref()) + } + + /// Find a loader that matches a blob's extension or content type. + pub fn find_loader_for_blob(&self, blob: &Blob) -> Option<&dyn Loader> { + let ext = blob.extension(); + let ct = blob.content_type(); + + for loader in &self.loaders { + if let Some(ext) = ext { + if loader.extensions().contains(&ext) { + return Some(loader.as_ref()); + } + } + if let Some(ct) = ct { + if loader.content_types().contains(&ct) { + return Some(loader.as_ref()); + } + } + } + None + } + + pub fn action_keys(&self) -> Vec<&str> { + self.actions.keys().map(|s| s.as_str()).collect() + } + + pub fn provider_keys(&self) -> Vec<&str> { + self.providers.keys().map(|s| s.as_str()).collect() + } + + pub fn source_keys(&self) -> Vec<&str> { + self.sources.keys().map(|s| s.as_str()).collect() + } + + pub fn target_keys(&self) -> Vec<&str> { + self.targets.keys().map(|s| s.as_str()).collect() + } + + pub fn loader_ids(&self) -> Vec<&str> { + self.loaders.iter().map(|l| l.id()).collect() + } +} + +impl Default for Registry { + fn default() -> Self { + Self::new() + } +} diff --git a/crates/nvisy-core/src/traits/action.rs b/crates/nvisy-core/src/traits/action.rs new file mode 100644 index 0000000..c1fe2a3 --- /dev/null +++ b/crates/nvisy-core/src/traits/action.rs @@ -0,0 +1,43 @@ +use std::any::Any; + +use async_trait::async_trait; +use tokio::sync::mpsc; + +use crate::data::DataValue; +use crate::errors::NvisyError; + +/// Type-erased action that consumes from an input channel and produces to an output channel. +#[async_trait] +pub trait Action: Send + Sync + 'static { + /// Unique identifier for this action (e.g. "detect-regex"). + fn id(&self) -> &str; + + /// Expected input data type name (e.g. "document"). + fn input_type(&self) -> &str; + + /// Output data type name (e.g. "entity"). + fn output_type(&self) -> &str; + + /// Whether this action requires a provider client. + fn requires_client(&self) -> bool { + false + } + + /// The provider ID this action requires, if any. + fn required_provider_id(&self) -> Option<&str> { + None + } + + /// Validate action parameters. + fn validate_params(&self, params: &serde_json::Value) -> Result<(), NvisyError>; + + /// Execute the action, consuming items from input and sending results to output. + /// Returns the number of items processed. + async fn execute( + &self, + input: mpsc::Receiver<DataValue>, + output: mpsc::Sender<DataValue>, + params: serde_json::Value, + client: Option<Box<dyn Any + Send>>, + ) -> Result<u64, NvisyError>; +} diff --git a/crates/nvisy-core/src/traits/loader.rs b/crates/nvisy-core/src/traits/loader.rs new file mode 100644 index 0000000..ed85bd2 --- /dev/null +++ b/crates/nvisy-core/src/traits/loader.rs @@ -0,0 +1,26 @@ +use async_trait::async_trait; + +use crate::datatypes::blob::Blob; +use crate::datatypes::document::Document; +use crate::datatypes::image::ImageData; +use crate::errors::NvisyError; + +/// Output of a loader: either a Document or an ImageData. +pub enum LoaderOutput { + Document(Document), + Image(ImageData), +} + +/// A loader transforms Blobs into Documents or Images. +#[async_trait] +pub trait Loader: Send + Sync + 'static { + fn id(&self) -> &str; + fn extensions(&self) -> &[&str]; + fn content_types(&self) -> &[&str]; + + async fn load( + &self, + blob: &Blob, + params: &serde_json::Value, + ) -> Result<Vec<LoaderOutput>, NvisyError>; +} diff --git a/crates/nvisy-core/src/traits/mod.rs b/crates/nvisy-core/src/traits/mod.rs new file mode 100644 index 0000000..125eb5a --- /dev/null +++ b/crates/nvisy-core/src/traits/mod.rs @@ -0,0 +1,4 @@ +pub mod action; +pub mod loader; +pub mod provider; +pub mod stream; diff --git a/crates/nvisy-core/src/traits/provider.rs b/crates/nvisy-core/src/traits/provider.rs new file mode 100644 index 0000000..216568a --- /dev/null +++ b/crates/nvisy-core/src/traits/provider.rs @@ -0,0 +1,29 @@ +use std::any::Any; +use std::future::Future; +use std::pin::Pin; + +use async_trait::async_trait; + +use crate::errors::NvisyError; + +/// A connected provider instance with an opaque client and optional disconnect callback. +pub struct ConnectedInstance { + pub client: Box<dyn Any + Send>, + pub disconnect: Option<Box<dyn FnOnce() -> Pin<Box<dyn Future<Output = ()> + Send>> + Send>>, +} + +/// Factory for creating connected provider instances. +#[async_trait] +pub trait ProviderFactory: Send + Sync + 'static { + /// Unique identifier (e.g. "s3", "openai"). + fn id(&self) -> &str; + + /// Validate credentials shape without connecting. + fn validate_credentials(&self, creds: &serde_json::Value) -> Result<(), NvisyError>; + + /// Verify credentials by attempting a lightweight connection. + async fn verify(&self, creds: &serde_json::Value) -> Result<(), NvisyError>; + + /// Create a connected instance. + async fn connect(&self, creds: &serde_json::Value) -> Result<ConnectedInstance, NvisyError>; +} diff --git a/crates/nvisy-core/src/traits/stream.rs b/crates/nvisy-core/src/traits/stream.rs new file mode 100644 index 0000000..9f841e9 --- /dev/null +++ b/crates/nvisy-core/src/traits/stream.rs @@ -0,0 +1,39 @@ +use std::any::Any; + +use async_trait::async_trait; +use tokio::sync::mpsc; + +use crate::data::DataValue; +use crate::errors::NvisyError; + +/// A source stream that reads data from an external system into the pipeline. +#[async_trait] +pub trait StreamSource: Send + Sync + 'static { + fn id(&self) -> &str; + fn output_type(&self) -> &str; + fn required_provider_id(&self) -> &str; + fn validate_params(&self, params: &serde_json::Value) -> Result<(), NvisyError>; + + async fn read( + &self, + output: mpsc::Sender<DataValue>, + params: serde_json::Value, + client: Box<dyn Any + Send>, + ) -> Result<u64, NvisyError>; +} + +/// A target stream that writes pipeline data to an external system. +#[async_trait] +pub trait StreamTarget: Send + Sync + 'static { + fn id(&self) -> &str; + fn input_type(&self) -> &str; + fn required_provider_id(&self) -> &str; + fn validate_params(&self, params: &serde_json::Value) -> Result<(), NvisyError>; + + async fn write( + &self, + input: mpsc::Receiver<DataValue>, + params: serde_json::Value, + client: Box<dyn Any + Send>, + ) -> Result<u64, NvisyError>; +} diff --git a/crates/nvisy-core/src/types.rs b/crates/nvisy-core/src/types.rs new file mode 100644 index 0000000..e5b6f15 --- /dev/null +++ b/crates/nvisy-core/src/types.rs @@ -0,0 +1,51 @@ +use serde::{Deserialize, Serialize}; + +/// Category of sensitive data. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum EntityCategory { + Pii, + Phi, + Financial, + Credentials, + Custom, +} + +/// How the entity was detected. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum DetectionMethod { + Regex, + AiNer, + Dictionary, + Checksum, + Composite, +} + +/// Method used to redact sensitive data. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum RedactionMethod { + Mask, + Replace, + Hash, + Encrypt, + Remove, + Blur, + Block, + Synthesize, +} + +/// Type of auditable action. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum AuditAction { + Detection, + Redaction, + PolicyEval, + Access, + Export, +} + +/// General-purpose metadata map. +pub type Metadata = serde_json::Map<String, serde_json::Value>; diff --git a/crates/nvisy-detect/Cargo.toml b/crates/nvisy-detect/Cargo.toml new file mode 100644 index 0000000..42facd0 --- /dev/null +++ b/crates/nvisy-detect/Cargo.toml @@ -0,0 +1,43 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-detect" +description = "Regex patterns, policy evaluation, and redaction actions for Nvisy" +keywords = ["nvisy", "detection", "regex", "redaction"] +categories = ["text-processing"] + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +# Internal crates +nvisy-core = { workspace = true, features = [] } + +# (De)serialization +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true, features = [] } + +# Async runtime +tokio = { workspace = true, features = ["sync"] } +async-trait = { workspace = true, features = [] } + +# Primitive datatypes +uuid = { workspace = true, features = ["v4"] } + +# Text processing +regex = { workspace = true, features = [] } + +# Observability +tracing = { workspace = true, features = [] } diff --git a/crates/nvisy-detect/src/actions/apply_redaction.rs b/crates/nvisy-detect/src/actions/apply_redaction.rs new file mode 100644 index 0000000..299eb83 --- /dev/null +++ b/crates/nvisy-detect/src/actions/apply_redaction.rs @@ -0,0 +1,138 @@ +use async_trait::async_trait; +use std::any::Any; +use std::collections::HashMap; +use tokio::sync::mpsc; +use uuid::Uuid; + +use nvisy_core::data::DataValue; +use nvisy_core::datatypes::document::Document; +use nvisy_core::datatypes::entity::Entity; +use nvisy_core::datatypes::redaction::Redaction; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::action::Action; + +pub struct ApplyRedactionAction; + +struct PendingRedaction { + start_offset: usize, + end_offset: usize, + replacement_value: String, +} + +#[async_trait] +impl Action for ApplyRedactionAction { + fn id(&self) -> &str { + "apply-redaction" + } + + fn input_type(&self) -> &str { + "document" + } + + fn output_type(&self) -> &str { + "document" + } + + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<DataValue>, + output: mpsc::Sender<DataValue>, + _params: serde_json::Value, + _client: Option<Box<dyn Any + Send>>, + ) -> Result<u64, NvisyError> { + let mut entities: HashMap<Uuid, Entity> = HashMap::new(); + let mut redactions: HashMap<Uuid, Redaction> = HashMap::new(); + let mut documents: Vec<Document> = Vec::new(); + + // Collect all items first + while let Some(item) = input.recv().await { + match item { + DataValue::Entity(e) => { + entities.insert(e.data.id, e); + } + DataValue::Redaction(r) => { + redactions.insert(r.entity_id, r); + } + DataValue::Document(d) => { + documents.push(d); + } + _ => {} + } + } + + let mut count = 0u64; + + for doc in documents { + let mut pending: Vec<PendingRedaction> = Vec::new(); + + for (entity_id, redaction) in &redactions { + let entity = match entities.get(entity_id) { + Some(e) => e, + None => continue, + }; + + // Check entity belongs to this document + let belongs = entity.data.parent_id == Some(doc.data.id) + || entity.source_id == Some(doc.data.id); + if !belongs { + continue; + } + + pending.push(PendingRedaction { + start_offset: entity.location.start_offset, + end_offset: entity.location.end_offset, + replacement_value: redaction.replacement_value.clone(), + }); + } + + if pending.is_empty() { + count += 1; + if output.send(DataValue::Document(doc)).await.is_err() { + return Ok(count); + } + continue; + } + + let redacted_content = apply_redactions(&doc.content, &mut pending); + let mut result = Document::new(redacted_content); + result.title = doc.title.clone(); + result.elements = doc.elements.clone(); + result.source_format = doc.source_format.clone(); + result.page_count = doc.page_count; + result.data.parent_id = Some(doc.data.id); + + count += 1; + if output.send(DataValue::Document(result)).await.is_err() { + return Ok(count); + } + } + + Ok(count) + } +} + +fn apply_redactions(text: &str, pending: &mut [PendingRedaction]) -> String { + // Sort by start offset descending (right-to-left) to preserve positions + pending.sort_by(|a, b| b.start_offset.cmp(&a.start_offset)); + + let mut result = text.to_string(); + for redaction in pending.iter() { + let start = redaction.start_offset.min(result.len()); + let end = redaction.end_offset.min(result.len()); + if start >= end { + continue; + } + + result = format!( + "{}{}{}", + &result[..start], + redaction.replacement_value, + &result[end..] + ); + } + result +} diff --git a/crates/nvisy-detect/src/actions/classify.rs b/crates/nvisy-detect/src/actions/classify.rs new file mode 100644 index 0000000..d72aa85 --- /dev/null +++ b/crates/nvisy-detect/src/actions/classify.rs @@ -0,0 +1,114 @@ +use async_trait::async_trait; +use std::any::Any; +use std::collections::HashMap; +use tokio::sync::mpsc; + +use nvisy_core::data::DataValue; +use nvisy_core::datatypes::document::Document; +use nvisy_core::datatypes::entity::Entity; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::action::Action; + +pub struct ClassifyAction; + +#[async_trait] +impl Action for ClassifyAction { + fn id(&self) -> &str { + "classify" + } + + fn input_type(&self) -> &str { + "document" + } + + fn output_type(&self) -> &str { + "document" + } + + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<DataValue>, + output: mpsc::Sender<DataValue>, + _params: serde_json::Value, + _client: Option<Box<dyn Any + Send>>, + ) -> Result<u64, NvisyError> { + let mut entities_by_source: HashMap<uuid::Uuid, Vec<Entity>> = HashMap::new(); + let mut documents: Vec<Document> = Vec::new(); + + while let Some(item) = input.recv().await { + match item { + DataValue::Entity(e) => { + let source_id = e.data.parent_id.unwrap_or(uuid::Uuid::nil()); + entities_by_source.entry(source_id).or_default().push(e); + } + DataValue::Document(d) => { + documents.push(d); + } + _ => {} + } + } + + let mut count = 0u64; + + for doc in documents { + let entities = entities_by_source + .get(&doc.data.id) + .map(|v| v.as_slice()) + .unwrap_or(&[]); + let sensitivity_level = compute_sensitivity_level(entities); + + let mut result = Document::new(&doc.content); + result.title = doc.title.clone(); + result.elements = doc.elements.clone(); + result.source_format = doc.source_format.clone(); + result.page_count = doc.page_count; + result.data.parent_id = Some(doc.data.id); + + let mut meta = doc.data.metadata.clone().unwrap_or_default(); + meta.insert( + "sensitivityLevel".to_string(), + serde_json::Value::String(sensitivity_level), + ); + meta.insert( + "totalEntities".to_string(), + serde_json::Value::Number(entities.len().into()), + ); + result.data.metadata = Some(meta); + + count += 1; + if output.send(DataValue::Document(result)).await.is_err() { + return Ok(count); + } + } + + Ok(count) + } +} + +fn compute_sensitivity_level(entities: &[Entity]) -> String { + if entities.is_empty() { + return "none".to_string(); + } + + let has_high_confidence = entities.iter().any(|e| e.confidence >= 0.9); + let has_critical_types = entities.iter().any(|e| { + matches!(e.category, nvisy_core::types::EntityCategory::Credentials) + || e.entity_type == "ssn" + || e.entity_type == "credit_card" + }); + + if has_critical_types && has_high_confidence { + return "critical".to_string(); + } + if has_critical_types || entities.len() > 10 { + return "high".to_string(); + } + if entities.len() > 3 { + return "medium".to_string(); + } + "low".to_string() +} diff --git a/crates/nvisy-detect/src/actions/detect_checksum.rs b/crates/nvisy-detect/src/actions/detect_checksum.rs new file mode 100644 index 0000000..7855034 --- /dev/null +++ b/crates/nvisy-detect/src/actions/detect_checksum.rs @@ -0,0 +1,99 @@ +use async_trait::async_trait; +use std::any::Any; +use tokio::sync::mpsc; + +use nvisy_core::data::DataValue; +use nvisy_core::datatypes::entity::Entity; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::action::Action; +use nvisy_core::types::DetectionMethod; + +use crate::patterns::credit_card::luhn_check; + +pub struct DetectChecksumAction; + +#[async_trait] +impl Action for DetectChecksumAction { + fn id(&self) -> &str { + "detect-checksum" + } + + fn input_type(&self) -> &str { + "entity" + } + + fn output_type(&self) -> &str { + "entity" + } + + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<DataValue>, + output: mpsc::Sender<DataValue>, + params: serde_json::Value, + _client: Option<Box<dyn Any + Send>>, + ) -> Result<u64, NvisyError> { + let drop_invalid = params + .get("dropInvalid") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + let confidence_boost = params + .get("confidenceBoost") + .and_then(|v| v.as_f64()) + .unwrap_or(0.05); + + let mut count = 0u64; + + while let Some(item) = input.recv().await { + if let DataValue::Entity(entity) = item { + let validator = get_validator(&entity.entity_type); + + if let Some(validate) = validator { + let is_valid = validate(&entity.value); + + if !is_valid && drop_invalid { + continue; + } + + if is_valid { + let mut boosted = Entity::new( + entity.category, + &entity.entity_type, + &entity.value, + DetectionMethod::Checksum, + (entity.confidence + confidence_boost).min(1.0), + entity.location.clone(), + ); + boosted.data.parent_id = entity.data.parent_id; + boosted.source_id = entity.source_id; + + count += 1; + if output.send(DataValue::Entity(boosted)).await.is_err() { + return Ok(count); + } + continue; + } + } + + // No validator or not valid but not dropping — pass through + count += 1; + if output.send(DataValue::Entity(entity)).await.is_err() { + return Ok(count); + } + } + } + + Ok(count) + } +} + +fn get_validator(entity_type: &str) -> Option<fn(&str) -> bool> { + match entity_type { + "credit_card" => Some(luhn_check), + _ => None, + } +} diff --git a/crates/nvisy-detect/src/actions/detect_regex.rs b/crates/nvisy-detect/src/actions/detect_regex.rs new file mode 100644 index 0000000..602b1e5 --- /dev/null +++ b/crates/nvisy-detect/src/actions/detect_regex.rs @@ -0,0 +1,115 @@ +use async_trait::async_trait; +use regex::Regex; +use std::any::Any; +use tokio::sync::mpsc; + +use nvisy_core::data::DataValue; +use nvisy_core::datatypes::entity::{Entity, EntityLocation}; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::action::Action; +use nvisy_core::types::DetectionMethod; + +use crate::patterns::{self, PatternDefinition}; + +pub struct DetectRegexAction; + +#[async_trait] +impl Action for DetectRegexAction { + fn id(&self) -> &str { + "detect-regex" + } + + fn input_type(&self) -> &str { + "document" + } + + fn output_type(&self) -> &str { + "entity" + } + + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<DataValue>, + output: mpsc::Sender<DataValue>, + params: serde_json::Value, + _client: Option<Box<dyn Any + Send>>, + ) -> Result<u64, NvisyError> { + let confidence_threshold: f64 = params + .get("confidenceThreshold") + .and_then(|v| v.as_f64()) + .unwrap_or(0.0); + + let requested_patterns: Option<Vec<String>> = params + .get("patterns") + .and_then(|v| serde_json::from_value(v.clone()).ok()); + + // Resolve patterns + let active_patterns = resolve_patterns(&requested_patterns); + + // Compile regexes + let compiled: Vec<(&PatternDefinition, Regex)> = active_patterns + .iter() + .filter_map(|p| Regex::new(p.pattern_str).ok().map(|r| (*p, r))) + .collect(); + + let mut count = 0u64; + + while let Some(item) = input.recv().await { + if let DataValue::Document(doc) = &item { + for (pattern, regex) in &compiled { + for mat in regex.find_iter(&doc.content) { + let value = mat.as_str(); + + if let Some(validate) = pattern.validate { + if !validate(value) { + continue; + } + } + + if pattern.confidence < confidence_threshold { + continue; + } + + let mut entity = Entity::new( + pattern.category, + pattern.entity_type, + value, + DetectionMethod::Regex, + pattern.confidence, + EntityLocation { + start_offset: mat.start(), + end_offset: mat.end(), + element_id: None, + page_number: None, + bounding_box: None, + }, + ); + entity.source_id = Some(doc.data.id); + entity.data.parent_id = Some(doc.data.id); + + count += 1; + if output.send(DataValue::Entity(entity)).await.is_err() { + return Ok(count); + } + } + } + } + } + + Ok(count) + } +} + +fn resolve_patterns(requested: &Option<Vec<String>>) -> Vec<&'static PatternDefinition> { + match requested { + Some(names) if !names.is_empty() => names + .iter() + .filter_map(|n| patterns::get_pattern(n)) + .collect(), + _ => patterns::get_all_patterns(), + } +} diff --git a/crates/nvisy-detect/src/actions/emit_audit.rs b/crates/nvisy-detect/src/actions/emit_audit.rs new file mode 100644 index 0000000..0d2759e --- /dev/null +++ b/crates/nvisy-detect/src/actions/emit_audit.rs @@ -0,0 +1,90 @@ +use async_trait::async_trait; +use std::any::Any; +use tokio::sync::mpsc; + +use nvisy_core::data::DataValue; +use nvisy_core::datatypes::audit::Audit; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::action::Action; +use nvisy_core::types::AuditAction; + +pub struct EmitAuditAction; + +#[async_trait] +impl Action for EmitAuditAction { + fn id(&self) -> &str { + "emit-audit" + } + + fn input_type(&self) -> &str { + "redaction" + } + + fn output_type(&self) -> &str { + "audit" + } + + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<DataValue>, + output: mpsc::Sender<DataValue>, + params: serde_json::Value, + _client: Option<Box<dyn Any + Send>>, + ) -> Result<u64, NvisyError> { + let run_id: Option<uuid::Uuid> = params + .get("runId") + .and_then(|v| v.as_str()) + .and_then(|s| s.parse().ok()); + let actor: Option<String> = params + .get("actor") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()); + + let mut count = 0u64; + + while let Some(item) = input.recv().await { + if let DataValue::Redaction(redaction) = item { + let mut audit = Audit::new(AuditAction::Redaction) + .with_entity_id(redaction.entity_id) + .with_redaction_id(redaction.data.id); + + if let Some(run_id) = run_id { + audit = audit.with_run_id(run_id); + } + if let Some(ref actor) = actor { + audit = audit.with_actor(actor); + } + + let mut details = serde_json::Map::new(); + details.insert( + "method".to_string(), + serde_json::to_value(redaction.method).unwrap_or_default(), + ); + details.insert( + "replacementValue".to_string(), + serde_json::Value::String(redaction.replacement_value.clone()), + ); + if let Some(ref rule_id) = redaction.policy_rule_id { + details.insert( + "policyRuleId".to_string(), + serde_json::Value::String(rule_id.clone()), + ); + } + audit = audit.with_details(details); + + audit.data.parent_id = Some(redaction.data.id); + + count += 1; + if output.send(DataValue::Audit(audit)).await.is_err() { + return Ok(count); + } + } + } + + Ok(count) + } +} diff --git a/crates/nvisy-detect/src/actions/evaluate_policy.rs b/crates/nvisy-detect/src/actions/evaluate_policy.rs new file mode 100644 index 0000000..9de8f0a --- /dev/null +++ b/crates/nvisy-detect/src/actions/evaluate_policy.rs @@ -0,0 +1,137 @@ +use async_trait::async_trait; +use std::any::Any; +use tokio::sync::mpsc; + +use nvisy_core::data::DataValue; +use nvisy_core::datatypes::entity::Entity; +use nvisy_core::datatypes::policy::PolicyRule; +use nvisy_core::datatypes::redaction::Redaction; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::action::Action; +use nvisy_core::types::RedactionMethod; + +pub struct EvaluatePolicyAction; + +#[async_trait] +impl Action for EvaluatePolicyAction { + fn id(&self) -> &str { + "evaluate-policy" + } + + fn input_type(&self) -> &str { + "entity" + } + + fn output_type(&self) -> &str { + "redaction" + } + + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<DataValue>, + output: mpsc::Sender<DataValue>, + params: serde_json::Value, + _client: Option<Box<dyn Any + Send>>, + ) -> Result<u64, NvisyError> { + let rules: Vec<PolicyRule> = params + .get("rules") + .and_then(|v| serde_json::from_value(v.clone()).ok()) + .unwrap_or_default(); + let default_method: RedactionMethod = params + .get("defaultMethod") + .and_then(|v| serde_json::from_value(v.clone()).ok()) + .unwrap_or(RedactionMethod::Mask); + let default_threshold: f64 = params + .get("defaultConfidenceThreshold") + .and_then(|v| v.as_f64()) + .unwrap_or(0.5); + + let mut sorted_rules = rules; + sorted_rules.sort_by_key(|r| r.priority); + + let mut count = 0u64; + + while let Some(item) = input.recv().await { + if let DataValue::Entity(entity) = item { + let rule = find_matching_rule(&entity, &sorted_rules); + let method = rule.map(|r| r.method).unwrap_or(default_method); + let threshold = rule + .map(|r| r.confidence_threshold) + .unwrap_or(default_threshold); + + if entity.confidence < threshold { + continue; + } + + let replacement_value = if let Some(r) = rule { + apply_template(&r.replacement_template, &entity) + } else { + apply_default_mask(&entity, default_method) + }; + + let mut redaction = + Redaction::new(entity.data.id, method, replacement_value); + redaction = redaction.with_original_value(&entity.value); + if let Some(r) = rule { + redaction = redaction.with_policy_rule_id(&r.id); + } + redaction.data.parent_id = Some(entity.data.id); + + count += 1; + if output.send(DataValue::Redaction(redaction)).await.is_err() { + return Ok(count); + } + } + } + + Ok(count) + } +} + +fn find_matching_rule<'a>(entity: &Entity, rules: &'a [PolicyRule]) -> Option<&'a PolicyRule> { + for rule in rules { + if !rule.enabled { + continue; + } + if entity.confidence < rule.confidence_threshold { + continue; + } + if !rule.categories.is_empty() && !rule.categories.contains(&entity.category) { + continue; + } + if !rule.entity_types.is_empty() + && !rule.entity_types.iter().any(|t| t == &entity.entity_type) + { + continue; + } + return Some(rule); + } + None +} + +fn apply_template(template: &str, entity: &Entity) -> String { + template + .replace("{entityType}", &entity.entity_type) + .replace( + "{category}", + &format!("{:?}", entity.category).to_lowercase(), + ) + .replace("{value}", &entity.value) +} + +fn apply_default_mask(entity: &Entity, method: RedactionMethod) -> String { + match method { + RedactionMethod::Mask => "*".repeat(entity.value.len()), + RedactionMethod::Replace => format!("[{}]", entity.entity_type.to_uppercase()), + RedactionMethod::Remove => String::new(), + RedactionMethod::Hash => format!("[HASH:{}]", entity.entity_type), + RedactionMethod::Encrypt => format!("[ENC:{}]", entity.entity_type), + RedactionMethod::Blur => format!("[BLURRED:{}]", entity.entity_type), + RedactionMethod::Block => "\u{2588}".repeat(entity.value.len()), + RedactionMethod::Synthesize => format!("[SYNTH:{}]", entity.entity_type), + } +} diff --git a/crates/nvisy-detect/src/actions/mod.rs b/crates/nvisy-detect/src/actions/mod.rs new file mode 100644 index 0000000..3dfdc36 --- /dev/null +++ b/crates/nvisy-detect/src/actions/mod.rs @@ -0,0 +1,6 @@ +pub mod apply_redaction; +pub mod classify; +pub mod detect_checksum; +pub mod detect_regex; +pub mod emit_audit; +pub mod evaluate_policy; diff --git a/crates/nvisy-detect/src/lib.rs b/crates/nvisy-detect/src/lib.rs new file mode 100644 index 0000000..b6a5e70 --- /dev/null +++ b/crates/nvisy-detect/src/lib.rs @@ -0,0 +1,29 @@ +pub mod actions; +pub mod loaders; +pub mod patterns; + +use nvisy_core::plugin::PluginDescriptor; + +use crate::actions::apply_redaction::ApplyRedactionAction; +use crate::actions::classify::ClassifyAction; +use crate::actions::detect_checksum::DetectChecksumAction; +use crate::actions::detect_regex::DetectRegexAction; +use crate::actions::emit_audit::EmitAuditAction; +use crate::actions::evaluate_policy::EvaluatePolicyAction; +use crate::loaders::csv_loader::CsvLoader; +use crate::loaders::json_loader::JsonLoader; +use crate::loaders::plaintext::PlaintextLoader; + +/// Create the detect plugin descriptor. +pub fn detect_plugin() -> PluginDescriptor { + PluginDescriptor::new("detect") + .with_action(DetectRegexAction) + .with_action(DetectChecksumAction) + .with_action(EvaluatePolicyAction) + .with_action(ApplyRedactionAction) + .with_action(ClassifyAction) + .with_action(EmitAuditAction) + .with_loader(PlaintextLoader) + .with_loader(CsvLoader) + .with_loader(JsonLoader) +} diff --git a/crates/nvisy-detect/src/loaders/csv_loader.rs b/crates/nvisy-detect/src/loaders/csv_loader.rs new file mode 100644 index 0000000..74b8606 --- /dev/null +++ b/crates/nvisy-detect/src/loaders/csv_loader.rs @@ -0,0 +1,37 @@ +use async_trait::async_trait; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::Document; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::loader::{Loader, LoaderOutput}; + +pub struct CsvLoader; + +#[async_trait] +impl Loader for CsvLoader { + fn id(&self) -> &str { + "csv" + } + + fn extensions(&self) -> &[&str] { + &["csv"] + } + + fn content_types(&self) -> &[&str] { + &["text/csv"] + } + + async fn load( + &self, + blob: &Blob, + _params: &serde_json::Value, + ) -> Result<Vec<LoaderOutput>, NvisyError> { + let content = String::from_utf8(blob.content.to_vec()).map_err(|e| { + NvisyError::validation(format!("Invalid UTF-8 in CSV: {}", e), "csv-loader") + })?; + let mut doc = Document::new(content); + doc.source_format = Some("csv".to_string()); + doc.data.parent_id = Some(blob.data.id); + Ok(vec![LoaderOutput::Document(doc)]) + } +} diff --git a/crates/nvisy-detect/src/loaders/json_loader.rs b/crates/nvisy-detect/src/loaders/json_loader.rs new file mode 100644 index 0000000..6f542d0 --- /dev/null +++ b/crates/nvisy-detect/src/loaders/json_loader.rs @@ -0,0 +1,41 @@ +use async_trait::async_trait; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::Document; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::loader::{Loader, LoaderOutput}; + +pub struct JsonLoader; + +#[async_trait] +impl Loader for JsonLoader { + fn id(&self) -> &str { + "json" + } + + fn extensions(&self) -> &[&str] { + &["json"] + } + + fn content_types(&self) -> &[&str] { + &["application/json"] + } + + async fn load( + &self, + blob: &Blob, + _params: &serde_json::Value, + ) -> Result<Vec<LoaderOutput>, NvisyError> { + let content = String::from_utf8(blob.content.to_vec()).map_err(|e| { + NvisyError::validation(format!("Invalid UTF-8 in JSON: {}", e), "json-loader") + })?; + // Validate it's valid JSON + let _: serde_json::Value = serde_json::from_str(&content).map_err(|e| { + NvisyError::validation(format!("Invalid JSON: {}", e), "json-loader") + })?; + let mut doc = Document::new(content); + doc.source_format = Some("json".to_string()); + doc.data.parent_id = Some(blob.data.id); + Ok(vec![LoaderOutput::Document(doc)]) + } +} diff --git a/crates/nvisy-detect/src/loaders/mod.rs b/crates/nvisy-detect/src/loaders/mod.rs new file mode 100644 index 0000000..b961145 --- /dev/null +++ b/crates/nvisy-detect/src/loaders/mod.rs @@ -0,0 +1,3 @@ +pub mod csv_loader; +pub mod json_loader; +pub mod plaintext; diff --git a/crates/nvisy-detect/src/loaders/plaintext.rs b/crates/nvisy-detect/src/loaders/plaintext.rs new file mode 100644 index 0000000..2056fb4 --- /dev/null +++ b/crates/nvisy-detect/src/loaders/plaintext.rs @@ -0,0 +1,40 @@ +use async_trait::async_trait; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::Document; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::loader::{Loader, LoaderOutput}; + +pub struct PlaintextLoader; + +#[async_trait] +impl Loader for PlaintextLoader { + fn id(&self) -> &str { + "plaintext" + } + + fn extensions(&self) -> &[&str] { + &["txt", "text"] + } + + fn content_types(&self) -> &[&str] { + &["text/plain"] + } + + async fn load( + &self, + blob: &Blob, + _params: &serde_json::Value, + ) -> Result<Vec<LoaderOutput>, NvisyError> { + let content = String::from_utf8(blob.content.to_vec()).map_err(|e| { + NvisyError::validation( + format!("Invalid UTF-8 in plaintext: {}", e), + "plaintext-loader", + ) + })?; + let mut doc = Document::new(content); + doc.source_format = Some("txt".to_string()); + doc.data.parent_id = Some(blob.data.id); + Ok(vec![LoaderOutput::Document(doc)]) + } +} diff --git a/crates/nvisy-detect/src/patterns/api_key.rs b/crates/nvisy-detect/src/patterns/api_key.rs new file mode 100644 index 0000000..1b7bdb8 --- /dev/null +++ b/crates/nvisy-detect/src/patterns/api_key.rs @@ -0,0 +1,39 @@ +use nvisy_core::types::EntityCategory; + +use super::PatternDefinition; + +pub static AWS_KEY_PATTERN: PatternDefinition = PatternDefinition { + name: "aws-key", + category: EntityCategory::Credentials, + entity_type: "aws_access_key", + pattern_str: r"\bAKIA[0-9A-Z]{16}\b", + confidence: 0.95, + validate: None, +}; + +pub static GITHUB_TOKEN_PATTERN: PatternDefinition = PatternDefinition { + name: "github-token", + category: EntityCategory::Credentials, + entity_type: "github_token", + pattern_str: r"\bgh[pousr]_[a-zA-Z0-9]{36}\b", + confidence: 0.95, + validate: None, +}; + +pub static STRIPE_KEY_PATTERN: PatternDefinition = PatternDefinition { + name: "stripe-key", + category: EntityCategory::Credentials, + entity_type: "stripe_key", + pattern_str: r"\bsk_(live|test)_[a-zA-Z0-9]{24,}\b", + confidence: 0.95, + validate: None, +}; + +pub static GENERIC_KEY_PATTERN: PatternDefinition = PatternDefinition { + name: "generic-api-key", + category: EntityCategory::Credentials, + entity_type: "api_key", + pattern_str: r#"(?i)(?:api[_\-]?key|api[_\-]?secret|access[_\-]?token|secret[_\-]?key|bearer)\s*[:=]\s*["']?([a-zA-Z0-9_\-]{20,})["']?"#, + confidence: 0.7, + validate: None, +}; diff --git a/crates/nvisy-detect/src/patterns/credit_card.rs b/crates/nvisy-detect/src/patterns/credit_card.rs new file mode 100644 index 0000000..bb0b9c5 --- /dev/null +++ b/crates/nvisy-detect/src/patterns/credit_card.rs @@ -0,0 +1,38 @@ +use nvisy_core::types::EntityCategory; + +use super::PatternDefinition; + +/// Luhn check algorithm for credit card validation. +pub fn luhn_check(num: &str) -> bool { + let digits: String = num.chars().filter(|c| c.is_ascii_digit()).collect(); + if digits.is_empty() { + return false; + } + let mut sum = 0u32; + let mut alternate = false; + for ch in digits.chars().rev() { + let mut n = ch.to_digit(10).unwrap_or(0); + if alternate { + n *= 2; + if n > 9 { + n -= 9; + } + } + sum += n; + alternate = !alternate; + } + sum % 10 == 0 +} + +fn validate_credit_card(value: &str) -> bool { + luhn_check(value) +} + +pub static CREDIT_CARD_PATTERN: PatternDefinition = PatternDefinition { + name: "credit-card", + category: EntityCategory::Financial, + entity_type: "credit_card", + pattern_str: r"\b(?:\d[ \-]*?){13,19}\b", + confidence: 0.85, + validate: Some(validate_credit_card), +}; diff --git a/crates/nvisy-detect/src/patterns/email.rs b/crates/nvisy-detect/src/patterns/email.rs new file mode 100644 index 0000000..66b6b33 --- /dev/null +++ b/crates/nvisy-detect/src/patterns/email.rs @@ -0,0 +1,12 @@ +use nvisy_core::types::EntityCategory; + +use super::PatternDefinition; + +pub static EMAIL_PATTERN: PatternDefinition = PatternDefinition { + name: "email", + category: EntityCategory::Pii, + entity_type: "email", + pattern_str: r"\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b", + confidence: 0.95, + validate: None, +}; diff --git a/crates/nvisy-detect/src/patterns/ip_address.rs b/crates/nvisy-detect/src/patterns/ip_address.rs new file mode 100644 index 0000000..8be5ab0 --- /dev/null +++ b/crates/nvisy-detect/src/patterns/ip_address.rs @@ -0,0 +1,21 @@ +use nvisy_core::types::EntityCategory; + +use super::PatternDefinition; + +pub static IPV4_PATTERN: PatternDefinition = PatternDefinition { + name: "ipv4", + category: EntityCategory::Pii, + entity_type: "ip_address", + pattern_str: r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b", + confidence: 0.75, + validate: None, +}; + +pub static IPV6_PATTERN: PatternDefinition = PatternDefinition { + name: "ipv6", + category: EntityCategory::Pii, + entity_type: "ip_address", + pattern_str: r"\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b|(?:[0-9a-fA-F]{1,4}:){1,7}:|::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\b", + confidence: 0.75, + validate: None, +}; diff --git a/crates/nvisy-detect/src/patterns/mod.rs b/crates/nvisy-detect/src/patterns/mod.rs new file mode 100644 index 0000000..8b2fd50 --- /dev/null +++ b/crates/nvisy-detect/src/patterns/mod.rs @@ -0,0 +1,55 @@ +pub mod api_key; +pub mod credit_card; +pub mod email; +pub mod ip_address; +pub mod phone; +pub mod ssn; + +use nvisy_core::types::EntityCategory; +use std::collections::HashMap; +use std::sync::LazyLock; + +/// Definition of a regex-based detection pattern. +pub struct PatternDefinition { + pub name: &'static str, + pub category: EntityCategory, + pub entity_type: &'static str, + pub pattern_str: &'static str, + pub confidence: f64, + pub validate: Option<fn(&str) -> bool>, +} + +static REGISTRY: LazyLock<HashMap<&'static str, &'static PatternDefinition>> = LazyLock::new(|| { + let patterns: &[&'static PatternDefinition] = &[ + &ssn::SSN_PATTERN, + &email::EMAIL_PATTERN, + &phone::PHONE_PATTERN, + &credit_card::CREDIT_CARD_PATTERN, + &api_key::AWS_KEY_PATTERN, + &api_key::GITHUB_TOKEN_PATTERN, + &api_key::STRIPE_KEY_PATTERN, + &api_key::GENERIC_KEY_PATTERN, + &ip_address::IPV4_PATTERN, + &ip_address::IPV6_PATTERN, + ]; + let mut map = HashMap::new(); + for p in patterns { + map.insert(p.name, *p); + } + map +}); + +/// Look up a built-in pattern by name. +pub fn get_pattern(name: &str) -> Option<&'static PatternDefinition> { + REGISTRY.get(name).copied() +} + +/// Get all built-in patterns. +pub fn get_all_patterns() -> Vec<&'static PatternDefinition> { + REGISTRY.values().copied().collect() +} + +/// Get all built-in pattern names. +pub fn get_all_pattern_names() -> Vec<&'static str> { + REGISTRY.keys().copied().collect() +} diff --git a/crates/nvisy-detect/src/patterns/phone.rs b/crates/nvisy-detect/src/patterns/phone.rs new file mode 100644 index 0000000..fcc2858 --- /dev/null +++ b/crates/nvisy-detect/src/patterns/phone.rs @@ -0,0 +1,12 @@ +use nvisy_core::types::EntityCategory; + +use super::PatternDefinition; + +pub static PHONE_PATTERN: PatternDefinition = PatternDefinition { + name: "phone", + category: EntityCategory::Pii, + entity_type: "phone", + pattern_str: r"(?:\+\d{1,3}[\s.\-]?)?\(?\d{2,4}\)?[\s.\-]?\d{3,4}[\s.\-]?\d{4}\b", + confidence: 0.8, + validate: None, +}; diff --git a/crates/nvisy-detect/src/patterns/ssn.rs b/crates/nvisy-detect/src/patterns/ssn.rs new file mode 100644 index 0000000..a5bc876 --- /dev/null +++ b/crates/nvisy-detect/src/patterns/ssn.rs @@ -0,0 +1,32 @@ +use nvisy_core::types::EntityCategory; + +use super::PatternDefinition; + +fn validate_ssn(value: &str) -> bool { + let parts: Vec<&str> = value.split('-').collect(); + if parts.len() != 3 { + return false; + } + let area: u32 = match parts[0].parse() { + Ok(v) => v, + Err(_) => return false, + }; + let group: u32 = match parts[1].parse() { + Ok(v) => v, + Err(_) => return false, + }; + let serial: u32 = match parts[2].parse() { + Ok(v) => v, + Err(_) => return false, + }; + area > 0 && area < 900 && area != 666 && group > 0 && serial > 0 +} + +pub static SSN_PATTERN: PatternDefinition = PatternDefinition { + name: "ssn", + category: EntityCategory::Pii, + entity_type: "ssn", + pattern_str: r"\b(\d{3})-(\d{2})-(\d{4})\b", + confidence: 0.9, + validate: Some(validate_ssn), +}; diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml new file mode 100644 index 0000000..ced9a33 --- /dev/null +++ b/crates/nvisy-engine/Cargo.toml @@ -0,0 +1,51 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-engine" +description = "DAG compiler and executor for Nvisy pipeline graphs" +keywords = ["nvisy", "engine", "dag", "pipeline"] +categories = ["concurrency"] + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +# Internal crates +nvisy-core = { workspace = true, features = [] } + +# (De)serialization +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true, features = [] } + +# Async runtime +tokio = { workspace = true, features = ["rt", "sync", "time", "macros"] } +tokio-util = { workspace = true, features = [] } + +# Primitive datatypes +uuid = { workspace = true, features = ["v4"] } +chrono = { workspace = true, features = [] } + +# Graph data structures +petgraph = { workspace = true, features = [] } + +# Error handling +thiserror = { workspace = true, features = [] } +anyhow = { workspace = true, features = [] } + +# Randomness +rand = { workspace = true, features = [] } + +# Observability +tracing = { workspace = true, features = [] } diff --git a/crates/nvisy-engine/src/compiler/mod.rs b/crates/nvisy-engine/src/compiler/mod.rs new file mode 100644 index 0000000..1f31250 --- /dev/null +++ b/crates/nvisy-engine/src/compiler/mod.rs @@ -0,0 +1,5 @@ +pub mod parse; +pub mod plan; + +pub use parse::parse_graph; +pub use plan::{build_plan, ExecutionPlan, ResolvedNode}; diff --git a/crates/nvisy-engine/src/compiler/parse.rs b/crates/nvisy-engine/src/compiler/parse.rs new file mode 100644 index 0000000..e22b906 --- /dev/null +++ b/crates/nvisy-engine/src/compiler/parse.rs @@ -0,0 +1,44 @@ +use crate::schema::Graph; +use nvisy_core::errors::NvisyError; + +/// Parse a graph from a JSON value. +pub fn parse_graph(value: &serde_json::Value) -> Result<Graph, NvisyError> { + let graph: Graph = serde_json::from_value(value.clone()).map_err(|e| { + NvisyError::validation(format!("Invalid graph definition: {}", e), "compiler") + })?; + + // Validate: must have at least one node + if graph.nodes.is_empty() { + return Err(NvisyError::validation("Graph must have at least one node", "compiler")); + } + + // Validate: no duplicate node IDs + let mut seen = std::collections::HashSet::new(); + for node in &graph.nodes { + if !seen.insert(node.id()) { + return Err(NvisyError::validation( + format!("Duplicate node ID: {}", node.id()), + "compiler", + )); + } + } + + // Validate: all edge endpoints reference existing nodes + let node_ids: std::collections::HashSet<&str> = graph.nodes.iter().map(|n| n.id()).collect(); + for edge in &graph.edges { + if !node_ids.contains(edge.from.as_str()) { + return Err(NvisyError::validation( + format!("Edge references unknown source node: {}", edge.from), + "compiler", + )); + } + if !node_ids.contains(edge.to.as_str()) { + return Err(NvisyError::validation( + format!("Edge references unknown target node: {}", edge.to), + "compiler", + )); + } + } + + Ok(graph) +} diff --git a/crates/nvisy-engine/src/compiler/plan.rs b/crates/nvisy-engine/src/compiler/plan.rs new file mode 100644 index 0000000..3120d99 --- /dev/null +++ b/crates/nvisy-engine/src/compiler/plan.rs @@ -0,0 +1,110 @@ +use std::collections::HashMap; +use petgraph::algo::{is_cyclic_directed, toposort}; +use petgraph::graph::{DiGraph, NodeIndex}; +use crate::schema::{Graph, GraphNode}; +use nvisy_core::errors::NvisyError; +use nvisy_core::registry::Registry; + +/// A node resolved against the registry. +#[derive(Debug, Clone)] +pub struct ResolvedNode { + pub node: GraphNode, + pub topo_order: usize, + pub upstream_ids: Vec<String>, + pub downstream_ids: Vec<String>, +} + +/// A compiled execution plan ready for the executor. +pub struct ExecutionPlan { + pub nodes: Vec<ResolvedNode>, + pub topo_order: Vec<String>, +} + +/// Build an execution plan from a parsed graph and registry. +pub fn build_plan(graph: &Graph, registry: &Registry) -> Result<ExecutionPlan, NvisyError> { + // Build petgraph + let mut pg: DiGraph<&str, ()> = DiGraph::new(); + let mut index_map: HashMap<&str, NodeIndex> = HashMap::new(); + + for node in &graph.nodes { + let idx = pg.add_node(node.id()); + index_map.insert(node.id(), idx); + } + + for edge in &graph.edges { + let from = index_map.get(edge.from.as_str()).ok_or_else(|| { + NvisyError::validation(format!("Unknown edge source: {}", edge.from), "compiler") + })?; + let to = index_map.get(edge.to.as_str()).ok_or_else(|| { + NvisyError::validation(format!("Unknown edge target: {}", edge.to), "compiler") + })?; + pg.add_edge(*from, *to, ()); + } + + // Cycle detection + if is_cyclic_directed(&pg) { + return Err(NvisyError::validation("Graph contains a cycle", "compiler")); + } + + // Topological sort + let topo = toposort(&pg, None).map_err(|_| { + NvisyError::validation("Graph contains a cycle", "compiler") + })?; + + let topo_order: Vec<String> = topo.iter().map(|idx| pg[*idx].to_string()).collect(); + + // Resolve nodes against registry + for node in &graph.nodes { + match node { + GraphNode::Action { action, params, .. } => { + let _a = registry.get_action(action).ok_or_else(|| { + NvisyError::validation(format!("Unknown action: {}", action), "compiler") + })?; + _a.validate_params(params)?; + } + GraphNode::Source { provider, stream, .. } => { + let source_key = format!("{}/{}", provider, stream); + let _s = registry.get_source(&source_key).ok_or_else(|| { + NvisyError::validation(format!("Unknown source: {}", source_key), "compiler") + })?; + } + GraphNode::Target { provider, stream, .. } => { + let target_key = format!("{}/{}", provider, stream); + let _t = registry.get_target(&target_key).ok_or_else(|| { + NvisyError::validation(format!("Unknown target: {}", target_key), "compiler") + })?; + } + } + } + + // Build resolved nodes with adjacency info + let node_map: HashMap<&str, &GraphNode> = graph.nodes.iter().map(|n| (n.id(), n)).collect(); + let mut resolved = Vec::new(); + + for (order, node_id) in topo_order.iter().enumerate() { + let node = node_map[node_id.as_str()]; + let idx = index_map[node_id.as_str()]; + + let upstream_ids: Vec<String> = pg + .neighbors_directed(idx, petgraph::Direction::Incoming) + .map(|n| pg[n].to_string()) + .collect(); + + let downstream_ids: Vec<String> = pg + .neighbors_directed(idx, petgraph::Direction::Outgoing) + .map(|n| pg[n].to_string()) + .collect(); + + resolved.push(ResolvedNode { + node: node.clone(), + topo_order: order, + upstream_ids, + downstream_ids, + }); + } + + Ok(ExecutionPlan { + nodes: resolved, + topo_order, + }) +} diff --git a/crates/nvisy-engine/src/connections.rs b/crates/nvisy-engine/src/connections.rs new file mode 100644 index 0000000..43f986c --- /dev/null +++ b/crates/nvisy-engine/src/connections.rs @@ -0,0 +1,15 @@ +use std::collections::HashMap; +use serde::{Deserialize, Serialize}; + +/// A validated connection to an external service. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Connection { + #[serde(rename = "type")] + pub provider_type: String, + pub credentials: serde_json::Value, + #[serde(default)] + pub context: serde_json::Value, +} + +/// Map of connection_id -> Connection +pub type Connections = HashMap<String, Connection>; diff --git a/crates/nvisy-engine/src/executor/context.rs b/crates/nvisy-engine/src/executor/context.rs new file mode 100644 index 0000000..2731e0c --- /dev/null +++ b/crates/nvisy-engine/src/executor/context.rs @@ -0,0 +1,43 @@ +use tokio::sync::{mpsc, watch}; +use nvisy_core::data::DataValue; + +/// Buffer size for inter-node channels. +pub const CHANNEL_BUFFER_SIZE: usize = 256; + +/// Wiring for a single edge: sender + receiver pair. +pub struct EdgeChannel { + pub sender: mpsc::Sender<DataValue>, + pub receiver: mpsc::Receiver<DataValue>, +} + +impl Default for EdgeChannel { + fn default() -> Self { + Self::new() + } +} + +impl EdgeChannel { + pub fn new() -> Self { + let (sender, receiver) = mpsc::channel(CHANNEL_BUFFER_SIZE); + Self { sender, receiver } + } +} + +/// Signals that a node has completed. +pub struct NodeSignal { + pub sender: watch::Sender<bool>, + pub receiver: watch::Receiver<bool>, +} + +impl Default for NodeSignal { + fn default() -> Self { + Self::new() + } +} + +impl NodeSignal { + pub fn new() -> Self { + let (sender, receiver) = watch::channel(false); + Self { sender, receiver } + } +} diff --git a/crates/nvisy-engine/src/executor/mod.rs b/crates/nvisy-engine/src/executor/mod.rs new file mode 100644 index 0000000..31905ca --- /dev/null +++ b/crates/nvisy-engine/src/executor/mod.rs @@ -0,0 +1,5 @@ +pub mod context; +pub mod nodes; +pub mod runner; + +pub use runner::run_graph; diff --git a/crates/nvisy-engine/src/executor/nodes.rs b/crates/nvisy-engine/src/executor/nodes.rs new file mode 100644 index 0000000..4c97921 --- /dev/null +++ b/crates/nvisy-engine/src/executor/nodes.rs @@ -0,0 +1,63 @@ +use std::any::Any; +use tokio::sync::mpsc; +use nvisy_core::data::DataValue; +use nvisy_core::errors::NvisyError; +use nvisy_core::registry::Registry; +use crate::schema::GraphNode; + +/// Execute a source node: read from external system into output channel. +pub async fn execute_source( + node: &GraphNode, + output: mpsc::Sender<DataValue>, + registry: &Registry, + client: Box<dyn Any + Send>, +) -> Result<u64, NvisyError> { + match node { + GraphNode::Source { provider, stream, params, .. } => { + let source_key = format!("{}/{}", provider, stream); + let source = registry.get_source(&source_key).ok_or_else(|| { + NvisyError::runtime(format!("Source not found: {}", source_key), "executor", false) + })?; + source.read(output, params.clone(), client).await + } + _ => Err(NvisyError::runtime("Expected source node", "executor", false)), + } +} + +/// Execute an action node: consume from input, produce to output. +pub async fn execute_action( + node: &GraphNode, + input: mpsc::Receiver<DataValue>, + output: mpsc::Sender<DataValue>, + registry: &Registry, + client: Option<Box<dyn Any + Send>>, +) -> Result<u64, NvisyError> { + match node { + GraphNode::Action { action, params, .. } => { + let act = registry.get_action(action).ok_or_else(|| { + NvisyError::runtime(format!("Action not found: {}", action), "executor", false) + })?; + act.execute(input, output, params.clone(), client).await + } + _ => Err(NvisyError::runtime("Expected action node", "executor", false)), + } +} + +/// Execute a target node: consume from input, write to external system. +pub async fn execute_target( + node: &GraphNode, + input: mpsc::Receiver<DataValue>, + registry: &Registry, + client: Box<dyn Any + Send>, +) -> Result<u64, NvisyError> { + match node { + GraphNode::Target { provider, stream, params, .. } => { + let target_key = format!("{}/{}", provider, stream); + let target = registry.get_target(&target_key).ok_or_else(|| { + NvisyError::runtime(format!("Target not found: {}", target_key), "executor", false) + })?; + target.write(input, params.clone(), client).await + } + _ => Err(NvisyError::runtime("Expected target node", "executor", false)), + } +} diff --git a/crates/nvisy-engine/src/executor/runner.rs b/crates/nvisy-engine/src/executor/runner.rs new file mode 100644 index 0000000..4c344ad --- /dev/null +++ b/crates/nvisy-engine/src/executor/runner.rs @@ -0,0 +1,152 @@ +use std::collections::HashMap; +use tokio::sync::{mpsc, watch}; +use tokio::task::JoinSet; +use uuid::Uuid; +use nvisy_core::data::DataValue; +use nvisy_core::errors::NvisyError; +use nvisy_core::registry::Registry; +use crate::compiler::plan::ExecutionPlan; +use crate::connections::Connections; +use crate::executor::context::CHANNEL_BUFFER_SIZE; +use crate::schema::GraphNode; + +/// Result of a single node execution. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct NodeResult { + pub node_id: String, + pub items_processed: u64, + pub error: Option<String>, +} + +/// Result of an entire graph execution. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct RunResult { + pub run_id: Uuid, + pub node_results: Vec<NodeResult>, + pub success: bool, +} + +/// Execute a compiled graph plan. +pub async fn run_graph( + plan: &ExecutionPlan, + _connections: &Connections, + _registry: &Registry, +) -> Result<RunResult, NvisyError> { + let run_id = Uuid::new_v4(); + + // Create channels for each edge + // Key: "from_id -> to_id", value: (sender, receiver) + let mut senders: HashMap<String, Vec<mpsc::Sender<DataValue>>> = HashMap::new(); + let mut receivers: HashMap<String, Vec<mpsc::Receiver<DataValue>>> = HashMap::new(); + + for node in &plan.nodes { + let node_id = node.node.id(); + for downstream_id in &node.downstream_ids { + let (tx, rx) = mpsc::channel(CHANNEL_BUFFER_SIZE); + senders.entry(node_id.to_string()).or_default().push(tx); + receivers.entry(downstream_id.clone()).or_default().push(rx); + } + } + + // Create completion signals per node + let mut signal_senders: HashMap<String, watch::Sender<bool>> = HashMap::new(); + let mut signal_receivers: HashMap<String, watch::Receiver<bool>> = HashMap::new(); + + for node in &plan.nodes { + let (tx, rx) = watch::channel(false); + signal_senders.insert(node.node.id().to_string(), tx); + signal_receivers.insert(node.node.id().to_string(), rx); + } + + // Spawn tasks + let mut join_set: JoinSet<NodeResult> = JoinSet::new(); + + for resolved in &plan.nodes { + let node = resolved.node.clone(); + let node_id = node.id().to_string(); + let upstream_ids = resolved.upstream_ids.clone(); + + // Collect upstream watch receivers + let upstream_watches: Vec<watch::Receiver<bool>> = upstream_ids + .iter() + .filter_map(|id| signal_receivers.get(id).cloned()) + .collect(); + + let completion_tx = signal_senders.remove(&node_id); + let node_senders = senders.remove(&node_id).unwrap_or_default(); + let node_receivers = receivers.remove(&node_id).unwrap_or_default(); + + join_set.spawn(async move { + // Wait for upstream nodes to complete + for mut rx in upstream_watches { + let _ = rx.wait_for(|&done| done).await; + } + + let result = execute_node(&node, node_senders, node_receivers).await; + + // Signal completion + if let Some(tx) = completion_tx { + let _ = tx.send(true); + } + + match result { + Ok(count) => NodeResult { + node_id, + items_processed: count, + error: None, + }, + Err(e) => NodeResult { + node_id, + items_processed: 0, + error: Some(e.to_string()), + }, + } + }); + } + + // Collect results + let mut node_results = Vec::new(); + while let Some(result) = join_set.join_next().await { + match result { + Ok(nr) => node_results.push(nr), + Err(e) => node_results.push(NodeResult { + node_id: "unknown".to_string(), + items_processed: 0, + error: Some(format!("Task panicked: {}", e)), + }), + } + } + + let success = node_results.iter().all(|r| r.error.is_none()); + + Ok(RunResult { + run_id, + node_results, + success, + }) +} + +/// Execute a single node with its channels (simplified -- does not use registry directly). +async fn execute_node( + _node: &GraphNode, + senders: Vec<mpsc::Sender<DataValue>>, + mut receivers: Vec<mpsc::Receiver<DataValue>>, +) -> Result<u64, NvisyError> { + // For now, forward items from receivers to senders (passthrough behavior). + // The actual registry-based dispatch happens via the Engine wrapper. + let mut count = 0u64; + + for rx in &mut receivers { + while let Some(item) = rx.recv().await { + count += 1; + for tx in &senders { + let _ = tx.send(item.clone()).await; + } + } + } + + // Drop senders to signal downstream completion + drop(senders); + + Ok(count) +} diff --git a/crates/nvisy-engine/src/lib.rs b/crates/nvisy-engine/src/lib.rs new file mode 100644 index 0000000..f051503 --- /dev/null +++ b/crates/nvisy-engine/src/lib.rs @@ -0,0 +1,6 @@ +pub mod compiler; +pub mod connections; +pub mod executor; +pub mod policies; +pub mod runs; +pub mod schema; diff --git a/crates/nvisy-engine/src/policies.rs b/crates/nvisy-engine/src/policies.rs new file mode 100644 index 0000000..04d3a94 --- /dev/null +++ b/crates/nvisy-engine/src/policies.rs @@ -0,0 +1,62 @@ +use std::time::Duration; +use tokio::time; +use nvisy_core::errors::NvisyError; +use crate::schema::{BackoffStrategy, RetryPolicy}; + +/// Compute delay for a retry attempt. +pub fn compute_delay(policy: &RetryPolicy, attempt: u32) -> Duration { + let base = Duration::from_millis(policy.delay_ms); + match policy.backoff { + BackoffStrategy::Fixed => base, + BackoffStrategy::Exponential => base * 2u32.saturating_pow(attempt), + BackoffStrategy::Jitter => { + let exp = base * 2u32.saturating_pow(attempt); + let jitter_range = exp.as_millis() as u64 + 1; + let jitter = Duration::from_millis(rand::random_range(0..jitter_range)); + exp + jitter + } + } +} + +/// Execute a future with retry logic. +pub async fn with_retry<F, Fut, T>( + policy: &RetryPolicy, + mut f: F, +) -> Result<T, NvisyError> +where + F: FnMut() -> Fut, + Fut: std::future::Future<Output = Result<T, NvisyError>>, +{ + let mut last_err = None; + for attempt in 0..=policy.max_retries { + match f().await { + Ok(v) => return Ok(v), + Err(e) => { + if !e.is_retryable() || attempt == policy.max_retries { + return Err(e); + } + last_err = Some(e); + let delay = compute_delay(policy, attempt); + time::sleep(delay).await; + } + } + } + Err(last_err.unwrap_or_else(|| NvisyError::runtime("Retry exhausted", "policies", false))) +} + +/// Execute a future with a timeout. +pub async fn with_timeout<F, T>( + timeout_ms: u64, + f: F, +) -> Result<T, NvisyError> +where + F: std::future::Future<Output = Result<T, NvisyError>>, +{ + match time::timeout(Duration::from_millis(timeout_ms), f).await { + Ok(result) => result, + Err(_) => Err(NvisyError::timeout(format!( + "Operation timed out after {}ms", + timeout_ms + ))), + } +} diff --git a/crates/nvisy-engine/src/runs.rs b/crates/nvisy-engine/src/runs.rs new file mode 100644 index 0000000..22975a7 --- /dev/null +++ b/crates/nvisy-engine/src/runs.rs @@ -0,0 +1,168 @@ +use std::collections::HashMap; +use std::sync::Arc; +use chrono::{DateTime, Utc}; +use tokio::sync::RwLock; +use tokio_util::sync::CancellationToken; +use uuid::Uuid; +use crate::executor::runner::RunResult; + +/// Status of a pipeline run. +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum RunStatus { + Pending, + Running, + Success, + PartialFailure, + Failure, + Cancelled, +} + +/// Progress of a single node within a run. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct NodeProgress { + pub node_id: String, + pub status: RunStatus, + pub items_processed: u64, + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option<String>, +} + +/// Full state of a run. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct RunState { + pub id: Uuid, + pub status: RunStatus, + pub created_at: DateTime<Utc>, + #[serde(skip_serializing_if = "Option::is_none")] + pub completed_at: Option<DateTime<Utc>>, + pub node_progress: HashMap<String, NodeProgress>, + #[serde(skip_serializing_if = "Option::is_none")] + pub result: Option<RunResult>, +} + +/// Summary of a run for listing. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct RunSummary { + pub id: Uuid, + pub status: RunStatus, + pub created_at: DateTime<Utc>, + #[serde(skip_serializing_if = "Option::is_none")] + pub completed_at: Option<DateTime<Utc>>, +} + +/// Manages all tracked runs. +pub struct RunManager { + runs: Arc<RwLock<HashMap<Uuid, RunState>>>, + cancel_tokens: Arc<RwLock<HashMap<Uuid, CancellationToken>>>, +} + +impl RunManager { + pub fn new() -> Self { + Self { + runs: Arc::new(RwLock::new(HashMap::new())), + cancel_tokens: Arc::new(RwLock::new(HashMap::new())), + } + } + + /// Create a new pending run and return its ID and cancellation token. + pub async fn create_run(&self) -> (Uuid, CancellationToken) { + let id = Uuid::new_v4(); + let token = CancellationToken::new(); + + let state = RunState { + id, + status: RunStatus::Pending, + created_at: Utc::now(), + completed_at: None, + node_progress: HashMap::new(), + result: None, + }; + + self.runs.write().await.insert(id, state); + self.cancel_tokens.write().await.insert(id, token.clone()); + + (id, token) + } + + /// Update a run to running status. + pub async fn set_running(&self, id: Uuid) { + if let Some(state) = self.runs.write().await.get_mut(&id) { + state.status = RunStatus::Running; + } + } + + /// Complete a run with a result. + pub async fn complete_run(&self, id: Uuid, result: RunResult) { + if let Some(state) = self.runs.write().await.get_mut(&id) { + state.status = if result.success { + RunStatus::Success + } else if result.node_results.iter().any(|r| r.error.is_none()) { + RunStatus::PartialFailure + } else { + RunStatus::Failure + }; + state.completed_at = Some(Utc::now()); + + for nr in &result.node_results { + state.node_progress.insert( + nr.node_id.clone(), + NodeProgress { + node_id: nr.node_id.clone(), + status: if nr.error.is_none() { + RunStatus::Success + } else { + RunStatus::Failure + }, + items_processed: nr.items_processed, + error: nr.error.clone(), + }, + ); + } + + state.result = Some(result); + } + self.cancel_tokens.write().await.remove(&id); + } + + /// Get the current state of a run. + pub async fn get(&self, id: Uuid) -> Option<RunState> { + self.runs.read().await.get(&id).cloned() + } + + /// List all runs, optionally filtered by status. + pub async fn list(&self, status: Option<RunStatus>) -> Vec<RunSummary> { + self.runs + .read() + .await + .values() + .filter(|s| status.is_none_or(|st| s.status == st)) + .map(|s| RunSummary { + id: s.id, + status: s.status, + created_at: s.created_at, + completed_at: s.completed_at, + }) + .collect() + } + + /// Cancel a running or pending run. Returns false if not found or already finished. + pub async fn cancel(&self, id: Uuid) -> bool { + if let Some(token) = self.cancel_tokens.read().await.get(&id) { + token.cancel(); + if let Some(state) = self.runs.write().await.get_mut(&id) { + state.status = RunStatus::Cancelled; + state.completed_at = Some(Utc::now()); + } + true + } else { + false + } + } +} + +impl Default for RunManager { + fn default() -> Self { + Self::new() + } +} diff --git a/crates/nvisy-engine/src/schema.rs b/crates/nvisy-engine/src/schema.rs new file mode 100644 index 0000000..39b1479 --- /dev/null +++ b/crates/nvisy-engine/src/schema.rs @@ -0,0 +1,120 @@ +use serde::{Deserialize, Serialize}; + +/// Retry policy for a node. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RetryPolicy { + #[serde(default = "default_max_retries")] + pub max_retries: u32, + #[serde(default = "default_delay_ms")] + pub delay_ms: u64, + #[serde(default)] + pub backoff: BackoffStrategy, +} + +fn default_max_retries() -> u32 { 3 } +fn default_delay_ms() -> u64 { 1000 } + +impl Default for RetryPolicy { + fn default() -> Self { + Self { + max_retries: 3, + delay_ms: 1000, + backoff: BackoffStrategy::default(), + } + } +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum BackoffStrategy { + #[default] + Fixed, + Exponential, + Jitter, +} + +/// A node in the graph definition. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum GraphNode { + Source { + id: String, + provider: String, + stream: String, + #[serde(default)] + params: serde_json::Value, + #[serde(skip_serializing_if = "Option::is_none")] + retry: Option<RetryPolicy>, + #[serde(skip_serializing_if = "Option::is_none")] + timeout_ms: Option<u64>, + }, + Action { + id: String, + action: String, + #[serde(default)] + params: serde_json::Value, + #[serde(skip_serializing_if = "Option::is_none")] + retry: Option<RetryPolicy>, + #[serde(skip_serializing_if = "Option::is_none")] + timeout_ms: Option<u64>, + }, + Target { + id: String, + provider: String, + stream: String, + #[serde(default)] + params: serde_json::Value, + #[serde(skip_serializing_if = "Option::is_none")] + retry: Option<RetryPolicy>, + #[serde(skip_serializing_if = "Option::is_none")] + timeout_ms: Option<u64>, + }, +} + +impl GraphNode { + pub fn id(&self) -> &str { + match self { + GraphNode::Source { id, .. } => id, + GraphNode::Action { id, .. } => id, + GraphNode::Target { id, .. } => id, + } + } + + pub fn params(&self) -> &serde_json::Value { + match self { + GraphNode::Source { params, .. } => params, + GraphNode::Action { params, .. } => params, + GraphNode::Target { params, .. } => params, + } + } + + pub fn retry(&self) -> Option<&RetryPolicy> { + match self { + GraphNode::Source { retry, .. } => retry.as_ref(), + GraphNode::Action { retry, .. } => retry.as_ref(), + GraphNode::Target { retry, .. } => retry.as_ref(), + } + } + + pub fn timeout_ms(&self) -> Option<u64> { + match self { + GraphNode::Source { timeout_ms, .. } => *timeout_ms, + GraphNode::Action { timeout_ms, .. } => *timeout_ms, + GraphNode::Target { timeout_ms, .. } => *timeout_ms, + } + } +} + +/// An edge connecting two nodes. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GraphEdge { + pub from: String, + pub to: String, +} + +/// A complete graph definition. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Graph { + pub nodes: Vec<GraphNode>, + pub edges: Vec<GraphEdge>, +} diff --git a/crates/nvisy-object/Cargo.toml b/crates/nvisy-object/Cargo.toml new file mode 100644 index 0000000..bd5903a --- /dev/null +++ b/crates/nvisy-object/Cargo.toml @@ -0,0 +1,48 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-object" +description = "Object store providers and streams (S3, Azure, GCS) for Nvisy" +keywords = ["nvisy", "object-store", "s3", "storage"] +categories = ["filesystem"] + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +# Internal crates +nvisy-core = { workspace = true, features = [] } + +# (De)serialization +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true, features = [] } + +# Async runtime +tokio = { workspace = true, features = ["sync"] } +async-trait = { workspace = true, features = [] } + +# Primitive datatypes +uuid = { workspace = true, features = ["v4"] } +bytes = { workspace = true, features = [] } + +# AWS SDK +aws-sdk-s3 = { workspace = true, features = [] } +aws-config = { workspace = true, features = [] } + +# Error handling +thiserror = { workspace = true, features = [] } + +# Observability +tracing = { workspace = true, features = [] } diff --git a/crates/nvisy-object/src/client.rs b/crates/nvisy-object/src/client.rs new file mode 100644 index 0000000..7479f4e --- /dev/null +++ b/crates/nvisy-object/src/client.rs @@ -0,0 +1,32 @@ +use async_trait::async_trait; +use bytes::Bytes; + +/// Result of a list operation. +pub struct ListResult { + pub keys: Vec<String>, + pub next_cursor: Option<String>, +} + +/// Abstract client for object storage operations. +#[async_trait] +pub trait ObjectStoreClient: Send + Sync + 'static { + async fn list(&self, prefix: &str, cursor: Option<&str>) -> Result<ListResult, Box<dyn std::error::Error + Send + Sync>>; + async fn get(&self, key: &str) -> Result<GetResult, Box<dyn std::error::Error + Send + Sync>>; + async fn put(&self, key: &str, data: Bytes, content_type: Option<&str>) -> Result<(), Box<dyn std::error::Error + Send + Sync>>; + async fn delete(&self, key: &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>>; +} + +/// Result of a get operation. +pub struct GetResult { + pub data: Bytes, + pub content_type: Option<String>, +} + +/// A sized wrapper around a boxed ObjectStoreClient, usable with `Box<dyn Any + Send>`. +pub struct ObjectStoreBox(pub Box<dyn ObjectStoreClient>); + +impl ObjectStoreBox { + pub fn new(client: impl ObjectStoreClient) -> Self { + Self(Box::new(client)) + } +} diff --git a/crates/nvisy-object/src/lib.rs b/crates/nvisy-object/src/lib.rs new file mode 100644 index 0000000..ec6a1ba --- /dev/null +++ b/crates/nvisy-object/src/lib.rs @@ -0,0 +1,16 @@ +pub mod client; +pub mod providers; +pub mod streams; + +use nvisy_core::plugin::PluginDescriptor; +use crate::providers::s3::S3ProviderFactory; +use crate::streams::read::ObjectReadStream; +use crate::streams::write::ObjectWriteStream; + +/// Create the object store plugin descriptor. +pub fn object_plugin() -> PluginDescriptor { + PluginDescriptor::new("object") + .with_provider(S3ProviderFactory) + .with_source(ObjectReadStream) + .with_target(ObjectWriteStream) +} diff --git a/crates/nvisy-object/src/providers/mod.rs b/crates/nvisy-object/src/providers/mod.rs new file mode 100644 index 0000000..7dce405 --- /dev/null +++ b/crates/nvisy-object/src/providers/mod.rs @@ -0,0 +1 @@ +pub mod s3; diff --git a/crates/nvisy-object/src/providers/s3.rs b/crates/nvisy-object/src/providers/s3.rs new file mode 100644 index 0000000..556cda1 --- /dev/null +++ b/crates/nvisy-object/src/providers/s3.rs @@ -0,0 +1,156 @@ +use async_trait::async_trait; +use aws_config::BehaviorVersion; +use aws_sdk_s3::Client as S3Client; +use bytes::Bytes; + +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::provider::{ConnectedInstance, ProviderFactory}; +use crate::client::{GetResult, ListResult, ObjectStoreClient}; + +/// S3-compatible object store client. +pub struct S3ObjectStoreClient { + client: S3Client, + bucket: String, +} + +impl S3ObjectStoreClient { + pub fn new(client: S3Client, bucket: String) -> Self { + Self { client, bucket } + } +} + +#[async_trait] +impl ObjectStoreClient for S3ObjectStoreClient { + async fn list(&self, prefix: &str, cursor: Option<&str>) -> Result<ListResult, Box<dyn std::error::Error + Send + Sync>> { + let mut req = self.client + .list_objects_v2() + .bucket(&self.bucket) + .prefix(prefix); + + if let Some(token) = cursor { + req = req.continuation_token(token); + } + + let resp = req.send().await?; + + let keys: Vec<String> = resp + .contents() + .iter() + .filter_map(|obj| obj.key().map(|k| k.to_string())) + .collect(); + + let next_cursor = resp.next_continuation_token().map(|s| s.to_string()); + + Ok(ListResult { keys, next_cursor }) + } + + async fn get(&self, key: &str) -> Result<GetResult, Box<dyn std::error::Error + Send + Sync>> { + let resp = self.client + .get_object() + .bucket(&self.bucket) + .key(key) + .send() + .await?; + + let content_type = resp.content_type().map(|s| s.to_string()); + let body = resp.body.collect().await?; + let data = body.into_bytes(); + + Ok(GetResult { data, content_type }) + } + + async fn put(&self, key: &str, data: Bytes, content_type: Option<&str>) -> Result<(), Box<dyn std::error::Error + Send + Sync>> { + let mut req = self.client + .put_object() + .bucket(&self.bucket) + .key(key) + .body(data.into()); + + if let Some(ct) = content_type { + req = req.content_type(ct); + } + + req.send().await?; + Ok(()) + } + + async fn delete(&self, key: &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>> { + self.client + .delete_object() + .bucket(&self.bucket) + .key(key) + .send() + .await?; + Ok(()) + } +} + +/// S3 provider factory. +pub struct S3ProviderFactory; + +#[async_trait] +impl ProviderFactory for S3ProviderFactory { + fn id(&self) -> &str { "s3" } + + fn validate_credentials(&self, creds: &serde_json::Value) -> Result<(), NvisyError> { + let bucket = creds.get("bucket").and_then(|v| v.as_str()); + if bucket.is_none() { + return Err(NvisyError::validation("Missing 'bucket' in S3 credentials", "s3")); + } + Ok(()) + } + + async fn verify(&self, creds: &serde_json::Value) -> Result<(), NvisyError> { + self.validate_credentials(creds)?; + // Could do a HeadBucket call here for verification + Ok(()) + } + + async fn connect(&self, creds: &serde_json::Value) -> Result<ConnectedInstance, NvisyError> { + let bucket = creds.get("bucket") + .and_then(|v| v.as_str()) + .ok_or_else(|| NvisyError::validation("Missing 'bucket'", "s3"))? + .to_string(); + + let region = creds.get("region") + .and_then(|v| v.as_str()) + .unwrap_or("us-east-1"); + + let endpoint = creds.get("endpoint") + .and_then(|v| v.as_str()); + + let mut config_loader = aws_config::defaults(BehaviorVersion::latest()) + .region(aws_sdk_s3::config::Region::new(region.to_string())); + + // If access_key and secret_key provided, use static credentials + if let (Some(access_key), Some(secret_key)) = ( + creds.get("accessKeyId").and_then(|v| v.as_str()), + creds.get("secretAccessKey").and_then(|v| v.as_str()), + ) { + config_loader = config_loader.credentials_provider( + aws_sdk_s3::config::Credentials::new( + access_key, + secret_key, + creds.get("sessionToken").and_then(|v| v.as_str()).map(|s| s.to_string()), + None, + "nvisy-s3", + ), + ); + } + + let config = config_loader.load().await; + let mut s3_config = aws_sdk_s3::config::Builder::from(&config); + + if let Some(ep) = endpoint { + s3_config = s3_config.endpoint_url(ep).force_path_style(true); + } + + let client = S3Client::from_conf(s3_config.build()); + let store_client = S3ObjectStoreClient::new(client, bucket); + + Ok(ConnectedInstance { + client: Box::new(crate::client::ObjectStoreBox::new(store_client)), + disconnect: None, + }) + } +} diff --git a/crates/nvisy-object/src/streams/mod.rs b/crates/nvisy-object/src/streams/mod.rs new file mode 100644 index 0000000..6295529 --- /dev/null +++ b/crates/nvisy-object/src/streams/mod.rs @@ -0,0 +1,2 @@ +pub mod read; +pub mod write; diff --git a/crates/nvisy-object/src/streams/read.rs b/crates/nvisy-object/src/streams/read.rs new file mode 100644 index 0000000..dc4ec77 --- /dev/null +++ b/crates/nvisy-object/src/streams/read.rs @@ -0,0 +1,73 @@ +use std::any::Any; +use async_trait::async_trait; +use tokio::sync::mpsc; + +use nvisy_core::data::DataValue; +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::stream::StreamSource; +use crate::client::ObjectStoreBox; + +pub struct ObjectReadStream; + +#[async_trait] +impl StreamSource for ObjectReadStream { + fn id(&self) -> &str { "read" } + fn output_type(&self) -> &str { "blob" } + fn required_provider_id(&self) -> &str { "s3" } + + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + Ok(()) + } + + async fn read( + &self, + output: mpsc::Sender<DataValue>, + params: serde_json::Value, + client: Box<dyn Any + Send>, + ) -> Result<u64, NvisyError> { + let store_box = client.downcast::<ObjectStoreBox>().map_err(|_| { + NvisyError::runtime("Invalid client type for object read stream", "object/read", false) + })?; + let store_client = &store_box.0; + + let prefix = params.get("prefix").and_then(|v| v.as_str()).unwrap_or(""); + let batch_size = params.get("batchSize").and_then(|v| v.as_u64()).unwrap_or(100) as usize; + + let mut cursor: Option<String> = None; + let mut total = 0u64; + + loop { + let result = store_client + .list(prefix, cursor.as_deref()) + .await + .map_err(|e| NvisyError::runtime(format!("List failed: {}", e), "object/read", true))?; + + let keys_count = result.keys.len(); + + for key in &result.keys { + let get_result = store_client + .get(key) + .await + .map_err(|e| NvisyError::runtime(format!("Get failed for {}: {}", key, e), "object/read", true))?; + + let mut blob = Blob::new(key.clone(), get_result.data); + if let Some(ct) = get_result.content_type { + blob = blob.with_content_type(ct); + } + + total += 1; + if output.send(DataValue::Blob(blob)).await.is_err() { + return Ok(total); + } + } + + if keys_count < batch_size || result.next_cursor.is_none() { + break; + } + cursor = result.next_cursor; + } + + Ok(total) + } +} diff --git a/crates/nvisy-object/src/streams/write.rs b/crates/nvisy-object/src/streams/write.rs new file mode 100644 index 0000000..fd23790 --- /dev/null +++ b/crates/nvisy-object/src/streams/write.rs @@ -0,0 +1,55 @@ +use std::any::Any; +use async_trait::async_trait; +use tokio::sync::mpsc; + +use nvisy_core::data::DataValue; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::stream::StreamTarget; +use crate::client::ObjectStoreBox; + +pub struct ObjectWriteStream; + +#[async_trait] +impl StreamTarget for ObjectWriteStream { + fn id(&self) -> &str { "write" } + fn input_type(&self) -> &str { "blob" } + fn required_provider_id(&self) -> &str { "s3" } + + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + Ok(()) + } + + async fn write( + &self, + mut input: mpsc::Receiver<DataValue>, + params: serde_json::Value, + client: Box<dyn Any + Send>, + ) -> Result<u64, NvisyError> { + let store_box = client.downcast::<ObjectStoreBox>().map_err(|_| { + NvisyError::runtime("Invalid client type for object write stream", "object/write", false) + })?; + let store_client = &store_box.0; + + let prefix = params.get("prefix").and_then(|v| v.as_str()).unwrap_or(""); + let mut total = 0u64; + + while let Some(item) = input.recv().await { + if let DataValue::Blob(blob) = item { + let key = if prefix.is_empty() { + blob.path.clone() + } else { + format!("{}{}", prefix, blob.path) + }; + + store_client + .put(&key, blob.content.clone(), blob.content_type()) + .await + .map_err(|e| NvisyError::runtime(format!("Put failed for {}: {}", key, e), "object/write", true))?; + + total += 1; + } + } + + Ok(total) + } +} diff --git a/crates/nvisy-python/Cargo.toml b/crates/nvisy-python/Cargo.toml new file mode 100644 index 0000000..0b1a89b --- /dev/null +++ b/crates/nvisy-python/Cargo.toml @@ -0,0 +1,46 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-python" +description = "PyO3 bridge for AI NER detection via embedded Python" +keywords = ["nvisy", "python", "pyo3", "ner"] +categories = ["api-bindings"] + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +# Internal crates +nvisy-core = { workspace = true, features = [] } + +# (De)serialization +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true, features = [] } + +# Async runtime +tokio = { workspace = true, features = ["sync", "rt"] } +async-trait = { workspace = true, features = [] } + +# Primitive datatypes +uuid = { workspace = true, features = ["v4"] } + +# Python interop +pyo3 = { workspace = true, features = ["auto-initialize", "serde"] } + +# Error handling +thiserror = { workspace = true, features = [] } + +# Observability +tracing = { workspace = true, features = [] } diff --git a/crates/nvisy-python/src/actions.rs b/crates/nvisy-python/src/actions.rs new file mode 100644 index 0000000..45c419d --- /dev/null +++ b/crates/nvisy-python/src/actions.rs @@ -0,0 +1,141 @@ +use std::any::Any; +use async_trait::async_trait; +use tokio::sync::mpsc; + +use nvisy_core::data::DataValue; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::action::Action; +use crate::bridge::PythonBridge; +use crate::ner::{self, NerConfig}; + +/// AI NER detection action for text documents. +pub struct DetectNerAction; + +#[async_trait] +impl Action for DetectNerAction { + fn id(&self) -> &str { "detect-ner" } + fn input_type(&self) -> &str { "document" } + fn output_type(&self) -> &str { "entity" } + fn requires_client(&self) -> bool { true } + fn required_provider_id(&self) -> Option<&str> { Some("ai") } + + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<DataValue>, + output: mpsc::Sender<DataValue>, + params: serde_json::Value, + client: Option<Box<dyn Any + Send>>, + ) -> Result<u64, NvisyError> { + let bridge = extract_bridge(client)?; + let config = parse_ner_config(¶ms); + let mut count = 0u64; + + while let Some(item) = input.recv().await { + if let DataValue::Document(doc) = &item { + let entities = ner::detect_ner(&bridge, &doc.content, &config).await?; + for mut entity in entities { + entity.source_id = Some(doc.data.id); + entity.data.parent_id = Some(doc.data.id); + count += 1; + if output.send(DataValue::Entity(entity)).await.is_err() { + return Ok(count); + } + } + } + } + + Ok(count) + } +} + +/// AI NER detection action for images. +pub struct DetectNerImageAction; + +#[async_trait] +impl Action for DetectNerImageAction { + fn id(&self) -> &str { "detect-ner-image" } + fn input_type(&self) -> &str { "image" } + fn output_type(&self) -> &str { "entity" } + fn requires_client(&self) -> bool { true } + fn required_provider_id(&self) -> Option<&str> { Some("ai") } + + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<DataValue>, + output: mpsc::Sender<DataValue>, + params: serde_json::Value, + client: Option<Box<dyn Any + Send>>, + ) -> Result<u64, NvisyError> { + let bridge = extract_bridge(client)?; + let config = parse_ner_config(¶ms); + let mut count = 0u64; + + while let Some(item) = input.recv().await { + if let DataValue::Image(img) = &item { + let entities = ner::detect_ner_image( + &bridge, + &img.image_data, + &img.mime_type, + &config, + ).await?; + for mut entity in entities { + entity.data.parent_id = Some(img.data.id); + count += 1; + if output.send(DataValue::Entity(entity)).await.is_err() { + return Ok(count); + } + } + } + } + + Ok(count) + } +} + +fn extract_bridge(client: Option<Box<dyn Any + Send>>) -> Result<PythonBridge, NvisyError> { + client + .ok_or_else(|| NvisyError::runtime("AI provider client required", "python", false))? + .downcast::<PythonBridge>() + .map(|b| *b) + .map_err(|_| NvisyError::runtime("Invalid client type for AI actions", "python", false)) +} + +fn parse_ner_config(params: &serde_json::Value) -> NerConfig { + NerConfig { + entity_types: params + .get("entityTypes") + .and_then(|v| serde_json::from_value(v.clone()).ok()) + .unwrap_or_default(), + confidence_threshold: params + .get("confidenceThreshold") + .and_then(|v| v.as_f64()) + .unwrap_or(0.5), + temperature: params + .get("temperature") + .and_then(|v| v.as_f64()) + .unwrap_or(0.0), + api_key: params + .get("apiKey") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), + model: params + .get("model") + .and_then(|v| v.as_str()) + .unwrap_or("gpt-4") + .to_string(), + provider: params + .get("provider") + .and_then(|v| v.as_str()) + .unwrap_or("openai") + .to_string(), + } +} diff --git a/crates/nvisy-python/src/bridge.rs b/crates/nvisy-python/src/bridge.rs new file mode 100644 index 0000000..f28c991 --- /dev/null +++ b/crates/nvisy-python/src/bridge.rs @@ -0,0 +1,38 @@ +use pyo3::prelude::*; +use nvisy_core::errors::NvisyError; +use crate::error::from_pyerr; + +/// Holds a reference to the loaded Python NER module. +#[derive(Clone)] +pub struct PythonBridge { + module_name: String, +} + +impl PythonBridge { + /// Create a new bridge that will load the given Python module. + pub fn new(module_name: impl Into<String>) -> Self { + Self { + module_name: module_name.into(), + } + } + + /// Initialize Python and verify the module can be imported. + pub fn init(&self) -> Result<(), NvisyError> { + Python::with_gil(|py| { + py.import(&self.module_name) + .map_err(from_pyerr)?; + Ok(()) + }) + } + + /// Get the module name. + pub fn module_name(&self) -> &str { + &self.module_name + } +} + +impl Default for PythonBridge { + fn default() -> Self { + Self::new("nvisy_ai") + } +} diff --git a/crates/nvisy-python/src/error.rs b/crates/nvisy-python/src/error.rs new file mode 100644 index 0000000..a5e8fa7 --- /dev/null +++ b/crates/nvisy-python/src/error.rs @@ -0,0 +1,13 @@ +use nvisy_core::errors::NvisyError; +use pyo3::PyErr; +use pyo3::types::PyTracebackMethods; + +/// Convert a Python error to a NvisyError. +pub fn from_pyerr(err: PyErr) -> NvisyError { + pyo3::Python::with_gil(|py| { + let traceback = err + .traceback(py) + .map(|tb| tb.format().unwrap_or_default()); + NvisyError::python(err.to_string(), traceback) + }) +} diff --git a/crates/nvisy-python/src/lib.rs b/crates/nvisy-python/src/lib.rs new file mode 100644 index 0000000..b4e7fcf --- /dev/null +++ b/crates/nvisy-python/src/lib.rs @@ -0,0 +1,17 @@ +pub mod actions; +pub mod bridge; +pub mod error; +pub mod ner; +pub mod provider; + +use nvisy_core::plugin::PluginDescriptor; +use crate::actions::{DetectNerAction, DetectNerImageAction}; +use crate::provider::AiProviderFactory; + +/// Create the Python AI plugin descriptor. +pub fn python_plugin() -> PluginDescriptor { + PluginDescriptor::new("ai") + .with_action(DetectNerAction) + .with_action(DetectNerImageAction) + .with_provider(AiProviderFactory) +} diff --git a/crates/nvisy-python/src/ner.rs b/crates/nvisy-python/src/ner.rs new file mode 100644 index 0000000..b5e4a94 --- /dev/null +++ b/crates/nvisy-python/src/ner.rs @@ -0,0 +1,171 @@ +use pyo3::prelude::*; +use pyo3::types::{PyDict, PyList}; + +use nvisy_core::datatypes::entity::{Entity, EntityLocation}; +use nvisy_core::errors::NvisyError; +use nvisy_core::types::{DetectionMethod, EntityCategory}; +use crate::bridge::PythonBridge; +use crate::error::from_pyerr; + +/// Configuration for NER detection. +#[derive(Debug, Clone)] +pub struct NerConfig { + pub entity_types: Vec<String>, + pub confidence_threshold: f64, + pub temperature: f64, + pub api_key: String, + pub model: String, + pub provider: String, +} + +/// Call Python detect_ner function via GIL + spawn_blocking. +pub async fn detect_ner( + bridge: &PythonBridge, + text: &str, + config: &NerConfig, +) -> Result<Vec<Entity>, NvisyError> { + let module_name = bridge.module_name().to_string(); + let text = text.to_string(); + let config = config.clone(); + + tokio::task::spawn_blocking(move || { + Python::with_gil(|py| { + let module = py.import(&module_name).map_err(from_pyerr)?; + + let kwargs = PyDict::new(py); + kwargs.set_item("text", &text).map_err(from_pyerr)?; + kwargs.set_item("entity_types", &config.entity_types).map_err(from_pyerr)?; + kwargs.set_item("confidence_threshold", config.confidence_threshold).map_err(from_pyerr)?; + kwargs.set_item("temperature", config.temperature).map_err(from_pyerr)?; + kwargs.set_item("api_key", &config.api_key).map_err(from_pyerr)?; + kwargs.set_item("model", &config.model).map_err(from_pyerr)?; + kwargs.set_item("provider", &config.provider).map_err(from_pyerr)?; + + let result = module + .call_method("detect_ner", (), Some(&kwargs)) + .map_err(from_pyerr)?; + + parse_python_entities(py, result) + }) + }) + .await + .map_err(|e| NvisyError::python(format!("Task join error: {}", e), None))? +} + +/// Call Python detect_ner_image function via GIL + spawn_blocking. +pub async fn detect_ner_image( + bridge: &PythonBridge, + image_data: &[u8], + mime_type: &str, + config: &NerConfig, +) -> Result<Vec<Entity>, NvisyError> { + let module_name = bridge.module_name().to_string(); + let image_data = image_data.to_vec(); + let mime_type = mime_type.to_string(); + let config = config.clone(); + + tokio::task::spawn_blocking(move || { + Python::with_gil(|py| { + let module = py.import(&module_name).map_err(from_pyerr)?; + + let kwargs = PyDict::new(py); + kwargs.set_item("image_bytes", &image_data[..]).map_err(from_pyerr)?; + kwargs.set_item("mime_type", &mime_type).map_err(from_pyerr)?; + kwargs.set_item("entity_types", &config.entity_types).map_err(from_pyerr)?; + kwargs.set_item("confidence_threshold", config.confidence_threshold).map_err(from_pyerr)?; + kwargs.set_item("api_key", &config.api_key).map_err(from_pyerr)?; + kwargs.set_item("model", &config.model).map_err(from_pyerr)?; + kwargs.set_item("provider", &config.provider).map_err(from_pyerr)?; + + let result = module + .call_method("detect_ner_image", (), Some(&kwargs)) + .map_err(from_pyerr)?; + + parse_python_entities(py, result) + }) + }) + .await + .map_err(|e| NvisyError::python(format!("Task join error: {}", e), None))? +} + +/// Parse Python list[dict] response into Vec<Entity>. +fn parse_python_entities(_py: Python<'_>, result: Bound<'_, PyAny>) -> Result<Vec<Entity>, NvisyError> { + let list: &Bound<'_, PyList> = result.downcast().map_err(|e| { + NvisyError::python(format!("Expected list from Python: {}", e), None) + })?; + + let mut entities = Vec::new(); + + for item in list.iter() { + let dict: &Bound<'_, PyDict> = item.downcast().map_err(|e| { + NvisyError::python(format!("Expected dict in list: {}", e), None) + })?; + + let category_str: String = dict + .get_item("category") + .map_err(from_pyerr)? + .ok_or_else(|| NvisyError::python("Missing 'category'", None))? + .extract() + .map_err(from_pyerr)?; + + let category = match category_str.as_str() { + "pii" => EntityCategory::Pii, + "phi" => EntityCategory::Phi, + "financial" => EntityCategory::Financial, + "credentials" => EntityCategory::Credentials, + _ => EntityCategory::Custom, + }; + + let entity_type: String = dict + .get_item("entity_type") + .map_err(from_pyerr)? + .ok_or_else(|| NvisyError::python("Missing 'entity_type'", None))? + .extract() + .map_err(from_pyerr)?; + + let value: String = dict + .get_item("value") + .map_err(from_pyerr)? + .ok_or_else(|| NvisyError::python("Missing 'value'", None))? + .extract() + .map_err(from_pyerr)?; + + let confidence: f64 = dict + .get_item("confidence") + .map_err(from_pyerr)? + .ok_or_else(|| NvisyError::python("Missing 'confidence'", None))? + .extract() + .map_err(from_pyerr)?; + + let start_offset: usize = dict + .get_item("start_offset") + .map_err(from_pyerr)? + .and_then(|v| v.extract().ok()) + .unwrap_or(0); + + let end_offset: usize = dict + .get_item("end_offset") + .map_err(from_pyerr)? + .and_then(|v| v.extract().ok()) + .unwrap_or(0); + + let entity = Entity::new( + category, + entity_type, + value, + DetectionMethod::AiNer, + confidence, + EntityLocation { + start_offset, + end_offset, + element_id: None, + page_number: None, + bounding_box: None, + }, + ); + + entities.push(entity); + } + + Ok(entities) +} diff --git a/crates/nvisy-python/src/provider.rs b/crates/nvisy-python/src/provider.rs new file mode 100644 index 0000000..734c3c9 --- /dev/null +++ b/crates/nvisy-python/src/provider.rs @@ -0,0 +1,34 @@ +use async_trait::async_trait; +use nvisy_core::errors::NvisyError; +use nvisy_core::traits::provider::{ConnectedInstance, ProviderFactory}; +use crate::bridge::PythonBridge; + +/// AI provider factory that creates PythonBridge instances. +pub struct AiProviderFactory; + +#[async_trait] +impl ProviderFactory for AiProviderFactory { + fn id(&self) -> &str { "ai" } + + fn validate_credentials(&self, creds: &serde_json::Value) -> Result<(), NvisyError> { + if creds.get("apiKey").and_then(|v| v.as_str()).is_none() { + return Err(NvisyError::validation("Missing 'apiKey' in AI credentials", "ai")); + } + Ok(()) + } + + async fn verify(&self, creds: &serde_json::Value) -> Result<(), NvisyError> { + self.validate_credentials(creds) + } + + async fn connect(&self, _creds: &serde_json::Value) -> Result<ConnectedInstance, NvisyError> { + let bridge = PythonBridge::default(); + // Don't init here — Python might not be available at connect time + // Init happens lazily when detect_ner is called + + Ok(ConnectedInstance { + client: Box::new(bridge), + disconnect: None, + }) + } +} diff --git a/crates/nvisy-server/Cargo.toml b/crates/nvisy-server/Cargo.toml new file mode 100644 index 0000000..c651ecd --- /dev/null +++ b/crates/nvisy-server/Cargo.toml @@ -0,0 +1,62 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-server" +description = "Axum HTTP server for the Nvisy data protection platform" +keywords = ["nvisy", "server", "http", "axum"] +categories = ["web-programming::http-server"] + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[[bin]] +name = "nvisy-server" +path = "src/main.rs" + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +# Internal crates +nvisy-core = { workspace = true, features = [] } +nvisy-engine = { workspace = true, features = [] } +nvisy-detect = { workspace = true, features = [] } +nvisy-object = { workspace = true, features = [] } +nvisy-python = { workspace = true, features = [] } + +# (De)serialization +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true, features = [] } + +# Async runtime +tokio = { workspace = true, features = ["rt-multi-thread", "macros", "signal"] } + +# HTTP server +axum = { workspace = true, features = ["http2", "macros"] } +tower = { workspace = true, features = ["full"] } +tower-http = { workspace = true, features = ["cors", "trace", "request-id", "limit"] } + +# OpenAPI / Documentation +utoipa = { workspace = true, features = ["axum_extras"] } +utoipa-swagger-ui = { workspace = true, features = ["axum"] } + +# Primitive datatypes +uuid = { workspace = true, features = ["v4"] } +chrono = { workspace = true, features = [] } + +# Observability +tracing = { workspace = true, features = [] } +tracing-subscriber = { workspace = true, features = ["fmt", "ansi", "json", "env-filter"] } + +# Error handling +thiserror = { workspace = true, features = [] } +anyhow = { workspace = true, features = ["backtrace"] } diff --git a/crates/nvisy-server/src/app.rs b/crates/nvisy-server/src/app.rs new file mode 100644 index 0000000..9783f13 --- /dev/null +++ b/crates/nvisy-server/src/app.rs @@ -0,0 +1,41 @@ +use axum::Router; +use std::sync::Arc; +use tower_http::cors::{Any, CorsLayer}; +use tower_http::trace::TraceLayer; + +use crate::config::ServerConfig; +use crate::routes; +use crate::service::engine_factory; +use crate::service::audit_store::AuditStore; +use crate::service::policy_store::PolicyStore; +use crate::state::AppState; +use nvisy_engine::runs::RunManager; + +/// Build a fully configured Axum application. +pub async fn build_app(_config: &ServerConfig) -> anyhow::Result<Router> { + let registry = engine_factory::create_registry()?; + + let state = AppState { + registry: Arc::new(registry), + run_manager: Arc::new(RunManager::new()), + policy_store: Arc::new(PolicyStore::new()), + audit_store: Arc::new(AuditStore::new()), + }; + + let cors = CorsLayer::new() + .allow_origin(Any) + .allow_methods(Any) + .allow_headers(Any); + + let app = Router::new() + .merge(routes::health::router()) + .merge(routes::graphs::router()) + .merge(routes::redact::router()) + .merge(routes::policies::router()) + .merge(routes::audit::router()) + .layer(TraceLayer::new_for_http()) + .layer(cors) + .with_state(state); + + Ok(app) +} diff --git a/crates/nvisy-server/src/config.rs b/crates/nvisy-server/src/config.rs new file mode 100644 index 0000000..2169fe4 --- /dev/null +++ b/crates/nvisy-server/src/config.rs @@ -0,0 +1,19 @@ +/// Server configuration loaded from environment variables. +pub struct ServerConfig { + pub host: String, + pub port: u16, + pub cors_origin: String, +} + +impl ServerConfig { + pub fn from_env() -> Self { + Self { + host: std::env::var("NVISY_HOST").unwrap_or_else(|_| "0.0.0.0".to_string()), + port: std::env::var("NVISY_PORT") + .ok() + .and_then(|p| p.parse().ok()) + .unwrap_or(8080), + cors_origin: std::env::var("NVISY_CORS_ORIGIN").unwrap_or_else(|_| "*".to_string()), + } + } +} diff --git a/crates/nvisy-server/src/main.rs b/crates/nvisy-server/src/main.rs new file mode 100644 index 0000000..32a9c56 --- /dev/null +++ b/crates/nvisy-server/src/main.rs @@ -0,0 +1,29 @@ +mod app; +mod config; +mod middleware; +mod routes; +mod schemas; +mod service; +mod state; + +use tracing_subscriber::EnvFilter; + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + // Initialize tracing + tracing_subscriber::fmt() + .with_env_filter(EnvFilter::from_default_env().add_directive("nvisy=info".parse()?)) + .json() + .init(); + + let config = config::ServerConfig::from_env(); + tracing::info!(host = %config.host, port = config.port, "Starting nvisy-server"); + + let app = app::build_app(&config).await?; + + let listener = tokio::net::TcpListener::bind(format!("{}:{}", config.host, config.port)).await?; + tracing::info!("Listening on {}:{}", config.host, config.port); + + axum::serve(listener, app).await?; + Ok(()) +} diff --git a/crates/nvisy-server/src/middleware/mod.rs b/crates/nvisy-server/src/middleware/mod.rs new file mode 100644 index 0000000..5713713 --- /dev/null +++ b/crates/nvisy-server/src/middleware/mod.rs @@ -0,0 +1,2 @@ +// Middleware is applied in app.rs via tower layers. +// Custom middleware can be added here as needed. diff --git a/crates/nvisy-server/src/routes/audit.rs b/crates/nvisy-server/src/routes/audit.rs new file mode 100644 index 0000000..844c34c --- /dev/null +++ b/crates/nvisy-server/src/routes/audit.rs @@ -0,0 +1,47 @@ +use axum::{ + Router, + extract::{Path, Query, State}, + routing::get, + Json, +}; +use uuid::Uuid; +use crate::state::AppState; + +pub fn router() -> Router<AppState> { + Router::new() + .route("/api/v1/audit", get(list_audit)) + .route("/api/v1/audit/{run_id}", get(get_audit_by_run)) +} + +#[derive(serde::Deserialize)] +struct AuditQuery { + #[serde(rename = "runId")] + run_id: Option<String>, + action: Option<String>, + #[serde(rename = "sourceId")] + source_id: Option<String>, + limit: Option<usize>, + offset: Option<usize>, +} + +async fn list_audit( + State(state): State<AppState>, + Query(query): Query<AuditQuery>, +) -> Json<serde_json::Value> { + let records = state.audit_store.query( + query.run_id.as_deref(), + query.action.as_deref(), + query.source_id.as_deref(), + query.limit.unwrap_or(100), + query.offset.unwrap_or(0), + ); + Json(serde_json::to_value(&records).unwrap_or_default()) +} + +async fn get_audit_by_run( + State(state): State<AppState>, + Path(run_id): Path<Uuid>, +) -> Json<serde_json::Value> { + let records = state.audit_store.get_by_run_id(run_id); + Json(serde_json::to_value(&records).unwrap_or_default()) +} diff --git a/crates/nvisy-server/src/routes/graphs.rs b/crates/nvisy-server/src/routes/graphs.rs new file mode 100644 index 0000000..e8cf874 --- /dev/null +++ b/crates/nvisy-server/src/routes/graphs.rs @@ -0,0 +1,71 @@ +use axum::{ + Router, + extract::{Path, State}, + routing::{delete, get, post}, + Json, +}; +use uuid::Uuid; +use crate::state::AppState; + +pub fn router() -> Router<AppState> { + Router::new() + .route("/api/v1/graphs/execute", post(execute_graph)) + .route("/api/v1/graphs/validate", post(validate_graph)) + .route("/api/v1/graphs", get(list_runs)) + .route("/api/v1/graphs/{run_id}", get(get_run)) + .route("/api/v1/graphs/{run_id}", delete(cancel_run)) +} + +async fn execute_graph( + State(state): State<AppState>, + Json(_body): Json<serde_json::Value>, +) -> (axum::http::StatusCode, Json<serde_json::Value>) { + let (run_id, _cancel_token) = state.run_manager.create_run().await; + state.run_manager.set_running(run_id).await; + + // TODO: spawn actual graph execution + // For now, return the run ID + ( + axum::http::StatusCode::ACCEPTED, + Json(serde_json::json!({ + "runId": run_id.to_string(), + "status": "accepted" + })), + ) +} + +async fn validate_graph( + State(_state): State<AppState>, + Json(_body): Json<serde_json::Value>, +) -> Json<serde_json::Value> { + // TODO: validate graph against registry + Json(serde_json::json!({ "valid": true, "errors": [] })) +} + +async fn list_runs( + State(state): State<AppState>, +) -> Json<serde_json::Value> { + let runs = state.run_manager.list(None).await; + Json(serde_json::to_value(&runs).unwrap_or_default()) +} + +async fn get_run( + State(state): State<AppState>, + Path(run_id): Path<Uuid>, +) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { + match state.run_manager.get(run_id).await { + Some(run) => Ok(Json(serde_json::to_value(&run).unwrap_or_default())), + None => Err(axum::http::StatusCode::NOT_FOUND), + } +} + +async fn cancel_run( + State(state): State<AppState>, + Path(run_id): Path<Uuid>, +) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { + if state.run_manager.cancel(run_id).await { + Ok(Json(serde_json::json!({ "cancelled": true }))) + } else { + Err(axum::http::StatusCode::NOT_FOUND) + } +} diff --git a/crates/nvisy-server/src/routes/health.rs b/crates/nvisy-server/src/routes/health.rs new file mode 100644 index 0000000..709e091 --- /dev/null +++ b/crates/nvisy-server/src/routes/health.rs @@ -0,0 +1,16 @@ +use axum::{Router, routing::get, Json}; +use crate::state::AppState; + +pub fn router() -> Router<AppState> { + Router::new() + .route("/health", get(health)) + .route("/ready", get(ready)) +} + +async fn health() -> Json<serde_json::Value> { + Json(serde_json::json!({ "status": "ok" })) +} + +async fn ready() -> Json<serde_json::Value> { + Json(serde_json::json!({ "status": "ready" })) +} diff --git a/crates/nvisy-server/src/routes/mod.rs b/crates/nvisy-server/src/routes/mod.rs new file mode 100644 index 0000000..e839dc8 --- /dev/null +++ b/crates/nvisy-server/src/routes/mod.rs @@ -0,0 +1,5 @@ +pub mod audit; +pub mod graphs; +pub mod health; +pub mod policies; +pub mod redact; diff --git a/crates/nvisy-server/src/routes/policies.rs b/crates/nvisy-server/src/routes/policies.rs new file mode 100644 index 0000000..a49a65e --- /dev/null +++ b/crates/nvisy-server/src/routes/policies.rs @@ -0,0 +1,100 @@ +use axum::{ + Router, + extract::{Path, State}, + routing::{delete, get, post, put}, + Json, +}; +use uuid::Uuid; +use crate::state::AppState; + +pub fn router() -> Router<AppState> { + Router::new() + .route("/api/v1/policies", post(create_policy)) + .route("/api/v1/policies", get(list_policies)) + .route("/api/v1/policies/{id}", get(get_policy)) + .route("/api/v1/policies/{id}", put(update_policy)) + .route("/api/v1/policies/{id}", delete(delete_policy)) +} + +#[derive(serde::Deserialize)] +struct CreatePolicyRequest { + name: String, + #[serde(default)] + rules: Vec<serde_json::Value>, + #[serde(rename = "defaultMethod", default = "default_method")] + default_method: String, + #[serde(rename = "defaultConfidenceThreshold", default = "default_threshold")] + default_confidence_threshold: f64, +} + +fn default_method() -> String { "mask".to_string() } +fn default_threshold() -> f64 { 0.5 } + +async fn create_policy( + State(state): State<AppState>, + Json(body): Json<CreatePolicyRequest>, +) -> (axum::http::StatusCode, Json<serde_json::Value>) { + let policy = state.policy_store.create( + body.name, + body.rules, + body.default_method, + body.default_confidence_threshold, + ); + ( + axum::http::StatusCode::CREATED, + Json(serde_json::to_value(&policy).unwrap_or_default()), + ) +} + +async fn list_policies( + State(state): State<AppState>, +) -> Json<serde_json::Value> { + let policies = state.policy_store.list(); + Json(serde_json::to_value(&policies).unwrap_or_default()) +} + +async fn get_policy( + State(state): State<AppState>, + Path(id): Path<Uuid>, +) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { + match state.policy_store.get(id) { + Some(policy) => Ok(Json(serde_json::to_value(&policy).unwrap_or_default())), + None => Err(axum::http::StatusCode::NOT_FOUND), + } +} + +#[derive(serde::Deserialize)] +struct UpdatePolicyRequest { + #[serde(default)] + name: Option<String>, + #[serde(default)] + rules: Option<Vec<serde_json::Value>>, + #[serde(rename = "defaultMethod")] + #[serde(default)] + default_method: Option<String>, + #[serde(rename = "defaultConfidenceThreshold")] + #[serde(default)] + default_confidence_threshold: Option<f64>, +} + +async fn update_policy( + State(state): State<AppState>, + Path(id): Path<Uuid>, + Json(body): Json<UpdatePolicyRequest>, +) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { + match state.policy_store.update(id, body.name, body.rules, body.default_method, body.default_confidence_threshold) { + Some(policy) => Ok(Json(serde_json::to_value(&policy).unwrap_or_default())), + None => Err(axum::http::StatusCode::NOT_FOUND), + } +} + +async fn delete_policy( + State(state): State<AppState>, + Path(id): Path<Uuid>, +) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { + if state.policy_store.delete(id) { + Ok(Json(serde_json::json!({ "deleted": true }))) + } else { + Err(axum::http::StatusCode::NOT_FOUND) + } +} diff --git a/crates/nvisy-server/src/routes/redact.rs b/crates/nvisy-server/src/routes/redact.rs new file mode 100644 index 0000000..a94c8bc --- /dev/null +++ b/crates/nvisy-server/src/routes/redact.rs @@ -0,0 +1,42 @@ +use axum::{ + Router, + extract::State, + routing::post, + Json, +}; +use crate::state::AppState; + +pub fn router() -> Router<AppState> { + Router::new() + .route("/api/v1/redact", post(redact)) +} + +#[derive(serde::Deserialize)] +struct RedactRequest { + source: serde_json::Value, + #[serde(default)] + detection: Option<serde_json::Value>, + #[serde(default)] + output: Option<serde_json::Value>, + #[serde(rename = "policyId")] + #[serde(default)] + policy_id: Option<String>, +} + +async fn redact( + State(state): State<AppState>, + Json(_body): Json<RedactRequest>, +) -> (axum::http::StatusCode, Json<serde_json::Value>) { + let (run_id, _cancel_token) = state.run_manager.create_run().await; + state.run_manager.set_running(run_id).await; + + // TODO: build redaction graph from body and execute + + ( + axum::http::StatusCode::ACCEPTED, + Json(serde_json::json!({ + "runId": run_id.to_string(), + "status": "accepted" + })), + ) +} diff --git a/crates/nvisy-server/src/schemas/mod.rs b/crates/nvisy-server/src/schemas/mod.rs new file mode 100644 index 0000000..9c59885 --- /dev/null +++ b/crates/nvisy-server/src/schemas/mod.rs @@ -0,0 +1 @@ +// OpenAPI schema types can be defined here when utoipa integration is added. diff --git a/crates/nvisy-server/src/service/audit_store.rs b/crates/nvisy-server/src/service/audit_store.rs new file mode 100644 index 0000000..1972758 --- /dev/null +++ b/crates/nvisy-server/src/service/audit_store.rs @@ -0,0 +1,72 @@ +use std::sync::RwLock; +use uuid::Uuid; + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct StoredAudit { + pub id: Uuid, + pub action: String, + pub timestamp: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub entity_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + pub redaction_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + pub policy_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + pub source_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + pub run_id: Option<Uuid>, + #[serde(skip_serializing_if = "Option::is_none")] + pub actor: Option<String>, + #[serde(skip_serializing_if = "Option::is_none")] + pub details: Option<serde_json::Value>, +} + +pub struct AuditStore { + records: RwLock<Vec<StoredAudit>>, +} + +impl AuditStore { + pub fn new() -> Self { + Self { + records: RwLock::new(Vec::new()), + } + } + + pub fn add(&self, record: StoredAudit) { + self.records.write().unwrap().push(record); + } + + pub fn query( + &self, + run_id: Option<&str>, + action: Option<&str>, + source_id: Option<&str>, + limit: usize, + offset: usize, + ) -> Vec<StoredAudit> { + let records = self.records.read().unwrap(); + let mut results: Vec<&StoredAudit> = records.iter().collect(); + + if let Some(rid) = run_id { + if let Ok(uid) = rid.parse::<Uuid>() { + results.retain(|r| r.run_id == Some(uid)); + } + } + if let Some(act) = action { + results.retain(|r| r.action == act); + } + if let Some(sid) = source_id { + if let Ok(uid) = sid.parse::<Uuid>() { + results.retain(|r| r.source_id == Some(uid)); + } + } + + results.into_iter().skip(offset).take(limit).cloned().collect() + } + + pub fn get_by_run_id(&self, run_id: Uuid) -> Vec<StoredAudit> { + let records = self.records.read().unwrap(); + records.iter().filter(|r| r.run_id == Some(run_id)).cloned().collect() + } +} diff --git a/crates/nvisy-server/src/service/engine_factory.rs b/crates/nvisy-server/src/service/engine_factory.rs new file mode 100644 index 0000000..3c8bb54 --- /dev/null +++ b/crates/nvisy-server/src/service/engine_factory.rs @@ -0,0 +1,16 @@ +use nvisy_core::registry::Registry; +use nvisy_core::errors::NvisyError; + +/// Create a registry with all standard plugins loaded. +pub fn create_registry() -> Result<Registry, NvisyError> { + let mut registry = Registry::new(); + registry.load(nvisy_detect::detect_plugin())?; + registry.load(nvisy_object::object_plugin())?; + registry.load(nvisy_python::python_plugin())?; + tracing::info!( + actions = ?registry.action_keys(), + providers = ?registry.provider_keys(), + "Registry initialized" + ); + Ok(registry) +} diff --git a/crates/nvisy-server/src/service/mod.rs b/crates/nvisy-server/src/service/mod.rs new file mode 100644 index 0000000..cdd56c7 --- /dev/null +++ b/crates/nvisy-server/src/service/mod.rs @@ -0,0 +1,3 @@ +pub mod audit_store; +pub mod engine_factory; +pub mod policy_store; diff --git a/crates/nvisy-server/src/service/policy_store.rs b/crates/nvisy-server/src/service/policy_store.rs new file mode 100644 index 0000000..13fe6a5 --- /dev/null +++ b/crates/nvisy-server/src/service/policy_store.rs @@ -0,0 +1,78 @@ +use std::collections::HashMap; +use std::sync::RwLock; +use uuid::Uuid; + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct StoredPolicy { + pub id: Uuid, + pub name: String, + pub rules: Vec<serde_json::Value>, + pub default_method: String, + pub default_confidence_threshold: f64, + pub created_at: String, + pub updated_at: String, +} + +pub struct PolicyStore { + policies: RwLock<HashMap<Uuid, StoredPolicy>>, +} + +impl PolicyStore { + pub fn new() -> Self { + Self { + policies: RwLock::new(HashMap::new()), + } + } + + pub fn create( + &self, + name: String, + rules: Vec<serde_json::Value>, + default_method: String, + default_confidence_threshold: f64, + ) -> StoredPolicy { + let id = Uuid::new_v4(); + let now = chrono::Utc::now().to_rfc3339(); + let policy = StoredPolicy { + id, + name, + rules, + default_method, + default_confidence_threshold, + created_at: now.clone(), + updated_at: now, + }; + self.policies.write().unwrap().insert(id, policy.clone()); + policy + } + + pub fn get(&self, id: Uuid) -> Option<StoredPolicy> { + self.policies.read().unwrap().get(&id).cloned() + } + + pub fn list(&self) -> Vec<StoredPolicy> { + self.policies.read().unwrap().values().cloned().collect() + } + + pub fn update( + &self, + id: Uuid, + name: Option<String>, + rules: Option<Vec<serde_json::Value>>, + default_method: Option<String>, + default_confidence_threshold: Option<f64>, + ) -> Option<StoredPolicy> { + let mut policies = self.policies.write().unwrap(); + let existing = policies.get_mut(&id)?; + if let Some(n) = name { existing.name = n; } + if let Some(r) = rules { existing.rules = r; } + if let Some(m) = default_method { existing.default_method = m; } + if let Some(t) = default_confidence_threshold { existing.default_confidence_threshold = t; } + existing.updated_at = chrono::Utc::now().to_rfc3339(); + Some(existing.clone()) + } + + pub fn delete(&self, id: Uuid) -> bool { + self.policies.write().unwrap().remove(&id).is_some() + } +} diff --git a/crates/nvisy-server/src/state.rs b/crates/nvisy-server/src/state.rs new file mode 100644 index 0000000..f5a348e --- /dev/null +++ b/crates/nvisy-server/src/state.rs @@ -0,0 +1,14 @@ +use std::sync::Arc; +use nvisy_engine::runs::RunManager; +use crate::service::audit_store::AuditStore; +use crate::service::policy_store::PolicyStore; +use nvisy_core::registry::Registry; + +/// Shared application state. +#[derive(Clone)] +pub struct AppState { + pub registry: Arc<Registry>, + pub run_manager: Arc<RunManager>, + pub policy_store: Arc<PolicyStore>, + pub audit_store: Arc<AuditStore>, +} diff --git a/package-lock.json b/package-lock.json deleted file mode 100644 index 2076e00..0000000 --- a/package-lock.json +++ /dev/null @@ -1,7640 +0,0 @@ -{ - "name": "@nvisy/monorepo", - "lockfileVersion": 3, - "requires": true, - "packages": { - "": { - "name": "@nvisy/monorepo", - "workspaces": [ - "packages/*" - ], - "devDependencies": { - "@biomejs/biome": "^2.3.14", - "@types/node": "^25.2.0", - "@vitest/coverage-v8": "^4.0.18", - "rimraf": "^6.1.2", - "tsup": "^8.5.1", - "typescript": "^5.9.3", - "vitest": "^4.0.18" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "node_modules/@ai-sdk/anthropic": { - "version": "3.0.37", - "resolved": "https://registry.npmjs.org/@ai-sdk/anthropic/-/anthropic-3.0.37.tgz", - "integrity": "sha512-tEgcJPw+a6obbF+SHrEiZsx3DNxOHqeY8bK4IpiNsZ8YPZD141R34g3lEAaQnmNN5mGsEJ8SXoEDabuzi8wFJQ==", - "license": "Apache-2.0", - "dependencies": { - "@ai-sdk/provider": "3.0.7", - "@ai-sdk/provider-utils": "4.0.13" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "zod": "^3.25.76 || ^4.1.8" - } - }, - "node_modules/@ai-sdk/gateway": { - "version": "3.0.36", - "resolved": "https://registry.npmjs.org/@ai-sdk/gateway/-/gateway-3.0.36.tgz", - "integrity": "sha512-2r1Q6azvqMYxQ1hqfWZmWg4+8MajoldD/ty65XdhCaCoBfvDu7trcvxXDfTSU+3/wZ1JIDky46SWYFOHnTbsBw==", - "license": "Apache-2.0", - "dependencies": { - "@ai-sdk/provider": "3.0.7", - "@ai-sdk/provider-utils": "4.0.13", - "@vercel/oidc": "3.1.0" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "zod": "^3.25.76 || ^4.1.8" - } - }, - "node_modules/@ai-sdk/google": { - "version": "3.0.21", - "resolved": "https://registry.npmjs.org/@ai-sdk/google/-/google-3.0.21.tgz", - "integrity": "sha512-qQuvcbDqDPZojtoT45UFCQVH2w3m6KJKKjqJduUsvhN5ZqOXste0h4HgHK8hwGuDfv96Jr9QQEpspbgp6iu5Uw==", - "license": "Apache-2.0", - "dependencies": { - "@ai-sdk/provider": "3.0.7", - "@ai-sdk/provider-utils": "4.0.13" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "zod": "^3.25.76 || ^4.1.8" - } - }, - "node_modules/@ai-sdk/openai": { - "version": "3.0.25", - "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-3.0.25.tgz", - "integrity": "sha512-DsaN46R98+D1W3lU3fKuPU3ofacboLaHlkAwxJPgJ8eup1AJHmPK1N1y10eJJbJcF6iby8Tf/vanoZxc9JPUfw==", - "license": "Apache-2.0", - "dependencies": { - "@ai-sdk/provider": "3.0.7", - "@ai-sdk/provider-utils": "4.0.13" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "zod": "^3.25.76 || ^4.1.8" - } - }, - "node_modules/@ai-sdk/provider": { - "version": "3.0.7", - "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-3.0.7.tgz", - "integrity": "sha512-VkPLrutM6VdA924/mG8OS+5frbVTcu6e046D2bgDo00tehBANR1QBJ/mPcZ9tXMFOsVcm6SQArOregxePzTFPw==", - "license": "Apache-2.0", - "dependencies": { - "json-schema": "^0.4.0" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/@ai-sdk/provider-utils": { - "version": "4.0.13", - "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-4.0.13.tgz", - "integrity": "sha512-HHG72BN4d+OWTcq2NwTxOm/2qvk1duYsnhCDtsbYwn/h/4zeqURu1S0+Cn0nY2Ysq9a9HGKvrYuMn9bgFhR2Og==", - "license": "Apache-2.0", - "dependencies": { - "@ai-sdk/provider": "3.0.7", - "@standard-schema/spec": "^1.1.0", - "eventsource-parser": "^3.0.6" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "zod": "^3.25.76 || ^4.1.8" - } - }, - "node_modules/@asteasolutions/zod-to-openapi": { - "version": "8.4.0", - "resolved": "https://registry.npmjs.org/@asteasolutions/zod-to-openapi/-/zod-to-openapi-8.4.0.tgz", - "integrity": "sha512-Ckp971tmTw4pnv+o7iK85ldBHBKk6gxMaoNyLn3c2Th/fKoTG8G3jdYuOanpdGqwlDB0z01FOjry2d32lfTqrA==", - "license": "MIT", - "dependencies": { - "openapi3-ts": "^4.1.2" - }, - "peerDependencies": { - "zod": "^4.0.0" - } - }, - "node_modules/@aws-crypto/crc32": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/@aws-crypto/crc32/-/crc32-5.2.0.tgz", - "integrity": "sha512-nLbCWqQNgUiwwtFsen1AdzAtvuLRsQS8rYgMuxCrdKf9kOssamGLuPwyTY9wyYblNr9+1XM8v6zoDTPPSIeANg==", - "license": "Apache-2.0", - "dependencies": { - "@aws-crypto/util": "^5.2.0", - "@aws-sdk/types": "^3.222.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=16.0.0" - } - }, - "node_modules/@aws-crypto/crc32c": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/@aws-crypto/crc32c/-/crc32c-5.2.0.tgz", - "integrity": "sha512-+iWb8qaHLYKrNvGRbiYRHSdKRWhto5XlZUEBwDjYNf+ly5SVYG6zEoYIdxvf5R3zyeP16w4PLBn3rH1xc74Rag==", - "license": "Apache-2.0", - "dependencies": { - "@aws-crypto/util": "^5.2.0", - "@aws-sdk/types": "^3.222.0", - "tslib": "^2.6.2" - } - }, - "node_modules/@aws-crypto/sha1-browser": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/@aws-crypto/sha1-browser/-/sha1-browser-5.2.0.tgz", - "integrity": "sha512-OH6lveCFfcDjX4dbAvCFSYUjJZjDr/3XJ3xHtjn3Oj5b9RjojQo8npoLeA/bNwkOkrSQ0wgrHzXk4tDRxGKJeg==", - "license": "Apache-2.0", - "dependencies": { - "@aws-crypto/supports-web-crypto": "^5.2.0", - "@aws-crypto/util": "^5.2.0", - "@aws-sdk/types": "^3.222.0", - "@aws-sdk/util-locate-window": "^3.0.0", - "@smithy/util-utf8": "^2.0.0", - "tslib": "^2.6.2" - } - }, - "node_modules/@aws-crypto/sha1-browser/node_modules/@smithy/is-array-buffer": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@smithy/is-array-buffer/-/is-array-buffer-2.2.0.tgz", - "integrity": "sha512-GGP3O9QFD24uGeAXYUjwSTXARoqpZykHadOmA8G5vfJPK0/DC67qa//0qvqrJzL1xc8WQWX7/yc7fwudjPHPhA==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@aws-crypto/sha1-browser/node_modules/@smithy/util-buffer-from": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@smithy/util-buffer-from/-/util-buffer-from-2.2.0.tgz", - "integrity": "sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/is-array-buffer": "^2.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@aws-crypto/sha1-browser/node_modules/@smithy/util-utf8": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/@smithy/util-utf8/-/util-utf8-2.3.0.tgz", - "integrity": "sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/util-buffer-from": "^2.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@aws-crypto/sha256-browser": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/@aws-crypto/sha256-browser/-/sha256-browser-5.2.0.tgz", - "integrity": "sha512-AXfN/lGotSQwu6HNcEsIASo7kWXZ5HYWvfOmSNKDsEqC4OashTp8alTmaz+F7TC2L083SFv5RdB+qU3Vs1kZqw==", - "license": "Apache-2.0", - "dependencies": { - "@aws-crypto/sha256-js": "^5.2.0", - "@aws-crypto/supports-web-crypto": "^5.2.0", - "@aws-crypto/util": "^5.2.0", - "@aws-sdk/types": "^3.222.0", - "@aws-sdk/util-locate-window": "^3.0.0", - "@smithy/util-utf8": "^2.0.0", - "tslib": "^2.6.2" - } - }, - "node_modules/@aws-crypto/sha256-browser/node_modules/@smithy/is-array-buffer": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@smithy/is-array-buffer/-/is-array-buffer-2.2.0.tgz", - "integrity": "sha512-GGP3O9QFD24uGeAXYUjwSTXARoqpZykHadOmA8G5vfJPK0/DC67qa//0qvqrJzL1xc8WQWX7/yc7fwudjPHPhA==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@aws-crypto/sha256-browser/node_modules/@smithy/util-buffer-from": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@smithy/util-buffer-from/-/util-buffer-from-2.2.0.tgz", - "integrity": "sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/is-array-buffer": "^2.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@aws-crypto/sha256-browser/node_modules/@smithy/util-utf8": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/@smithy/util-utf8/-/util-utf8-2.3.0.tgz", - "integrity": "sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/util-buffer-from": "^2.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@aws-crypto/sha256-js": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/@aws-crypto/sha256-js/-/sha256-js-5.2.0.tgz", - "integrity": "sha512-FFQQyu7edu4ufvIZ+OadFpHHOt+eSTBaYaki44c+akjg7qZg9oOQeLlk77F6tSYqjDAFClrHJk9tMf0HdVyOvA==", - "license": "Apache-2.0", - "dependencies": { - "@aws-crypto/util": "^5.2.0", - "@aws-sdk/types": "^3.222.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=16.0.0" - } - }, - "node_modules/@aws-crypto/supports-web-crypto": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/@aws-crypto/supports-web-crypto/-/supports-web-crypto-5.2.0.tgz", - "integrity": "sha512-iAvUotm021kM33eCdNfwIN//F77/IADDSs58i+MDaOqFrVjZo9bAal0NK7HurRuWLLpF1iLX7gbWrjHjeo+YFg==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - } - }, - "node_modules/@aws-crypto/util": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/@aws-crypto/util/-/util-5.2.0.tgz", - "integrity": "sha512-4RkU9EsI6ZpBve5fseQlGNUWKMa1RLPQ1dnjnQoe07ldfIzcsGb5hC5W0Dm7u423KWzawlrpbjXBrXCEv9zazQ==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.222.0", - "@smithy/util-utf8": "^2.0.0", - "tslib": "^2.6.2" - } - }, - "node_modules/@aws-crypto/util/node_modules/@smithy/is-array-buffer": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@smithy/is-array-buffer/-/is-array-buffer-2.2.0.tgz", - "integrity": "sha512-GGP3O9QFD24uGeAXYUjwSTXARoqpZykHadOmA8G5vfJPK0/DC67qa//0qvqrJzL1xc8WQWX7/yc7fwudjPHPhA==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@aws-crypto/util/node_modules/@smithy/util-buffer-from": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/@smithy/util-buffer-from/-/util-buffer-from-2.2.0.tgz", - "integrity": "sha512-IJdWBbTcMQ6DA0gdNhh/BwrLkDR+ADW5Kr1aZmd4k3DIF6ezMV4R2NIAmT08wQJ3yUK82thHWmC/TnK/wpMMIA==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/is-array-buffer": "^2.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@aws-crypto/util/node_modules/@smithy/util-utf8": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/@smithy/util-utf8/-/util-utf8-2.3.0.tgz", - "integrity": "sha512-R8Rdn8Hy72KKcebgLiv8jQcQkXoLMOGGv5uI1/k0l+snqkOzQ1R0ChUBCxWMlBsFMekWjq0wRudIweFs7sKT5A==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/util-buffer-from": "^2.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@aws-sdk/client-s3": { - "version": "3.984.0", - "resolved": "https://registry.npmjs.org/@aws-sdk/client-s3/-/client-s3-3.984.0.tgz", - "integrity": "sha512-7ny2Slr93Y+QniuluvcfWwyDi32zWQfznynL56Tk0vVh7bWrvS/odm8WP2nInKicRVNipcJHY2YInur6Q/9V0A==", - "license": "Apache-2.0", - "dependencies": { - "@aws-crypto/sha1-browser": "5.2.0", - "@aws-crypto/sha256-browser": "5.2.0", - "@aws-crypto/sha256-js": "5.2.0", - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/credential-provider-node": "^3.972.5", - "@aws-sdk/middleware-bucket-endpoint": "^3.972.3", - "@aws-sdk/middleware-expect-continue": "^3.972.3", - "@aws-sdk/middleware-flexible-checksums": "^3.972.4", - "@aws-sdk/middleware-host-header": "^3.972.3", - "@aws-sdk/middleware-location-constraint": "^3.972.3", - "@aws-sdk/middleware-logger": "^3.972.3", - "@aws-sdk/middleware-recursion-detection": "^3.972.3", - "@aws-sdk/middleware-sdk-s3": "^3.972.6", - "@aws-sdk/middleware-ssec": "^3.972.3", - "@aws-sdk/middleware-user-agent": "^3.972.6", - "@aws-sdk/region-config-resolver": "^3.972.3", - "@aws-sdk/signature-v4-multi-region": "3.984.0", - "@aws-sdk/types": "^3.973.1", - "@aws-sdk/util-endpoints": "3.984.0", - "@aws-sdk/util-user-agent-browser": "^3.972.3", - "@aws-sdk/util-user-agent-node": "^3.972.4", - "@smithy/config-resolver": "^4.4.6", - "@smithy/core": "^3.22.0", - "@smithy/eventstream-serde-browser": "^4.2.8", - "@smithy/eventstream-serde-config-resolver": "^4.3.8", - "@smithy/eventstream-serde-node": "^4.2.8", - "@smithy/fetch-http-handler": "^5.3.9", - "@smithy/hash-blob-browser": "^4.2.9", - "@smithy/hash-node": "^4.2.8", - "@smithy/hash-stream-node": "^4.2.8", - "@smithy/invalid-dependency": "^4.2.8", - "@smithy/md5-js": "^4.2.8", - "@smithy/middleware-content-length": "^4.2.8", - "@smithy/middleware-endpoint": "^4.4.12", - "@smithy/middleware-retry": "^4.4.29", - "@smithy/middleware-serde": "^4.2.9", - "@smithy/middleware-stack": "^4.2.8", - "@smithy/node-config-provider": "^4.3.8", - "@smithy/node-http-handler": "^4.4.8", - "@smithy/protocol-http": "^5.3.8", - "@smithy/smithy-client": "^4.11.1", - "@smithy/types": "^4.12.0", - "@smithy/url-parser": "^4.2.8", - "@smithy/util-base64": "^4.3.0", - "@smithy/util-body-length-browser": "^4.2.0", - "@smithy/util-body-length-node": "^4.2.1", - "@smithy/util-defaults-mode-browser": "^4.3.28", - "@smithy/util-defaults-mode-node": "^4.2.31", - "@smithy/util-endpoints": "^3.2.8", - "@smithy/util-middleware": "^4.2.8", - "@smithy/util-retry": "^4.2.8", - "@smithy/util-stream": "^4.5.10", - "@smithy/util-utf8": "^4.2.0", - "@smithy/util-waiter": "^4.2.8", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/client-sso": { - "version": "3.982.0", - "resolved": "https://registry.npmjs.org/@aws-sdk/client-sso/-/client-sso-3.982.0.tgz", - "integrity": "sha512-qJrIiivmvujdGqJ0ldSUvhN3k3N7GtPesoOI1BSt0fNXovVnMz4C/JmnkhZihU7hJhDvxJaBROLYTU+lpild4w==", - "license": "Apache-2.0", - "dependencies": { - "@aws-crypto/sha256-browser": "5.2.0", - "@aws-crypto/sha256-js": "5.2.0", - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/middleware-host-header": "^3.972.3", - "@aws-sdk/middleware-logger": "^3.972.3", - "@aws-sdk/middleware-recursion-detection": "^3.972.3", - "@aws-sdk/middleware-user-agent": "^3.972.6", - "@aws-sdk/region-config-resolver": "^3.972.3", - "@aws-sdk/types": "^3.973.1", - "@aws-sdk/util-endpoints": "3.982.0", - "@aws-sdk/util-user-agent-browser": "^3.972.3", - "@aws-sdk/util-user-agent-node": "^3.972.4", - "@smithy/config-resolver": "^4.4.6", - "@smithy/core": "^3.22.0", - "@smithy/fetch-http-handler": "^5.3.9", - "@smithy/hash-node": "^4.2.8", - "@smithy/invalid-dependency": "^4.2.8", - "@smithy/middleware-content-length": "^4.2.8", - "@smithy/middleware-endpoint": "^4.4.12", - "@smithy/middleware-retry": "^4.4.29", - "@smithy/middleware-serde": "^4.2.9", - "@smithy/middleware-stack": "^4.2.8", - "@smithy/node-config-provider": "^4.3.8", - "@smithy/node-http-handler": "^4.4.8", - "@smithy/protocol-http": "^5.3.8", - "@smithy/smithy-client": "^4.11.1", - "@smithy/types": "^4.12.0", - "@smithy/url-parser": "^4.2.8", - "@smithy/util-base64": "^4.3.0", - "@smithy/util-body-length-browser": "^4.2.0", - "@smithy/util-body-length-node": "^4.2.1", - "@smithy/util-defaults-mode-browser": "^4.3.28", - "@smithy/util-defaults-mode-node": "^4.2.31", - "@smithy/util-endpoints": "^3.2.8", - "@smithy/util-middleware": "^4.2.8", - "@smithy/util-retry": "^4.2.8", - "@smithy/util-utf8": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/client-sso/node_modules/@aws-sdk/util-endpoints": { - "version": "3.982.0", - "resolved": "https://registry.npmjs.org/@aws-sdk/util-endpoints/-/util-endpoints-3.982.0.tgz", - "integrity": "sha512-M27u8FJP7O0Of9hMWX5dipp//8iglmV9jr7R8SR8RveU+Z50/8TqH68Tu6wUWBGMfXjzbVwn1INIAO5lZrlxXQ==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@smithy/types": "^4.12.0", - "@smithy/url-parser": "^4.2.8", - "@smithy/util-endpoints": "^3.2.8", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/core": { - "version": "3.973.6", - "resolved": "https://registry.npmjs.org/@aws-sdk/core/-/core-3.973.6.tgz", - "integrity": "sha512-pz4ZOw3BLG0NdF25HoB9ymSYyPbMiIjwQJ2aROXRhAzt+b+EOxStfFv8s5iZyP6Kiw7aYhyWxj5G3NhmkoOTKw==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@aws-sdk/xml-builder": "^3.972.4", - "@smithy/core": "^3.22.0", - "@smithy/node-config-provider": "^4.3.8", - "@smithy/property-provider": "^4.2.8", - "@smithy/protocol-http": "^5.3.8", - "@smithy/signature-v4": "^5.3.8", - "@smithy/smithy-client": "^4.11.1", - "@smithy/types": "^4.12.0", - "@smithy/util-base64": "^4.3.0", - "@smithy/util-middleware": "^4.2.8", - "@smithy/util-utf8": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/crc64-nvme": { - "version": "3.972.0", - "resolved": "https://registry.npmjs.org/@aws-sdk/crc64-nvme/-/crc64-nvme-3.972.0.tgz", - "integrity": "sha512-ThlLhTqX68jvoIVv+pryOdb5coP1cX1/MaTbB9xkGDCbWbsqQcLqzPxuSoW1DCnAAIacmXCWpzUNOB9pv+xXQw==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/credential-provider-env": { - "version": "3.972.4", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-env/-/credential-provider-env-3.972.4.tgz", - "integrity": "sha512-/8dnc7+XNMmViEom2xsNdArQxQPSgy4Z/lm6qaFPTrMFesT1bV3PsBhb19n09nmxHdrtQskYmViddUIjUQElXg==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/types": "^3.973.1", - "@smithy/property-provider": "^4.2.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/credential-provider-http": { - "version": "3.972.6", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-http/-/credential-provider-http-3.972.6.tgz", - "integrity": "sha512-5ERWqRljiZv44AIdvIRQ3k+EAV0Sq2WeJHvXuK7gL7bovSxOf8Al7MLH7Eh3rdovH4KHFnlIty7J71mzvQBl5Q==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/types": "^3.973.1", - "@smithy/fetch-http-handler": "^5.3.9", - "@smithy/node-http-handler": "^4.4.8", - "@smithy/property-provider": "^4.2.8", - "@smithy/protocol-http": "^5.3.8", - "@smithy/smithy-client": "^4.11.1", - "@smithy/types": "^4.12.0", - "@smithy/util-stream": "^4.5.10", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/credential-provider-ini": { - "version": "3.972.4", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-ini/-/credential-provider-ini-3.972.4.tgz", - "integrity": "sha512-eRUg+3HaUKuXWn/lEMirdiA5HOKmEl8hEHVuszIDt2MMBUKgVX5XNGmb3XmbgU17h6DZ+RtjbxQpjhz3SbTjZg==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/credential-provider-env": "^3.972.4", - "@aws-sdk/credential-provider-http": "^3.972.6", - "@aws-sdk/credential-provider-login": "^3.972.4", - "@aws-sdk/credential-provider-process": "^3.972.4", - "@aws-sdk/credential-provider-sso": "^3.972.4", - "@aws-sdk/credential-provider-web-identity": "^3.972.4", - "@aws-sdk/nested-clients": "3.982.0", - "@aws-sdk/types": "^3.973.1", - "@smithy/credential-provider-imds": "^4.2.8", - "@smithy/property-provider": "^4.2.8", - "@smithy/shared-ini-file-loader": "^4.4.3", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/credential-provider-login": { - "version": "3.972.4", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-login/-/credential-provider-login-3.972.4.tgz", - "integrity": "sha512-nLGjXuvWWDlQAp505xIONI7Gam0vw2p7Qu3P6on/W2q7rjJXtYjtpHbcsaOjJ/pAju3eTvEQuSuRedcRHVQIAQ==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/nested-clients": "3.982.0", - "@aws-sdk/types": "^3.973.1", - "@smithy/property-provider": "^4.2.8", - "@smithy/protocol-http": "^5.3.8", - "@smithy/shared-ini-file-loader": "^4.4.3", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/credential-provider-node": { - "version": "3.972.5", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-node/-/credential-provider-node-3.972.5.tgz", - "integrity": "sha512-VWXKgSISQCI2GKN3zakTNHSiZ0+mux7v6YHmmbLQp/o3fvYUQJmKGcLZZzg2GFA+tGGBStplra9VFNf/WwxpYg==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/credential-provider-env": "^3.972.4", - "@aws-sdk/credential-provider-http": "^3.972.6", - "@aws-sdk/credential-provider-ini": "^3.972.4", - "@aws-sdk/credential-provider-process": "^3.972.4", - "@aws-sdk/credential-provider-sso": "^3.972.4", - "@aws-sdk/credential-provider-web-identity": "^3.972.4", - "@aws-sdk/types": "^3.973.1", - "@smithy/credential-provider-imds": "^4.2.8", - "@smithy/property-provider": "^4.2.8", - "@smithy/shared-ini-file-loader": "^4.4.3", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/credential-provider-process": { - "version": "3.972.4", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-process/-/credential-provider-process-3.972.4.tgz", - "integrity": "sha512-TCZpWUnBQN1YPk6grvd5x419OfXjHvhj5Oj44GYb84dOVChpg/+2VoEj+YVA4F4E/6huQPNnX7UYbTtxJqgihw==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/types": "^3.973.1", - "@smithy/property-provider": "^4.2.8", - "@smithy/shared-ini-file-loader": "^4.4.3", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/credential-provider-sso": { - "version": "3.972.4", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-sso/-/credential-provider-sso-3.972.4.tgz", - "integrity": "sha512-wzsGwv9mKlwJ3vHLyembBvGE/5nPUIwRR2I51B1cBV4Cb4ql9nIIfpmHzm050XYTY5fqTOKJQnhLj7zj89VG8g==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/client-sso": "3.982.0", - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/token-providers": "3.982.0", - "@aws-sdk/types": "^3.973.1", - "@smithy/property-provider": "^4.2.8", - "@smithy/shared-ini-file-loader": "^4.4.3", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/credential-provider-web-identity": { - "version": "3.972.4", - "resolved": "https://registry.npmjs.org/@aws-sdk/credential-provider-web-identity/-/credential-provider-web-identity-3.972.4.tgz", - "integrity": "sha512-hIzw2XzrG8jzsUSEatehmpkd5rWzASg5IHUfA+m01k/RtvfAML7ZJVVohuKdhAYx+wV2AThLiQJVzqn7F0khrw==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/nested-clients": "3.982.0", - "@aws-sdk/types": "^3.973.1", - "@smithy/property-provider": "^4.2.8", - "@smithy/shared-ini-file-loader": "^4.4.3", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/middleware-bucket-endpoint": { - "version": "3.972.3", - "resolved": "https://registry.npmjs.org/@aws-sdk/middleware-bucket-endpoint/-/middleware-bucket-endpoint-3.972.3.tgz", - "integrity": "sha512-fmbgWYirF67YF1GfD7cg5N6HHQ96EyRNx/rDIrTF277/zTWVuPI2qS/ZHgofwR1NZPe/NWvoppflQY01LrbVLg==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@aws-sdk/util-arn-parser": "^3.972.2", - "@smithy/node-config-provider": "^4.3.8", - "@smithy/protocol-http": "^5.3.8", - "@smithy/types": "^4.12.0", - "@smithy/util-config-provider": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/middleware-expect-continue": { - "version": "3.972.3", - "resolved": "https://registry.npmjs.org/@aws-sdk/middleware-expect-continue/-/middleware-expect-continue-3.972.3.tgz", - "integrity": "sha512-4msC33RZsXQpUKR5QR4HnvBSNCPLGHmB55oDiROqqgyOc+TOfVu2xgi5goA7ms6MdZLeEh2905UfWMnMMF4mRg==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@smithy/protocol-http": "^5.3.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/middleware-flexible-checksums": { - "version": "3.972.4", - "resolved": "https://registry.npmjs.org/@aws-sdk/middleware-flexible-checksums/-/middleware-flexible-checksums-3.972.4.tgz", - "integrity": "sha512-xOxsUkF3O3BtIe3tf54OpPo94eZepjFm3z0Dd2TZKbsPxMiRTFXurC04wJ58o/wPW9YHVO9VqZik3MfoPfrKlw==", - "license": "Apache-2.0", - "dependencies": { - "@aws-crypto/crc32": "5.2.0", - "@aws-crypto/crc32c": "5.2.0", - "@aws-crypto/util": "5.2.0", - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/crc64-nvme": "3.972.0", - "@aws-sdk/types": "^3.973.1", - "@smithy/is-array-buffer": "^4.2.0", - "@smithy/node-config-provider": "^4.3.8", - "@smithy/protocol-http": "^5.3.8", - "@smithy/types": "^4.12.0", - "@smithy/util-middleware": "^4.2.8", - "@smithy/util-stream": "^4.5.10", - "@smithy/util-utf8": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/middleware-host-header": { - "version": "3.972.3", - "resolved": "https://registry.npmjs.org/@aws-sdk/middleware-host-header/-/middleware-host-header-3.972.3.tgz", - "integrity": "sha512-aknPTb2M+G3s+0qLCx4Li/qGZH8IIYjugHMv15JTYMe6mgZO8VBpYgeGYsNMGCqCZOcWzuf900jFBG5bopfzmA==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@smithy/protocol-http": "^5.3.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/middleware-location-constraint": { - "version": "3.972.3", - "resolved": "https://registry.npmjs.org/@aws-sdk/middleware-location-constraint/-/middleware-location-constraint-3.972.3.tgz", - "integrity": "sha512-nIg64CVrsXp67vbK0U1/Is8rik3huS3QkRHn2DRDx4NldrEFMgdkZGI/+cZMKD9k4YOS110Dfu21KZLHrFA/1g==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/middleware-logger": { - "version": "3.972.3", - "resolved": "https://registry.npmjs.org/@aws-sdk/middleware-logger/-/middleware-logger-3.972.3.tgz", - "integrity": "sha512-Ftg09xNNRqaz9QNzlfdQWfpqMCJbsQdnZVJP55jfhbKi1+FTWxGuvfPoBhDHIovqWKjqbuiew3HuhxbJ0+OjgA==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/middleware-recursion-detection": { - "version": "3.972.3", - "resolved": "https://registry.npmjs.org/@aws-sdk/middleware-recursion-detection/-/middleware-recursion-detection-3.972.3.tgz", - "integrity": "sha512-PY57QhzNuXHnwbJgbWYTrqIDHYSeOlhfYERTAuc16LKZpTZRJUjzBFokp9hF7u1fuGeE3D70ERXzdbMBOqQz7Q==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@aws/lambda-invoke-store": "^0.2.2", - "@smithy/protocol-http": "^5.3.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/middleware-sdk-s3": { - "version": "3.972.6", - "resolved": "https://registry.npmjs.org/@aws-sdk/middleware-sdk-s3/-/middleware-sdk-s3-3.972.6.tgz", - "integrity": "sha512-Xq7wM6kbgJN1UO++8dvH/efPb1nTwWqFCpZCR7RCLOETP7xAUAhVo7JmsCnML5Di/iC4Oo5VrJ4QmkYcMZniLw==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/types": "^3.973.1", - "@aws-sdk/util-arn-parser": "^3.972.2", - "@smithy/core": "^3.22.0", - "@smithy/node-config-provider": "^4.3.8", - "@smithy/protocol-http": "^5.3.8", - "@smithy/signature-v4": "^5.3.8", - "@smithy/smithy-client": "^4.11.1", - "@smithy/types": "^4.12.0", - "@smithy/util-config-provider": "^4.2.0", - "@smithy/util-middleware": "^4.2.8", - "@smithy/util-stream": "^4.5.10", - "@smithy/util-utf8": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/middleware-ssec": { - "version": "3.972.3", - "resolved": "https://registry.npmjs.org/@aws-sdk/middleware-ssec/-/middleware-ssec-3.972.3.tgz", - "integrity": "sha512-dU6kDuULN3o3jEHcjm0c4zWJlY1zWVkjG9NPe9qxYLLpcbdj5kRYBS2DdWYD+1B9f910DezRuws7xDEqKkHQIg==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/middleware-user-agent": { - "version": "3.972.6", - "resolved": "https://registry.npmjs.org/@aws-sdk/middleware-user-agent/-/middleware-user-agent-3.972.6.tgz", - "integrity": "sha512-TehLN8W/kivl0U9HcS+keryElEWORROpghDXZBLfnb40DXM7hx/i+7OOjkogXQOF3QtUraJVRkHQ07bPhrWKlw==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/types": "^3.973.1", - "@aws-sdk/util-endpoints": "3.982.0", - "@smithy/core": "^3.22.0", - "@smithy/protocol-http": "^5.3.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/middleware-user-agent/node_modules/@aws-sdk/util-endpoints": { - "version": "3.982.0", - "resolved": "https://registry.npmjs.org/@aws-sdk/util-endpoints/-/util-endpoints-3.982.0.tgz", - "integrity": "sha512-M27u8FJP7O0Of9hMWX5dipp//8iglmV9jr7R8SR8RveU+Z50/8TqH68Tu6wUWBGMfXjzbVwn1INIAO5lZrlxXQ==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@smithy/types": "^4.12.0", - "@smithy/url-parser": "^4.2.8", - "@smithy/util-endpoints": "^3.2.8", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/nested-clients": { - "version": "3.982.0", - "resolved": "https://registry.npmjs.org/@aws-sdk/nested-clients/-/nested-clients-3.982.0.tgz", - "integrity": "sha512-VVkaH27digrJfdVrT64rjkllvOp4oRiZuuJvrylLXAKl18ujToJR7AqpDldL/LS63RVne3QWIpkygIymxFtliQ==", - "license": "Apache-2.0", - "dependencies": { - "@aws-crypto/sha256-browser": "5.2.0", - "@aws-crypto/sha256-js": "5.2.0", - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/middleware-host-header": "^3.972.3", - "@aws-sdk/middleware-logger": "^3.972.3", - "@aws-sdk/middleware-recursion-detection": "^3.972.3", - "@aws-sdk/middleware-user-agent": "^3.972.6", - "@aws-sdk/region-config-resolver": "^3.972.3", - "@aws-sdk/types": "^3.973.1", - "@aws-sdk/util-endpoints": "3.982.0", - "@aws-sdk/util-user-agent-browser": "^3.972.3", - "@aws-sdk/util-user-agent-node": "^3.972.4", - "@smithy/config-resolver": "^4.4.6", - "@smithy/core": "^3.22.0", - "@smithy/fetch-http-handler": "^5.3.9", - "@smithy/hash-node": "^4.2.8", - "@smithy/invalid-dependency": "^4.2.8", - "@smithy/middleware-content-length": "^4.2.8", - "@smithy/middleware-endpoint": "^4.4.12", - "@smithy/middleware-retry": "^4.4.29", - "@smithy/middleware-serde": "^4.2.9", - "@smithy/middleware-stack": "^4.2.8", - "@smithy/node-config-provider": "^4.3.8", - "@smithy/node-http-handler": "^4.4.8", - "@smithy/protocol-http": "^5.3.8", - "@smithy/smithy-client": "^4.11.1", - "@smithy/types": "^4.12.0", - "@smithy/url-parser": "^4.2.8", - "@smithy/util-base64": "^4.3.0", - "@smithy/util-body-length-browser": "^4.2.0", - "@smithy/util-body-length-node": "^4.2.1", - "@smithy/util-defaults-mode-browser": "^4.3.28", - "@smithy/util-defaults-mode-node": "^4.2.31", - "@smithy/util-endpoints": "^3.2.8", - "@smithy/util-middleware": "^4.2.8", - "@smithy/util-retry": "^4.2.8", - "@smithy/util-utf8": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/nested-clients/node_modules/@aws-sdk/util-endpoints": { - "version": "3.982.0", - "resolved": "https://registry.npmjs.org/@aws-sdk/util-endpoints/-/util-endpoints-3.982.0.tgz", - "integrity": "sha512-M27u8FJP7O0Of9hMWX5dipp//8iglmV9jr7R8SR8RveU+Z50/8TqH68Tu6wUWBGMfXjzbVwn1INIAO5lZrlxXQ==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@smithy/types": "^4.12.0", - "@smithy/url-parser": "^4.2.8", - "@smithy/util-endpoints": "^3.2.8", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/region-config-resolver": { - "version": "3.972.3", - "resolved": "https://registry.npmjs.org/@aws-sdk/region-config-resolver/-/region-config-resolver-3.972.3.tgz", - "integrity": "sha512-v4J8qYAWfOMcZ4MJUyatntOicTzEMaU7j3OpkRCGGFSL2NgXQ5VbxauIyORA+pxdKZ0qQG2tCQjQjZDlXEC3Ow==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@smithy/config-resolver": "^4.4.6", - "@smithy/node-config-provider": "^4.3.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/signature-v4-multi-region": { - "version": "3.984.0", - "resolved": "https://registry.npmjs.org/@aws-sdk/signature-v4-multi-region/-/signature-v4-multi-region-3.984.0.tgz", - "integrity": "sha512-TaWbfYCwnuOSvDSrgs7QgoaoXse49E7LzUkVOUhoezwB7bkmhp+iojADm7UepCEu4021SquD7NG1xA+WCvmldA==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/middleware-sdk-s3": "^3.972.6", - "@aws-sdk/types": "^3.973.1", - "@smithy/protocol-http": "^5.3.8", - "@smithy/signature-v4": "^5.3.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/token-providers": { - "version": "3.982.0", - "resolved": "https://registry.npmjs.org/@aws-sdk/token-providers/-/token-providers-3.982.0.tgz", - "integrity": "sha512-v3M0KYp2TVHYHNBT7jHD9lLTWAdS9CaWJ2jboRKt0WAB65bA7iUEpR+k4VqKYtpQN4+8kKSc4w+K6kUNZkHKQw==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/core": "^3.973.6", - "@aws-sdk/nested-clients": "3.982.0", - "@aws-sdk/types": "^3.973.1", - "@smithy/property-provider": "^4.2.8", - "@smithy/shared-ini-file-loader": "^4.4.3", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/types": { - "version": "3.973.1", - "resolved": "https://registry.npmjs.org/@aws-sdk/types/-/types-3.973.1.tgz", - "integrity": "sha512-DwHBiMNOB468JiX6+i34c+THsKHErYUdNQ3HexeXZvVn4zouLjgaS4FejiGSi2HyBuzuyHg7SuOPmjSvoU9NRg==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/util-arn-parser": { - "version": "3.972.2", - "resolved": "https://registry.npmjs.org/@aws-sdk/util-arn-parser/-/util-arn-parser-3.972.2.tgz", - "integrity": "sha512-VkykWbqMjlSgBFDyrY3nOSqupMc6ivXuGmvci6Q3NnLq5kC+mKQe2QBZ4nrWRE/jqOxeFP2uYzLtwncYYcvQDg==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/util-endpoints": { - "version": "3.984.0", - "resolved": "https://registry.npmjs.org/@aws-sdk/util-endpoints/-/util-endpoints-3.984.0.tgz", - "integrity": "sha512-9ebjLA0hMKHeVvXEtTDCCOBtwjb0bOXiuUV06HNeVdgAjH6gj4x4Zwt4IBti83TiyTGOCl5YfZqGx4ehVsasbQ==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@smithy/types": "^4.12.0", - "@smithy/url-parser": "^4.2.8", - "@smithy/util-endpoints": "^3.2.8", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/util-locate-window": { - "version": "3.965.4", - "resolved": "https://registry.npmjs.org/@aws-sdk/util-locate-window/-/util-locate-window-3.965.4.tgz", - "integrity": "sha512-H1onv5SkgPBK2P6JR2MjGgbOnttoNzSPIRoeZTNPZYyaplwGg50zS3amXvXqF0/qfXpWEC9rLWU564QTB9bSog==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws-sdk/util-user-agent-browser": { - "version": "3.972.3", - "resolved": "https://registry.npmjs.org/@aws-sdk/util-user-agent-browser/-/util-user-agent-browser-3.972.3.tgz", - "integrity": "sha512-JurOwkRUcXD/5MTDBcqdyQ9eVedtAsZgw5rBwktsPTN7QtPiS2Ld1jkJepNgYoCufz1Wcut9iup7GJDoIHp8Fw==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/types": "^3.973.1", - "@smithy/types": "^4.12.0", - "bowser": "^2.11.0", - "tslib": "^2.6.2" - } - }, - "node_modules/@aws-sdk/util-user-agent-node": { - "version": "3.972.4", - "resolved": "https://registry.npmjs.org/@aws-sdk/util-user-agent-node/-/util-user-agent-node-3.972.4.tgz", - "integrity": "sha512-3WFCBLiM8QiHDfosQq3Py+lIMgWlFWwFQliUHUqwEiRqLnKyhgbU3AKa7AWJF7lW2Oc/2kFNY4MlAYVnVc0i8A==", - "license": "Apache-2.0", - "dependencies": { - "@aws-sdk/middleware-user-agent": "^3.972.6", - "@aws-sdk/types": "^3.973.1", - "@smithy/node-config-provider": "^4.3.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - }, - "peerDependencies": { - "aws-crt": ">=1.0.0" - }, - "peerDependenciesMeta": { - "aws-crt": { - "optional": true - } - } - }, - "node_modules/@aws-sdk/xml-builder": { - "version": "3.972.4", - "resolved": "https://registry.npmjs.org/@aws-sdk/xml-builder/-/xml-builder-3.972.4.tgz", - "integrity": "sha512-0zJ05ANfYqI6+rGqj8samZBFod0dPPousBjLEqg8WdxSgbMAkRgLyn81lP215Do0rFJ/17LIXwr7q0yK24mP6Q==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "fast-xml-parser": "5.3.4", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@aws/lambda-invoke-store": { - "version": "0.2.3", - "resolved": "https://registry.npmjs.org/@aws/lambda-invoke-store/-/lambda-invoke-store-0.2.3.tgz", - "integrity": "sha512-oLvsaPMTBejkkmHhjf09xTgk71mOqyr/409NKhRIL08If7AhVfUsJhVsx386uJaqNd42v9kWamQ9lFbkoC2dYw==", - "license": "Apache-2.0", - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@azure-rest/core-client": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@azure-rest/core-client/-/core-client-2.5.1.tgz", - "integrity": "sha512-EHaOXW0RYDKS5CFffnixdyRPak5ytiCtU7uXDcP/uiY+A6jFRwNGzzJBiznkCzvi5EYpY+YWinieqHb0oY916A==", - "license": "MIT", - "dependencies": { - "@azure/abort-controller": "^2.1.2", - "@azure/core-auth": "^1.10.0", - "@azure/core-rest-pipeline": "^1.22.0", - "@azure/core-tracing": "^1.3.0", - "@typespec/ts-http-runtime": "^0.3.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@azure/abort-controller": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/@azure/abort-controller/-/abort-controller-2.1.2.tgz", - "integrity": "sha512-nBrLsEWm4J2u5LpAPjxADTlq3trDgVZZXHNKabeXZtpq3d3AbN/KGO82R87rdDz5/lYB024rtEf10/q0urNgsA==", - "license": "MIT", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@azure/core-auth": { - "version": "1.10.1", - "resolved": "https://registry.npmjs.org/@azure/core-auth/-/core-auth-1.10.1.tgz", - "integrity": "sha512-ykRMW8PjVAn+RS6ww5cmK9U2CyH9p4Q88YJwvUslfuMmN98w/2rdGRLPqJYObapBCdzBVeDgYWdJnFPFb7qzpg==", - "license": "MIT", - "dependencies": { - "@azure/abort-controller": "^2.1.2", - "@azure/core-util": "^1.13.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@azure/core-client": { - "version": "1.10.1", - "resolved": "https://registry.npmjs.org/@azure/core-client/-/core-client-1.10.1.tgz", - "integrity": "sha512-Nh5PhEOeY6PrnxNPsEHRr9eimxLwgLlpmguQaHKBinFYA/RU9+kOYVOQqOrTsCL+KSxrLLl1gD8Dk5BFW/7l/w==", - "license": "MIT", - "dependencies": { - "@azure/abort-controller": "^2.1.2", - "@azure/core-auth": "^1.10.0", - "@azure/core-rest-pipeline": "^1.22.0", - "@azure/core-tracing": "^1.3.0", - "@azure/core-util": "^1.13.0", - "@azure/logger": "^1.3.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@azure/core-http-compat": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/@azure/core-http-compat/-/core-http-compat-2.3.2.tgz", - "integrity": "sha512-Tf6ltdKzOJEgxZeWLCjMxrxbodB/ZeCbzzA1A2qHbhzAjzjHoBVSUeSl/baT/oHAxhc4qdqVaDKnc2+iE932gw==", - "license": "MIT", - "dependencies": { - "@azure/abort-controller": "^2.1.2" - }, - "engines": { - "node": ">=20.0.0" - }, - "peerDependencies": { - "@azure/core-client": "^1.10.0", - "@azure/core-rest-pipeline": "^1.22.0" - } - }, - "node_modules/@azure/core-lro": { - "version": "2.7.2", - "resolved": "https://registry.npmjs.org/@azure/core-lro/-/core-lro-2.7.2.tgz", - "integrity": "sha512-0YIpccoX8m/k00O7mDDMdJpbr6mf1yWo2dfmxt5A8XVZVVMz2SSKaEbMCeJRvgQ0IaSlqhjT47p4hVIRRy90xw==", - "license": "MIT", - "dependencies": { - "@azure/abort-controller": "^2.0.0", - "@azure/core-util": "^1.2.0", - "@azure/logger": "^1.0.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@azure/core-paging": { - "version": "1.6.2", - "resolved": "https://registry.npmjs.org/@azure/core-paging/-/core-paging-1.6.2.tgz", - "integrity": "sha512-YKWi9YuCU04B55h25cnOYZHxXYtEvQEbKST5vqRga7hWY9ydd3FZHdeQF8pyh+acWZvppw13M/LMGx0LABUVMA==", - "license": "MIT", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@azure/core-rest-pipeline": { - "version": "1.22.2", - "resolved": "https://registry.npmjs.org/@azure/core-rest-pipeline/-/core-rest-pipeline-1.22.2.tgz", - "integrity": "sha512-MzHym+wOi8CLUlKCQu12de0nwcq9k9Kuv43j4Wa++CsCpJwps2eeBQwD2Bu8snkxTtDKDx4GwjuR9E8yC8LNrg==", - "license": "MIT", - "dependencies": { - "@azure/abort-controller": "^2.1.2", - "@azure/core-auth": "^1.10.0", - "@azure/core-tracing": "^1.3.0", - "@azure/core-util": "^1.13.0", - "@azure/logger": "^1.3.0", - "@typespec/ts-http-runtime": "^0.3.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@azure/core-tracing": { - "version": "1.3.1", - "resolved": "https://registry.npmjs.org/@azure/core-tracing/-/core-tracing-1.3.1.tgz", - "integrity": "sha512-9MWKevR7Hz8kNzzPLfX4EAtGM2b8mr50HPDBvio96bURP/9C+HjdH3sBlLSNNrvRAr5/k/svoH457gB5IKpmwQ==", - "license": "MIT", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@azure/core-util": { - "version": "1.13.1", - "resolved": "https://registry.npmjs.org/@azure/core-util/-/core-util-1.13.1.tgz", - "integrity": "sha512-XPArKLzsvl0Hf0CaGyKHUyVgF7oDnhKoP85Xv6M4StF/1AhfORhZudHtOyf2s+FcbuQ9dPRAjB8J2KvRRMUK2A==", - "license": "MIT", - "dependencies": { - "@azure/abort-controller": "^2.1.2", - "@typespec/ts-http-runtime": "^0.3.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@azure/core-xml": { - "version": "1.5.0", - "resolved": "https://registry.npmjs.org/@azure/core-xml/-/core-xml-1.5.0.tgz", - "integrity": "sha512-D/sdlJBMJfx7gqoj66PKVmhDDaU6TKA49ptcolxdas29X7AfvLTmfAGLjAcIMBK7UZ2o4lygHIqVckOlQU3xWw==", - "license": "MIT", - "dependencies": { - "fast-xml-parser": "^5.0.7", - "tslib": "^2.8.1" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@azure/identity": { - "version": "4.13.0", - "resolved": "https://registry.npmjs.org/@azure/identity/-/identity-4.13.0.tgz", - "integrity": "sha512-uWC0fssc+hs1TGGVkkghiaFkkS7NkTxfnCH+Hdg+yTehTpMcehpok4PgUKKdyCH+9ldu6FhiHRv84Ntqj1vVcw==", - "license": "MIT", - "dependencies": { - "@azure/abort-controller": "^2.0.0", - "@azure/core-auth": "^1.9.0", - "@azure/core-client": "^1.9.2", - "@azure/core-rest-pipeline": "^1.17.0", - "@azure/core-tracing": "^1.0.0", - "@azure/core-util": "^1.11.0", - "@azure/logger": "^1.0.0", - "@azure/msal-browser": "^4.2.0", - "@azure/msal-node": "^3.5.0", - "open": "^10.1.0", - "tslib": "^2.2.0" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@azure/keyvault-common": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/@azure/keyvault-common/-/keyvault-common-2.0.0.tgz", - "integrity": "sha512-wRLVaroQtOqfg60cxkzUkGKrKMsCP6uYXAOomOIysSMyt1/YM0eUn9LqieAWM8DLcU4+07Fio2YGpPeqUbpP9w==", - "license": "MIT", - "dependencies": { - "@azure/abort-controller": "^2.0.0", - "@azure/core-auth": "^1.3.0", - "@azure/core-client": "^1.5.0", - "@azure/core-rest-pipeline": "^1.8.0", - "@azure/core-tracing": "^1.0.0", - "@azure/core-util": "^1.10.0", - "@azure/logger": "^1.1.4", - "tslib": "^2.2.0" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@azure/keyvault-keys": { - "version": "4.10.0", - "resolved": "https://registry.npmjs.org/@azure/keyvault-keys/-/keyvault-keys-4.10.0.tgz", - "integrity": "sha512-eDT7iXoBTRZ2n3fLiftuGJFD+yjkiB1GNqzU2KbY1TLYeXeSPVTVgn2eJ5vmRTZ11978jy2Kg2wI7xa9Tyr8ag==", - "license": "MIT", - "dependencies": { - "@azure-rest/core-client": "^2.3.3", - "@azure/abort-controller": "^2.1.2", - "@azure/core-auth": "^1.9.0", - "@azure/core-http-compat": "^2.2.0", - "@azure/core-lro": "^2.7.2", - "@azure/core-paging": "^1.6.2", - "@azure/core-rest-pipeline": "^1.19.0", - "@azure/core-tracing": "^1.2.0", - "@azure/core-util": "^1.11.0", - "@azure/keyvault-common": "^2.0.0", - "@azure/logger": "^1.1.4", - "tslib": "^2.8.1" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@azure/logger": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/@azure/logger/-/logger-1.3.0.tgz", - "integrity": "sha512-fCqPIfOcLE+CGqGPd66c8bZpwAji98tZ4JI9i/mlTNTlsIWslCfpg48s/ypyLxZTump5sypjrKn2/kY7q8oAbA==", - "license": "MIT", - "dependencies": { - "@typespec/ts-http-runtime": "^0.3.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@azure/msal-browser": { - "version": "4.28.1", - "resolved": "https://registry.npmjs.org/@azure/msal-browser/-/msal-browser-4.28.1.tgz", - "integrity": "sha512-al2u2fTchbClq3L4C1NlqLm+vwKfhYCPtZN2LR/9xJVaQ4Mnrwf5vANvuyPSJHcGvw50UBmhuVmYUAhTEetTpA==", - "license": "MIT", - "dependencies": { - "@azure/msal-common": "15.14.1" - }, - "engines": { - "node": ">=0.8.0" - } - }, - "node_modules/@azure/msal-common": { - "version": "15.14.1", - "resolved": "https://registry.npmjs.org/@azure/msal-common/-/msal-common-15.14.1.tgz", - "integrity": "sha512-IkzF7Pywt6QKTS0kwdCv/XV8x8JXknZDvSjj/IccooxnP373T5jaadO3FnOrbWo3S0UqkfIDyZNTaQ/oAgRdXw==", - "license": "MIT", - "engines": { - "node": ">=0.8.0" - } - }, - "node_modules/@azure/msal-node": { - "version": "3.8.6", - "resolved": "https://registry.npmjs.org/@azure/msal-node/-/msal-node-3.8.6.tgz", - "integrity": "sha512-XTmhdItcBckcVVTy65Xp+42xG4LX5GK+9AqAsXPXk4IqUNv+LyQo5TMwNjuFYBfAB2GTG9iSQGk+QLc03vhf3w==", - "license": "MIT", - "dependencies": { - "@azure/msal-common": "15.14.1", - "jsonwebtoken": "^9.0.0", - "uuid": "^8.3.0" - }, - "engines": { - "node": ">=16" - } - }, - "node_modules/@azure/storage-blob": { - "version": "12.30.0", - "resolved": "https://registry.npmjs.org/@azure/storage-blob/-/storage-blob-12.30.0.tgz", - "integrity": "sha512-peDCR8blSqhsAKDbpSP/o55S4sheNwSrblvCaHUZ5xUI73XA7ieUGGwrONgD/Fng0EoDe1VOa3fAQ7+WGB3Ocg==", - "license": "MIT", - "dependencies": { - "@azure/abort-controller": "^2.1.2", - "@azure/core-auth": "^1.9.0", - "@azure/core-client": "^1.9.3", - "@azure/core-http-compat": "^2.2.0", - "@azure/core-lro": "^2.2.0", - "@azure/core-paging": "^1.6.2", - "@azure/core-rest-pipeline": "^1.19.1", - "@azure/core-tracing": "^1.2.0", - "@azure/core-util": "^1.11.0", - "@azure/core-xml": "^1.4.5", - "@azure/logger": "^1.1.4", - "@azure/storage-common": "^12.2.0", - "events": "^3.0.0", - "tslib": "^2.8.1" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@azure/storage-common": { - "version": "12.3.0", - "resolved": "https://registry.npmjs.org/@azure/storage-common/-/storage-common-12.3.0.tgz", - "integrity": "sha512-/OFHhy86aG5Pe8dP5tsp+BuJ25JOAl9yaMU3WZbkeoiFMHFtJ7tu5ili7qEdBXNW9G5lDB19trwyI6V49F/8iQ==", - "license": "MIT", - "dependencies": { - "@azure/abort-controller": "^2.1.2", - "@azure/core-auth": "^1.9.0", - "@azure/core-http-compat": "^2.2.0", - "@azure/core-rest-pipeline": "^1.19.1", - "@azure/core-tracing": "^1.2.0", - "@azure/core-util": "^1.11.0", - "@azure/logger": "^1.1.4", - "events": "^3.3.0", - "tslib": "^2.8.1" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@babel/helper-string-parser": { - "version": "7.27.1", - "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz", - "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/helper-validator-identifier": { - "version": "7.28.5", - "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz", - "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@babel/parser": { - "version": "7.29.0", - "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.29.0.tgz", - "integrity": "sha512-IyDgFV5GeDUVX4YdF/3CPULtVGSXXMLh1xVIgdCgxApktqnQV0r7/8Nqthg+8YLGaAtdyIlo2qIdZrbCv4+7ww==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/types": "^7.29.0" - }, - "bin": { - "parser": "bin/babel-parser.js" - }, - "engines": { - "node": ">=6.0.0" - } - }, - "node_modules/@babel/types": { - "version": "7.29.0", - "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.29.0.tgz", - "integrity": "sha512-LwdZHpScM4Qz8Xw2iKSzS+cfglZzJGvofQICy7W7v4caru4EaAmyUuO6BGrbyQ2mYV11W0U8j5mBhd14dd3B0A==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/helper-string-parser": "^7.27.1", - "@babel/helper-validator-identifier": "^7.28.5" - }, - "engines": { - "node": ">=6.9.0" - } - }, - "node_modules/@bcoe/v8-coverage": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/@bcoe/v8-coverage/-/v8-coverage-1.0.2.tgz", - "integrity": "sha512-6zABk/ECA/QYSCQ1NGiVwwbQerUCZ+TQbp64Q3AgmfNvurHH0j8TtXa1qbShXA6qqkpAj4V5W8pP6mLe1mcMqA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - } - }, - "node_modules/@biomejs/biome": { - "version": "2.3.14", - "resolved": "https://registry.npmjs.org/@biomejs/biome/-/biome-2.3.14.tgz", - "integrity": "sha512-QMT6QviX0WqXJCaiqVMiBUCr5WRQ1iFSjvOLoTk6auKukJMvnMzWucXpwZB0e8F00/1/BsS9DzcKgWH+CLqVuA==", - "dev": true, - "license": "MIT OR Apache-2.0", - "bin": { - "biome": "bin/biome" - }, - "engines": { - "node": ">=14.21.3" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/biome" - }, - "optionalDependencies": { - "@biomejs/cli-darwin-arm64": "2.3.14", - "@biomejs/cli-darwin-x64": "2.3.14", - "@biomejs/cli-linux-arm64": "2.3.14", - "@biomejs/cli-linux-arm64-musl": "2.3.14", - "@biomejs/cli-linux-x64": "2.3.14", - "@biomejs/cli-linux-x64-musl": "2.3.14", - "@biomejs/cli-win32-arm64": "2.3.14", - "@biomejs/cli-win32-x64": "2.3.14" - } - }, - "node_modules/@biomejs/cli-darwin-arm64": { - "version": "2.3.14", - "resolved": "https://registry.npmjs.org/@biomejs/cli-darwin-arm64/-/cli-darwin-arm64-2.3.14.tgz", - "integrity": "sha512-UJGPpvWJMkLxSRtpCAKfKh41Q4JJXisvxZL8ChN1eNW3m/WlPFJ6EFDCE7YfUb4XS8ZFi3C1dFpxUJ0Ety5n+A==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT OR Apache-2.0", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": ">=14.21.3" - } - }, - "node_modules/@biomejs/cli-darwin-x64": { - "version": "2.3.14", - "resolved": "https://registry.npmjs.org/@biomejs/cli-darwin-x64/-/cli-darwin-x64-2.3.14.tgz", - "integrity": "sha512-PNkLNQG6RLo8lG7QoWe/hhnMxJIt1tEimoXpGQjwS/dkdNiKBLPv4RpeQl8o3s1OKI3ZOR5XPiYtmbGGHAOnLA==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT OR Apache-2.0", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": ">=14.21.3" - } - }, - "node_modules/@biomejs/cli-linux-arm64": { - "version": "2.3.14", - "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-arm64/-/cli-linux-arm64-2.3.14.tgz", - "integrity": "sha512-KT67FKfzIw6DNnUNdYlBg+eU24Go3n75GWK6NwU4+yJmDYFe9i/MjiI+U/iEzKvo0g7G7MZqoyrhIYuND2w8QQ==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT OR Apache-2.0", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.21.3" - } - }, - "node_modules/@biomejs/cli-linux-arm64-musl": { - "version": "2.3.14", - "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-arm64-musl/-/cli-linux-arm64-musl-2.3.14.tgz", - "integrity": "sha512-LInRbXhYujtL3sH2TMCH/UBwJZsoGwfQjBrMfl84CD4hL/41C/EU5mldqf1yoFpsI0iPWuU83U+nB2TUUypWeg==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT OR Apache-2.0", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.21.3" - } - }, - "node_modules/@biomejs/cli-linux-x64": { - "version": "2.3.14", - "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-x64/-/cli-linux-x64-2.3.14.tgz", - "integrity": "sha512-ZsZzQsl9U+wxFrGGS4f6UxREUlgHwmEfu1IrXlgNFrNnd5Th6lIJr8KmSzu/+meSa9f4rzFrbEW9LBBA6ScoMA==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT OR Apache-2.0", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.21.3" - } - }, - "node_modules/@biomejs/cli-linux-x64-musl": { - "version": "2.3.14", - "resolved": "https://registry.npmjs.org/@biomejs/cli-linux-x64-musl/-/cli-linux-x64-musl-2.3.14.tgz", - "integrity": "sha512-KQU7EkbBBuHPW3/rAcoiVmhlPtDSGOGRPv9js7qJVpYTzjQmVR+C9Rfcz+ti8YCH+zT1J52tuBybtP4IodjxZQ==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT OR Apache-2.0", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=14.21.3" - } - }, - "node_modules/@biomejs/cli-win32-arm64": { - "version": "2.3.14", - "resolved": "https://registry.npmjs.org/@biomejs/cli-win32-arm64/-/cli-win32-arm64-2.3.14.tgz", - "integrity": "sha512-+IKYkj/pUBbnRf1G1+RlyA3LWiDgra1xpS7H2g4BuOzzRbRB+hmlw0yFsLprHhbbt7jUzbzAbAjK/Pn0FDnh1A==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT OR Apache-2.0", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=14.21.3" - } - }, - "node_modules/@biomejs/cli-win32-x64": { - "version": "2.3.14", - "resolved": "https://registry.npmjs.org/@biomejs/cli-win32-x64/-/cli-win32-x64-2.3.14.tgz", - "integrity": "sha512-oizCjdyQ3WJEswpb3Chdngeat56rIdSYK12JI3iI11Mt5T5EXcZ7WLuowzEaFPNJ3zmOQFliMN8QY1Pi+qsfdQ==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT OR Apache-2.0", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=14.21.3" - } - }, - "node_modules/@colors/colors": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.6.0.tgz", - "integrity": "sha512-Ir+AOibqzrIsL6ajt3Rz3LskB7OiMVHqltZmspbW/TJuTVuyOMirVqAkjfY6JISiLHgyNqicAC8AyHHGzNd/dA==", - "license": "MIT", - "engines": { - "node": ">=0.1.90" - } - }, - "node_modules/@dabh/diagnostics": { - "version": "2.0.8", - "resolved": "https://registry.npmjs.org/@dabh/diagnostics/-/diagnostics-2.0.8.tgz", - "integrity": "sha512-R4MSXTVnuMzGD7bzHdW2ZhhdPC/igELENcq5IjEverBvq5hn1SXCWcsi6eSsdWP0/Ur+SItRRjAktmdoX/8R/Q==", - "license": "MIT", - "dependencies": { - "@so-ric/colorspace": "^1.1.6", - "enabled": "2.0.x", - "kuler": "^2.0.0" - } - }, - "node_modules/@datastructures-js/deque": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/@datastructures-js/deque/-/deque-1.0.8.tgz", - "integrity": "sha512-PSBhJ2/SmeRPRHuBv7i/fHWIdSC3JTyq56qb+Rq0wjOagi0/fdV5/B/3Md5zFZus/W6OkSPMaxMKKMNMrSmubg==", - "license": "MIT" - }, - "node_modules/@esbuild/aix-ppc64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.27.3.tgz", - "integrity": "sha512-9fJMTNFTWZMh5qwrBItuziu834eOCUcEqymSH7pY+zoMVEZg3gcPuBNxH1EvfVYe9h0x/Ptw8KBzv7qxb7l8dg==", - "cpu": [ - "ppc64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "aix" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/android-arm": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.27.3.tgz", - "integrity": "sha512-i5D1hPY7GIQmXlXhs2w8AWHhenb00+GxjxRncS2ZM7YNVGNfaMxgzSGuO8o8SJzRc/oZwU2bcScvVERk03QhzA==", - "cpu": [ - "arm" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/android-arm64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.27.3.tgz", - "integrity": "sha512-YdghPYUmj/FX2SYKJ0OZxf+iaKgMsKHVPF1MAq/P8WirnSpCStzKJFjOjzsW0QQ7oIAiccHdcqjbHmJxRb/dmg==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/android-x64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.27.3.tgz", - "integrity": "sha512-IN/0BNTkHtk8lkOM8JWAYFg4ORxBkZQf9zXiEOfERX/CzxW3Vg1ewAhU7QSWQpVIzTW+b8Xy+lGzdYXV6UZObQ==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/darwin-arm64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.27.3.tgz", - "integrity": "sha512-Re491k7ByTVRy0t3EKWajdLIr0gz2kKKfzafkth4Q8A5n1xTHrkqZgLLjFEHVD+AXdUGgQMq+Godfq45mGpCKg==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/darwin-x64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.27.3.tgz", - "integrity": "sha512-vHk/hA7/1AckjGzRqi6wbo+jaShzRowYip6rt6q7VYEDX4LEy1pZfDpdxCBnGtl+A5zq8iXDcyuxwtv3hNtHFg==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/freebsd-arm64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.27.3.tgz", - "integrity": "sha512-ipTYM2fjt3kQAYOvo6vcxJx3nBYAzPjgTCk7QEgZG8AUO3ydUhvelmhrbOheMnGOlaSFUoHXB6un+A7q4ygY9w==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/freebsd-x64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.27.3.tgz", - "integrity": "sha512-dDk0X87T7mI6U3K9VjWtHOXqwAMJBNN2r7bejDsc+j03SEjtD9HrOl8gVFByeM0aJksoUuUVU9TBaZa2rgj0oA==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-arm": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.27.3.tgz", - "integrity": "sha512-s6nPv2QkSupJwLYyfS+gwdirm0ukyTFNl3KTgZEAiJDd+iHZcbTPPcWCcRYH+WlNbwChgH2QkE9NSlNrMT8Gfw==", - "cpu": [ - "arm" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-arm64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.27.3.tgz", - "integrity": "sha512-sZOuFz/xWnZ4KH3YfFrKCf1WyPZHakVzTiqji3WDc0BCl2kBwiJLCXpzLzUBLgmp4veFZdvN5ChW4Eq/8Fc2Fg==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-ia32": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.27.3.tgz", - "integrity": "sha512-yGlQYjdxtLdh0a3jHjuwOrxQjOZYD/C9PfdbgJJF3TIZWnm/tMd/RcNiLngiu4iwcBAOezdnSLAwQDPqTmtTYg==", - "cpu": [ - "ia32" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-loong64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.27.3.tgz", - "integrity": "sha512-WO60Sn8ly3gtzhyjATDgieJNet/KqsDlX5nRC5Y3oTFcS1l0KWba+SEa9Ja1GfDqSF1z6hif/SkpQJbL63cgOA==", - "cpu": [ - "loong64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-mips64el": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.27.3.tgz", - "integrity": "sha512-APsymYA6sGcZ4pD6k+UxbDjOFSvPWyZhjaiPyl/f79xKxwTnrn5QUnXR5prvetuaSMsb4jgeHewIDCIWljrSxw==", - "cpu": [ - "mips64el" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-ppc64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.27.3.tgz", - "integrity": "sha512-eizBnTeBefojtDb9nSh4vvVQ3V9Qf9Df01PfawPcRzJH4gFSgrObw+LveUyDoKU3kxi5+9RJTCWlj4FjYXVPEA==", - "cpu": [ - "ppc64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-riscv64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.27.3.tgz", - "integrity": "sha512-3Emwh0r5wmfm3ssTWRQSyVhbOHvqegUDRd0WhmXKX2mkHJe1SFCMJhagUleMq+Uci34wLSipf8Lagt4LlpRFWQ==", - "cpu": [ - "riscv64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-s390x": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.27.3.tgz", - "integrity": "sha512-pBHUx9LzXWBc7MFIEEL0yD/ZVtNgLytvx60gES28GcWMqil8ElCYR4kvbV2BDqsHOvVDRrOxGySBM9Fcv744hw==", - "cpu": [ - "s390x" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/linux-x64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.27.3.tgz", - "integrity": "sha512-Czi8yzXUWIQYAtL/2y6vogER8pvcsOsk5cpwL4Gk5nJqH5UZiVByIY8Eorm5R13gq+DQKYg0+JyQoytLQas4dA==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/netbsd-arm64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.27.3.tgz", - "integrity": "sha512-sDpk0RgmTCR/5HguIZa9n9u+HVKf40fbEUt+iTzSnCaGvY9kFP0YKBWZtJaraonFnqef5SlJ8/TiPAxzyS+UoA==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "netbsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/netbsd-x64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.27.3.tgz", - "integrity": "sha512-P14lFKJl/DdaE00LItAukUdZO5iqNH7+PjoBm+fLQjtxfcfFE20Xf5CrLsmZdq5LFFZzb5JMZ9grUwvtVYzjiA==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "netbsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/openbsd-arm64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.27.3.tgz", - "integrity": "sha512-AIcMP77AvirGbRl/UZFTq5hjXK+2wC7qFRGoHSDrZ5v5b8DK/GYpXW3CPRL53NkvDqb9D+alBiC/dV0Fb7eJcw==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "openbsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/openbsd-x64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.27.3.tgz", - "integrity": "sha512-DnW2sRrBzA+YnE70LKqnM3P+z8vehfJWHXECbwBmH/CU51z6FiqTQTHFenPlHmo3a8UgpLyH3PT+87OViOh1AQ==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "openbsd" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/openharmony-arm64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.27.3.tgz", - "integrity": "sha512-NinAEgr/etERPTsZJ7aEZQvvg/A6IsZG/LgZy+81wON2huV7SrK3e63dU0XhyZP4RKGyTm7aOgmQk0bGp0fy2g==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "openharmony" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/sunos-x64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.27.3.tgz", - "integrity": "sha512-PanZ+nEz+eWoBJ8/f8HKxTTD172SKwdXebZ0ndd953gt1HRBbhMsaNqjTyYLGLPdoWHy4zLU7bDVJztF5f3BHA==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "sunos" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/win32-arm64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.27.3.tgz", - "integrity": "sha512-B2t59lWWYrbRDw/tjiWOuzSsFh1Y/E95ofKz7rIVYSQkUYBjfSgf6oeYPNWHToFRr2zx52JKApIcAS/D5TUBnA==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/win32-ia32": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.27.3.tgz", - "integrity": "sha512-QLKSFeXNS8+tHW7tZpMtjlNb7HKau0QDpwm49u0vUp9y1WOF+PEzkU84y9GqYaAVW8aH8f3GcBck26jh54cX4Q==", - "cpu": [ - "ia32" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@esbuild/win32-x64": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.27.3.tgz", - "integrity": "sha512-4uJGhsxuptu3OcpVAzli+/gWusVGwZZHTlS63hh++ehExkVT8SgiEf7/uC/PclrPPkLhZqGgCTjd0VWLo6xMqA==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">=18" - } - }, - "node_modules/@google-cloud/paginator": { - "version": "5.0.2", - "resolved": "https://registry.npmjs.org/@google-cloud/paginator/-/paginator-5.0.2.tgz", - "integrity": "sha512-DJS3s0OVH4zFDB1PzjxAsHqJT6sKVbRwwML0ZBP9PbU7Yebtu/7SWMRzvO2J3nUi9pRNITCfu4LJeooM2w4pjg==", - "license": "Apache-2.0", - "dependencies": { - "arrify": "^2.0.0", - "extend": "^3.0.2" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@google-cloud/projectify": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/@google-cloud/projectify/-/projectify-4.0.0.tgz", - "integrity": "sha512-MmaX6HeSvyPbWGwFq7mXdo0uQZLGBYCwziiLIGq5JVX+/bdI3SAq6bP98trV5eTWfLuvsMcIC1YJOF2vfteLFA==", - "license": "Apache-2.0", - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@google-cloud/promisify": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/@google-cloud/promisify/-/promisify-4.0.0.tgz", - "integrity": "sha512-Orxzlfb9c67A15cq2JQEyVc7wEsmFBmHjZWZYQMUyJ1qivXyMwdyNOs9odi79hze+2zqdTtu1E19IM/FtqZ10g==", - "license": "Apache-2.0", - "engines": { - "node": ">=14" - } - }, - "node_modules/@google-cloud/storage": { - "version": "7.19.0", - "resolved": "https://registry.npmjs.org/@google-cloud/storage/-/storage-7.19.0.tgz", - "integrity": "sha512-n2FjE7NAOYyshogdc7KQOl/VZb4sneqPjWouSyia9CMDdMhRX5+RIbqalNmC7LOLzuLAN89VlF2HvG8na9G+zQ==", - "license": "Apache-2.0", - "dependencies": { - "@google-cloud/paginator": "^5.0.0", - "@google-cloud/projectify": "^4.0.0", - "@google-cloud/promisify": "<4.1.0", - "abort-controller": "^3.0.0", - "async-retry": "^1.3.3", - "duplexify": "^4.1.3", - "fast-xml-parser": "^5.3.4", - "gaxios": "^6.0.2", - "google-auth-library": "^9.6.3", - "html-entities": "^2.5.2", - "mime": "^3.0.0", - "p-limit": "^3.0.1", - "retry-request": "^7.0.0", - "teeny-request": "^9.0.0", - "uuid": "^8.0.0" - }, - "engines": { - "node": ">=14" - } - }, - "node_modules/@graphql-typed-document-node/core": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/@graphql-typed-document-node/core/-/core-3.2.0.tgz", - "integrity": "sha512-mB9oAsNCm9aM3/SOv4YtBMqZbYj10R7dkq8byBqxGY/ncFwhf2oQzMV+LCRlWoDSEBJ3COiR1yeDvMtsoOsuFQ==", - "license": "MIT", - "peerDependencies": { - "graphql": "^0.8.0 || ^0.9.0 || ^0.10.0 || ^0.11.0 || ^0.12.0 || ^0.13.0 || ^14.0.0 || ^15.0.0 || ^16.0.0 || ^17.0.0" - } - }, - "node_modules/@grpc/grpc-js": { - "version": "1.14.3", - "resolved": "https://registry.npmjs.org/@grpc/grpc-js/-/grpc-js-1.14.3.tgz", - "integrity": "sha512-Iq8QQQ/7X3Sac15oB6p0FmUg/klxQvXLeileoqrTRGJYLV+/9tubbr9ipz0GKHjmXVsgFPo/+W+2cA8eNcR+XA==", - "license": "Apache-2.0", - "dependencies": { - "@grpc/proto-loader": "^0.8.0", - "@js-sdsl/ordered-map": "^4.4.2" - }, - "engines": { - "node": ">=12.10.0" - } - }, - "node_modules/@grpc/grpc-js/node_modules/@grpc/proto-loader": { - "version": "0.8.0", - "resolved": "https://registry.npmjs.org/@grpc/proto-loader/-/proto-loader-0.8.0.tgz", - "integrity": "sha512-rc1hOQtjIWGxcxpb9aHAfLpIctjEnsDehj0DAiVfBlmT84uvR0uUtN2hEi/ecvWVjXUGf5qPF4qEgiLOx1YIMQ==", - "license": "Apache-2.0", - "dependencies": { - "lodash.camelcase": "^4.3.0", - "long": "^5.0.0", - "protobufjs": "^7.5.3", - "yargs": "^17.7.2" - }, - "bin": { - "proto-loader-gen-types": "build/bin/proto-loader-gen-types.js" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/@grpc/proto-loader": { - "version": "0.7.15", - "resolved": "https://registry.npmjs.org/@grpc/proto-loader/-/proto-loader-0.7.15.tgz", - "integrity": "sha512-tMXdRCfYVixjuFK+Hk0Q1s38gV9zDiDJfWL3h1rv4Qc39oILCu1TRTDt7+fGUI8K4G1Fj125Hx/ru3azECWTyQ==", - "license": "Apache-2.0", - "dependencies": { - "lodash.camelcase": "^4.3.0", - "long": "^5.0.0", - "protobufjs": "^7.2.5", - "yargs": "^17.7.2" - }, - "bin": { - "proto-loader-gen-types": "build/bin/proto-loader-gen-types.js" - }, - "engines": { - "node": ">=6" - } - }, - "node_modules/@hono/event-emitter": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/@hono/event-emitter/-/event-emitter-2.0.0.tgz", - "integrity": "sha512-7/zg7hfPh9lncYCxU3avk40vGhiqP4D5NvmaNX+8QxXivIkrLckSia5P4Nz6PH+A1T8Aj3yFlONf4AiX5rqaEA==", - "license": "MIT", - "engines": { - "node": ">=16.0.0" - }, - "peerDependencies": { - "hono": "*" - } - }, - "node_modules/@hono/node-server": { - "version": "1.19.9", - "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.9.tgz", - "integrity": "sha512-vHL6w3ecZsky+8P5MD+eFfaGTyCeOHUIFYMGpQGbrBTSmNNoxv0if69rEZ5giu36weC5saFuznL411gRX7bJDw==", - "license": "MIT", - "engines": { - "node": ">=18.14.1" - }, - "peerDependencies": { - "hono": "^4" - } - }, - "node_modules/@hono/node-ws": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/@hono/node-ws/-/node-ws-1.3.0.tgz", - "integrity": "sha512-ju25YbbvLuXdqBCmLZLqnNYu1nbHIQjoyUqA8ApZOeL1k4skuiTcw5SW77/5SUYo2Xi2NVBJoVlfQurnKEp03Q==", - "license": "MIT", - "dependencies": { - "ws": "^8.17.0" - }, - "engines": { - "node": ">=18.14.1" - }, - "peerDependencies": { - "@hono/node-server": "^1.19.2", - "hono": "^4.6.0" - } - }, - "node_modules/@hono/otel": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@hono/otel/-/otel-1.1.0.tgz", - "integrity": "sha512-3lXExGP+odVTF3W1kTHgRGw4d4xdiYpeRs8dnTwfnHfw5uGEXgUzmkB4/ZQd3tDxYRt7eUhnWuBk5ChV97eqkA==", - "license": "MIT", - "dependencies": { - "@opentelemetry/api": "^1.9.0", - "@opentelemetry/semantic-conventions": "^1.28.0" - }, - "peerDependencies": { - "hono": ">=4.0.0" - } - }, - "node_modules/@hono/zod-openapi": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/@hono/zod-openapi/-/zod-openapi-1.2.1.tgz", - "integrity": "sha512-aZza4V8wkqpdHBWFNPiCeWd0cGOXbYuQW9AyezHs/jwQm5p67GkUyXwfthAooAwnG7thTpvOJkThZpCoY6us8w==", - "license": "MIT", - "dependencies": { - "@asteasolutions/zod-to-openapi": "^8.1.0", - "@hono/zod-validator": "^0.7.6", - "openapi3-ts": "^4.5.0" - }, - "engines": { - "node": ">=16.0.0" - }, - "peerDependencies": { - "hono": ">=4.3.6", - "zod": "^4.0.0" - } - }, - "node_modules/@hono/zod-validator": { - "version": "0.7.6", - "resolved": "https://registry.npmjs.org/@hono/zod-validator/-/zod-validator-0.7.6.tgz", - "integrity": "sha512-Io1B6d011Gj1KknV4rXYz4le5+5EubcWEU/speUjuw9XMMIaP3n78yXLhjd2A3PXaXaUwEAluOiAyLqhBEJgsw==", - "license": "MIT", - "peerDependencies": { - "hono": ">=3.9.0", - "zod": "^3.25.0 || ^4.0.0" - } - }, - "node_modules/@isaacs/balanced-match": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/@isaacs/balanced-match/-/balanced-match-4.0.1.tgz", - "integrity": "sha512-yzMTt9lEb8Gv7zRioUilSglI0c0smZ9k5D65677DLWLtWJaXIS3CqcGyUFByYKlnUj6TkjLVs54fBl6+TiGQDQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": "20 || >=22" - } - }, - "node_modules/@isaacs/brace-expansion": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/@isaacs/brace-expansion/-/brace-expansion-5.0.1.tgz", - "integrity": "sha512-WMz71T1JS624nWj2n2fnYAuPovhv7EUhk69R6i9dsVyzxt5eM3bjwvgk9L+APE1TRscGysAVMANkB0jh0LQZrQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@isaacs/balanced-match": "^4.0.1" - }, - "engines": { - "node": "20 || >=22" - } - }, - "node_modules/@jridgewell/gen-mapping": { - "version": "0.3.13", - "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz", - "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@jridgewell/sourcemap-codec": "^1.5.0", - "@jridgewell/trace-mapping": "^0.3.24" - } - }, - "node_modules/@jridgewell/resolve-uri": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", - "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=6.0.0" - } - }, - "node_modules/@jridgewell/sourcemap-codec": { - "version": "1.5.5", - "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", - "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", - "dev": true, - "license": "MIT" - }, - "node_modules/@jridgewell/trace-mapping": { - "version": "0.3.31", - "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz", - "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@jridgewell/resolve-uri": "^3.1.0", - "@jridgewell/sourcemap-codec": "^1.4.14" - } - }, - "node_modules/@js-joda/core": { - "version": "5.7.0", - "resolved": "https://registry.npmjs.org/@js-joda/core/-/core-5.7.0.tgz", - "integrity": "sha512-WBu4ULVVxySLLzK1Ppq+OdfP+adRS4ntmDQT915rzDJ++i95gc2jZkM5B6LWEAwN3lGXpfie3yPABozdD3K3Vg==", - "license": "BSD-3-Clause" - }, - "node_modules/@js-sdsl/ordered-map": { - "version": "4.4.2", - "resolved": "https://registry.npmjs.org/@js-sdsl/ordered-map/-/ordered-map-4.4.2.tgz", - "integrity": "sha512-iUKgm52T8HOE/makSxjqoWhe95ZJA1/G1sYsGev2JDKUSS14KAgg1LHb+Ba+IPow0xflbnSkOsZcO08C7w1gYw==", - "license": "MIT", - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/js-sdsl" - } - }, - "node_modules/@logtape/hono": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/@logtape/hono/-/hono-2.0.2.tgz", - "integrity": "sha512-dCIfdFnpEguVvd0cLeo7BOMXXBZ/e0dTcOiB9rn46tMsILZlmiIVWz+O0q1wOqSkooOau6zCkw+Rt58UT5nvPQ==", - "funding": [ - "https://github.com/sponsors/dahlia" - ], - "license": "MIT", - "peerDependencies": { - "@logtape/logtape": "^2.0.2", - "hono": "^4.0.0" - } - }, - "node_modules/@logtape/logtape": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/@logtape/logtape/-/logtape-2.0.2.tgz", - "integrity": "sha512-cveUBLbCMFkvkLycP/2vNWvywl47JcJbazHIju94/QNGboZ/jyYAgZIm0ZXezAOx3eIz8OG1EOZ5CuQP3+2FQg==", - "funding": [ - "https://github.com/sponsors/dahlia" - ], - "license": "MIT" - }, - "node_modules/@logtape/pretty": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/@logtape/pretty/-/pretty-2.0.2.tgz", - "integrity": "sha512-WMKoHuaEZvJgDeciIM/OL+joDDlheuoSpkfJOuKncdim7eV6GfIh0BUxLt0Td4JJsljzt5dAttxaX0kXqE0N9Q==", - "funding": [ - "https://github.com/sponsors/dahlia" - ], - "license": "MIT", - "peerDependencies": { - "@logtape/logtape": "^2.0.2" - } - }, - "node_modules/@logtape/redaction": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/@logtape/redaction/-/redaction-2.0.2.tgz", - "integrity": "sha512-NSURYmPLk2E3H1VgxSqG1P65qScVWXntChS3pvXthku6v2E2bHWp+BvDYksoA15SNT7KtTXYgfsV2O3LCMJQrw==", - "funding": [ - "https://github.com/sponsors/dahlia" - ], - "license": "MIT", - "peerDependencies": { - "@logtape/logtape": "^2.0.2" - } - }, - "node_modules/@nvisy/core": { - "resolved": "packages/nvisy-core", - "link": true - }, - "node_modules/@nvisy/plugin-ai": { - "resolved": "packages/nvisy-plugin-ai", - "link": true - }, - "node_modules/@nvisy/plugin-core": { - "resolved": "packages/nvisy-plugin-core", - "link": true - }, - "node_modules/@nvisy/plugin-nosql": { - "resolved": "packages/nvisy-plugin-nosql", - "link": true - }, - "node_modules/@nvisy/plugin-object": { - "resolved": "packages/nvisy-plugin-object", - "link": true - }, - "node_modules/@nvisy/plugin-pandoc": { - "resolved": "packages/nvisy-plugin-pandoc", - "link": true - }, - "node_modules/@nvisy/plugin-queue": { - "resolved": "packages/nvisy-plugin-queue", - "link": true - }, - "node_modules/@nvisy/plugin-sql": { - "resolved": "packages/nvisy-plugin-sql", - "link": true - }, - "node_modules/@nvisy/plugin-tesseract": { - "resolved": "packages/nvisy-plugin-tesseract", - "link": true - }, - "node_modules/@nvisy/plugin-vector": { - "resolved": "packages/nvisy-plugin-vector", - "link": true - }, - "node_modules/@nvisy/runtime": { - "resolved": "packages/nvisy-runtime", - "link": true - }, - "node_modules/@nvisy/server": { - "resolved": "packages/nvisy-server", - "link": true - }, - "node_modules/@opentelemetry/api": { - "version": "1.9.0", - "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", - "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", - "license": "Apache-2.0", - "engines": { - "node": ">=8.0.0" - } - }, - "node_modules/@opentelemetry/semantic-conventions": { - "version": "1.39.0", - "resolved": "https://registry.npmjs.org/@opentelemetry/semantic-conventions/-/semantic-conventions-1.39.0.tgz", - "integrity": "sha512-R5R9tb2AXs2IRLNKLBJDynhkfmx7mX0vi8NkhZb3gUkPWHn6HXk5J8iQ/dql0U3ApfWym4kXXmBDRGO+oeOfjg==", - "license": "Apache-2.0", - "engines": { - "node": ">=14" - } - }, - "node_modules/@petamoriken/float16": { - "version": "3.9.3", - "resolved": "https://registry.npmjs.org/@petamoriken/float16/-/float16-3.9.3.tgz", - "integrity": "sha512-8awtpHXCx/bNpFt4mt2xdkgtgVvKqty8VbjHI/WWWQuEw+KLzFot3f4+LkQY9YmOtq7A5GdOnqoIC8Pdygjk2g==", - "license": "MIT" - }, - "node_modules/@protobufjs/aspromise": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", - "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==", - "license": "BSD-3-Clause" - }, - "node_modules/@protobufjs/base64": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz", - "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==", - "license": "BSD-3-Clause" - }, - "node_modules/@protobufjs/codegen": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz", - "integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==", - "license": "BSD-3-Clause" - }, - "node_modules/@protobufjs/eventemitter": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz", - "integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==", - "license": "BSD-3-Clause" - }, - "node_modules/@protobufjs/fetch": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz", - "integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==", - "license": "BSD-3-Clause", - "dependencies": { - "@protobufjs/aspromise": "^1.1.1", - "@protobufjs/inquire": "^1.1.0" - } - }, - "node_modules/@protobufjs/float": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz", - "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==", - "license": "BSD-3-Clause" - }, - "node_modules/@protobufjs/inquire": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz", - "integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==", - "license": "BSD-3-Clause" - }, - "node_modules/@protobufjs/path": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz", - "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==", - "license": "BSD-3-Clause" - }, - "node_modules/@protobufjs/pool": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz", - "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==", - "license": "BSD-3-Clause" - }, - "node_modules/@protobufjs/utf8": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz", - "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==", - "license": "BSD-3-Clause" - }, - "node_modules/@qdrant/js-client-rest": { - "version": "1.16.2", - "resolved": "https://registry.npmjs.org/@qdrant/js-client-rest/-/js-client-rest-1.16.2.tgz", - "integrity": "sha512-Zm4wEZURrZ24a+Hmm4l1QQYjiz975Ep3vF0yzWR7ICGcxittNz47YK2iBOk8kb8qseCu8pg7WmO1HOIsO8alvw==", - "license": "Apache-2.0", - "dependencies": { - "@qdrant/openapi-typescript-fetch": "1.2.6", - "undici": "^6.0.0" - }, - "engines": { - "node": ">=18.17.0", - "pnpm": ">=8" - }, - "peerDependencies": { - "typescript": ">=4.7" - } - }, - "node_modules/@qdrant/openapi-typescript-fetch": { - "version": "1.2.6", - "resolved": "https://registry.npmjs.org/@qdrant/openapi-typescript-fetch/-/openapi-typescript-fetch-1.2.6.tgz", - "integrity": "sha512-oQG/FejNpItrxRHoyctYvT3rwGZOnK4jr3JdppO/c78ktDvkWiPXPHNsrDf33K9sZdRb6PR7gi4noIapu5q4HA==", - "license": "MIT", - "engines": { - "node": ">=18.0.0", - "pnpm": ">=8" - } - }, - "node_modules/@rollup/rollup-android-arm-eabi": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.57.1.tgz", - "integrity": "sha512-A6ehUVSiSaaliTxai040ZpZ2zTevHYbvu/lDoeAteHI8QnaosIzm4qwtezfRg1jOYaUmnzLX1AOD6Z+UJjtifg==", - "cpu": [ - "arm" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "android" - ] - }, - "node_modules/@rollup/rollup-android-arm64": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.57.1.tgz", - "integrity": "sha512-dQaAddCY9YgkFHZcFNS/606Exo8vcLHwArFZ7vxXq4rigo2bb494/xKMMwRRQW6ug7Js6yXmBZhSBRuBvCCQ3w==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "android" - ] - }, - "node_modules/@rollup/rollup-darwin-arm64": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.57.1.tgz", - "integrity": "sha512-crNPrwJOrRxagUYeMn/DZwqN88SDmwaJ8Cvi/TN1HnWBU7GwknckyosC2gd0IqYRsHDEnXf328o9/HC6OkPgOg==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ] - }, - "node_modules/@rollup/rollup-darwin-x64": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.57.1.tgz", - "integrity": "sha512-Ji8g8ChVbKrhFtig5QBV7iMaJrGtpHelkB3lsaKzadFBe58gmjfGXAOfI5FV0lYMH8wiqsxKQ1C9B0YTRXVy4w==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ] - }, - "node_modules/@rollup/rollup-freebsd-arm64": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.57.1.tgz", - "integrity": "sha512-R+/WwhsjmwodAcz65guCGFRkMb4gKWTcIeLy60JJQbXrJ97BOXHxnkPFrP+YwFlaS0m+uWJTstrUA9o+UchFug==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ] - }, - "node_modules/@rollup/rollup-freebsd-x64": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.57.1.tgz", - "integrity": "sha512-IEQTCHeiTOnAUC3IDQdzRAGj3jOAYNr9kBguI7MQAAZK3caezRrg0GxAb6Hchg4lxdZEI5Oq3iov/w/hnFWY9Q==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ] - }, - "node_modules/@rollup/rollup-linux-arm-gnueabihf": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.57.1.tgz", - "integrity": "sha512-F8sWbhZ7tyuEfsmOxwc2giKDQzN3+kuBLPwwZGyVkLlKGdV1nvnNwYD0fKQ8+XS6hp9nY7B+ZeK01EBUE7aHaw==", - "cpu": [ - "arm" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-arm-musleabihf": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.57.1.tgz", - "integrity": "sha512-rGfNUfn0GIeXtBP1wL5MnzSj98+PZe/AXaGBCRmT0ts80lU5CATYGxXukeTX39XBKsxzFpEeK+Mrp9faXOlmrw==", - "cpu": [ - "arm" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-arm64-gnu": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.57.1.tgz", - "integrity": "sha512-MMtej3YHWeg/0klK2Qodf3yrNzz6CGjo2UntLvk2RSPlhzgLvYEB3frRvbEF2wRKh1Z2fDIg9KRPe1fawv7C+g==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-arm64-musl": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.57.1.tgz", - "integrity": "sha512-1a/qhaaOXhqXGpMFMET9VqwZakkljWHLmZOX48R0I/YLbhdxr1m4gtG1Hq7++VhVUmf+L3sTAf9op4JlhQ5u1Q==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-loong64-gnu": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.57.1.tgz", - "integrity": "sha512-QWO6RQTZ/cqYtJMtxhkRkidoNGXc7ERPbZN7dVW5SdURuLeVU7lwKMpo18XdcmpWYd0qsP1bwKPf7DNSUinhvA==", - "cpu": [ - "loong64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-loong64-musl": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.57.1.tgz", - "integrity": "sha512-xpObYIf+8gprgWaPP32xiN5RVTi/s5FCR+XMXSKmhfoJjrpRAjCuuqQXyxUa/eJTdAE6eJ+KDKaoEqjZQxh3Gw==", - "cpu": [ - "loong64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-ppc64-gnu": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.57.1.tgz", - "integrity": "sha512-4BrCgrpZo4hvzMDKRqEaW1zeecScDCR+2nZ86ATLhAoJ5FQ+lbHVD3ttKe74/c7tNT9c6F2viwB3ufwp01Oh2w==", - "cpu": [ - "ppc64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-ppc64-musl": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.57.1.tgz", - "integrity": "sha512-NOlUuzesGauESAyEYFSe3QTUguL+lvrN1HtwEEsU2rOwdUDeTMJdO5dUYl/2hKf9jWydJrO9OL/XSSf65R5+Xw==", - "cpu": [ - "ppc64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-riscv64-gnu": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.57.1.tgz", - "integrity": "sha512-ptA88htVp0AwUUqhVghwDIKlvJMD/fmL/wrQj99PRHFRAG6Z5nbWoWG4o81Nt9FT+IuqUQi+L31ZKAFeJ5Is+A==", - "cpu": [ - "riscv64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-riscv64-musl": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.57.1.tgz", - "integrity": "sha512-S51t7aMMTNdmAMPpBg7OOsTdn4tySRQvklmL3RpDRyknk87+Sp3xaumlatU+ppQ+5raY7sSTcC2beGgvhENfuw==", - "cpu": [ - "riscv64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-s390x-gnu": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.57.1.tgz", - "integrity": "sha512-Bl00OFnVFkL82FHbEqy3k5CUCKH6OEJL54KCyx2oqsmZnFTR8IoNqBF+mjQVcRCT5sB6yOvK8A37LNm/kPJiZg==", - "cpu": [ - "s390x" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-x64-gnu": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.57.1.tgz", - "integrity": "sha512-ABca4ceT4N+Tv/GtotnWAeXZUZuM/9AQyCyKYyKnpk4yoA7QIAuBt6Hkgpw8kActYlew2mvckXkvx0FfoInnLg==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-linux-x64-musl": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.57.1.tgz", - "integrity": "sha512-HFps0JeGtuOR2convgRRkHCekD7j+gdAuXM+/i6kGzQtFhlCtQkpwtNzkNj6QhCDp7DRJ7+qC/1Vg2jt5iSOFw==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ] - }, - "node_modules/@rollup/rollup-openbsd-x64": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.57.1.tgz", - "integrity": "sha512-H+hXEv9gdVQuDTgnqD+SQffoWoc0Of59AStSzTEj/feWTBAnSfSD3+Dql1ZruJQxmykT/JVY0dE8Ka7z0DH1hw==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "openbsd" - ] - }, - "node_modules/@rollup/rollup-openharmony-arm64": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.57.1.tgz", - "integrity": "sha512-4wYoDpNg6o/oPximyc/NG+mYUejZrCU2q+2w6YZqrAs2UcNUChIZXjtafAiiZSUc7On8v5NyNj34Kzj/Ltk6dQ==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "openharmony" - ] - }, - "node_modules/@rollup/rollup-win32-arm64-msvc": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.57.1.tgz", - "integrity": "sha512-O54mtsV/6LW3P8qdTcamQmuC990HDfR71lo44oZMZlXU4tzLrbvTii87Ni9opq60ds0YzuAlEr/GNwuNluZyMQ==", - "cpu": [ - "arm64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@rollup/rollup-win32-ia32-msvc": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.57.1.tgz", - "integrity": "sha512-P3dLS+IerxCT/7D2q2FYcRdWRl22dNbrbBEtxdWhXrfIMPP9lQhb5h4Du04mdl5Woq05jVCDPCMF7Ub0NAjIew==", - "cpu": [ - "ia32" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@rollup/rollup-win32-x64-gnu": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.57.1.tgz", - "integrity": "sha512-VMBH2eOOaKGtIJYleXsi2B8CPVADrh+TyNxJ4mWPnKfLB/DBUmzW+5m1xUrcwWoMfSLagIRpjUFeW5CO5hyciQ==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@rollup/rollup-win32-x64-msvc": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.57.1.tgz", - "integrity": "sha512-mxRFDdHIWRxg3UfIIAwCm6NzvxG0jDX/wBN6KsQFTvKFqqg9vTrWUE68qEjHt19A5wwx5X5aUi2zuZT7YR0jrA==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "win32" - ] - }, - "node_modules/@scalar/core": { - "version": "0.3.37", - "resolved": "https://registry.npmjs.org/@scalar/core/-/core-0.3.37.tgz", - "integrity": "sha512-cQWMHsGD9jCiYHi91acR3tOsj+qGk+dRQ2W+N5+au1NZ/GkUNT5TUEufekn/sj1S8af+lOnn3y0xXoTI34jCog==", - "license": "MIT", - "dependencies": { - "@scalar/types": "0.6.2" - }, - "engines": { - "node": ">=20" - } - }, - "node_modules/@scalar/helpers": { - "version": "0.2.11", - "resolved": "https://registry.npmjs.org/@scalar/helpers/-/helpers-0.2.11.tgz", - "integrity": "sha512-Y7DLt1bIZF9dvHzJwSJTcC1lpSr1Tbf4VBhHOCRIHu23Rr7/lhQnddRxFmPV1tZXwEQKz7F7yRrubwCfKPCucw==", - "license": "MIT", - "engines": { - "node": ">=20" - } - }, - "node_modules/@scalar/hono-api-reference": { - "version": "0.9.40", - "resolved": "https://registry.npmjs.org/@scalar/hono-api-reference/-/hono-api-reference-0.9.40.tgz", - "integrity": "sha512-0tQOxyEwuu1QGcoA5aCJg2eSmNfF35mxeGx13TND9ud5ZBeuOqli8jyfykgkqV3gFTnDDlQYgQcOvB6Rgk2beA==", - "license": "MIT", - "dependencies": { - "@scalar/core": "0.3.37" - }, - "engines": { - "node": ">=20" - }, - "peerDependencies": { - "hono": "^4.11.5" - } - }, - "node_modules/@scalar/types": { - "version": "0.6.2", - "resolved": "https://registry.npmjs.org/@scalar/types/-/types-0.6.2.tgz", - "integrity": "sha512-VWfY/z9R5NT8PpKVmvmIj6QSh56MMcl8x3JsGiNxR+w7txGQEq+QzEl35aU56uSBFmLfPk1oyInoaHhkosKooA==", - "license": "MIT", - "dependencies": { - "@scalar/helpers": "0.2.11", - "nanoid": "^5.1.6", - "type-fest": "^5.3.1", - "zod": "^4.3.5" - }, - "engines": { - "node": ">=20" - } - }, - "node_modules/@scalar/types/node_modules/nanoid": { - "version": "5.1.6", - "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-5.1.6.tgz", - "integrity": "sha512-c7+7RQ+dMB5dPwwCp4ee1/iV/q2P6aK1mTZcfr1BTuVlyW9hJYiMPybJCcnBlQtuSmTIWNeazm/zqNoZSSElBg==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/ai" - } - ], - "license": "MIT", - "bin": { - "nanoid": "bin/nanoid.js" - }, - "engines": { - "node": "^18 || >=20" - } - }, - "node_modules/@smithy/abort-controller": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/abort-controller/-/abort-controller-4.2.8.tgz", - "integrity": "sha512-peuVfkYHAmS5ybKxWcfraK7WBBP0J+rkfUcbHJJKQ4ir3UAUNQI+Y4Vt/PqSzGqgloJ5O1dk7+WzNL8wcCSXbw==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/chunked-blob-reader": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/@smithy/chunked-blob-reader/-/chunked-blob-reader-5.2.0.tgz", - "integrity": "sha512-WmU0TnhEAJLWvfSeMxBNe5xtbselEO8+4wG0NtZeL8oR21WgH1xiO37El+/Y+H/Ie4SCwBy3MxYWmOYaGgZueA==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/chunked-blob-reader-native": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/@smithy/chunked-blob-reader-native/-/chunked-blob-reader-native-4.2.1.tgz", - "integrity": "sha512-lX9Ay+6LisTfpLid2zZtIhSEjHMZoAR5hHCR4H7tBz/Zkfr5ea8RcQ7Tk4mi0P76p4cN+Btz16Ffno7YHpKXnQ==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/util-base64": "^4.3.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/config-resolver": { - "version": "4.4.6", - "resolved": "https://registry.npmjs.org/@smithy/config-resolver/-/config-resolver-4.4.6.tgz", - "integrity": "sha512-qJpzYC64kaj3S0fueiu3kXm8xPrR3PcXDPEgnaNMRn0EjNSZFoFjvbUp0YUDsRhN1CB90EnHJtbxWKevnH99UQ==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/node-config-provider": "^4.3.8", - "@smithy/types": "^4.12.0", - "@smithy/util-config-provider": "^4.2.0", - "@smithy/util-endpoints": "^3.2.8", - "@smithy/util-middleware": "^4.2.8", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/core": { - "version": "3.22.1", - "resolved": "https://registry.npmjs.org/@smithy/core/-/core-3.22.1.tgz", - "integrity": "sha512-x3ie6Crr58MWrm4viHqqy2Du2rHYZjwu8BekasrQx4ca+Y24dzVAwq3yErdqIbc2G3I0kLQA13PQ+/rde+u65g==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/middleware-serde": "^4.2.9", - "@smithy/protocol-http": "^5.3.8", - "@smithy/types": "^4.12.0", - "@smithy/util-base64": "^4.3.0", - "@smithy/util-body-length-browser": "^4.2.0", - "@smithy/util-middleware": "^4.2.8", - "@smithy/util-stream": "^4.5.11", - "@smithy/util-utf8": "^4.2.0", - "@smithy/uuid": "^1.1.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/credential-provider-imds": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/credential-provider-imds/-/credential-provider-imds-4.2.8.tgz", - "integrity": "sha512-FNT0xHS1c/CPN8upqbMFP83+ul5YgdisfCfkZ86Jh2NSmnqw/AJ6x5pEogVCTVvSm7j9MopRU89bmDelxuDMYw==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/node-config-provider": "^4.3.8", - "@smithy/property-provider": "^4.2.8", - "@smithy/types": "^4.12.0", - "@smithy/url-parser": "^4.2.8", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/eventstream-codec": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/eventstream-codec/-/eventstream-codec-4.2.8.tgz", - "integrity": "sha512-jS/O5Q14UsufqoGhov7dHLOPCzkYJl9QDzusI2Psh4wyYx/izhzvX9P4D69aTxcdfVhEPhjK+wYyn/PzLjKbbw==", - "license": "Apache-2.0", - "dependencies": { - "@aws-crypto/crc32": "5.2.0", - "@smithy/types": "^4.12.0", - "@smithy/util-hex-encoding": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/eventstream-serde-browser": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/eventstream-serde-browser/-/eventstream-serde-browser-4.2.8.tgz", - "integrity": "sha512-MTfQT/CRQz5g24ayXdjg53V0mhucZth4PESoA5IhvaWVDTOQLfo8qI9vzqHcPsdd2v6sqfTYqF5L/l+pea5Uyw==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/eventstream-serde-universal": "^4.2.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/eventstream-serde-config-resolver": { - "version": "4.3.8", - "resolved": "https://registry.npmjs.org/@smithy/eventstream-serde-config-resolver/-/eventstream-serde-config-resolver-4.3.8.tgz", - "integrity": "sha512-ah12+luBiDGzBruhu3efNy1IlbwSEdNiw8fOZksoKoWW1ZHvO/04MQsdnws/9Aj+5b0YXSSN2JXKy/ClIsW8MQ==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/eventstream-serde-node": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/eventstream-serde-node/-/eventstream-serde-node-4.2.8.tgz", - "integrity": "sha512-cYpCpp29z6EJHa5T9WL0KAlq3SOKUQkcgSoeRfRVwjGgSFl7Uh32eYGt7IDYCX20skiEdRffyDpvF2efEZPC0A==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/eventstream-serde-universal": "^4.2.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/eventstream-serde-universal": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/eventstream-serde-universal/-/eventstream-serde-universal-4.2.8.tgz", - "integrity": "sha512-iJ6YNJd0bntJYnX6s52NC4WFYcZeKrPUr1Kmmr5AwZcwCSzVpS7oavAmxMR7pMq7V+D1G4s9F5NJK0xwOsKAlQ==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/eventstream-codec": "^4.2.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/fetch-http-handler": { - "version": "5.3.9", - "resolved": "https://registry.npmjs.org/@smithy/fetch-http-handler/-/fetch-http-handler-5.3.9.tgz", - "integrity": "sha512-I4UhmcTYXBrct03rwzQX1Y/iqQlzVQaPxWjCjula++5EmWq9YGBrx6bbGqluGc1f0XEfhSkiY4jhLgbsJUMKRA==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/protocol-http": "^5.3.8", - "@smithy/querystring-builder": "^4.2.8", - "@smithy/types": "^4.12.0", - "@smithy/util-base64": "^4.3.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/hash-blob-browser": { - "version": "4.2.9", - "resolved": "https://registry.npmjs.org/@smithy/hash-blob-browser/-/hash-blob-browser-4.2.9.tgz", - "integrity": "sha512-m80d/iicI7DlBDxyQP6Th7BW/ejDGiF0bgI754+tiwK0lgMkcaIBgvwwVc7OFbY4eUzpGtnig52MhPAEJ7iNYg==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/chunked-blob-reader": "^5.2.0", - "@smithy/chunked-blob-reader-native": "^4.2.1", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/hash-node": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/hash-node/-/hash-node-4.2.8.tgz", - "integrity": "sha512-7ZIlPbmaDGxVoxErDZnuFG18WekhbA/g2/i97wGj+wUBeS6pcUeAym8u4BXh/75RXWhgIJhyC11hBzig6MljwA==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "@smithy/util-buffer-from": "^4.2.0", - "@smithy/util-utf8": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/hash-stream-node": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/hash-stream-node/-/hash-stream-node-4.2.8.tgz", - "integrity": "sha512-v0FLTXgHrTeheYZFGhR+ehX5qUm4IQsjAiL9qehad2cyjMWcN2QG6/4mSwbSgEQzI7jwfoXj7z4fxZUx/Mhj2w==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "@smithy/util-utf8": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/invalid-dependency": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/invalid-dependency/-/invalid-dependency-4.2.8.tgz", - "integrity": "sha512-N9iozRybwAQ2dn9Fot9kI6/w9vos2oTXLhtK7ovGqwZjlOcxu6XhPlpLpC+INsxktqHinn5gS2DXDjDF2kG5sQ==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/is-array-buffer": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/@smithy/is-array-buffer/-/is-array-buffer-4.2.0.tgz", - "integrity": "sha512-DZZZBvC7sjcYh4MazJSGiWMI2L7E0oCiRHREDzIxi/M2LY79/21iXt6aPLHge82wi5LsuRF5A06Ds3+0mlh6CQ==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/md5-js": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/md5-js/-/md5-js-4.2.8.tgz", - "integrity": "sha512-oGMaLj4tVZzLi3itBa9TCswgMBr7k9b+qKYowQ6x1rTyTuO1IU2YHdHUa+891OsOH+wCsH7aTPRsTJO3RMQmjQ==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "@smithy/util-utf8": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/middleware-content-length": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/middleware-content-length/-/middleware-content-length-4.2.8.tgz", - "integrity": "sha512-RO0jeoaYAB1qBRhfVyq0pMgBoUK34YEJxVxyjOWYZiOKOq2yMZ4MnVXMZCUDenpozHue207+9P5ilTV1zeda0A==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/protocol-http": "^5.3.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/middleware-endpoint": { - "version": "4.4.13", - "resolved": "https://registry.npmjs.org/@smithy/middleware-endpoint/-/middleware-endpoint-4.4.13.tgz", - "integrity": "sha512-x6vn0PjYmGdNuKh/juUJJewZh7MoQ46jYaJ2mvekF4EesMuFfrl4LaW/k97Zjf8PTCPQmPgMvwewg7eNoH9n5w==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/core": "^3.22.1", - "@smithy/middleware-serde": "^4.2.9", - "@smithy/node-config-provider": "^4.3.8", - "@smithy/shared-ini-file-loader": "^4.4.3", - "@smithy/types": "^4.12.0", - "@smithy/url-parser": "^4.2.8", - "@smithy/util-middleware": "^4.2.8", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/middleware-retry": { - "version": "4.4.30", - "resolved": "https://registry.npmjs.org/@smithy/middleware-retry/-/middleware-retry-4.4.30.tgz", - "integrity": "sha512-CBGyFvN0f8hlnqKH/jckRDz78Snrp345+PVk8Ux7pnkUCW97Iinse59lY78hBt04h1GZ6hjBN94BRwZy1xC8Bg==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/node-config-provider": "^4.3.8", - "@smithy/protocol-http": "^5.3.8", - "@smithy/service-error-classification": "^4.2.8", - "@smithy/smithy-client": "^4.11.2", - "@smithy/types": "^4.12.0", - "@smithy/util-middleware": "^4.2.8", - "@smithy/util-retry": "^4.2.8", - "@smithy/uuid": "^1.1.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/middleware-serde": { - "version": "4.2.9", - "resolved": "https://registry.npmjs.org/@smithy/middleware-serde/-/middleware-serde-4.2.9.tgz", - "integrity": "sha512-eMNiej0u/snzDvlqRGSN3Vl0ESn3838+nKyVfF2FKNXFbi4SERYT6PR392D39iczngbqqGG0Jl1DlCnp7tBbXQ==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/protocol-http": "^5.3.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/middleware-stack": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/middleware-stack/-/middleware-stack-4.2.8.tgz", - "integrity": "sha512-w6LCfOviTYQjBctOKSwy6A8FIkQy7ICvglrZFl6Bw4FmcQ1Z420fUtIhxaUZZshRe0VCq4kvDiPiXrPZAe8oRA==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/node-config-provider": { - "version": "4.3.8", - "resolved": "https://registry.npmjs.org/@smithy/node-config-provider/-/node-config-provider-4.3.8.tgz", - "integrity": "sha512-aFP1ai4lrbVlWjfpAfRSL8KFcnJQYfTl5QxLJXY32vghJrDuFyPZ6LtUL+JEGYiFRG1PfPLHLoxj107ulncLIg==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/property-provider": "^4.2.8", - "@smithy/shared-ini-file-loader": "^4.4.3", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/node-http-handler": { - "version": "4.4.9", - "resolved": "https://registry.npmjs.org/@smithy/node-http-handler/-/node-http-handler-4.4.9.tgz", - "integrity": "sha512-KX5Wml5mF+luxm1szW4QDz32e3NObgJ4Fyw+irhph4I/2geXwUy4jkIMUs5ZPGflRBeR6BUkC2wqIab4Llgm3w==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/abort-controller": "^4.2.8", - "@smithy/protocol-http": "^5.3.8", - "@smithy/querystring-builder": "^4.2.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/property-provider": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/property-provider/-/property-provider-4.2.8.tgz", - "integrity": "sha512-EtCTbyIveCKeOXDSWSdze3k612yCPq1YbXsbqX3UHhkOSW8zKsM9NOJG5gTIya0vbY2DIaieG8pKo1rITHYL0w==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/protocol-http": { - "version": "5.3.8", - "resolved": "https://registry.npmjs.org/@smithy/protocol-http/-/protocol-http-5.3.8.tgz", - "integrity": "sha512-QNINVDhxpZ5QnP3aviNHQFlRogQZDfYlCkQT+7tJnErPQbDhysondEjhikuANxgMsZrkGeiAxXy4jguEGsDrWQ==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/querystring-builder": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/querystring-builder/-/querystring-builder-4.2.8.tgz", - "integrity": "sha512-Xr83r31+DrE8CP3MqPgMJl+pQlLLmOfiEUnoyAlGzzJIrEsbKsPy1hqH0qySaQm4oWrCBlUqRt+idEgunKB+iw==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "@smithy/util-uri-escape": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/querystring-parser": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/querystring-parser/-/querystring-parser-4.2.8.tgz", - "integrity": "sha512-vUurovluVy50CUlazOiXkPq40KGvGWSdmusa3130MwrR1UNnNgKAlj58wlOe61XSHRpUfIIh6cE0zZ8mzKaDPA==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/service-error-classification": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/service-error-classification/-/service-error-classification-4.2.8.tgz", - "integrity": "sha512-mZ5xddodpJhEt3RkCjbmUQuXUOaPNTkbMGR0bcS8FE0bJDLMZlhmpgrvPNCYglVw5rsYTpSnv19womw9WWXKQQ==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/shared-ini-file-loader": { - "version": "4.4.3", - "resolved": "https://registry.npmjs.org/@smithy/shared-ini-file-loader/-/shared-ini-file-loader-4.4.3.tgz", - "integrity": "sha512-DfQjxXQnzC5UbCUPeC3Ie8u+rIWZTvuDPAGU/BxzrOGhRvgUanaP68kDZA+jaT3ZI+djOf+4dERGlm9mWfFDrg==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/signature-v4": { - "version": "5.3.8", - "resolved": "https://registry.npmjs.org/@smithy/signature-v4/-/signature-v4-5.3.8.tgz", - "integrity": "sha512-6A4vdGj7qKNRF16UIcO8HhHjKW27thsxYci+5r/uVRkdcBEkOEiY8OMPuydLX4QHSrJqGHPJzPRwwVTqbLZJhg==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/is-array-buffer": "^4.2.0", - "@smithy/protocol-http": "^5.3.8", - "@smithy/types": "^4.12.0", - "@smithy/util-hex-encoding": "^4.2.0", - "@smithy/util-middleware": "^4.2.8", - "@smithy/util-uri-escape": "^4.2.0", - "@smithy/util-utf8": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/smithy-client": { - "version": "4.11.2", - "resolved": "https://registry.npmjs.org/@smithy/smithy-client/-/smithy-client-4.11.2.tgz", - "integrity": "sha512-SCkGmFak/xC1n7hKRsUr6wOnBTJ3L22Qd4e8H1fQIuKTAjntwgU8lrdMe7uHdiT2mJAOWA/60qaW9tiMu69n1A==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/core": "^3.22.1", - "@smithy/middleware-endpoint": "^4.4.13", - "@smithy/middleware-stack": "^4.2.8", - "@smithy/protocol-http": "^5.3.8", - "@smithy/types": "^4.12.0", - "@smithy/util-stream": "^4.5.11", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/types": { - "version": "4.12.0", - "resolved": "https://registry.npmjs.org/@smithy/types/-/types-4.12.0.tgz", - "integrity": "sha512-9YcuJVTOBDjg9LWo23Qp0lTQ3D7fQsQtwle0jVfpbUHy9qBwCEgKuVH4FqFB3VYu0nwdHKiEMA+oXz7oV8X1kw==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/url-parser": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/url-parser/-/url-parser-4.2.8.tgz", - "integrity": "sha512-NQho9U68TGMEU639YkXnVMV3GEFFULmmaWdlu1E9qzyIePOHsoSnagTGSDv1Zi8DCNN6btxOSdgmy5E/hsZwhA==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/querystring-parser": "^4.2.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-base64": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/@smithy/util-base64/-/util-base64-4.3.0.tgz", - "integrity": "sha512-GkXZ59JfyxsIwNTWFnjmFEI8kZpRNIBfxKjv09+nkAWPt/4aGaEWMM04m4sxgNVWkbt2MdSvE3KF/PfX4nFedQ==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/util-buffer-from": "^4.2.0", - "@smithy/util-utf8": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-body-length-browser": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/@smithy/util-body-length-browser/-/util-body-length-browser-4.2.0.tgz", - "integrity": "sha512-Fkoh/I76szMKJnBXWPdFkQJl2r9SjPt3cMzLdOB6eJ4Pnpas8hVoWPYemX/peO0yrrvldgCUVJqOAjUrOLjbxg==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-body-length-node": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/@smithy/util-body-length-node/-/util-body-length-node-4.2.1.tgz", - "integrity": "sha512-h53dz/pISVrVrfxV1iqXlx5pRg3V2YWFcSQyPyXZRrZoZj4R4DeWRDo1a7dd3CPTcFi3kE+98tuNyD2axyZReA==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-buffer-from": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/@smithy/util-buffer-from/-/util-buffer-from-4.2.0.tgz", - "integrity": "sha512-kAY9hTKulTNevM2nlRtxAG2FQ3B2OR6QIrPY3zE5LqJy1oxzmgBGsHLWTcNhWXKchgA0WHW+mZkQrng/pgcCew==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/is-array-buffer": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-config-provider": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/@smithy/util-config-provider/-/util-config-provider-4.2.0.tgz", - "integrity": "sha512-YEjpl6XJ36FTKmD+kRJJWYvrHeUvm5ykaUS5xK+6oXffQPHeEM4/nXlZPe+Wu0lsgRUcNZiliYNh/y7q9c2y6Q==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-defaults-mode-browser": { - "version": "4.3.29", - "resolved": "https://registry.npmjs.org/@smithy/util-defaults-mode-browser/-/util-defaults-mode-browser-4.3.29.tgz", - "integrity": "sha512-nIGy3DNRmOjaYaaKcQDzmWsro9uxlaqUOhZDHQed9MW/GmkBZPtnU70Pu1+GT9IBmUXwRdDuiyaeiy9Xtpn3+Q==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/property-provider": "^4.2.8", - "@smithy/smithy-client": "^4.11.2", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-defaults-mode-node": { - "version": "4.2.32", - "resolved": "https://registry.npmjs.org/@smithy/util-defaults-mode-node/-/util-defaults-mode-node-4.2.32.tgz", - "integrity": "sha512-7dtFff6pu5fsjqrVve0YMhrnzJtccCWDacNKOkiZjJ++fmjGExmmSu341x+WU6Oc1IccL7lDuaUj7SfrHpWc5Q==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/config-resolver": "^4.4.6", - "@smithy/credential-provider-imds": "^4.2.8", - "@smithy/node-config-provider": "^4.3.8", - "@smithy/property-provider": "^4.2.8", - "@smithy/smithy-client": "^4.11.2", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-endpoints": { - "version": "3.2.8", - "resolved": "https://registry.npmjs.org/@smithy/util-endpoints/-/util-endpoints-3.2.8.tgz", - "integrity": "sha512-8JaVTn3pBDkhZgHQ8R0epwWt+BqPSLCjdjXXusK1onwJlRuN69fbvSK66aIKKO7SwVFM6x2J2ox5X8pOaWcUEw==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/node-config-provider": "^4.3.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-hex-encoding": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/@smithy/util-hex-encoding/-/util-hex-encoding-4.2.0.tgz", - "integrity": "sha512-CCQBwJIvXMLKxVbO88IukazJD9a4kQ9ZN7/UMGBjBcJYvatpWk+9g870El4cB8/EJxfe+k+y0GmR9CAzkF+Nbw==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-middleware": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/util-middleware/-/util-middleware-4.2.8.tgz", - "integrity": "sha512-PMqfeJxLcNPMDgvPbbLl/2Vpin+luxqTGPpW3NAQVLbRrFRzTa4rNAASYeIGjRV9Ytuhzny39SpyU04EQreF+A==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-retry": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/util-retry/-/util-retry-4.2.8.tgz", - "integrity": "sha512-CfJqwvoRY0kTGe5AkQokpURNCT1u/MkRzMTASWMPPo2hNSnKtF1D45dQl3DE2LKLr4m+PW9mCeBMJr5mCAVThg==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/service-error-classification": "^4.2.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-stream": { - "version": "4.5.11", - "resolved": "https://registry.npmjs.org/@smithy/util-stream/-/util-stream-4.5.11.tgz", - "integrity": "sha512-lKmZ0S/3Qj2OF5H1+VzvDLb6kRxGzZHq6f3rAsoSu5cTLGsn3v3VQBA8czkNNXlLjoFEtVu3OQT2jEeOtOE2CA==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/fetch-http-handler": "^5.3.9", - "@smithy/node-http-handler": "^4.4.9", - "@smithy/types": "^4.12.0", - "@smithy/util-base64": "^4.3.0", - "@smithy/util-buffer-from": "^4.2.0", - "@smithy/util-hex-encoding": "^4.2.0", - "@smithy/util-utf8": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-uri-escape": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/@smithy/util-uri-escape/-/util-uri-escape-4.2.0.tgz", - "integrity": "sha512-igZpCKV9+E/Mzrpq6YacdTQ0qTiLm85gD6N/IrmyDvQFA4UnU3d5g3m8tMT/6zG/vVkWSU+VxeUyGonL62DuxA==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-utf8": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/@smithy/util-utf8/-/util-utf8-4.2.0.tgz", - "integrity": "sha512-zBPfuzoI8xyBtR2P6WQj63Rz8i3AmfAaJLuNG8dWsfvPe8lO4aCPYLn879mEgHndZH1zQ2oXmG8O1GGzzaoZiw==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/util-buffer-from": "^4.2.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/util-waiter": { - "version": "4.2.8", - "resolved": "https://registry.npmjs.org/@smithy/util-waiter/-/util-waiter-4.2.8.tgz", - "integrity": "sha512-n+lahlMWk+aejGuax7DPWtqav8HYnWxQwR+LCG2BgCUmaGcTe9qZCFsmw8TMg9iG75HOwhrJCX9TCJRLH+Yzqg==", - "license": "Apache-2.0", - "dependencies": { - "@smithy/abort-controller": "^4.2.8", - "@smithy/types": "^4.12.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@smithy/uuid": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@smithy/uuid/-/uuid-1.1.0.tgz", - "integrity": "sha512-4aUIteuyxtBUhVdiQqcDhKFitwfd9hqoSDYY2KRXiWtgoWJ9Bmise+KfEPDiVHWeJepvF8xJO9/9+WDIciMFFw==", - "license": "Apache-2.0", - "dependencies": { - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/@so-ric/colorspace": { - "version": "1.1.6", - "resolved": "https://registry.npmjs.org/@so-ric/colorspace/-/colorspace-1.1.6.tgz", - "integrity": "sha512-/KiKkpHNOBgkFJwu9sh48LkHSMYGyuTcSFK/qMBdnOAlrRJzRSXAOFB5qwzaVQuDl8wAvHVMkaASQDReTahxuw==", - "license": "MIT", - "dependencies": { - "color": "^5.0.2", - "text-hex": "1.0.x" - } - }, - "node_modules/@standard-schema/spec": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.1.0.tgz", - "integrity": "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==", - "license": "MIT" - }, - "node_modules/@tootallnate/once": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-2.0.0.tgz", - "integrity": "sha512-XCuKFP5PS55gnMVu3dty8KPatLqUoy/ZYzDzAGCQ8JNFCkLXzmI7vNHCR+XpbZaMWQK/vQubr7PkYq8g470J/A==", - "license": "MIT", - "engines": { - "node": ">= 10" - } - }, - "node_modules/@types/caseless": { - "version": "0.12.5", - "resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.5.tgz", - "integrity": "sha512-hWtVTC2q7hc7xZ/RLbxapMvDMgUnDvKvMOpKal4DrMyfGBUfB1oKaZlIRr6mJL+If3bAP6sV/QneGzF6tJjZDg==", - "license": "MIT" - }, - "node_modules/@types/chai": { - "version": "5.2.3", - "resolved": "https://registry.npmjs.org/@types/chai/-/chai-5.2.3.tgz", - "integrity": "sha512-Mw558oeA9fFbv65/y4mHtXDs9bPnFMZAL/jxdPFUpOHHIXX91mcgEHbS5Lahr+pwZFR8A7GQleRWeI6cGFC2UA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/deep-eql": "*", - "assertion-error": "^2.0.1" - } - }, - "node_modules/@types/deep-eql": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/@types/deep-eql/-/deep-eql-4.0.2.tgz", - "integrity": "sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/estree": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", - "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/node": { - "version": "25.2.1", - "resolved": "https://registry.npmjs.org/@types/node/-/node-25.2.1.tgz", - "integrity": "sha512-CPrnr8voK8vC6eEtyRzvMpgp3VyVRhgclonE7qYi6P9sXwYb59ucfrnmFBTaP0yUi8Gk4yZg/LlTJULGxvTNsg==", - "license": "MIT", - "dependencies": { - "undici-types": "~7.16.0" - } - }, - "node_modules/@types/pg": { - "version": "8.16.0", - "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.16.0.tgz", - "integrity": "sha512-RmhMd/wD+CF8Dfo+cVIy3RR5cl8CyfXQ0tGgW6XBL8L4LM/UTEbNXYRbLwU6w+CgrKBNbrQWt4FUtTfaU5jSYQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/node": "*", - "pg-protocol": "*", - "pg-types": "^2.2.0" - } - }, - "node_modules/@types/readable-stream": { - "version": "4.0.23", - "resolved": "https://registry.npmjs.org/@types/readable-stream/-/readable-stream-4.0.23.tgz", - "integrity": "sha512-wwXrtQvbMHxCbBgjHaMGEmImFTQxxpfMOR/ZoQnXxB1woqkUbdLGFDgauo00Py9IudiaqSeiBiulSV9i6XIPig==", - "license": "MIT", - "dependencies": { - "@types/node": "*" - } - }, - "node_modules/@types/request": { - "version": "2.48.13", - "resolved": "https://registry.npmjs.org/@types/request/-/request-2.48.13.tgz", - "integrity": "sha512-FGJ6udDNUCjd19pp0Q3iTiDkwhYup7J8hpMW9c4k53NrccQFFWKRho6hvtPPEhnXWKvukfwAlB6DbDz4yhH5Gg==", - "license": "MIT", - "dependencies": { - "@types/caseless": "*", - "@types/node": "*", - "@types/tough-cookie": "*", - "form-data": "^2.5.5" - } - }, - "node_modules/@types/tough-cookie": { - "version": "4.0.5", - "resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.5.tgz", - "integrity": "sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==", - "license": "MIT" - }, - "node_modules/@types/triple-beam": { - "version": "1.3.5", - "resolved": "https://registry.npmjs.org/@types/triple-beam/-/triple-beam-1.3.5.tgz", - "integrity": "sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==", - "license": "MIT" - }, - "node_modules/@typespec/ts-http-runtime": { - "version": "0.3.3", - "resolved": "https://registry.npmjs.org/@typespec/ts-http-runtime/-/ts-http-runtime-0.3.3.tgz", - "integrity": "sha512-91fp6CAAJSRtH5ja95T1FHSKa8aPW9/Zw6cta81jlZTUw/+Vq8jM/AfF/14h2b71wwR84JUTW/3Y8QPhDAawFA==", - "license": "MIT", - "dependencies": { - "http-proxy-agent": "^7.0.0", - "https-proxy-agent": "^7.0.0", - "tslib": "^2.6.2" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/@vercel/oidc": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/@vercel/oidc/-/oidc-3.1.0.tgz", - "integrity": "sha512-Fw28YZpRnA3cAHHDlkt7xQHiJ0fcL+NRcIqsocZQUSmbzeIKRpwttJjik5ZGanXP+vlA4SbTg+AbA3bP363l+w==", - "license": "Apache-2.0", - "engines": { - "node": ">= 20" - } - }, - "node_modules/@vitest/coverage-v8": { - "version": "4.0.18", - "resolved": "https://registry.npmjs.org/@vitest/coverage-v8/-/coverage-v8-4.0.18.tgz", - "integrity": "sha512-7i+N2i0+ME+2JFZhfuz7Tg/FqKtilHjGyGvoHYQ6iLV0zahbsJ9sljC9OcFcPDbhYKCet+sG8SsVqlyGvPflZg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@bcoe/v8-coverage": "^1.0.2", - "@vitest/utils": "4.0.18", - "ast-v8-to-istanbul": "^0.3.10", - "istanbul-lib-coverage": "^3.2.2", - "istanbul-lib-report": "^3.0.1", - "istanbul-reports": "^3.2.0", - "magicast": "^0.5.1", - "obug": "^2.1.1", - "std-env": "^3.10.0", - "tinyrainbow": "^3.0.3" - }, - "funding": { - "url": "https://opencollective.com/vitest" - }, - "peerDependencies": { - "@vitest/browser": "4.0.18", - "vitest": "4.0.18" - }, - "peerDependenciesMeta": { - "@vitest/browser": { - "optional": true - } - } - }, - "node_modules/@vitest/expect": { - "version": "4.0.18", - "resolved": "https://registry.npmjs.org/@vitest/expect/-/expect-4.0.18.tgz", - "integrity": "sha512-8sCWUyckXXYvx4opfzVY03EOiYVxyNrHS5QxX3DAIi5dpJAAkyJezHCP77VMX4HKA2LDT/Jpfo8i2r5BE3GnQQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@standard-schema/spec": "^1.0.0", - "@types/chai": "^5.2.2", - "@vitest/spy": "4.0.18", - "@vitest/utils": "4.0.18", - "chai": "^6.2.1", - "tinyrainbow": "^3.0.3" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/@vitest/mocker": { - "version": "4.0.18", - "resolved": "https://registry.npmjs.org/@vitest/mocker/-/mocker-4.0.18.tgz", - "integrity": "sha512-HhVd0MDnzzsgevnOWCBj5Otnzobjy5wLBe4EdeeFGv8luMsGcYqDuFRMcttKWZA5vVO8RFjexVovXvAM4JoJDQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@vitest/spy": "4.0.18", - "estree-walker": "^3.0.3", - "magic-string": "^0.30.21" - }, - "funding": { - "url": "https://opencollective.com/vitest" - }, - "peerDependencies": { - "msw": "^2.4.9", - "vite": "^6.0.0 || ^7.0.0-0" - }, - "peerDependenciesMeta": { - "msw": { - "optional": true - }, - "vite": { - "optional": true - } - } - }, - "node_modules/@vitest/pretty-format": { - "version": "4.0.18", - "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-4.0.18.tgz", - "integrity": "sha512-P24GK3GulZWC5tz87ux0m8OADrQIUVDPIjjj65vBXYG17ZeU3qD7r+MNZ1RNv4l8CGU2vtTRqixrOi9fYk/yKw==", - "dev": true, - "license": "MIT", - "dependencies": { - "tinyrainbow": "^3.0.3" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/@vitest/runner": { - "version": "4.0.18", - "resolved": "https://registry.npmjs.org/@vitest/runner/-/runner-4.0.18.tgz", - "integrity": "sha512-rpk9y12PGa22Jg6g5M3UVVnTS7+zycIGk9ZNGN+m6tZHKQb7jrP7/77WfZy13Y/EUDd52NDsLRQhYKtv7XfPQw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@vitest/utils": "4.0.18", - "pathe": "^2.0.3" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/@vitest/snapshot": { - "version": "4.0.18", - "resolved": "https://registry.npmjs.org/@vitest/snapshot/-/snapshot-4.0.18.tgz", - "integrity": "sha512-PCiV0rcl7jKQjbgYqjtakly6T1uwv/5BQ9SwBLekVg/EaYeQFPiXcgrC2Y7vDMA8dM1SUEAEV82kgSQIlXNMvA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@vitest/pretty-format": "4.0.18", - "magic-string": "^0.30.21", - "pathe": "^2.0.3" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/@vitest/spy": { - "version": "4.0.18", - "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-4.0.18.tgz", - "integrity": "sha512-cbQt3PTSD7P2OARdVW3qWER5EGq7PHlvE+QfzSC0lbwO+xnt7+XH06ZzFjFRgzUX//JmpxrCu92VdwvEPlWSNw==", - "dev": true, - "license": "MIT", - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/@vitest/utils": { - "version": "4.0.18", - "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-4.0.18.tgz", - "integrity": "sha512-msMRKLMVLWygpK3u2Hybgi4MNjcYJvwTb0Ru09+fOyCXIgT5raYP041DRRdiJiI3k/2U6SEbAETB3YtBrUkCFA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@vitest/pretty-format": "4.0.18", - "tinyrainbow": "^3.0.3" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/@zilliz/milvus2-sdk-node": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/@zilliz/milvus2-sdk-node/-/milvus2-sdk-node-2.6.9.tgz", - "integrity": "sha512-qOaVIpQ3E4w6Dp4lp9QIIuGedpE5dWRhK9SRX+y9WXcq4EXYvcdfR2aG/Vb5tWBPQwcMrGb5z8gRfFy7/gRbIw==", - "license": "Apache-2.0", - "dependencies": { - "@grpc/grpc-js": "1.7.3", - "@grpc/proto-loader": "^0.7.10", - "@opentelemetry/api": "^1.9.0", - "@petamoriken/float16": "^3.8.6", - "dayjs": "^1.11.7", - "generic-pool": "^3.9.0", - "lru-cache": "^9.1.2", - "protobufjs": "^7.2.6", - "winston": "^3.9.0" - } - }, - "node_modules/@zilliz/milvus2-sdk-node/node_modules/lru-cache": { - "version": "9.1.2", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-9.1.2.tgz", - "integrity": "sha512-ERJq3FOzJTxBbFjZ7iDs+NiK4VI9Wz+RdrrAB8dio1oV+YvdPzUEE4QNiT2VD51DkIbCYRUUzCRkssXCHqSnKQ==", - "license": "ISC", - "engines": { - "node": "14 || >=16.14" - } - }, - "node_modules/abort-controller": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", - "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", - "license": "MIT", - "dependencies": { - "event-target-shim": "^5.0.0" - }, - "engines": { - "node": ">=6.5" - } - }, - "node_modules/abort-controller-x": { - "version": "0.5.0", - "resolved": "https://registry.npmjs.org/abort-controller-x/-/abort-controller-x-0.5.0.tgz", - "integrity": "sha512-yTt9CI0x+nRfX6BFMenEGP8ooPvErGH6AbFz20C2IeOLIlDsrw/VHpgne3GsCEuTA410IiFiaLVFKmgM4bKEPQ==", - "license": "MIT" - }, - "node_modules/acorn": { - "version": "8.15.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", - "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", - "dev": true, - "license": "MIT", - "bin": { - "acorn": "bin/acorn" - }, - "engines": { - "node": ">=0.4.0" - } - }, - "node_modules/agent-base": { - "version": "7.1.4", - "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", - "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==", - "license": "MIT", - "engines": { - "node": ">= 14" - } - }, - "node_modules/ai": { - "version": "6.0.73", - "resolved": "https://registry.npmjs.org/ai/-/ai-6.0.73.tgz", - "integrity": "sha512-p2/ICXIjAM4+bIFHEkAB+l58zq+aTmxAkotsb6doNt/CEms72zt6gxv2ky1fQDwU4ecMOcmMh78VJUSEKECzlg==", - "license": "Apache-2.0", - "dependencies": { - "@ai-sdk/gateway": "3.0.36", - "@ai-sdk/provider": "3.0.7", - "@ai-sdk/provider-utils": "4.0.13", - "@opentelemetry/api": "1.9.0" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "zod": "^3.25.76 || ^4.1.8" - } - }, - "node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "license": "MIT", - "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/ansi-styles/node_modules/color-convert": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", - "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", - "license": "MIT", - "dependencies": { - "color-name": "~1.1.4" - }, - "engines": { - "node": ">=7.0.0" - } - }, - "node_modules/ansi-styles/node_modules/color-name": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", - "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", - "license": "MIT" - }, - "node_modules/any-promise": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz", - "integrity": "sha512-7UvmKalWRt1wgjL1RrGxoSJW/0QZFIegpeGvZG9kjp8vrRu55XTHbwnqq2GpXm9uLbcuhxm3IqX9OB4MZR1b2A==", - "dev": true, - "license": "MIT" - }, - "node_modules/arrify": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/arrify/-/arrify-2.0.1.tgz", - "integrity": "sha512-3duEwti880xqi4eAMN8AyR4a0ByT90zoYdLlevfrvU43vb0YZwZVfxOgxWrLXXXpyugL0hNZc9G6BiB5B3nUug==", - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/assertion-error": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-2.0.1.tgz", - "integrity": "sha512-Izi8RQcffqCeNVgFigKli1ssklIbpHnCYc6AknXGYoB6grJqyeby7jv12JUQgmTAnIDnbck1uxksT4dzN3PWBA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=12" - } - }, - "node_modules/ast-v8-to-istanbul": { - "version": "0.3.11", - "resolved": "https://registry.npmjs.org/ast-v8-to-istanbul/-/ast-v8-to-istanbul-0.3.11.tgz", - "integrity": "sha512-Qya9fkoofMjCBNVdWINMjB5KZvkYfaO9/anwkWnjxibpWUxo5iHl2sOdP7/uAqaRuUYuoo8rDwnbaaKVFxoUvw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@jridgewell/trace-mapping": "^0.3.31", - "estree-walker": "^3.0.3", - "js-tokens": "^10.0.0" - } - }, - "node_modules/async": { - "version": "3.2.6", - "resolved": "https://registry.npmjs.org/async/-/async-3.2.6.tgz", - "integrity": "sha512-htCUDlxyyCLMgaM3xXg0C0LW2xqfuQ6p05pCEIsXuyQ+a1koYKTuBMzRNwmybfLgvJDMd0r1LTn4+E0Ti6C2AA==", - "license": "MIT" - }, - "node_modules/async-retry": { - "version": "1.3.3", - "resolved": "https://registry.npmjs.org/async-retry/-/async-retry-1.3.3.tgz", - "integrity": "sha512-wfr/jstw9xNi/0teMHrRW7dsz3Lt5ARhYNZ2ewpadnhaIp5mbALhOAP+EAdsC7t4Z6wqsDVv9+W6gm1Dk9mEyw==", - "license": "MIT", - "dependencies": { - "retry": "0.13.1" - } - }, - "node_modules/asynckit": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", - "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", - "license": "MIT" - }, - "node_modules/aws-ssl-profiles": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/aws-ssl-profiles/-/aws-ssl-profiles-1.1.2.tgz", - "integrity": "sha512-NZKeq9AfyQvEeNlN0zSYAaWrmBffJh3IELMZfRpJVWgrpEbtEpnjvzqBPf+mxoI287JohRDoa+/nsfqqiZmF6g==", - "license": "MIT", - "engines": { - "node": ">= 6.0.0" - } - }, - "node_modules/base64-js": { - "version": "1.5.1", - "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", - "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT" - }, - "node_modules/bignumber.js": { - "version": "9.3.1", - "resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.3.1.tgz", - "integrity": "sha512-Ko0uX15oIUS7wJ3Rb30Fs6SkVbLmPBAKdlm7q9+ak9bbIeFf0MwuBsQV6z7+X768/cHsfg+WlysDWJcmthjsjQ==", - "license": "MIT", - "engines": { - "node": "*" - } - }, - "node_modules/bl": { - "version": "6.1.6", - "resolved": "https://registry.npmjs.org/bl/-/bl-6.1.6.tgz", - "integrity": "sha512-jLsPgN/YSvPUg9UX0Kd73CXpm2Psg9FxMeCSXnk3WBO3CMT10JMwijubhGfHCnFu6TPn1ei3b975dxv7K2pWVg==", - "license": "MIT", - "dependencies": { - "@types/readable-stream": "^4.0.0", - "buffer": "^6.0.3", - "inherits": "^2.0.4", - "readable-stream": "^4.2.0" - } - }, - "node_modules/bl/node_modules/readable-stream": { - "version": "4.7.0", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-4.7.0.tgz", - "integrity": "sha512-oIGGmcpTLwPga8Bn6/Z75SVaH1z5dUut2ibSyAMVhmUggWpmDn2dapB0n7f8nwaSiRtepAsfJyfXIO5DCVAODg==", - "license": "MIT", - "dependencies": { - "abort-controller": "^3.0.0", - "buffer": "^6.0.3", - "events": "^3.3.0", - "process": "^0.11.10", - "string_decoder": "^1.3.0" - }, - "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - } - }, - "node_modules/bowser": { - "version": "2.13.1", - "resolved": "https://registry.npmjs.org/bowser/-/bowser-2.13.1.tgz", - "integrity": "sha512-OHawaAbjwx6rqICCKgSG0SAnT05bzd7ppyKLVUITZpANBaaMFBAsaNkto3LoQ31tyFP5kNujE8Cdx85G9VzOkw==", - "license": "MIT" - }, - "node_modules/buffer": { - "version": "6.0.3", - "resolved": "https://registry.npmjs.org/buffer/-/buffer-6.0.3.tgz", - "integrity": "sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT", - "dependencies": { - "base64-js": "^1.3.1", - "ieee754": "^1.2.1" - } - }, - "node_modules/buffer-equal-constant-time": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/buffer-equal-constant-time/-/buffer-equal-constant-time-1.0.1.tgz", - "integrity": "sha512-zRpUiDwd/xk6ADqPMATG8vc9VPrkck7T07OIx0gnjmJAnHnTVXNQG3vfvWNuiZIkwu9KrKdA1iJKfsfTVxE6NA==", - "license": "BSD-3-Clause" - }, - "node_modules/bundle-name": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz", - "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==", - "license": "MIT", - "dependencies": { - "run-applescript": "^7.0.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/bundle-require": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/bundle-require/-/bundle-require-5.1.0.tgz", - "integrity": "sha512-3WrrOuZiyaaZPWiEt4G3+IffISVC9HYlWueJEBWED4ZH4aIAC2PnkdnuRrR94M+w6yGWn4AglWtJtBI8YqvgoA==", - "dev": true, - "license": "MIT", - "dependencies": { - "load-tsconfig": "^0.2.3" - }, - "engines": { - "node": "^12.20.0 || ^14.13.1 || >=16.0.0" - }, - "peerDependencies": { - "esbuild": ">=0.18" - } - }, - "node_modules/cac": { - "version": "6.7.14", - "resolved": "https://registry.npmjs.org/cac/-/cac-6.7.14.tgz", - "integrity": "sha512-b6Ilus+c3RrdDk+JhLKUAQfzzgLEPy6wcXqS7f/xe1EETvsDP6GORG7SFuOs6cID5YkqchW/LXZbX5bc8j7ZcQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/call-bind-apply-helpers": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", - "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", - "license": "MIT", - "dependencies": { - "es-errors": "^1.3.0", - "function-bind": "^1.1.2" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/chai": { - "version": "6.2.2", - "resolved": "https://registry.npmjs.org/chai/-/chai-6.2.2.tgz", - "integrity": "sha512-NUPRluOfOiTKBKvWPtSD4PhFvWCqOi0BGStNWs57X9js7XGTprSmFoz5F0tWhR4WPjNeR9jXqdC7/UpSJTnlRg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - } - }, - "node_modules/chokidar": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-4.0.3.tgz", - "integrity": "sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==", - "dev": true, - "license": "MIT", - "dependencies": { - "readdirp": "^4.0.1" - }, - "engines": { - "node": ">= 14.16.0" - }, - "funding": { - "url": "https://paulmillr.com/funding/" - } - }, - "node_modules/cliui": { - "version": "8.0.1", - "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz", - "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==", - "license": "ISC", - "dependencies": { - "string-width": "^4.2.0", - "strip-ansi": "^6.0.1", - "wrap-ansi": "^7.0.0" - }, - "engines": { - "node": ">=12" - } - }, - "node_modules/color": { - "version": "5.0.3", - "resolved": "https://registry.npmjs.org/color/-/color-5.0.3.tgz", - "integrity": "sha512-ezmVcLR3xAVp8kYOm4GS45ZLLgIE6SPAFoduLr6hTDajwb3KZ2F46gulK3XpcwRFb5KKGCSezCBAY4Dw4HsyXA==", - "license": "MIT", - "dependencies": { - "color-convert": "^3.1.3", - "color-string": "^2.1.3" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/color-convert": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-3.1.3.tgz", - "integrity": "sha512-fasDH2ont2GqF5HpyO4w0+BcewlhHEZOFn9c1ckZdHpJ56Qb7MHhH/IcJZbBGgvdtwdwNbLvxiBEdg336iA9Sg==", - "license": "MIT", - "dependencies": { - "color-name": "^2.0.0" - }, - "engines": { - "node": ">=14.6" - } - }, - "node_modules/color-name": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/color-name/-/color-name-2.1.0.tgz", - "integrity": "sha512-1bPaDNFm0axzE4MEAzKPuqKWeRaT43U/hyxKPBdqTfmPF+d6n7FSoTFxLVULUJOmiLp01KjhIPPH+HrXZJN4Rg==", - "license": "MIT", - "engines": { - "node": ">=12.20" - } - }, - "node_modules/color-string": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/color-string/-/color-string-2.1.4.tgz", - "integrity": "sha512-Bb6Cq8oq0IjDOe8wJmi4JeNn763Xs9cfrBcaylK1tPypWzyoy2G3l90v9k64kjphl/ZJjPIShFztenRomi8WTg==", - "license": "MIT", - "dependencies": { - "color-name": "^2.0.0" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/combined-stream": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", - "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", - "license": "MIT", - "dependencies": { - "delayed-stream": "~1.0.0" - }, - "engines": { - "node": ">= 0.8" - } - }, - "node_modules/commander": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz", - "integrity": "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 6" - } - }, - "node_modules/confbox": { - "version": "0.1.8", - "resolved": "https://registry.npmjs.org/confbox/-/confbox-0.1.8.tgz", - "integrity": "sha512-RMtmw0iFkeR4YV+fUOSucriAQNb9g8zFR52MWCtl+cCZOFRNL6zeB395vPzFhEjjn4fMxXudmELnl/KF/WrK6w==", - "dev": true, - "license": "MIT" - }, - "node_modules/consola": { - "version": "3.4.2", - "resolved": "https://registry.npmjs.org/consola/-/consola-3.4.2.tgz", - "integrity": "sha512-5IKcdX0nnYavi6G7TtOhwkYzyjfJlatbjMjuLSfE2kYT5pMDOilZ4OvMhi637CcDICTmz3wARPoyhqyX1Y+XvA==", - "dev": true, - "license": "MIT", - "engines": { - "node": "^14.18.0 || >=16.10.0" - } - }, - "node_modules/cross-fetch": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/cross-fetch/-/cross-fetch-3.2.0.tgz", - "integrity": "sha512-Q+xVJLoGOeIMXZmbUK4HYk+69cQH6LudR0Vu/pRm2YlU/hDV9CiS0gKUMaWY5f2NeUH9C1nV3bsTlCo0FsTV1Q==", - "license": "MIT", - "dependencies": { - "node-fetch": "^2.7.0" - } - }, - "node_modules/csv-parse": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-6.1.0.tgz", - "integrity": "sha512-CEE+jwpgLn+MmtCpVcPtiCZpVtB6Z2OKPTr34pycYYoL7sxdOkXDdQ4lRiw6ioC0q6BLqhc6cKweCVvral8yhw==", - "license": "MIT" - }, - "node_modules/dayjs": { - "version": "1.11.19", - "resolved": "https://registry.npmjs.org/dayjs/-/dayjs-1.11.19.tgz", - "integrity": "sha512-t5EcLVS6QPBNqM2z8fakk/NKel+Xzshgt8FFKAn+qwlD1pzZWxh0nVCrvFK7ZDb6XucZeF9z8C7CBWTRIVApAw==", - "license": "MIT" - }, - "node_modules/debug": { - "version": "4.4.3", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", - "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", - "license": "MIT", - "dependencies": { - "ms": "^2.1.3" - }, - "engines": { - "node": ">=6.0" - }, - "peerDependenciesMeta": { - "supports-color": { - "optional": true - } - } - }, - "node_modules/default-browser": { - "version": "5.5.0", - "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.5.0.tgz", - "integrity": "sha512-H9LMLr5zwIbSxrmvikGuI/5KGhZ8E2zH3stkMgM5LpOWDutGM2JZaj460Udnf1a+946zc7YBgrqEWwbk7zHvGw==", - "license": "MIT", - "dependencies": { - "bundle-name": "^4.1.0", - "default-browser-id": "^5.0.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/default-browser-id": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.1.tgz", - "integrity": "sha512-x1VCxdX4t+8wVfd1so/9w+vQ4vx7lKd2Qp5tDRutErwmR85OgmfX7RlLRMWafRMY7hbEiXIbudNrjOAPa/hL8Q==", - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/define-lazy-prop": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz", - "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==", - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/delayed-stream": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", - "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", - "license": "MIT", - "engines": { - "node": ">=0.4.0" - } - }, - "node_modules/denque": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/denque/-/denque-2.1.0.tgz", - "integrity": "sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw==", - "license": "Apache-2.0", - "engines": { - "node": ">=0.10" - } - }, - "node_modules/dunder-proto": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", - "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", - "license": "MIT", - "dependencies": { - "call-bind-apply-helpers": "^1.0.1", - "es-errors": "^1.3.0", - "gopd": "^1.2.0" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/duplexify": { - "version": "4.1.3", - "resolved": "https://registry.npmjs.org/duplexify/-/duplexify-4.1.3.tgz", - "integrity": "sha512-M3BmBhwJRZsSx38lZyhE53Csddgzl5R7xGJNk7CVddZD6CcmwMCH8J+7AprIrQKH7TonKxaCjcv27Qmf+sQ+oA==", - "license": "MIT", - "dependencies": { - "end-of-stream": "^1.4.1", - "inherits": "^2.0.3", - "readable-stream": "^3.1.1", - "stream-shift": "^1.0.2" - } - }, - "node_modules/ecdsa-sig-formatter": { - "version": "1.0.11", - "resolved": "https://registry.npmjs.org/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz", - "integrity": "sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ==", - "license": "Apache-2.0", - "dependencies": { - "safe-buffer": "^5.0.1" - } - }, - "node_modules/effection": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/effection/-/effection-4.0.2.tgz", - "integrity": "sha512-O8WMGP10nPuJDwbNGILcaCNWS+CvDYjcdsUSD79nWZ+WtUQ8h1MEV7JJwCSZCSeKx8+TdEaZ/8r6qPTR2o/o8w==", - "license": "MIT", - "engines": { - "node": ">= 16" - } - }, - "node_modules/emoji-regex": { - "version": "8.0.0", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", - "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", - "license": "MIT" - }, - "node_modules/enabled": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/enabled/-/enabled-2.0.0.tgz", - "integrity": "sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ==", - "license": "MIT" - }, - "node_modules/end-of-stream": { - "version": "1.4.5", - "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz", - "integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==", - "license": "MIT", - "dependencies": { - "once": "^1.4.0" - } - }, - "node_modules/es-define-property": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", - "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", - "license": "MIT", - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/es-errors": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", - "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", - "license": "MIT", - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/es-module-lexer": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-1.7.0.tgz", - "integrity": "sha512-jEQoCwk8hyb2AZziIOLhDqpm5+2ww5uIE6lkO/6jcOCusfk6LhMHpXXfBLXTZ7Ydyt0j4VoUQv6uGNYbdW+kBA==", - "dev": true, - "license": "MIT" - }, - "node_modules/es-object-atoms": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", - "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", - "license": "MIT", - "dependencies": { - "es-errors": "^1.3.0" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/es-set-tostringtag": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", - "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", - "license": "MIT", - "dependencies": { - "es-errors": "^1.3.0", - "get-intrinsic": "^1.2.6", - "has-tostringtag": "^1.0.2", - "hasown": "^2.0.2" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/esbuild": { - "version": "0.27.3", - "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.3.tgz", - "integrity": "sha512-8VwMnyGCONIs6cWue2IdpHxHnAjzxnw2Zr7MkVxB2vjmQ2ivqGFb4LEG3SMnv0Gb2F/G/2yA8zUaiL1gywDCCg==", - "dev": true, - "hasInstallScript": true, - "license": "MIT", - "bin": { - "esbuild": "bin/esbuild" - }, - "engines": { - "node": ">=18" - }, - "optionalDependencies": { - "@esbuild/aix-ppc64": "0.27.3", - "@esbuild/android-arm": "0.27.3", - "@esbuild/android-arm64": "0.27.3", - "@esbuild/android-x64": "0.27.3", - "@esbuild/darwin-arm64": "0.27.3", - "@esbuild/darwin-x64": "0.27.3", - "@esbuild/freebsd-arm64": "0.27.3", - "@esbuild/freebsd-x64": "0.27.3", - "@esbuild/linux-arm": "0.27.3", - "@esbuild/linux-arm64": "0.27.3", - "@esbuild/linux-ia32": "0.27.3", - "@esbuild/linux-loong64": "0.27.3", - "@esbuild/linux-mips64el": "0.27.3", - "@esbuild/linux-ppc64": "0.27.3", - "@esbuild/linux-riscv64": "0.27.3", - "@esbuild/linux-s390x": "0.27.3", - "@esbuild/linux-x64": "0.27.3", - "@esbuild/netbsd-arm64": "0.27.3", - "@esbuild/netbsd-x64": "0.27.3", - "@esbuild/openbsd-arm64": "0.27.3", - "@esbuild/openbsd-x64": "0.27.3", - "@esbuild/openharmony-arm64": "0.27.3", - "@esbuild/sunos-x64": "0.27.3", - "@esbuild/win32-arm64": "0.27.3", - "@esbuild/win32-ia32": "0.27.3", - "@esbuild/win32-x64": "0.27.3" - } - }, - "node_modules/escalade": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", - "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", - "license": "MIT", - "engines": { - "node": ">=6" - } - }, - "node_modules/estree-walker": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-3.0.3.tgz", - "integrity": "sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/estree": "^1.0.0" - } - }, - "node_modules/event-target-shim": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", - "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", - "license": "MIT", - "engines": { - "node": ">=6" - } - }, - "node_modules/events": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/events/-/events-3.3.0.tgz", - "integrity": "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==", - "license": "MIT", - "engines": { - "node": ">=0.8.x" - } - }, - "node_modules/eventsource-parser": { - "version": "3.0.6", - "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.6.tgz", - "integrity": "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==", - "license": "MIT", - "engines": { - "node": ">=18.0.0" - } - }, - "node_modules/expect-type": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/expect-type/-/expect-type-1.3.0.tgz", - "integrity": "sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=12.0.0" - } - }, - "node_modules/extend": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", - "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", - "license": "MIT" - }, - "node_modules/fast-xml-parser": { - "version": "5.3.4", - "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-5.3.4.tgz", - "integrity": "sha512-EFd6afGmXlCx8H8WTZHhAoDaWaGyuIBoZJ2mknrNxug+aZKjkp0a0dlars9Izl+jF+7Gu1/5f/2h68cQpe0IiA==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/NaturalIntelligence" - } - ], - "license": "MIT", - "dependencies": { - "strnum": "^2.1.0" - }, - "bin": { - "fxparser": "src/cli/cli.js" - } - }, - "node_modules/fdir": { - "version": "6.5.0", - "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz", - "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=12.0.0" - }, - "peerDependencies": { - "picomatch": "^3 || ^4" - }, - "peerDependenciesMeta": { - "picomatch": { - "optional": true - } - } - }, - "node_modules/fecha": { - "version": "4.2.3", - "resolved": "https://registry.npmjs.org/fecha/-/fecha-4.2.3.tgz", - "integrity": "sha512-OP2IUU6HeYKJi3i0z4A19kHMQoLVs4Hc+DPqqxI2h/DPZHTm/vjsfC6P0b4jCMy14XizLBqvndQ+UilD7707Jw==", - "license": "MIT" - }, - "node_modules/fix-dts-default-cjs-exports": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/fix-dts-default-cjs-exports/-/fix-dts-default-cjs-exports-1.0.1.tgz", - "integrity": "sha512-pVIECanWFC61Hzl2+oOCtoJ3F17kglZC/6N94eRWycFgBH35hHx0Li604ZIzhseh97mf2p0cv7vVrOZGoqhlEg==", - "dev": true, - "license": "MIT", - "dependencies": { - "magic-string": "^0.30.17", - "mlly": "^1.7.4", - "rollup": "^4.34.8" - } - }, - "node_modules/fn.name": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/fn.name/-/fn.name-1.1.0.tgz", - "integrity": "sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==", - "license": "MIT" - }, - "node_modules/form-data": { - "version": "2.5.5", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.5.5.tgz", - "integrity": "sha512-jqdObeR2rxZZbPSGL+3VckHMYtu+f9//KXBsVny6JSX/pa38Fy+bGjuG8eW/H6USNQWhLi8Num++cU2yOCNz4A==", - "license": "MIT", - "dependencies": { - "asynckit": "^0.4.0", - "combined-stream": "^1.0.8", - "es-set-tostringtag": "^2.1.0", - "hasown": "^2.0.2", - "mime-types": "^2.1.35", - "safe-buffer": "^5.2.1" - }, - "engines": { - "node": ">= 0.12" - } - }, - "node_modules/fsevents": { - "version": "2.3.3", - "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", - "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", - "dev": true, - "hasInstallScript": true, - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], - "engines": { - "node": "^8.16.0 || ^10.6.0 || >=11.0.0" - } - }, - "node_modules/function-bind": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", - "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", - "license": "MIT", - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/gaxios": { - "version": "6.7.1", - "resolved": "https://registry.npmjs.org/gaxios/-/gaxios-6.7.1.tgz", - "integrity": "sha512-LDODD4TMYx7XXdpwxAVRAIAuB0bzv0s+ywFonY46k126qzQHT9ygyoa9tncmOiQmmDrik65UYsEkv3lbfqQ3yQ==", - "license": "Apache-2.0", - "dependencies": { - "extend": "^3.0.2", - "https-proxy-agent": "^7.0.1", - "is-stream": "^2.0.0", - "node-fetch": "^2.6.9", - "uuid": "^9.0.1" - }, - "engines": { - "node": ">=14" - } - }, - "node_modules/gaxios/node_modules/uuid": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", - "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", - "funding": [ - "https://github.com/sponsors/broofa", - "https://github.com/sponsors/ctavan" - ], - "license": "MIT", - "bin": { - "uuid": "dist/bin/uuid" - } - }, - "node_modules/gcp-metadata": { - "version": "6.1.1", - "resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-6.1.1.tgz", - "integrity": "sha512-a4tiq7E0/5fTjxPAaH4jpjkSv/uCaU2p5KC6HVGrvl0cDjA8iBZv4vv1gyzlmK0ZUKqwpOyQMKzZQe3lTit77A==", - "license": "Apache-2.0", - "dependencies": { - "gaxios": "^6.1.1", - "google-logging-utils": "^0.0.2", - "json-bigint": "^1.0.0" - }, - "engines": { - "node": ">=14" - } - }, - "node_modules/generate-function": { - "version": "2.3.1", - "resolved": "https://registry.npmjs.org/generate-function/-/generate-function-2.3.1.tgz", - "integrity": "sha512-eeB5GfMNeevm/GRYq20ShmsaGcmI81kIX2K9XQx5miC8KdHaC6Jm0qQ8ZNeGOi7wYB8OsdxKs+Y2oVuTFuVwKQ==", - "license": "MIT", - "dependencies": { - "is-property": "^1.0.2" - } - }, - "node_modules/generic-pool": { - "version": "3.9.0", - "resolved": "https://registry.npmjs.org/generic-pool/-/generic-pool-3.9.0.tgz", - "integrity": "sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==", - "license": "MIT", - "engines": { - "node": ">= 4" - } - }, - "node_modules/get-caller-file": { - "version": "2.0.5", - "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", - "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==", - "license": "ISC", - "engines": { - "node": "6.* || 8.* || >= 10.*" - } - }, - "node_modules/get-intrinsic": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", - "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", - "license": "MIT", - "dependencies": { - "call-bind-apply-helpers": "^1.0.2", - "es-define-property": "^1.0.1", - "es-errors": "^1.3.0", - "es-object-atoms": "^1.1.1", - "function-bind": "^1.1.2", - "get-proto": "^1.0.1", - "gopd": "^1.2.0", - "has-symbols": "^1.1.0", - "hasown": "^2.0.2", - "math-intrinsics": "^1.1.0" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/get-proto": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", - "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", - "license": "MIT", - "dependencies": { - "dunder-proto": "^1.0.1", - "es-object-atoms": "^1.0.0" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/glob": { - "version": "13.0.1", - "resolved": "https://registry.npmjs.org/glob/-/glob-13.0.1.tgz", - "integrity": "sha512-B7U/vJpE3DkJ5WXTgTpTRN63uV42DseiXXKMwG14LQBXmsdeIoHAPbU/MEo6II0k5ED74uc2ZGTC6MwHFQhF6w==", - "dev": true, - "license": "BlueOak-1.0.0", - "dependencies": { - "minimatch": "^10.1.2", - "minipass": "^7.1.2", - "path-scurry": "^2.0.0" - }, - "engines": { - "node": "20 || >=22" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/google-auth-library": { - "version": "9.15.1", - "resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-9.15.1.tgz", - "integrity": "sha512-Jb6Z0+nvECVz+2lzSMt9u98UsoakXxA2HGHMCxh+so3n90XgYWkq5dur19JAJV7ONiJY22yBTyJB1TSkvPq9Ng==", - "license": "Apache-2.0", - "dependencies": { - "base64-js": "^1.3.0", - "ecdsa-sig-formatter": "^1.0.11", - "gaxios": "^6.1.1", - "gcp-metadata": "^6.1.0", - "gtoken": "^7.0.0", - "jws": "^4.0.0" - }, - "engines": { - "node": ">=14" - } - }, - "node_modules/google-logging-utils": { - "version": "0.0.2", - "resolved": "https://registry.npmjs.org/google-logging-utils/-/google-logging-utils-0.0.2.tgz", - "integrity": "sha512-NEgUnEcBiP5HrPzufUkBzJOD/Sxsco3rLNo1F1TNf7ieU8ryUzBhqba8r756CjLX7rn3fHl6iLEwPYuqpoKgQQ==", - "license": "Apache-2.0", - "engines": { - "node": ">=14" - } - }, - "node_modules/gopd": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", - "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", - "license": "MIT", - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/graphology": { - "version": "0.26.0", - "resolved": "https://registry.npmjs.org/graphology/-/graphology-0.26.0.tgz", - "integrity": "sha512-8SSImzgUUYC89Z042s+0r/vMibY7GX/Emz4LDO5e7jYXhuoWfHISPFJYjpRLUSJGq6UQ6xlenvX1p/hJdfXuXg==", - "license": "MIT", - "dependencies": { - "events": "^3.3.0" - }, - "peerDependencies": { - "graphology-types": ">=0.24.0" - } - }, - "node_modules/graphology-dag": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/graphology-dag/-/graphology-dag-0.4.1.tgz", - "integrity": "sha512-3ch9oOAnHZDoT043vyg7ukmSkKJ505nFzaHaYOn0IF2PgGo5VtIavyVK4UpbIa4tli3hhGm1ZTdBsubTmaxu/w==", - "license": "MIT", - "dependencies": { - "graphology-utils": "^2.4.1", - "mnemonist": "^0.39.0" - }, - "peerDependencies": { - "graphology-types": ">=0.19.0" - } - }, - "node_modules/graphology-types": { - "version": "0.24.8", - "resolved": "https://registry.npmjs.org/graphology-types/-/graphology-types-0.24.8.tgz", - "integrity": "sha512-hDRKYXa8TsoZHjgEaysSRyPdT6uB78Ci8WnjgbStlQysz7xR52PInxNsmnB7IBOM1BhikxkNyCVEFgmPKnpx3Q==", - "license": "MIT" - }, - "node_modules/graphology-utils": { - "version": "2.5.2", - "resolved": "https://registry.npmjs.org/graphology-utils/-/graphology-utils-2.5.2.tgz", - "integrity": "sha512-ckHg8MXrXJkOARk56ZaSCM1g1Wihe2d6iTmz1enGOz4W/l831MBCKSayeFQfowgF8wd+PQ4rlch/56Vs/VZLDQ==", - "license": "MIT", - "peerDependencies": { - "graphology-types": ">=0.23.0" - } - }, - "node_modules/graphql": { - "version": "16.12.0", - "resolved": "https://registry.npmjs.org/graphql/-/graphql-16.12.0.tgz", - "integrity": "sha512-DKKrynuQRne0PNpEbzuEdHlYOMksHSUI8Zc9Unei5gTsMNA2/vMpoMz/yKba50pejK56qj98qM0SjYxAKi13gQ==", - "license": "MIT", - "engines": { - "node": "^12.22.0 || ^14.16.0 || ^16.0.0 || >=17.0.0" - } - }, - "node_modules/graphql-request": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/graphql-request/-/graphql-request-6.1.0.tgz", - "integrity": "sha512-p+XPfS4q7aIpKVcgmnZKhMNqhltk20hfXtkaIkTfjjmiKMJ5xrt5c743cL03y/K7y1rg3WrIC49xGiEQ4mxdNw==", - "license": "MIT", - "dependencies": { - "@graphql-typed-document-node/core": "^3.2.0", - "cross-fetch": "^3.1.5" - }, - "peerDependencies": { - "graphql": "14 - 16" - } - }, - "node_modules/gtoken": { - "version": "7.1.0", - "resolved": "https://registry.npmjs.org/gtoken/-/gtoken-7.1.0.tgz", - "integrity": "sha512-pCcEwRi+TKpMlxAQObHDQ56KawURgyAf6jtIY046fJ5tIv3zDe/LEIubckAO8fj6JnAxLdmWkUfNyulQ2iKdEw==", - "license": "MIT", - "dependencies": { - "gaxios": "^6.0.0", - "jws": "^4.0.0" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/has-flag": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", - "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/has-symbols": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", - "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", - "license": "MIT", - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/has-tostringtag": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", - "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", - "license": "MIT", - "dependencies": { - "has-symbols": "^1.0.3" - }, - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/hasown": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", - "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", - "license": "MIT", - "dependencies": { - "function-bind": "^1.1.2" - }, - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/hono": { - "version": "4.11.8", - "resolved": "https://registry.npmjs.org/hono/-/hono-4.11.8.tgz", - "integrity": "sha512-eVkB/CYCCei7K2WElZW9yYQFWssG0DhaDhVvr7wy5jJ22K+ck8fWW0EsLpB0sITUTvPnc97+rrbQqIr5iqiy9Q==", - "license": "MIT", - "engines": { - "node": ">=16.9.0" - } - }, - "node_modules/html-entities": { - "version": "2.6.0", - "resolved": "https://registry.npmjs.org/html-entities/-/html-entities-2.6.0.tgz", - "integrity": "sha512-kig+rMn/QOVRvr7c86gQ8lWXq+Hkv6CbAH1hLu+RG338StTpE8Z0b44SDVaqVu7HGKf27frdmUYEs9hTUX/cLQ==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/mdevils" - }, - { - "type": "patreon", - "url": "https://patreon.com/mdevils" - } - ], - "license": "MIT" - }, - "node_modules/html-escaper": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", - "integrity": "sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==", - "dev": true, - "license": "MIT" - }, - "node_modules/http-proxy-agent": { - "version": "7.0.2", - "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", - "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==", - "license": "MIT", - "dependencies": { - "agent-base": "^7.1.0", - "debug": "^4.3.4" - }, - "engines": { - "node": ">= 14" - } - }, - "node_modules/https-proxy-agent": { - "version": "7.0.6", - "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", - "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", - "license": "MIT", - "dependencies": { - "agent-base": "^7.1.2", - "debug": "4" - }, - "engines": { - "node": ">= 14" - } - }, - "node_modules/iconv-lite": { - "version": "0.7.2", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.2.tgz", - "integrity": "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==", - "license": "MIT", - "dependencies": { - "safer-buffer": ">= 2.1.2 < 3.0.0" - }, - "engines": { - "node": ">=0.10.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/express" - } - }, - "node_modules/ieee754": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", - "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "BSD-3-Clause" - }, - "node_modules/inherits": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", - "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", - "license": "ISC" - }, - "node_modules/is-docker": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz", - "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==", - "license": "MIT", - "bin": { - "is-docker": "cli.js" - }, - "engines": { - "node": "^12.20.0 || ^14.13.1 || >=16.0.0" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/is-fullwidth-code-point": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", - "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/is-inside-container": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz", - "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==", - "license": "MIT", - "dependencies": { - "is-docker": "^3.0.0" - }, - "bin": { - "is-inside-container": "cli.js" - }, - "engines": { - "node": ">=14.16" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/is-property": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/is-property/-/is-property-1.0.2.tgz", - "integrity": "sha512-Ks/IoX00TtClbGQr4TWXemAnktAQvYB7HzcCxDGqEZU6oCmb2INHuOoKxbtR+HFkmYWBKv/dOZtGRiAjDhj92g==", - "license": "MIT" - }, - "node_modules/is-stream": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz", - "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==", - "license": "MIT", - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/is-wsl": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.0.tgz", - "integrity": "sha512-UcVfVfaK4Sc4m7X3dUSoHoozQGBEFeDC+zVo06t98xe8CzHSZZBekNXH+tu0NalHolcJ/QAGqS46Hef7QXBIMw==", - "license": "MIT", - "dependencies": { - "is-inside-container": "^1.0.0" - }, - "engines": { - "node": ">=16" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/istanbul-lib-coverage": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz", - "integrity": "sha512-O8dpsF+r0WV/8MNRKfnmrtCWhuKjxrq2w+jpzBL5UZKTi2LeVWnWOmWRxFlesJONmc+wLAGvKQZEOanko0LFTg==", - "dev": true, - "license": "BSD-3-Clause", - "engines": { - "node": ">=8" - } - }, - "node_modules/istanbul-lib-report": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/istanbul-lib-report/-/istanbul-lib-report-3.0.1.tgz", - "integrity": "sha512-GCfE1mtsHGOELCU8e/Z7YWzpmybrx/+dSTfLrvY8qRmaY6zXTKWn6WQIjaAFw069icm6GVMNkgu0NzI4iPZUNw==", - "dev": true, - "license": "BSD-3-Clause", - "dependencies": { - "istanbul-lib-coverage": "^3.0.0", - "make-dir": "^4.0.0", - "supports-color": "^7.1.0" - }, - "engines": { - "node": ">=10" - } - }, - "node_modules/istanbul-reports": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/istanbul-reports/-/istanbul-reports-3.2.0.tgz", - "integrity": "sha512-HGYWWS/ehqTV3xN10i23tkPkpH46MLCIMFNCaaKNavAXTF1RkqxawEPtnjnGZ6XKSInBKkiOA5BKS+aZiY3AvA==", - "dev": true, - "license": "BSD-3-Clause", - "dependencies": { - "html-escaper": "^2.0.0", - "istanbul-lib-report": "^3.0.0" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/joycon": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/joycon/-/joycon-3.1.1.tgz", - "integrity": "sha512-34wB/Y7MW7bzjKRjUKTa46I2Z7eV62Rkhva+KkopW7Qvv/OSWBqvkSY7vusOPrNuZcUG3tApvdVgNB8POj3SPw==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10" - } - }, - "node_modules/js-md4": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/js-md4/-/js-md4-0.3.2.tgz", - "integrity": "sha512-/GDnfQYsltsjRswQhN9fhv3EMw2sCpUdrdxyWDOUK7eyD++r3gRhzgiQgc/x4MAv2i1iuQ4lxO5mvqM3vj4bwA==", - "license": "MIT" - }, - "node_modules/js-tokens": { - "version": "10.0.0", - "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-10.0.0.tgz", - "integrity": "sha512-lM/UBzQmfJRo9ABXbPWemivdCW8V2G8FHaHdypQaIy523snUjog0W71ayWXTjiR+ixeMyVHN2XcpnTd/liPg/Q==", - "dev": true, - "license": "MIT" - }, - "node_modules/json-bigint": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz", - "integrity": "sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==", - "license": "MIT", - "dependencies": { - "bignumber.js": "^9.0.0" - } - }, - "node_modules/json-schema": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz", - "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==", - "license": "(AFL-2.1 OR BSD-3-Clause)" - }, - "node_modules/jsonwebtoken": { - "version": "9.0.3", - "resolved": "https://registry.npmjs.org/jsonwebtoken/-/jsonwebtoken-9.0.3.tgz", - "integrity": "sha512-MT/xP0CrubFRNLNKvxJ2BYfy53Zkm++5bX9dtuPbqAeQpTVe0MQTFhao8+Cp//EmJp244xt6Drw/GVEGCUj40g==", - "license": "MIT", - "dependencies": { - "jws": "^4.0.1", - "lodash.includes": "^4.3.0", - "lodash.isboolean": "^3.0.3", - "lodash.isinteger": "^4.0.4", - "lodash.isnumber": "^3.0.3", - "lodash.isplainobject": "^4.0.6", - "lodash.isstring": "^4.0.1", - "lodash.once": "^4.0.0", - "ms": "^2.1.1", - "semver": "^7.5.4" - }, - "engines": { - "node": ">=12", - "npm": ">=6" - } - }, - "node_modules/jwa": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", - "integrity": "sha512-hRF04fqJIP8Abbkq5NKGN0Bbr3JxlQ+qhZufXVr0DvujKy93ZCbXZMHDL4EOtodSbCWxOqR8MS1tXA5hwqCXDg==", - "license": "MIT", - "dependencies": { - "buffer-equal-constant-time": "^1.0.1", - "ecdsa-sig-formatter": "1.0.11", - "safe-buffer": "^5.0.1" - } - }, - "node_modules/jws": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.1.tgz", - "integrity": "sha512-EKI/M/yqPncGUUh44xz0PxSidXFr/+r0pA70+gIYhjv+et7yxM+s29Y+VGDkovRofQem0fs7Uvf4+YmAdyRduA==", - "license": "MIT", - "dependencies": { - "jwa": "^2.0.1", - "safe-buffer": "^5.0.1" - } - }, - "node_modules/kuler": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/kuler/-/kuler-2.0.0.tgz", - "integrity": "sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A==", - "license": "MIT" - }, - "node_modules/kysely": { - "version": "0.28.11", - "resolved": "https://registry.npmjs.org/kysely/-/kysely-0.28.11.tgz", - "integrity": "sha512-zpGIFg0HuoC893rIjYX1BETkVWdDnzTzF5e0kWXJFg5lE0k1/LfNWBejrcnOFu8Q2Rfq/hTDTU7XLUM8QOrpzg==", - "license": "MIT", - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/lilconfig": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz", - "integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=14" - }, - "funding": { - "url": "https://github.com/sponsors/antonk52" - } - }, - "node_modules/lines-and-columns": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", - "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", - "dev": true, - "license": "MIT" - }, - "node_modules/load-tsconfig": { - "version": "0.2.5", - "resolved": "https://registry.npmjs.org/load-tsconfig/-/load-tsconfig-0.2.5.tgz", - "integrity": "sha512-IXO6OCs9yg8tMKzfPZ1YmheJbZCiEsnBdcB03l0OcfK9prKnJb96siuHCr5Fl37/yo9DnKU+TLpxzTUspw9shg==", - "dev": true, - "license": "MIT", - "engines": { - "node": "^12.20.0 || ^14.13.1 || >=16.0.0" - } - }, - "node_modules/lodash.camelcase": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/lodash.camelcase/-/lodash.camelcase-4.3.0.tgz", - "integrity": "sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==", - "license": "MIT" - }, - "node_modules/lodash.includes": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/lodash.includes/-/lodash.includes-4.3.0.tgz", - "integrity": "sha512-W3Bx6mdkRTGtlJISOvVD/lbqjTlPPUDTMnlXZFnVwi9NKJ6tiAk6LVdlhZMm17VZisqhKcgzpO5Wz91PCt5b0w==", - "license": "MIT" - }, - "node_modules/lodash.isboolean": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/lodash.isboolean/-/lodash.isboolean-3.0.3.tgz", - "integrity": "sha512-Bz5mupy2SVbPHURB98VAcw+aHh4vRV5IPNhILUCsOzRmsTmSQ17jIuqopAentWoehktxGd9e/hbIXq980/1QJg==", - "license": "MIT" - }, - "node_modules/lodash.isinteger": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/lodash.isinteger/-/lodash.isinteger-4.0.4.tgz", - "integrity": "sha512-DBwtEWN2caHQ9/imiNeEA5ys1JoRtRfY3d7V9wkqtbycnAmTvRRmbHKDV4a0EYc678/dia0jrte4tjYwVBaZUA==", - "license": "MIT" - }, - "node_modules/lodash.isnumber": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/lodash.isnumber/-/lodash.isnumber-3.0.3.tgz", - "integrity": "sha512-QYqzpfwO3/CWf3XP+Z+tkQsfaLL/EnUlXWVkIk5FUPc4sBdTehEqZONuyRt2P67PXAk+NXmTBcc97zw9t1FQrw==", - "license": "MIT" - }, - "node_modules/lodash.isplainobject": { - "version": "4.0.6", - "resolved": "https://registry.npmjs.org/lodash.isplainobject/-/lodash.isplainobject-4.0.6.tgz", - "integrity": "sha512-oSXzaWypCMHkPC3NvBEaPHf0KsA5mvPrOPgQWDsbg8n7orZ290M0BmC/jgRZ4vcJ6DTAhjrsSYgdsW/F+MFOBA==", - "license": "MIT" - }, - "node_modules/lodash.isstring": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/lodash.isstring/-/lodash.isstring-4.0.1.tgz", - "integrity": "sha512-0wJxfxH1wgO3GrbuP+dTTk7op+6L41QCXbGINEmD+ny/G/eCqGzxyCsh7159S+mgDDcoarnBw6PC1PS5+wUGgw==", - "license": "MIT" - }, - "node_modules/lodash.once": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/lodash.once/-/lodash.once-4.1.1.tgz", - "integrity": "sha512-Sb487aTOCr9drQVL8pIxOzVhafOjZN9UU54hiN8PU3uAiSV7lx1yYNpbNmex2PK6dSJoNTSJUUswT651yww3Mg==", - "license": "MIT" - }, - "node_modules/logform": { - "version": "2.7.0", - "resolved": "https://registry.npmjs.org/logform/-/logform-2.7.0.tgz", - "integrity": "sha512-TFYA4jnP7PVbmlBIfhlSe+WKxs9dklXMTEGcBCIvLhE/Tn3H6Gk1norupVW7m5Cnd4bLcr08AytbyV/xj7f/kQ==", - "license": "MIT", - "dependencies": { - "@colors/colors": "1.6.0", - "@types/triple-beam": "^1.3.2", - "fecha": "^4.2.0", - "ms": "^2.1.1", - "safe-stable-stringify": "^2.3.1", - "triple-beam": "^1.3.0" - }, - "engines": { - "node": ">= 12.0.0" - } - }, - "node_modules/long": { - "version": "5.3.2", - "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz", - "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==", - "license": "Apache-2.0" - }, - "node_modules/lru-cache": { - "version": "11.2.5", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.2.5.tgz", - "integrity": "sha512-vFrFJkWtJvJnD5hg+hJvVE8Lh/TcMzKnTgCWmtBipwI5yLX/iX+5UB2tfuyODF5E7k9xEzMdYgGqaSb1c0c5Yw==", - "dev": true, - "license": "BlueOak-1.0.0", - "engines": { - "node": "20 || >=22" - } - }, - "node_modules/lru.min": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/lru.min/-/lru.min-1.1.4.tgz", - "integrity": "sha512-DqC6n3QQ77zdFpCMASA1a3Jlb64Hv2N2DciFGkO/4L9+q/IpIAuRlKOvCXabtRW6cQf8usbmM6BE/TOPysCdIA==", - "license": "MIT", - "engines": { - "bun": ">=1.0.0", - "deno": ">=1.30.0", - "node": ">=8.0.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wellwelwel" - } - }, - "node_modules/magic-bytes.js": { - "version": "1.13.0", - "resolved": "https://registry.npmjs.org/magic-bytes.js/-/magic-bytes.js-1.13.0.tgz", - "integrity": "sha512-afO2mnxW7GDTXMm5/AoN1WuOcdoKhtgXjIvHmobqTD1grNplhGdv3PFOyjCVmrnOZBIT/gD/koDKpYG+0mvHcg==", - "license": "MIT" - }, - "node_modules/magic-string": { - "version": "0.30.21", - "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.21.tgz", - "integrity": "sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@jridgewell/sourcemap-codec": "^1.5.5" - } - }, - "node_modules/magicast": { - "version": "0.5.2", - "resolved": "https://registry.npmjs.org/magicast/-/magicast-0.5.2.tgz", - "integrity": "sha512-E3ZJh4J3S9KfwdjZhe2afj6R9lGIN5Pher1pF39UGrXRqq/VDaGVIGN13BjHd2u8B61hArAGOnso7nBOouW3TQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@babel/parser": "^7.29.0", - "@babel/types": "^7.29.0", - "source-map-js": "^1.2.1" - } - }, - "node_modules/make-dir": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-4.0.0.tgz", - "integrity": "sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==", - "dev": true, - "license": "MIT", - "dependencies": { - "semver": "^7.5.3" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/math-intrinsics": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", - "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", - "license": "MIT", - "engines": { - "node": ">= 0.4" - } - }, - "node_modules/mime": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/mime/-/mime-3.0.0.tgz", - "integrity": "sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A==", - "license": "MIT", - "bin": { - "mime": "cli.js" - }, - "engines": { - "node": ">=10.0.0" - } - }, - "node_modules/mime-db": { - "version": "1.52.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", - "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/mime-types": { - "version": "2.1.35", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", - "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", - "license": "MIT", - "dependencies": { - "mime-db": "1.52.0" - }, - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/minimatch": { - "version": "10.1.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.1.2.tgz", - "integrity": "sha512-fu656aJ0n2kcXwsnwnv9g24tkU5uSmOlTjd6WyyaKm2Z+h1qmY6bAjrcaIxF/BslFqbZ8UBtbJi7KgQOZD2PTw==", - "dev": true, - "license": "BlueOak-1.0.0", - "dependencies": { - "@isaacs/brace-expansion": "^5.0.1" - }, - "engines": { - "node": "20 || >=22" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/minipass": { - "version": "7.1.2", - "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz", - "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==", - "dev": true, - "license": "ISC", - "engines": { - "node": ">=16 || 14 >=14.17" - } - }, - "node_modules/mlly": { - "version": "1.8.0", - "resolved": "https://registry.npmjs.org/mlly/-/mlly-1.8.0.tgz", - "integrity": "sha512-l8D9ODSRWLe2KHJSifWGwBqpTZXIXTeo8mlKjY+E2HAakaTeNpqAyBZ8GSqLzHgw4XmHmC8whvpjJNMbFZN7/g==", - "dev": true, - "license": "MIT", - "dependencies": { - "acorn": "^8.15.0", - "pathe": "^2.0.3", - "pkg-types": "^1.3.1", - "ufo": "^1.6.1" - } - }, - "node_modules/mnemonist": { - "version": "0.39.8", - "resolved": "https://registry.npmjs.org/mnemonist/-/mnemonist-0.39.8.tgz", - "integrity": "sha512-vyWo2K3fjrUw8YeeZ1zF0fy6Mu59RHokURlld8ymdUPjMlD9EC9ov1/YPqTgqRvUN9nTr3Gqfz29LYAmu0PHPQ==", - "license": "MIT", - "dependencies": { - "obliterator": "^2.0.1" - } - }, - "node_modules/ms": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", - "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", - "license": "MIT" - }, - "node_modules/mysql2": { - "version": "3.16.3", - "resolved": "https://registry.npmjs.org/mysql2/-/mysql2-3.16.3.tgz", - "integrity": "sha512-+3XhQEt4FEFuvGV0JjIDj4eP2OT/oIj/54dYvqhblnSzlfcxVOuj+cd15Xz6hsG4HU1a+A5+BA9gm0618C4z7A==", - "license": "MIT", - "dependencies": { - "aws-ssl-profiles": "^1.1.2", - "denque": "^2.1.0", - "generate-function": "^2.3.1", - "iconv-lite": "^0.7.2", - "long": "^5.3.2", - "lru.min": "^1.1.3", - "named-placeholders": "^1.1.6", - "seq-queue": "^0.0.5", - "sqlstring": "^2.3.3" - }, - "engines": { - "node": ">= 8.0" - } - }, - "node_modules/mz": { - "version": "2.7.0", - "resolved": "https://registry.npmjs.org/mz/-/mz-2.7.0.tgz", - "integrity": "sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==", - "dev": true, - "license": "MIT", - "dependencies": { - "any-promise": "^1.0.0", - "object-assign": "^4.0.1", - "thenify-all": "^1.0.0" - } - }, - "node_modules/named-placeholders": { - "version": "1.1.6", - "resolved": "https://registry.npmjs.org/named-placeholders/-/named-placeholders-1.1.6.tgz", - "integrity": "sha512-Tz09sEL2EEuv5fFowm419c1+a/jSMiBjI9gHxVLrVdbUkkNUUfjsVYs9pVZu5oCon/kmRh9TfLEObFtkVxmY0w==", - "license": "MIT", - "dependencies": { - "lru.min": "^1.1.0" - }, - "engines": { - "node": ">=8.0.0" - } - }, - "node_modules/nanoid": { - "version": "3.3.11", - "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", - "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", - "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/ai" - } - ], - "license": "MIT", - "bin": { - "nanoid": "bin/nanoid.cjs" - }, - "engines": { - "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" - } - }, - "node_modules/native-duplexpair": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/native-duplexpair/-/native-duplexpair-1.0.0.tgz", - "integrity": "sha512-E7QQoM+3jvNtlmyfqRZ0/U75VFgCls+fSkbml2MpgWkWyz3ox8Y58gNhfuziuQYGNNQAbFZJQck55LHCnCK6CA==", - "license": "MIT" - }, - "node_modules/nice-grpc": { - "version": "2.1.14", - "resolved": "https://registry.npmjs.org/nice-grpc/-/nice-grpc-2.1.14.tgz", - "integrity": "sha512-GK9pKNxlvnU5FAdaw7i2FFuR9CqBspcE+if2tqnKXBcE0R8525wj4BZvfcwj7FjvqbssqKxRHt2nwedalbJlww==", - "license": "MIT", - "dependencies": { - "@grpc/grpc-js": "^1.14.0", - "abort-controller-x": "^0.4.0", - "nice-grpc-common": "^2.0.2" - } - }, - "node_modules/nice-grpc-client-middleware-retry": { - "version": "3.1.13", - "resolved": "https://registry.npmjs.org/nice-grpc-client-middleware-retry/-/nice-grpc-client-middleware-retry-3.1.13.tgz", - "integrity": "sha512-Q9I/wm5lYkDTveKFirrTHBkBY137yavXZ4xQDXTPIycUp7aLXD8xPTHFhqtAFWUw05aS91uffZZRgdv3HS0y/g==", - "license": "MIT", - "dependencies": { - "abort-controller-x": "^0.4.0", - "nice-grpc-common": "^2.0.2" - } - }, - "node_modules/nice-grpc-client-middleware-retry/node_modules/abort-controller-x": { - "version": "0.4.3", - "resolved": "https://registry.npmjs.org/abort-controller-x/-/abort-controller-x-0.4.3.tgz", - "integrity": "sha512-VtUwTNU8fpMwvWGn4xE93ywbogTYsuT+AUxAXOeelbXuQVIwNmC5YLeho9sH4vZ4ITW8414TTAOG1nW6uIVHCA==", - "license": "MIT" - }, - "node_modules/nice-grpc-common": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/nice-grpc-common/-/nice-grpc-common-2.0.2.tgz", - "integrity": "sha512-7RNWbls5kAL1QVUOXvBsv1uO0wPQK3lHv+cY1gwkTzirnG1Nop4cBJZubpgziNbaVc/bl9QJcyvsf/NQxa3rjQ==", - "license": "MIT", - "dependencies": { - "ts-error": "^1.0.6" - } - }, - "node_modules/nice-grpc/node_modules/abort-controller-x": { - "version": "0.4.3", - "resolved": "https://registry.npmjs.org/abort-controller-x/-/abort-controller-x-0.4.3.tgz", - "integrity": "sha512-VtUwTNU8fpMwvWGn4xE93ywbogTYsuT+AUxAXOeelbXuQVIwNmC5YLeho9sH4vZ4ITW8414TTAOG1nW6uIVHCA==", - "license": "MIT" - }, - "node_modules/node-fetch": { - "version": "2.7.0", - "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", - "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", - "license": "MIT", - "dependencies": { - "whatwg-url": "^5.0.0" - }, - "engines": { - "node": "4.x || >=6.0.0" - }, - "peerDependencies": { - "encoding": "^0.1.0" - }, - "peerDependenciesMeta": { - "encoding": { - "optional": true - } - } - }, - "node_modules/object-assign": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", - "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/obliterator": { - "version": "2.0.5", - "resolved": "https://registry.npmjs.org/obliterator/-/obliterator-2.0.5.tgz", - "integrity": "sha512-42CPE9AhahZRsMNslczq0ctAEtqk8Eka26QofnqC346BZdHDySk3LWka23LI7ULIw11NmltpiLagIq8gBozxTw==", - "license": "MIT" - }, - "node_modules/obug": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/obug/-/obug-2.1.1.tgz", - "integrity": "sha512-uTqF9MuPraAQ+IsnPf366RG4cP9RtUi7MLO1N3KEc+wb0a6yKpeL0lmk2IB1jY5KHPAlTc6T/JRdC/YqxHNwkQ==", - "dev": true, - "funding": [ - "https://github.com/sponsors/sxzz", - "https://opencollective.com/debug" - ], - "license": "MIT" - }, - "node_modules/once": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", - "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", - "license": "ISC", - "dependencies": { - "wrappy": "1" - } - }, - "node_modules/one-time": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/one-time/-/one-time-1.0.0.tgz", - "integrity": "sha512-5DXOiRKwuSEcQ/l0kGCF6Q3jcADFv5tSmRaJck/OqkVFcOzutB134KRSfF0xDrL39MNnqxbHBbUUcjZIhTgb2g==", - "license": "MIT", - "dependencies": { - "fn.name": "1.x.x" - } - }, - "node_modules/open": { - "version": "10.2.0", - "resolved": "https://registry.npmjs.org/open/-/open-10.2.0.tgz", - "integrity": "sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA==", - "license": "MIT", - "dependencies": { - "default-browser": "^5.2.1", - "define-lazy-prop": "^3.0.0", - "is-inside-container": "^1.0.0", - "wsl-utils": "^0.1.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/openapi3-ts": { - "version": "4.5.0", - "resolved": "https://registry.npmjs.org/openapi3-ts/-/openapi3-ts-4.5.0.tgz", - "integrity": "sha512-jaL+HgTq2Gj5jRcfdutgRGLosCy/hT8sQf6VOy+P+g36cZOjI1iukdPnijC+4CmeRzg/jEllJUboEic2FhxhtQ==", - "license": "MIT", - "dependencies": { - "yaml": "^2.8.0" - } - }, - "node_modules/p-limit": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", - "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==", - "license": "MIT", - "dependencies": { - "yocto-queue": "^0.1.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/package-json-from-dist": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.1.tgz", - "integrity": "sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==", - "dev": true, - "license": "BlueOak-1.0.0" - }, - "node_modules/path-scurry": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-2.0.1.tgz", - "integrity": "sha512-oWyT4gICAu+kaA7QWk/jvCHWarMKNs6pXOGWKDTr7cw4IGcUbW+PeTfbaQiLGheFRpjo6O9J0PmyMfQPjH71oA==", - "dev": true, - "license": "BlueOak-1.0.0", - "dependencies": { - "lru-cache": "^11.0.0", - "minipass": "^7.1.2" - }, - "engines": { - "node": "20 || >=22" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/pathe": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/pathe/-/pathe-2.0.3.tgz", - "integrity": "sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==", - "dev": true, - "license": "MIT" - }, - "node_modules/pg": { - "version": "8.18.0", - "resolved": "https://registry.npmjs.org/pg/-/pg-8.18.0.tgz", - "integrity": "sha512-xqrUDL1b9MbkydY/s+VZ6v+xiMUmOUk7SS9d/1kpyQxoJ6U9AO1oIJyUWVZojbfe5Cc/oluutcgFG4L9RDP1iQ==", - "license": "MIT", - "dependencies": { - "pg-connection-string": "^2.11.0", - "pg-pool": "^3.11.0", - "pg-protocol": "^1.11.0", - "pg-types": "2.2.0", - "pgpass": "1.0.5" - }, - "engines": { - "node": ">= 16.0.0" - }, - "optionalDependencies": { - "pg-cloudflare": "^1.3.0" - }, - "peerDependencies": { - "pg-native": ">=3.0.1" - }, - "peerDependenciesMeta": { - "pg-native": { - "optional": true - } - } - }, - "node_modules/pg-cloudflare": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/pg-cloudflare/-/pg-cloudflare-1.3.0.tgz", - "integrity": "sha512-6lswVVSztmHiRtD6I8hw4qP/nDm1EJbKMRhf3HCYaqud7frGysPv7FYJ5noZQdhQtN2xJnimfMtvQq21pdbzyQ==", - "license": "MIT", - "optional": true - }, - "node_modules/pg-connection-string": { - "version": "2.11.0", - "resolved": "https://registry.npmjs.org/pg-connection-string/-/pg-connection-string-2.11.0.tgz", - "integrity": "sha512-kecgoJwhOpxYU21rZjULrmrBJ698U2RxXofKVzOn5UDj61BPj/qMb7diYUR1nLScCDbrztQFl1TaQZT0t1EtzQ==", - "license": "MIT" - }, - "node_modules/pg-int8": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz", - "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==", - "license": "ISC", - "engines": { - "node": ">=4.0.0" - } - }, - "node_modules/pg-pool": { - "version": "3.11.0", - "resolved": "https://registry.npmjs.org/pg-pool/-/pg-pool-3.11.0.tgz", - "integrity": "sha512-MJYfvHwtGp870aeusDh+hg9apvOe2zmpZJpyt+BMtzUWlVqbhFmMK6bOBXLBUPd7iRtIF9fZplDc7KrPN3PN7w==", - "license": "MIT", - "peerDependencies": { - "pg": ">=8.0" - } - }, - "node_modules/pg-protocol": { - "version": "1.11.0", - "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.11.0.tgz", - "integrity": "sha512-pfsxk2M9M3BuGgDOfuy37VNRRX3jmKgMjcvAcWqNDpZSf4cUmv8HSOl5ViRQFsfARFn0KuUQTgLxVMbNq5NW3g==", - "license": "MIT" - }, - "node_modules/pg-types": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz", - "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==", - "license": "MIT", - "dependencies": { - "pg-int8": "1.0.1", - "postgres-array": "~2.0.0", - "postgres-bytea": "~1.0.0", - "postgres-date": "~1.0.4", - "postgres-interval": "^1.1.0" - }, - "engines": { - "node": ">=4" - } - }, - "node_modules/pgpass": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/pgpass/-/pgpass-1.0.5.tgz", - "integrity": "sha512-FdW9r/jQZhSeohs1Z3sI1yxFQNFvMcnmfuj4WBMUTxOrAyLMaTcE1aAMBiTlbMNaXvBCQuVi0R7hd8udDSP7ug==", - "license": "MIT", - "dependencies": { - "split2": "^4.1.0" - } - }, - "node_modules/pgvector": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/pgvector/-/pgvector-0.2.1.tgz", - "integrity": "sha512-nKaQY9wtuiidwLMdVIce1O3kL0d+FxrigCVzsShnoqzOSaWWWOvuctb/sYwlai5cTwwzRSNa+a/NtN2kVZGNJw==", - "license": "MIT", - "engines": { - "node": ">= 18" - } - }, - "node_modules/picocolors": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", - "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", - "dev": true, - "license": "ISC" - }, - "node_modules/picomatch": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", - "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/jonschlinkert" - } - }, - "node_modules/pirates": { - "version": "4.0.7", - "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.7.tgz", - "integrity": "sha512-TfySrs/5nm8fQJDcBDuUng3VOUKsd7S+zqvbOTiGXHfxX4wK31ard+hoNuvkicM/2YFzlpDgABOevKSsB4G/FA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 6" - } - }, - "node_modules/pkg-types": { - "version": "1.3.1", - "resolved": "https://registry.npmjs.org/pkg-types/-/pkg-types-1.3.1.tgz", - "integrity": "sha512-/Jm5M4RvtBFVkKWRu2BLUTNP8/M2a+UwuAX+ae4770q1qVGtfjG+WTCupoZixokjmHiry8uI+dlY8KXYV5HVVQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "confbox": "^0.1.8", - "mlly": "^1.7.4", - "pathe": "^2.0.1" - } - }, - "node_modules/postcss": { - "version": "8.5.6", - "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz", - "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==", - "dev": true, - "funding": [ - { - "type": "opencollective", - "url": "https://opencollective.com/postcss/" - }, - { - "type": "tidelift", - "url": "https://tidelift.com/funding/github/npm/postcss" - }, - { - "type": "github", - "url": "https://github.com/sponsors/ai" - } - ], - "license": "MIT", - "dependencies": { - "nanoid": "^3.3.11", - "picocolors": "^1.1.1", - "source-map-js": "^1.2.1" - }, - "engines": { - "node": "^10 || ^12 || >=14" - } - }, - "node_modules/postcss-load-config": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/postcss-load-config/-/postcss-load-config-6.0.1.tgz", - "integrity": "sha512-oPtTM4oerL+UXmx+93ytZVN82RrlY/wPUV8IeDxFrzIjXOLF1pN+EmKPLbubvKHT2HC20xXsCAH2Z+CKV6Oz/g==", - "dev": true, - "funding": [ - { - "type": "opencollective", - "url": "https://opencollective.com/postcss/" - }, - { - "type": "github", - "url": "https://github.com/sponsors/ai" - } - ], - "license": "MIT", - "dependencies": { - "lilconfig": "^3.1.1" - }, - "engines": { - "node": ">= 18" - }, - "peerDependencies": { - "jiti": ">=1.21.0", - "postcss": ">=8.0.9", - "tsx": "^4.8.1", - "yaml": "^2.4.2" - }, - "peerDependenciesMeta": { - "jiti": { - "optional": true - }, - "postcss": { - "optional": true - }, - "tsx": { - "optional": true - }, - "yaml": { - "optional": true - } - } - }, - "node_modules/postgres-array": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz", - "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==", - "license": "MIT", - "engines": { - "node": ">=4" - } - }, - "node_modules/postgres-bytea": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.1.tgz", - "integrity": "sha512-5+5HqXnsZPE65IJZSMkZtURARZelel2oXUEO8rH83VS/hxH5vv1uHquPg5wZs8yMAfdv971IU+kcPUczi7NVBQ==", - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/postgres-date": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz", - "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==", - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/postgres-interval": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz", - "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==", - "license": "MIT", - "dependencies": { - "xtend": "^4.0.0" - }, - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/process": { - "version": "0.11.10", - "resolved": "https://registry.npmjs.org/process/-/process-0.11.10.tgz", - "integrity": "sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A==", - "license": "MIT", - "engines": { - "node": ">= 0.6.0" - } - }, - "node_modules/protobufjs": { - "version": "7.5.4", - "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.4.tgz", - "integrity": "sha512-CvexbZtbov6jW2eXAvLukXjXUW1TzFaivC46BpWc/3BpcCysb5Vffu+B3XHMm8lVEuy2Mm4XGex8hBSg1yapPg==", - "hasInstallScript": true, - "license": "BSD-3-Clause", - "dependencies": { - "@protobufjs/aspromise": "^1.1.2", - "@protobufjs/base64": "^1.1.2", - "@protobufjs/codegen": "^2.0.4", - "@protobufjs/eventemitter": "^1.1.0", - "@protobufjs/fetch": "^1.1.0", - "@protobufjs/float": "^1.0.2", - "@protobufjs/inquire": "^1.1.0", - "@protobufjs/path": "^1.1.2", - "@protobufjs/pool": "^1.1.0", - "@protobufjs/utf8": "^1.1.0", - "@types/node": ">=13.7.0", - "long": "^5.0.0" - }, - "engines": { - "node": ">=12.0.0" - } - }, - "node_modules/readable-stream": { - "version": "3.6.2", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", - "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", - "license": "MIT", - "dependencies": { - "inherits": "^2.0.3", - "string_decoder": "^1.1.1", - "util-deprecate": "^1.0.1" - }, - "engines": { - "node": ">= 6" - } - }, - "node_modules/readdirp": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-4.1.2.tgz", - "integrity": "sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">= 14.18.0" - }, - "funding": { - "type": "individual", - "url": "https://paulmillr.com/funding/" - } - }, - "node_modules/require-directory": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", - "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==", - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/resolve-from": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-5.0.0.tgz", - "integrity": "sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=8" - } - }, - "node_modules/retry": { - "version": "0.13.1", - "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz", - "integrity": "sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==", - "license": "MIT", - "engines": { - "node": ">= 4" - } - }, - "node_modules/retry-request": { - "version": "7.0.2", - "resolved": "https://registry.npmjs.org/retry-request/-/retry-request-7.0.2.tgz", - "integrity": "sha512-dUOvLMJ0/JJYEn8NrpOaGNE7X3vpI5XlZS/u0ANjqtcZVKnIxP7IgCFwrKTxENw29emmwug53awKtaMm4i9g5w==", - "license": "MIT", - "dependencies": { - "@types/request": "^2.48.8", - "extend": "^3.0.2", - "teeny-request": "^9.0.0" - }, - "engines": { - "node": ">=14" - } - }, - "node_modules/rimraf": { - "version": "6.1.2", - "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-6.1.2.tgz", - "integrity": "sha512-cFCkPslJv7BAXJsYlK1dZsbP8/ZNLkCAQ0bi1hf5EKX2QHegmDFEFA6QhuYJlk7UDdc+02JjO80YSOrWPpw06g==", - "dev": true, - "license": "BlueOak-1.0.0", - "dependencies": { - "glob": "^13.0.0", - "package-json-from-dist": "^1.0.1" - }, - "bin": { - "rimraf": "dist/esm/bin.mjs" - }, - "engines": { - "node": "20 || >=22" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" - } - }, - "node_modules/rollup": { - "version": "4.57.1", - "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.57.1.tgz", - "integrity": "sha512-oQL6lgK3e2QZeQ7gcgIkS2YZPg5slw37hYufJ3edKlfQSGGm8ICoxswK15ntSzF/a8+h7ekRy7k7oWc3BQ7y8A==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/estree": "1.0.8" - }, - "bin": { - "rollup": "dist/bin/rollup" - }, - "engines": { - "node": ">=18.0.0", - "npm": ">=8.0.0" - }, - "optionalDependencies": { - "@rollup/rollup-android-arm-eabi": "4.57.1", - "@rollup/rollup-android-arm64": "4.57.1", - "@rollup/rollup-darwin-arm64": "4.57.1", - "@rollup/rollup-darwin-x64": "4.57.1", - "@rollup/rollup-freebsd-arm64": "4.57.1", - "@rollup/rollup-freebsd-x64": "4.57.1", - "@rollup/rollup-linux-arm-gnueabihf": "4.57.1", - "@rollup/rollup-linux-arm-musleabihf": "4.57.1", - "@rollup/rollup-linux-arm64-gnu": "4.57.1", - "@rollup/rollup-linux-arm64-musl": "4.57.1", - "@rollup/rollup-linux-loong64-gnu": "4.57.1", - "@rollup/rollup-linux-loong64-musl": "4.57.1", - "@rollup/rollup-linux-ppc64-gnu": "4.57.1", - "@rollup/rollup-linux-ppc64-musl": "4.57.1", - "@rollup/rollup-linux-riscv64-gnu": "4.57.1", - "@rollup/rollup-linux-riscv64-musl": "4.57.1", - "@rollup/rollup-linux-s390x-gnu": "4.57.1", - "@rollup/rollup-linux-x64-gnu": "4.57.1", - "@rollup/rollup-linux-x64-musl": "4.57.1", - "@rollup/rollup-openbsd-x64": "4.57.1", - "@rollup/rollup-openharmony-arm64": "4.57.1", - "@rollup/rollup-win32-arm64-msvc": "4.57.1", - "@rollup/rollup-win32-ia32-msvc": "4.57.1", - "@rollup/rollup-win32-x64-gnu": "4.57.1", - "@rollup/rollup-win32-x64-msvc": "4.57.1", - "fsevents": "~2.3.2" - } - }, - "node_modules/run-applescript": { - "version": "7.1.0", - "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.1.0.tgz", - "integrity": "sha512-DPe5pVFaAsinSaV6QjQ6gdiedWDcRCbUuiQfQa2wmWV7+xC9bGulGI8+TdRmoFkAPaBXk8CrAbnlY2ISniJ47Q==", - "license": "MIT", - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/safe-buffer": { - "version": "5.2.1", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", - "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT" - }, - "node_modules/safe-stable-stringify": { - "version": "2.5.0", - "resolved": "https://registry.npmjs.org/safe-stable-stringify/-/safe-stable-stringify-2.5.0.tgz", - "integrity": "sha512-b3rppTKm9T+PsVCBEOUR46GWI7fdOs00VKZ1+9c1EWDaDMvjQc6tUwuFyIprgGgTcWoVHSKrU8H31ZHA2e0RHA==", - "license": "MIT", - "engines": { - "node": ">=10" - } - }, - "node_modules/safer-buffer": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", - "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", - "license": "MIT" - }, - "node_modules/semver": { - "version": "7.7.4", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", - "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", - "license": "ISC", - "bin": { - "semver": "bin/semver.js" - }, - "engines": { - "node": ">=10" - } - }, - "node_modules/seq-queue": { - "version": "0.0.5", - "resolved": "https://registry.npmjs.org/seq-queue/-/seq-queue-0.0.5.tgz", - "integrity": "sha512-hr3Wtp/GZIc/6DAGPDcV4/9WoZhjrkXsi5B/07QgX8tsdc6ilr7BFM6PM6rbdAX1kFSDYeZGLipIZZKyQP0O5Q==" - }, - "node_modules/siginfo": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/siginfo/-/siginfo-2.0.0.tgz", - "integrity": "sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==", - "dev": true, - "license": "ISC" - }, - "node_modules/source-map": { - "version": "0.7.6", - "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.7.6.tgz", - "integrity": "sha512-i5uvt8C3ikiWeNZSVZNWcfZPItFQOsYTUAOkcUPGd8DqDy1uOUikjt5dG+uRlwyvR108Fb9DOd4GvXfT0N2/uQ==", - "dev": true, - "license": "BSD-3-Clause", - "engines": { - "node": ">= 12" - } - }, - "node_modules/source-map-js": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", - "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", - "dev": true, - "license": "BSD-3-Clause", - "engines": { - "node": ">=0.10.0" - } - }, - "node_modules/split2": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/split2/-/split2-4.2.0.tgz", - "integrity": "sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==", - "license": "ISC", - "engines": { - "node": ">= 10.x" - } - }, - "node_modules/sprintf-js": { - "version": "1.1.3", - "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.1.3.tgz", - "integrity": "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==", - "license": "BSD-3-Clause" - }, - "node_modules/sqlstring": { - "version": "2.3.3", - "resolved": "https://registry.npmjs.org/sqlstring/-/sqlstring-2.3.3.tgz", - "integrity": "sha512-qC9iz2FlN7DQl3+wjwn3802RTyjCx7sDvfQEXchwa6CWOx07/WVfh91gBmQ9fahw8snwGEWU3xGzOt4tFyHLxg==", - "license": "MIT", - "engines": { - "node": ">= 0.6" - } - }, - "node_modules/stack-trace": { - "version": "0.0.10", - "resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz", - "integrity": "sha512-KGzahc7puUKkzyMt+IqAep+TVNbKP+k2Lmwhub39m1AsTSkaDutx56aDCo+HLDzf/D26BIHTJWNiTG1KAJiQCg==", - "license": "MIT", - "engines": { - "node": "*" - } - }, - "node_modules/stackback": { - "version": "0.0.2", - "resolved": "https://registry.npmjs.org/stackback/-/stackback-0.0.2.tgz", - "integrity": "sha512-1XMJE5fQo1jGH6Y/7ebnwPOBEkIEnT4QF32d5R1+VXdXveM0IBMJt8zfaxX1P3QhVwrYe+576+jkANtSS2mBbw==", - "dev": true, - "license": "MIT" - }, - "node_modules/std-env": { - "version": "3.10.0", - "resolved": "https://registry.npmjs.org/std-env/-/std-env-3.10.0.tgz", - "integrity": "sha512-5GS12FdOZNliM5mAOxFRg7Ir0pWz8MdpYm6AY6VPkGpbA7ZzmbzNcBJQ0GPvvyWgcY7QAhCgf9Uy89I03faLkg==", - "dev": true, - "license": "MIT" - }, - "node_modules/stream-events": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/stream-events/-/stream-events-1.0.5.tgz", - "integrity": "sha512-E1GUzBSgvct8Jsb3v2X15pjzN1tYebtbLaMg+eBOUOAxgbLoSbT2NS91ckc5lJD1KfLjId+jXJRgo0qnV5Nerg==", - "license": "MIT", - "dependencies": { - "stubs": "^3.0.0" - } - }, - "node_modules/stream-shift": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/stream-shift/-/stream-shift-1.0.3.tgz", - "integrity": "sha512-76ORR0DO1o1hlKwTbi/DM3EXWGf3ZJYO8cXX5RJwnul2DEg2oyoZyjLNoQM8WsvZiFKCRfC1O0J7iCvie3RZmQ==", - "license": "MIT" - }, - "node_modules/string_decoder": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", - "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", - "license": "MIT", - "dependencies": { - "safe-buffer": "~5.2.0" - } - }, - "node_modules/string-width": { - "version": "4.2.3", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", - "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", - "license": "MIT", - "dependencies": { - "emoji-regex": "^8.0.0", - "is-fullwidth-code-point": "^3.0.0", - "strip-ansi": "^6.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/strip-ansi": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", - "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "license": "MIT", - "dependencies": { - "ansi-regex": "^5.0.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/strnum": { - "version": "2.1.2", - "resolved": "https://registry.npmjs.org/strnum/-/strnum-2.1.2.tgz", - "integrity": "sha512-l63NF9y/cLROq/yqKXSLtcMeeyOfnSQlfMSlzFt/K73oIaD8DGaQWd7Z34X9GPiKqP5rbSh84Hl4bOlLcjiSrQ==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/NaturalIntelligence" - } - ], - "license": "MIT" - }, - "node_modules/stubs": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/stubs/-/stubs-3.0.0.tgz", - "integrity": "sha512-PdHt7hHUJKxvTCgbKX9C1V/ftOcjJQgz8BZwNfV5c4B6dcGqlpelTbJ999jBGZ2jYiPAwcX5dP6oBwVlBlUbxw==", - "license": "MIT" - }, - "node_modules/sucrase": { - "version": "3.35.1", - "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.1.tgz", - "integrity": "sha512-DhuTmvZWux4H1UOnWMB3sk0sbaCVOoQZjv8u1rDoTV0HTdGem9hkAZtl4JZy8P2z4Bg0nT+YMeOFyVr4zcG5Tw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@jridgewell/gen-mapping": "^0.3.2", - "commander": "^4.0.0", - "lines-and-columns": "^1.1.6", - "mz": "^2.7.0", - "pirates": "^4.0.1", - "tinyglobby": "^0.2.11", - "ts-interface-checker": "^0.1.9" - }, - "bin": { - "sucrase": "bin/sucrase", - "sucrase-node": "bin/sucrase-node" - }, - "engines": { - "node": ">=16 || 14 >=14.17" - } - }, - "node_modules/supports-color": { - "version": "7.2.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", - "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", - "dev": true, - "license": "MIT", - "dependencies": { - "has-flag": "^4.0.0" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/tagged-tag": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/tagged-tag/-/tagged-tag-1.0.0.tgz", - "integrity": "sha512-yEFYrVhod+hdNyx7g5Bnkkb0G6si8HJurOoOEgC8B/O0uXLHlaey/65KRv6cuWBNhBgHKAROVpc7QyYqE5gFng==", - "license": "MIT", - "engines": { - "node": ">=20" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/tarn": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/tarn/-/tarn-3.0.2.tgz", - "integrity": "sha512-51LAVKUSZSVfI05vjPESNc5vwqqZpbXCsU+/+wxlOrUjk2SnFTt97v9ZgQrD4YmxYW1Px6w2KjaDitCfkvgxMQ==", - "license": "MIT", - "engines": { - "node": ">=8.0.0" - } - }, - "node_modules/tedious": { - "version": "19.2.0", - "resolved": "https://registry.npmjs.org/tedious/-/tedious-19.2.0.tgz", - "integrity": "sha512-2dDjX0KP54riDvJPiiIozv0WRS/giJb3/JG2lWpa2dgM0Gha7mLAxbTR3ltPkGzfoS6M3oDnhYnWuzeaZibHuQ==", - "license": "MIT", - "dependencies": { - "@azure/core-auth": "^1.7.2", - "@azure/identity": "^4.2.1", - "@azure/keyvault-keys": "^4.4.0", - "@js-joda/core": "^5.6.5", - "@types/node": ">=18", - "bl": "^6.1.4", - "iconv-lite": "^0.7.0", - "js-md4": "^0.3.2", - "native-duplexpair": "^1.0.0", - "sprintf-js": "^1.1.3" - }, - "engines": { - "node": ">=18.17" - } - }, - "node_modules/teeny-request": { - "version": "9.0.0", - "resolved": "https://registry.npmjs.org/teeny-request/-/teeny-request-9.0.0.tgz", - "integrity": "sha512-resvxdc6Mgb7YEThw6G6bExlXKkv6+YbuzGg9xuXxSgxJF7Ozs+o8Y9+2R3sArdWdW8nOokoQb1yrpFB0pQK2g==", - "license": "Apache-2.0", - "dependencies": { - "http-proxy-agent": "^5.0.0", - "https-proxy-agent": "^5.0.0", - "node-fetch": "^2.6.9", - "stream-events": "^1.0.5", - "uuid": "^9.0.0" - }, - "engines": { - "node": ">=14" - } - }, - "node_modules/teeny-request/node_modules/agent-base": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz", - "integrity": "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==", - "license": "MIT", - "dependencies": { - "debug": "4" - }, - "engines": { - "node": ">= 6.0.0" - } - }, - "node_modules/teeny-request/node_modules/http-proxy-agent": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz", - "integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==", - "license": "MIT", - "dependencies": { - "@tootallnate/once": "2", - "agent-base": "6", - "debug": "4" - }, - "engines": { - "node": ">= 6" - } - }, - "node_modules/teeny-request/node_modules/https-proxy-agent": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz", - "integrity": "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==", - "license": "MIT", - "dependencies": { - "agent-base": "6", - "debug": "4" - }, - "engines": { - "node": ">= 6" - } - }, - "node_modules/teeny-request/node_modules/uuid": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", - "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", - "funding": [ - "https://github.com/sponsors/broofa", - "https://github.com/sponsors/ctavan" - ], - "license": "MIT", - "bin": { - "uuid": "dist/bin/uuid" - } - }, - "node_modules/text-hex": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz", - "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==", - "license": "MIT" - }, - "node_modules/thenify": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz", - "integrity": "sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==", - "dev": true, - "license": "MIT", - "dependencies": { - "any-promise": "^1.0.0" - } - }, - "node_modules/thenify-all": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/thenify-all/-/thenify-all-1.6.0.tgz", - "integrity": "sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==", - "dev": true, - "license": "MIT", - "dependencies": { - "thenify": ">= 3.1.0 < 4" - }, - "engines": { - "node": ">=0.8" - } - }, - "node_modules/tinybench": { - "version": "2.9.0", - "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz", - "integrity": "sha512-0+DUvqWMValLmha6lr4kD8iAMK1HzV0/aKnCtWb9v9641TnP/MFb7Pc2bxoxQjTXAErryXVgUOfv2YqNllqGeg==", - "dev": true, - "license": "MIT" - }, - "node_modules/tinyexec": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/tinyexec/-/tinyexec-0.3.2.tgz", - "integrity": "sha512-KQQR9yN7R5+OSwaK0XQoj22pwHoTlgYqmUscPYoknOoWCWfj/5/ABTMRi69FrKU5ffPVh5QcFikpWJI/P1ocHA==", - "dev": true, - "license": "MIT" - }, - "node_modules/tinyglobby": { - "version": "0.2.15", - "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz", - "integrity": "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "fdir": "^6.5.0", - "picomatch": "^4.0.3" - }, - "engines": { - "node": ">=12.0.0" - }, - "funding": { - "url": "https://github.com/sponsors/SuperchupuDev" - } - }, - "node_modules/tinyrainbow": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.0.3.tgz", - "integrity": "sha512-PSkbLUoxOFRzJYjjxHJt9xro7D+iilgMX/C9lawzVuYiIdcihh9DXmVibBe8lmcFrRi/VzlPjBxbN7rH24q8/Q==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/tr46": { - "version": "0.0.3", - "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", - "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", - "license": "MIT" - }, - "node_modules/tree-kill": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/tree-kill/-/tree-kill-1.2.2.tgz", - "integrity": "sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==", - "dev": true, - "license": "MIT", - "bin": { - "tree-kill": "cli.js" - } - }, - "node_modules/triple-beam": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.4.1.tgz", - "integrity": "sha512-aZbgViZrg1QNcG+LULa7nhZpJTZSLm/mXnHXnbAbjmN5aSa0y7V+wvv6+4WaBtpISJzThKy+PIPxc1Nq1EJ9mg==", - "license": "MIT", - "engines": { - "node": ">= 14.0.0" - } - }, - "node_modules/ts-error": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/ts-error/-/ts-error-1.0.6.tgz", - "integrity": "sha512-tLJxacIQUM82IR7JO1UUkKlYuUTmoY9HBJAmNWFzheSlDS5SPMcNIepejHJa4BpPQLAcbRhRf3GDJzyj6rbKvA==", - "license": "MIT" - }, - "node_modules/ts-interface-checker": { - "version": "0.1.13", - "resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz", - "integrity": "sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==", - "dev": true, - "license": "Apache-2.0" - }, - "node_modules/tslib": { - "version": "2.8.1", - "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", - "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", - "license": "0BSD" - }, - "node_modules/tsup": { - "version": "8.5.1", - "resolved": "https://registry.npmjs.org/tsup/-/tsup-8.5.1.tgz", - "integrity": "sha512-xtgkqwdhpKWr3tKPmCkvYmS9xnQK3m3XgxZHwSUjvfTjp7YfXe5tT3GgWi0F2N+ZSMsOeWeZFh7ZZFg5iPhing==", - "dev": true, - "license": "MIT", - "dependencies": { - "bundle-require": "^5.1.0", - "cac": "^6.7.14", - "chokidar": "^4.0.3", - "consola": "^3.4.0", - "debug": "^4.4.0", - "esbuild": "^0.27.0", - "fix-dts-default-cjs-exports": "^1.0.0", - "joycon": "^3.1.1", - "picocolors": "^1.1.1", - "postcss-load-config": "^6.0.1", - "resolve-from": "^5.0.0", - "rollup": "^4.34.8", - "source-map": "^0.7.6", - "sucrase": "^3.35.0", - "tinyexec": "^0.3.2", - "tinyglobby": "^0.2.11", - "tree-kill": "^1.2.2" - }, - "bin": { - "tsup": "dist/cli-default.js", - "tsup-node": "dist/cli-node.js" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "@microsoft/api-extractor": "^7.36.0", - "@swc/core": "^1", - "postcss": "^8.4.12", - "typescript": ">=4.5.0" - }, - "peerDependenciesMeta": { - "@microsoft/api-extractor": { - "optional": true - }, - "@swc/core": { - "optional": true - }, - "postcss": { - "optional": true - }, - "typescript": { - "optional": true - } - } - }, - "node_modules/type-fest": { - "version": "5.4.3", - "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-5.4.3.tgz", - "integrity": "sha512-AXSAQJu79WGc79/3e9/CR77I/KQgeY1AhNvcShIH4PTcGYyC4xv6H4R4AUOwkPS5799KlVDAu8zExeCrkGquiA==", - "license": "(MIT OR CC0-1.0)", - "dependencies": { - "tagged-tag": "^1.0.0" - }, - "engines": { - "node": ">=20" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/typescript": { - "version": "5.9.3", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", - "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", - "license": "Apache-2.0", - "bin": { - "tsc": "bin/tsc", - "tsserver": "bin/tsserver" - }, - "engines": { - "node": ">=14.17" - } - }, - "node_modules/ufo": { - "version": "1.6.3", - "resolved": "https://registry.npmjs.org/ufo/-/ufo-1.6.3.tgz", - "integrity": "sha512-yDJTmhydvl5lJzBmy/hyOAA0d+aqCBuwl818haVdYCRrWV84o7YyeVm4QlVHStqNrrJSTb6jKuFAVqAFsr+K3Q==", - "dev": true, - "license": "MIT" - }, - "node_modules/undici": { - "version": "6.23.0", - "resolved": "https://registry.npmjs.org/undici/-/undici-6.23.0.tgz", - "integrity": "sha512-VfQPToRA5FZs/qJxLIinmU59u0r7LXqoJkCzinq3ckNJp3vKEh7jTWN589YQ5+aoAC/TGRLyJLCPKcLQbM8r9g==", - "license": "MIT", - "engines": { - "node": ">=18.17" - } - }, - "node_modules/undici-types": { - "version": "7.16.0", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", - "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==", - "license": "MIT" - }, - "node_modules/util-deprecate": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", - "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", - "license": "MIT" - }, - "node_modules/uuid": { - "version": "8.3.2", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-8.3.2.tgz", - "integrity": "sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==", - "license": "MIT", - "bin": { - "uuid": "dist/bin/uuid" - } - }, - "node_modules/vite": { - "version": "7.3.1", - "resolved": "https://registry.npmjs.org/vite/-/vite-7.3.1.tgz", - "integrity": "sha512-w+N7Hifpc3gRjZ63vYBXA56dvvRlNWRczTdmCBBa+CotUzAPf5b7YMdMR/8CQoeYE5LX3W4wj6RYTgonm1b9DA==", - "dev": true, - "license": "MIT", - "dependencies": { - "esbuild": "^0.27.0", - "fdir": "^6.5.0", - "picomatch": "^4.0.3", - "postcss": "^8.5.6", - "rollup": "^4.43.0", - "tinyglobby": "^0.2.15" - }, - "bin": { - "vite": "bin/vite.js" - }, - "engines": { - "node": "^20.19.0 || >=22.12.0" - }, - "funding": { - "url": "https://github.com/vitejs/vite?sponsor=1" - }, - "optionalDependencies": { - "fsevents": "~2.3.3" - }, - "peerDependencies": { - "@types/node": "^20.19.0 || >=22.12.0", - "jiti": ">=1.21.0", - "less": "^4.0.0", - "lightningcss": "^1.21.0", - "sass": "^1.70.0", - "sass-embedded": "^1.70.0", - "stylus": ">=0.54.8", - "sugarss": "^5.0.0", - "terser": "^5.16.0", - "tsx": "^4.8.1", - "yaml": "^2.4.2" - }, - "peerDependenciesMeta": { - "@types/node": { - "optional": true - }, - "jiti": { - "optional": true - }, - "less": { - "optional": true - }, - "lightningcss": { - "optional": true - }, - "sass": { - "optional": true - }, - "sass-embedded": { - "optional": true - }, - "stylus": { - "optional": true - }, - "sugarss": { - "optional": true - }, - "terser": { - "optional": true - }, - "tsx": { - "optional": true - }, - "yaml": { - "optional": true - } - } - }, - "node_modules/vitest": { - "version": "4.0.18", - "resolved": "https://registry.npmjs.org/vitest/-/vitest-4.0.18.tgz", - "integrity": "sha512-hOQuK7h0FGKgBAas7v0mSAsnvrIgAvWmRFjmzpJ7SwFHH3g1k2u37JtYwOwmEKhK6ZO3v9ggDBBm0La1LCK4uQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@vitest/expect": "4.0.18", - "@vitest/mocker": "4.0.18", - "@vitest/pretty-format": "4.0.18", - "@vitest/runner": "4.0.18", - "@vitest/snapshot": "4.0.18", - "@vitest/spy": "4.0.18", - "@vitest/utils": "4.0.18", - "es-module-lexer": "^1.7.0", - "expect-type": "^1.2.2", - "magic-string": "^0.30.21", - "obug": "^2.1.1", - "pathe": "^2.0.3", - "picomatch": "^4.0.3", - "std-env": "^3.10.0", - "tinybench": "^2.9.0", - "tinyexec": "^1.0.2", - "tinyglobby": "^0.2.15", - "tinyrainbow": "^3.0.3", - "vite": "^6.0.0 || ^7.0.0", - "why-is-node-running": "^2.3.0" - }, - "bin": { - "vitest": "vitest.mjs" - }, - "engines": { - "node": "^20.0.0 || ^22.0.0 || >=24.0.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" - }, - "peerDependencies": { - "@edge-runtime/vm": "*", - "@opentelemetry/api": "^1.9.0", - "@types/node": "^20.0.0 || ^22.0.0 || >=24.0.0", - "@vitest/browser-playwright": "4.0.18", - "@vitest/browser-preview": "4.0.18", - "@vitest/browser-webdriverio": "4.0.18", - "@vitest/ui": "4.0.18", - "happy-dom": "*", - "jsdom": "*" - }, - "peerDependenciesMeta": { - "@edge-runtime/vm": { - "optional": true - }, - "@opentelemetry/api": { - "optional": true - }, - "@types/node": { - "optional": true - }, - "@vitest/browser-playwright": { - "optional": true - }, - "@vitest/browser-preview": { - "optional": true - }, - "@vitest/browser-webdriverio": { - "optional": true - }, - "@vitest/ui": { - "optional": true - }, - "happy-dom": { - "optional": true - }, - "jsdom": { - "optional": true - } - } - }, - "node_modules/vitest/node_modules/tinyexec": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/tinyexec/-/tinyexec-1.0.2.tgz", - "integrity": "sha512-W/KYk+NFhkmsYpuHq5JykngiOCnxeVL8v8dFnqxSD8qEEdRfXk1SDM6JzNqcERbcGYj9tMrDQBYV9cjgnunFIg==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" - } - }, - "node_modules/weaviate-client": { - "version": "3.11.0", - "resolved": "https://registry.npmjs.org/weaviate-client/-/weaviate-client-3.11.0.tgz", - "integrity": "sha512-pEO+V8OZ84KUKz9ftQnSuooCT4Fdh3SjkDj6FPfxI3Iy6qc+PTTAMFFO0wL2prnf71DIs0tdNw/1RFX4kJkE/w==", - "license": "BSD-3-Clause", - "dependencies": { - "@datastructures-js/deque": "^1.0.8", - "abort-controller-x": "^0.5.0", - "graphql": "^16.12.0", - "graphql-request": "^6.1.0", - "long": "^5.3.2", - "nice-grpc": "^2.1.14", - "nice-grpc-client-middleware-retry": "^3.1.13", - "nice-grpc-common": "^2.0.2", - "uuid": "^9.0.1" - }, - "engines": { - "node": ">=20.0.0" - } - }, - "node_modules/weaviate-client/node_modules/uuid": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", - "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", - "funding": [ - "https://github.com/sponsors/broofa", - "https://github.com/sponsors/ctavan" - ], - "license": "MIT", - "bin": { - "uuid": "dist/bin/uuid" - } - }, - "node_modules/webidl-conversions": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", - "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", - "license": "BSD-2-Clause" - }, - "node_modules/whatwg-url": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", - "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", - "license": "MIT", - "dependencies": { - "tr46": "~0.0.3", - "webidl-conversions": "^3.0.0" - } - }, - "node_modules/why-is-node-running": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/why-is-node-running/-/why-is-node-running-2.3.0.tgz", - "integrity": "sha512-hUrmaWBdVDcxvYqnyh09zunKzROWjbZTiNy8dBEjkS7ehEDQibXJ7XvlmtbwuTclUiIyN+CyXQD4Vmko8fNm8w==", - "dev": true, - "license": "MIT", - "dependencies": { - "siginfo": "^2.0.0", - "stackback": "0.0.2" - }, - "bin": { - "why-is-node-running": "cli.js" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/winston": { - "version": "3.19.0", - "resolved": "https://registry.npmjs.org/winston/-/winston-3.19.0.tgz", - "integrity": "sha512-LZNJgPzfKR+/J3cHkxcpHKpKKvGfDZVPS4hfJCc4cCG0CgYzvlD6yE/S3CIL/Yt91ak327YCpiF/0MyeZHEHKA==", - "license": "MIT", - "dependencies": { - "@colors/colors": "^1.6.0", - "@dabh/diagnostics": "^2.0.8", - "async": "^3.2.3", - "is-stream": "^2.0.0", - "logform": "^2.7.0", - "one-time": "^1.0.0", - "readable-stream": "^3.4.0", - "safe-stable-stringify": "^2.3.1", - "stack-trace": "0.0.x", - "triple-beam": "^1.3.0", - "winston-transport": "^4.9.0" - }, - "engines": { - "node": ">= 12.0.0" - } - }, - "node_modules/winston-transport": { - "version": "4.9.0", - "resolved": "https://registry.npmjs.org/winston-transport/-/winston-transport-4.9.0.tgz", - "integrity": "sha512-8drMJ4rkgaPo1Me4zD/3WLfI/zPdA9o2IipKODunnGDcuqbHwjsbB79ylv04LCGGzU0xQ6vTznOMpQGaLhhm6A==", - "license": "MIT", - "dependencies": { - "logform": "^2.7.0", - "readable-stream": "^3.6.2", - "triple-beam": "^1.3.0" - }, - "engines": { - "node": ">= 12.0.0" - } - }, - "node_modules/wrap-ansi": { - "version": "7.0.0", - "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", - "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", - "license": "MIT", - "dependencies": { - "ansi-styles": "^4.0.0", - "string-width": "^4.1.0", - "strip-ansi": "^6.0.0" - }, - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/wrap-ansi?sponsor=1" - } - }, - "node_modules/wrappy": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", - "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", - "license": "ISC" - }, - "node_modules/ws": { - "version": "8.19.0", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.19.0.tgz", - "integrity": "sha512-blAT2mjOEIi0ZzruJfIhb3nps74PRWTCz1IjglWEEpQl5XS/UNama6u2/rjFkDDouqr4L67ry+1aGIALViWjDg==", - "license": "MIT", - "engines": { - "node": ">=10.0.0" - }, - "peerDependencies": { - "bufferutil": "^4.0.1", - "utf-8-validate": ">=5.0.2" - }, - "peerDependenciesMeta": { - "bufferutil": { - "optional": true - }, - "utf-8-validate": { - "optional": true - } - } - }, - "node_modules/wsl-utils": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/wsl-utils/-/wsl-utils-0.1.0.tgz", - "integrity": "sha512-h3Fbisa2nKGPxCpm89Hk33lBLsnaGBvctQopaBSOW/uIs6FTe1ATyAnKFJrzVs9vpGdsTe73WF3V4lIsk4Gacw==", - "license": "MIT", - "dependencies": { - "is-wsl": "^3.1.0" - }, - "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/xtend": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", - "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", - "license": "MIT", - "engines": { - "node": ">=0.4" - } - }, - "node_modules/y18n": { - "version": "5.0.8", - "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", - "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==", - "license": "ISC", - "engines": { - "node": ">=10" - } - }, - "node_modules/yaml": { - "version": "2.8.2", - "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.8.2.tgz", - "integrity": "sha512-mplynKqc1C2hTVYxd0PU2xQAc22TI1vShAYGksCCfxbn/dFwnHTNi1bvYsBTkhdUNtGIf5xNOg938rrSSYvS9A==", - "license": "ISC", - "bin": { - "yaml": "bin.mjs" - }, - "engines": { - "node": ">= 14.6" - }, - "funding": { - "url": "https://github.com/sponsors/eemeli" - } - }, - "node_modules/yargs": { - "version": "17.7.2", - "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz", - "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==", - "license": "MIT", - "dependencies": { - "cliui": "^8.0.1", - "escalade": "^3.1.1", - "get-caller-file": "^2.0.5", - "require-directory": "^2.1.1", - "string-width": "^4.2.3", - "y18n": "^5.0.5", - "yargs-parser": "^21.1.1" - }, - "engines": { - "node": ">=12" - } - }, - "node_modules/yargs-parser": { - "version": "21.1.1", - "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz", - "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==", - "license": "ISC", - "engines": { - "node": ">=12" - } - }, - "node_modules/yocto-queue": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", - "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==", - "license": "MIT", - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/zod": { - "version": "4.3.6", - "resolved": "https://registry.npmjs.org/zod/-/zod-4.3.6.tgz", - "integrity": "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==", - "license": "MIT", - "funding": { - "url": "https://github.com/sponsors/colinhacks" - } - }, - "packages/nvisy-core": { - "name": "@nvisy/core", - "version": "0.1.0", - "dependencies": { - "@logtape/logtape": "^2.0.2", - "magic-bytes.js": "^1.13.0", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-plugin-ai": { - "name": "@nvisy/plugin-ai", - "version": "0.1.0", - "dependencies": { - "@ai-sdk/anthropic": "^3.0.36", - "@ai-sdk/google": "^3.0.20", - "@ai-sdk/openai": "^3.0.25", - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "ai": "^6.0.69", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-plugin-core": { - "name": "@nvisy/plugin-core", - "version": "0.1.0", - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "csv-parse": "^6.1.0", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-plugin-markup": { - "name": "@nvisy/plugin-markup", - "version": "0.1.0", - "extraneous": true, - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "@nvisy/plugin-core": "*", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-plugin-nosql": { - "name": "@nvisy/plugin-nosql", - "version": "0.1.0", - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-plugin-object": { - "name": "@nvisy/plugin-object", - "version": "0.1.0", - "dependencies": { - "@aws-sdk/client-s3": "^3.750.0", - "@azure/storage-blob": "^12.26.0", - "@google-cloud/storage": "^7.15.0", - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-plugin-ocr": { - "name": "@nvisy/plugin-ocr", - "version": "0.1.0", - "extraneous": true, - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-plugin-pandoc": { - "name": "@nvisy/plugin-pandoc", - "version": "0.1.0", - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-plugin-queue": { - "name": "@nvisy/plugin-queue", - "version": "0.1.0", - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-plugin-sql": { - "name": "@nvisy/plugin-sql", - "version": "0.1.0", - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "kysely": "^0.28.11", - "mysql2": "^3.16.3", - "pg": "^8.18.0", - "tarn": "^3.0.2", - "tedious": "^19.2.0", - "zod": "^4.3.6" - }, - "devDependencies": { - "@types/pg": "^8.16.0" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-plugin-tesseract": { - "name": "@nvisy/plugin-tesseract", - "version": "0.1.0", - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-plugin-vector": { - "name": "@nvisy/plugin-vector", - "version": "0.1.0", - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "@pinecone-database/pinecone": "^7.0.0", - "@qdrant/js-client-rest": "^1.13.0", - "@zilliz/milvus2-sdk-node": "^2.5.0", - "pg": "^8.13.0", - "pgvector": "^0.2.0", - "weaviate-client": "^3.5.0", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-plugin-vector/node_modules/@pinecone-database/pinecone": { - "version": "7.0.0", - "resolved": "https://registry.npmjs.org/@pinecone-database/pinecone/-/pinecone-7.0.0.tgz", - "integrity": "sha512-/+SzpIJPXhrwv27CCz+sFw/r22Sjk9s+i8nFTryPiQAcwgWmWRWFqT1/LGkTdd9NRhuEA48yaCx/HgM6ugLNJA==", - "license": "Apache-2.0", - "engines": { - "node": ">=20.0.0" - } - }, - "packages/nvisy-runtime": { - "name": "@nvisy/runtime", - "version": "0.1.0", - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "@nvisy/plugin-core": "*", - "effection": "^4.0.2", - "graphology": "^0.26.0", - "graphology-dag": "^0.4.1", - "graphology-types": "^0.24.8", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "packages/nvisy-server": { - "name": "@nvisy/server", - "version": "0.1.0", - "dependencies": { - "@hono/event-emitter": "^2.0.0", - "@hono/node-server": "^1.19.9", - "@hono/node-ws": "^1.3.0", - "@hono/otel": "^1.1.0", - "@hono/zod-openapi": "^1.2.1", - "@hono/zod-validator": "^0.7.6", - "@logtape/hono": "^2.0.2", - "@logtape/logtape": "^2.0.2", - "@logtape/pretty": "^2.0.2", - "@logtape/redaction": "^2.0.2", - "@nvisy/core": "*", - "@nvisy/plugin-ai": "*", - "@nvisy/plugin-nosql": "*", - "@nvisy/plugin-object": "*", - "@nvisy/plugin-pandoc": "*", - "@nvisy/plugin-queue": "*", - "@nvisy/plugin-sql": "*", - "@nvisy/plugin-tesseract": "*", - "@nvisy/plugin-vector": "*", - "@nvisy/runtime": "*", - "@scalar/hono-api-reference": "^0.9.40", - "hono": "^4.11.7", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } - }, - "sdks/nvisy-ts": { - "name": "nvisy", - "version": "0.1.0", - "extraneous": true, - "devDependencies": { - "@biomejs/biome": "^2.3.14", - "tsup": "^8.5.1", - "typescript": "^5.9.3", - "vitest": "^4.0.18" - }, - "engines": { - "node": ">=22.0.0" - } - } - } -} diff --git a/package.json b/package.json deleted file mode 100644 index 3efa9d9..0000000 --- a/package.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "name": "@nvisy/monorepo", - "private": true, - "type": "module", - "engines": { - "node": ">=22.0.0" - }, - "workspaces": [ - "packages/*" - ], - "scripts": { - "build": "npm run build --workspaces", - "test": "vitest", - "test:coverage": "vitest --coverage", - "lint": "biome lint .", - "lint:fix": "biome lint --write .", - "format": "biome format --write .", - "format:check": "biome format .", - "check": "biome check .", - "check:fix": "biome check --write .", - "typecheck": "npm run typecheck --workspaces", - "clean": "npm run clean --workspaces && rimraf node_modules" - }, - "overrides": { - "@zilliz/milvus2-sdk-node": { - "@grpc/grpc-js": "^1.8.22" - } - }, - "devDependencies": { - "@biomejs/biome": "^2.3.14", - "@types/node": "^25.2.0", - "@vitest/coverage-v8": "^4.0.18", - "rimraf": "^6.1.2", - "tsup": "^8.5.1", - "typescript": "^5.9.3", - "vitest": "^4.0.18" - } -} diff --git a/packages/README.md b/packages/README.md deleted file mode 100644 index f375c9f..0000000 --- a/packages/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# Packages - -[![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) - -## Core infrastructure - -The runtime architecture follows a layered separation of concerns. At the -foundation, a shared core library defines the type system, error taxonomy, and -abstract interfaces that all other packages depend on. Above it, the runtime -engine implements a DAG-based execution model: pipeline definitions are parsed -from declarative JSON graphs, compiled into immutable execution plans, and -evaluated in topological order with structured concurrency, per-node retry -policies, and full lineage tracking across every data item. The server package -exposes this engine over HTTP, providing a REST API for pipeline management, -execution, and observability. - -| Package | Description | -|---------|-------------| -| [`nvisy-core`](nvisy-core/) | Core data types, errors, and utilities | -| [`nvisy-runtime`](nvisy-runtime/) | Graph definition, DAG compiler, execution engine | -| [`nvisy-server`](nvisy-server/) | HTTP execution worker | - -## Provider plugins - -Provider plugins supply the I/O boundary of a pipeline. Each plugin implements -one or more _providers_ — authenticated clients to external systems — and -_streams_ — source or target adapters that read from or write to those systems -using the provider's client. This design decouples credential management from -data flow: a single provider connection can back multiple streams within the -same pipeline, and streams are reusable across providers that share a common -client interface. Provider plugins cover the six major categories of external -storage: relational databases, document stores, object stores, vector -databases, message queues, and AI model endpoints. - -| Package | Description | -|---------|-------------| -| [`nvisy-plugin-ai`](nvisy-plugin-ai/) | AI provider integrations (OpenAI, Anthropic, Google) | -| [`nvisy-plugin-nosql`](nvisy-plugin-nosql/) | NoSQL database integrations (MongoDB, DynamoDB, Firestore) | -| [`nvisy-plugin-object`](nvisy-plugin-object/) | Object store integrations (S3, GCS, Azure Blob) | -| [`nvisy-plugin-queue`](nvisy-plugin-queue/) | Message queue integrations (Kafka, RabbitMQ, SQS, Redis Streams) | -| [`nvisy-plugin-sql`](nvisy-plugin-sql/) | SQL database integrations (Postgres, MySQL, MSSQL) | -| [`nvisy-plugin-vector`](nvisy-plugin-vector/) | Vector database integrations (Pinecone, Qdrant, Milvus, Weaviate, pgvector) | - -## Action plugins - -Action plugins operate on data in-flight without requiring external service -credentials. They implement pure transformations: a function from one typed data -item to another, executed locally within the pipeline process. This category -includes format conversion, structured parsing, and content extraction. Because -actions carry no provider dependency, they compose freely between any source and -target and introduce no additional authentication surface. The runtime -guarantees type safety at the graph edges — an action's input and output types -must match the adjacent nodes in the DAG. - -| Package | Description | -|---------|-------------| -| [`nvisy-plugin-core`](nvisy-plugin-core/) | Built-in chunk/partition actions, plaintext/CSV/JSON loaders, core datatype registration | -| [`nvisy-plugin-tesseract`](nvisy-plugin-tesseract/) | Optical character recognition (Tesseract) | -| [`nvisy-plugin-pandoc`](nvisy-plugin-pandoc/) | Document format conversion (Pandoc) | diff --git a/packages/nvisy-ai/pyproject.toml b/packages/nvisy-ai/pyproject.toml new file mode 100644 index 0000000..0065a46 --- /dev/null +++ b/packages/nvisy-ai/pyproject.toml @@ -0,0 +1,16 @@ +[project] +name = "nvisy-ai" +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "openai>=1.0", + "anthropic>=0.30", + "google-generativeai>=0.7", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.backends" + +[tool.hatch.build.targets.wheel] +packages = ["src/nvisy_ai"] diff --git a/packages/nvisy-ai/src/nvisy_ai/__init__.py b/packages/nvisy-ai/src/nvisy_ai/__init__.py new file mode 100644 index 0000000..c1bf9cd --- /dev/null +++ b/packages/nvisy-ai/src/nvisy_ai/__init__.py @@ -0,0 +1,5 @@ +"""Nvisy AI NER detection package.""" + +from nvisy_ai.ner import detect_ner, detect_ner_image + +__all__ = ["detect_ner", "detect_ner_image"] diff --git a/packages/nvisy-ai/src/nvisy_ai/ner.py b/packages/nvisy-ai/src/nvisy_ai/ner.py new file mode 100644 index 0000000..3e4076c --- /dev/null +++ b/packages/nvisy-ai/src/nvisy_ai/ner.py @@ -0,0 +1,145 @@ +"""NER detection functions called from Rust via PyO3.""" + +import json +from typing import Optional + +from .prompts import NER_SYSTEM_PROMPT, NER_IMAGE_SYSTEM_PROMPT +from .providers.base import CompletionClient + + +def _get_client(provider: str, api_key: str, model: str) -> CompletionClient: + """Create a completion client for the given provider.""" + if provider == "openai": + from .providers.openai import OpenAIClient + return OpenAIClient(api_key=api_key, model=model) + elif provider == "anthropic": + from .providers.anthropic import AnthropicClient + return AnthropicClient(api_key=api_key, model=model) + elif provider == "gemini": + from .providers.gemini import GeminiClient + return GeminiClient(api_key=api_key, model=model) + else: + raise ValueError(f"Unknown provider: {provider}") + + +def _parse_entities(response_text: str) -> list[dict]: + """Parse the JSON response from the LLM into entity dicts.""" + text = response_text.strip() + # Strip markdown code fences if present + if text.startswith("```"): + lines = text.split("\n") + lines = [l for l in lines if not l.startswith("```")] + text = "\n".join(lines) + + try: + entities = json.loads(text) + except json.JSONDecodeError: + return [] + + if not isinstance(entities, list): + return [] + + return entities + + +def detect_ner( + text: str, + entity_types: Optional[list[str]] = None, + confidence_threshold: float = 0.5, + temperature: float = 0.0, + api_key: str = "", + model: str = "gpt-4", + provider: str = "openai", +) -> list[dict]: + """Detect named entities in text using an LLM. + + Called from Rust via PyO3. + + Args: + text: The text to analyze. + entity_types: Optional list of entity types to detect. + confidence_threshold: Minimum confidence to include. + temperature: LLM temperature parameter. + api_key: API key for the provider. + model: Model name to use. + provider: Provider name ("openai", "anthropic", "gemini"). + + Returns: + List of entity dicts with keys: category, entity_type, value, + confidence, start_offset, end_offset. + """ + import asyncio + + client = _get_client(provider, api_key, model) + + user_prompt = f"Analyze the following text for sensitive data:\n\n{text}" + if entity_types: + user_prompt += f"\n\nOnly detect these entity types: {', '.join(entity_types)}" + + loop = asyncio.new_event_loop() + try: + response = loop.run_until_complete( + client.complete(NER_SYSTEM_PROMPT, user_prompt, temperature) + ) + finally: + loop.close() + + entities = _parse_entities(response) + + # Filter by confidence threshold + return [ + e for e in entities + if e.get("confidence", 0) >= confidence_threshold + ] + + +def detect_ner_image( + image_bytes: bytes, + mime_type: str, + entity_types: Optional[list[str]] = None, + confidence_threshold: float = 0.5, + temperature: float = 0.0, + api_key: str = "", + model: str = "gpt-4", + provider: str = "openai", +) -> list[dict]: + """Detect named entities in an image using a multimodal LLM. + + Called from Rust via PyO3. + + Args: + image_bytes: Raw image bytes. + mime_type: MIME type of the image. + entity_types: Optional list of entity types to detect. + confidence_threshold: Minimum confidence to include. + api_key: API key for the provider. + model: Model name to use. + provider: Provider name ("openai", "anthropic", "gemini"). + + Returns: + List of entity dicts. + """ + import asyncio + + client = _get_client(provider, api_key, model) + + user_prompt = "Analyze this image for any visible sensitive data." + if entity_types: + user_prompt += f"\n\nOnly detect these entity types: {', '.join(entity_types)}" + + loop = asyncio.new_event_loop() + try: + response = loop.run_until_complete( + client.complete_with_image( + NER_IMAGE_SYSTEM_PROMPT, image_bytes, mime_type, user_prompt, temperature + ) + ) + finally: + loop.close() + + entities = _parse_entities(response) + + return [ + e for e in entities + if e.get("confidence", 0) >= confidence_threshold + ] diff --git a/packages/nvisy-ai/src/nvisy_ai/prompts.py b/packages/nvisy-ai/src/nvisy_ai/prompts.py new file mode 100644 index 0000000..fc608cc --- /dev/null +++ b/packages/nvisy-ai/src/nvisy_ai/prompts.py @@ -0,0 +1,37 @@ +"""NER system prompts for AI-based entity detection.""" + +NER_SYSTEM_PROMPT = """You are a Named Entity Recognition (NER) system specialized in detecting sensitive data. + +Given text, identify all instances of sensitive data including: +- PII: names, addresses, dates of birth, Social Security numbers, phone numbers, email addresses +- PHI: medical record numbers, health plan IDs, diagnoses, medications +- Financial: credit card numbers, bank account numbers, tax IDs +- Credentials: API keys, passwords, tokens, secret keys + +For each entity found, provide: +1. category: one of "pii", "phi", "financial", "credentials", "custom" +2. entity_type: specific type (e.g. "name", "ssn", "email", "credit_card") +3. value: the exact text matched +4. confidence: float 0-1 indicating detection confidence +5. start_offset: character offset where entity starts in the text +6. end_offset: character offset where entity ends in the text + +Return a JSON array of objects. If no entities found, return []. +Only return the JSON array, no additional text.""" + +NER_IMAGE_SYSTEM_PROMPT = """You are a Named Entity Recognition (NER) system that analyzes images for sensitive data. + +Examine the provided image and identify any visible sensitive data including: +- PII: names, addresses, dates of birth, Social Security numbers, phone numbers, email addresses +- PHI: medical record numbers, health plan IDs, diagnoses, medications +- Financial: credit card numbers, bank account numbers, tax IDs +- Credentials: API keys, passwords, tokens, secret keys + +For each entity found, provide: +1. category: one of "pii", "phi", "financial", "credentials", "custom" +2. entity_type: specific type (e.g. "name", "ssn", "email", "credit_card") +3. value: the exact text detected +4. confidence: float 0-1 indicating detection confidence + +Return a JSON array of objects. If no entities found, return []. +Only return the JSON array, no additional text.""" diff --git a/packages/nvisy-ai/src/nvisy_ai/providers/__init__.py b/packages/nvisy-ai/src/nvisy_ai/providers/__init__.py new file mode 100644 index 0000000..f40c051 --- /dev/null +++ b/packages/nvisy-ai/src/nvisy_ai/providers/__init__.py @@ -0,0 +1 @@ +"""AI provider implementations.""" diff --git a/packages/nvisy-ai/src/nvisy_ai/providers/anthropic.py b/packages/nvisy-ai/src/nvisy_ai/providers/anthropic.py new file mode 100644 index 0000000..0c741c5 --- /dev/null +++ b/packages/nvisy-ai/src/nvisy_ai/providers/anthropic.py @@ -0,0 +1,61 @@ +"""Anthropic completion provider.""" + +import base64 +from anthropic import Anthropic +from .base import CompletionClient + + +class AnthropicClient(CompletionClient): + """Anthropic-based completion client.""" + + def __init__(self, api_key: str, model: str = "claude-sonnet-4-5-20250929"): + self._client = Anthropic(api_key=api_key) + self._model = model + + async def complete( + self, + system_prompt: str, + user_prompt: str, + temperature: float = 0.0, + ) -> str: + response = self._client.messages.create( + model=self._model, + max_tokens=4096, + temperature=temperature, + system=system_prompt, + messages=[{"role": "user", "content": user_prompt}], + ) + return response.content[0].text if response.content else "" + + async def complete_with_image( + self, + system_prompt: str, + image_bytes: bytes, + mime_type: str, + user_prompt: str, + temperature: float = 0.0, + ) -> str: + b64 = base64.b64encode(image_bytes).decode("utf-8") + response = self._client.messages.create( + model=self._model, + max_tokens=4096, + temperature=temperature, + system=system_prompt, + messages=[ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": mime_type, + "data": b64, + }, + }, + {"type": "text", "text": user_prompt}, + ], + }, + ], + ) + return response.content[0].text if response.content else "" diff --git a/packages/nvisy-ai/src/nvisy_ai/providers/base.py b/packages/nvisy-ai/src/nvisy_ai/providers/base.py new file mode 100644 index 0000000..51021ec --- /dev/null +++ b/packages/nvisy-ai/src/nvisy_ai/providers/base.py @@ -0,0 +1,30 @@ +"""Abstract base class for AI completion providers.""" + +from abc import ABC, abstractmethod +from typing import Any + + +class CompletionClient(ABC): + """Abstract completion client for LLM providers.""" + + @abstractmethod + async def complete( + self, + system_prompt: str, + user_prompt: str, + temperature: float = 0.0, + ) -> str: + """Send a completion request and return the response text.""" + ... + + @abstractmethod + async def complete_with_image( + self, + system_prompt: str, + image_bytes: bytes, + mime_type: str, + user_prompt: str, + temperature: float = 0.0, + ) -> str: + """Send a multimodal completion request with an image.""" + ... diff --git a/packages/nvisy-ai/src/nvisy_ai/providers/gemini.py b/packages/nvisy-ai/src/nvisy_ai/providers/gemini.py new file mode 100644 index 0000000..ac0a8c5 --- /dev/null +++ b/packages/nvisy-ai/src/nvisy_ai/providers/gemini.py @@ -0,0 +1,39 @@ +"""Google Gemini completion provider.""" + +import google.generativeai as genai +from .base import CompletionClient + + +class GeminiClient(CompletionClient): + """Google Gemini-based completion client.""" + + def __init__(self, api_key: str, model: str = "gemini-1.5-pro"): + genai.configure(api_key=api_key) + self._model = genai.GenerativeModel(model) + + async def complete( + self, + system_prompt: str, + user_prompt: str, + temperature: float = 0.0, + ) -> str: + response = self._model.generate_content( + f"{system_prompt}\n\n{user_prompt}", + generation_config=genai.types.GenerationConfig(temperature=temperature), + ) + return response.text or "" + + async def complete_with_image( + self, + system_prompt: str, + image_bytes: bytes, + mime_type: str, + user_prompt: str, + temperature: float = 0.0, + ) -> str: + image_part = {"mime_type": mime_type, "data": image_bytes} + response = self._model.generate_content( + [f"{system_prompt}\n\n{user_prompt}", image_part], + generation_config=genai.types.GenerationConfig(temperature=temperature), + ) + return response.text or "" diff --git a/packages/nvisy-ai/src/nvisy_ai/providers/openai.py b/packages/nvisy-ai/src/nvisy_ai/providers/openai.py new file mode 100644 index 0000000..60bd501 --- /dev/null +++ b/packages/nvisy-ai/src/nvisy_ai/providers/openai.py @@ -0,0 +1,59 @@ +"""OpenAI completion provider.""" + +import base64 +from openai import OpenAI +from .base import CompletionClient + + +class OpenAIClient(CompletionClient): + """OpenAI-based completion client.""" + + def __init__(self, api_key: str, model: str = "gpt-4"): + self._client = OpenAI(api_key=api_key) + self._model = model + + async def complete( + self, + system_prompt: str, + user_prompt: str, + temperature: float = 0.0, + ) -> str: + response = self._client.chat.completions.create( + model=self._model, + temperature=temperature, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + ) + return response.choices[0].message.content or "" + + async def complete_with_image( + self, + system_prompt: str, + image_bytes: bytes, + mime_type: str, + user_prompt: str, + temperature: float = 0.0, + ) -> str: + b64 = base64.b64encode(image_bytes).decode("utf-8") + response = self._client.chat.completions.create( + model=self._model, + temperature=temperature, + messages=[ + {"role": "system", "content": system_prompt}, + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:{mime_type};base64,{b64}", + }, + }, + {"type": "text", "text": user_prompt}, + ], + }, + ], + ) + return response.choices[0].message.content or "" diff --git a/packages/nvisy-core/README.md b/packages/nvisy-core/README.md deleted file mode 100644 index 02407e7..0000000 --- a/packages/nvisy-core/README.md +++ /dev/null @@ -1,186 +0,0 @@ -# @nvisy/core - -[![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) - -Core primitives and abstractions for the Nvisy runtime platform. - -## Features - -- **Data types**: `Document`, `Chunk`, `Embedding`, and `Blob` for pipeline data, with lineage tracking via `Data` base class -- **Document model**: structured `Element` hierarchy with typed subclasses (`ImageElement`, `TableElement`, `FormElement`, `EmailElement`, `CompositeElement`) and provenance metadata -- **Plugin system**: bundle providers, streams, actions, loaders, and custom datatypes under a namespace -- **Provider abstraction**: connection lifecycle management with credential validation -- **Stream contracts**: resumable sources and sinks for external systems -- **Action contracts**: stream transforms with optional client dependencies -- **Loader contracts**: `Blob` to `Document` transforms with file extension and MIME type matching -- **Error taxonomy**: `RuntimeError`, `ValidationError`, `ConnectionError`, `CancellationError`, `TimeoutError` - -## Overview - -This package defines the foundational abstractions that all Nvisy plugins implement: - -- **Data types** (`Data`, `Document`, `Chunk`, `Embedding`, `Blob`): immutable data containers that flow through pipelines. All extend `Data`, which provides `id`, `parentId`, `metadata`, and lineage methods (`deriveFrom`, `withParent`). -- **Elements** (`Element`, `ImageElement`, `TableElement`, etc.): structured content within documents, with typed subclasses for images, tables, forms, and emails. Includes provenance metadata, coordinate systems, and an element type ontology. -- **Plugins** (`Plugin.define`): namespace for grouping providers, streams, actions, loaders, and custom datatypes. -- **Providers** (`Provider.withAuthentication`, `Provider.withoutAuthentication`): external client lifecycle management. -- **Streams** (`Stream.createSource`, `Stream.createTarget`): data I/O layer for reading from and writing to external systems. -- **Actions** (`Action.withClient`, `Action.withoutClient`): stream transforms that process data between sources and targets. -- **Loaders** (`Loader.define`): specialized transforms that convert `Blob` objects into `Document` instances, matched by file extension and MIME type. - -## Usage - -### Defining a Provider - -```ts -import { Provider } from "@nvisy/core"; -import { z } from "zod"; - -const credentialSchema = z.object({ - apiKey: z.string(), - endpoint: z.string().url(), -}); - -const myProvider = Provider.withAuthentication("my-provider", { - credentials: credentialSchema, - connect: async (creds) => { - const client = await createClient(creds); - return { - client, - disconnect: () => client.close(), - }; - }, -}); -``` - -### Defining a Stream Source - -```ts -import { Stream, Document } from "@nvisy/core"; -import { z } from "zod"; - -const contextSchema = z.object({ cursor: z.string().optional() }); -const sourceParamSchema = z.object({ limit: z.number() }); - -const mySource = Stream.createSource("my-source", MyClient, { - type: Document, context: contextSchema, params: sourceParamSchema, - reader: async function* (client, ctx, params) { - for await (const item of client.list({ cursor: ctx.cursor, limit: params.limit })) { - yield { data: new Document(item.text), context: { cursor: item.id } }; - } - }, -}); -``` - -### Defining a Stream Target - -```ts -import { Stream, Embedding } from "@nvisy/core"; -import { z } from "zod"; - -const targetParamSchema = z.object({ collection: z.string() }); - -const myTarget = Stream.createTarget("my-target", MyClient, { - type: Embedding, params: targetParamSchema, - writer: (client, params) => async (item) => { - await client.insert(params.collection, item); - }, -}); -``` - -### Defining an Action - -```ts -import { Action, Document, Chunk } from "@nvisy/core"; -import { z } from "zod"; - -const chunkerParamSchema = z.object({ maxLength: z.number() }); - -const myChunker = Action.withoutClient("my-chunker", { - types: [Document, Chunk], - params: chunkerParamSchema, - transform: async function* (stream, params) { - for await (const doc of stream) { - for (let i = 0; i < doc.content.length; i += params.maxLength) { - yield new Chunk(doc.content.slice(i, i + params.maxLength)).deriveFrom(doc); - } - } - }, -}); -``` - -### Defining a Loader - -```ts -import { Loader, Document } from "@nvisy/core"; -import { z } from "zod"; - -const loaderParamSchema = z.object({ - encoding: z.enum(["utf-8", "ascii"]).default("utf-8"), -}); - -const myLoader = Loader.define("markdown", { - extensions: [".md", ".markdown"], - contentTypes: ["text/markdown"], - params: loaderParamSchema, - load: async function* (blob, params) { - const text = blob.data.toString(params.encoding); - yield new Document(text).deriveFrom(blob); - }, -}); -``` - -### Defining a Datatype - -Custom data types extend the `Data` base class and are registered with `Datatype.define`. All `Data` subclasses get a unique `id`, optional `metadata`, and lineage tracking via `deriveFrom` / `withParent`. - -```ts -import { Data, Datatype } from "@nvisy/core"; - -class Audio extends Data { - readonly #duration: number; - readonly #sampleRate: number; - - constructor(duration: number, sampleRate: number) { - super(); - this.#duration = duration; - this.#sampleRate = sampleRate; - } - - get duration(): number { - return this.#duration; - } - - get sampleRate(): number { - return this.#sampleRate; - } -} - -const audioDatatype = Datatype.define("audio", Audio); -``` - -### Bundling into a Plugin - -```ts -import { Plugin, Datatype, Document, Chunk } from "@nvisy/core"; - -const myPlugin = Plugin.define("my-plugin") - .withDatatypes(audioDatatype) - .withProviders(myProvider) - .withStreams(mySource, myTarget) - .withActions(myChunker) - .withLoaders(myLoader); -``` - -## Changelog - -See [CHANGELOG.md](../../CHANGELOG.md) for release notes and version history. - -## License - -Apache 2.0 License - see [LICENSE.txt](../../LICENSE.txt) - -## Support - -- **Documentation**: [docs.nvisy.com](https://docs.nvisy.com) -- **Issues**: [GitHub Issues](https://github.com/nvisycom/runtime/issues) -- **Email**: [support@nvisy.com](mailto:support@nvisy.com) diff --git a/packages/nvisy-core/package.json b/packages/nvisy-core/package.json deleted file mode 100644 index bc36220..0000000 --- a/packages/nvisy-core/package.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "name": "@nvisy/core", - "version": "0.1.0", - "description": "Core data types, errors, and utilities for the Nvisy runtime", - "type": "module", - "exports": { - ".": { - "source": "./src/index.ts", - "types": "./dist/index.d.ts", - "import": "./dist/index.js" - } - }, - "files": [ - "dist" - ], - "scripts": { - "build": "tsup", - "build:watch": "tsup --watch", - "clean": "rimraf dist", - "typecheck": "tsc -b" - }, - "dependencies": { - "@logtape/logtape": "^2.0.2", - "magic-bytes.js": "^1.13.0", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } -} diff --git a/packages/nvisy-core/src/action.ts b/packages/nvisy-core/src/action.ts deleted file mode 100644 index 1f3fdfc..0000000 --- a/packages/nvisy-core/src/action.ts +++ /dev/null @@ -1,246 +0,0 @@ -/** - * Action factory and type definitions for stream transforms. - * - * Actions are the intermediate processing steps in a pipeline, - * transforming data between sources and targets. Use - * {@link Action.withoutClient} for rule-based transforms and - * {@link Action.withClient} for transforms that require a provider. - * - * @module - */ - -import type { z } from "zod"; -import type { Data } from "./datatypes/data.js"; -import type { ClassRef } from "./types.js"; - -/** - * Stream transform that operates without a provider client. - * - * @template TIn - Input data type consumed by the transform. - * @template TOut - Output data type produced by the transform. - * @template TParam - Configuration parameters for the transform. - */ -export type ClientlessTransformFn< - TIn extends Data, - TOut extends Data, - TParam, -> = (stream: AsyncIterable<TIn>, params: TParam) => AsyncIterable<TOut>; - -/** - * Stream transform that requires a provider client. - * - * @template TClient - Provider client type (e.g. database connection). - * @template TIn - Input data type consumed by the transform. - * @template TOut - Output data type produced by the transform. - * @template TParam - Configuration parameters for the transform. - */ -export type ClientTransformFn< - TClient, - TIn extends Data, - TOut extends Data, - TParam, -> = ( - stream: AsyncIterable<TIn>, - params: TParam, - client: TClient, -) => AsyncIterable<TOut>; - -/** - * Configuration for creating an action that does not require a provider client. - * - * @template TIn - Input data type consumed by the action. - * @template TOut - Output data type produced by the action. - * @template TParam - Configuration parameters for the action. - */ -export interface ClientlessActionConfig< - TIn extends Data, - TOut extends Data, - TParam, -> { - /** Input/output type classes. Single-element array means input equals output. */ - readonly types: - | [inputClass: ClassRef<TIn>, outputClass: ClassRef<TOut>] - | [inputClass: ClassRef<TIn>]; - /** Zod schema for validating action parameters. */ - readonly params: z.ZodType<TParam>; - /** The transform function that processes the stream. */ - readonly transform: ClientlessTransformFn<TIn, TOut, TParam>; -} - -/** - * Configuration for creating an action that requires a provider client. - * - * @template TClient - Provider client type required by the action. - * @template TIn - Input data type consumed by the action. - * @template TOut - Output data type produced by the action. - * @template TParam - Configuration parameters for the action. - */ -export interface ClientActionConfig< - TClient, - TIn extends Data, - TOut extends Data, - TParam, -> { - /** Input/output type classes. Single-element array means input equals output. */ - readonly types: - | [inputClass: ClassRef<TIn>, outputClass: ClassRef<TOut>] - | [inputClass: ClassRef<TIn>]; - /** Zod schema for validating action parameters. */ - readonly params: z.ZodType<TParam>; - /** The transform function that processes the stream with client access. */ - readonly transform: ClientTransformFn<TClient, TIn, TOut, TParam>; -} - -/** - * A registered action instance that can transform data streams. - * - * Actions are the intermediate processing steps in a pipeline, - * transforming data between sources and targets. - * - * @template TClient - Provider client type (void if no client needed). - * @template TIn - Input data type consumed by the action. - * @template TOut - Output data type produced by the action. - * @template TParam - Configuration parameters for the action. - */ -export interface ActionInstance< - TClient = void, - TIn extends Data = Data, - TOut extends Data = Data, - TParam = unknown, -> { - /** Unique identifier for this action. */ - readonly id: string; - /** Client class required by this action (undefined if clientless). */ - readonly clientClass?: ClassRef<TClient>; - /** Class reference for validating input data type. */ - readonly inputClass: ClassRef<TIn>; - /** Class reference for validating output data type. */ - readonly outputClass: ClassRef<TOut>; - /** Zod schema for validating action parameters. */ - readonly schema: z.ZodType<TParam>; - /** Transform an input stream into an output stream. */ - pipe( - stream: AsyncIterable<TIn>, - params: TParam, - client: TClient, - ): AsyncIterable<TOut>; -} - -class ActionImpl<TClient, TIn extends Data, TOut extends Data, TParam> - implements ActionInstance<TClient, TIn, TOut, TParam> -{ - readonly id: string; - readonly clientClass?: ClassRef<TClient>; - readonly inputClass: ClassRef<TIn>; - readonly outputClass: ClassRef<TOut>; - readonly schema: z.ZodType<TParam>; - readonly #transform: ClientTransformFn<TClient, TIn, TOut, TParam>; - - constructor(config: { - id: string; - clientClass?: ClassRef<TClient>; - inputClass: ClassRef<TIn>; - outputClass: ClassRef<TOut>; - schema: z.ZodType<TParam>; - transform: ClientTransformFn<TClient, TIn, TOut, TParam>; - }) { - this.id = config.id; - if (config.clientClass) this.clientClass = config.clientClass; - this.inputClass = config.inputClass; - this.outputClass = config.outputClass; - this.schema = config.schema; - this.#transform = config.transform; - } - - pipe( - stream: AsyncIterable<TIn>, - params: TParam, - client: TClient, - ): AsyncIterable<TOut> { - return this.#transform(stream, params, client); - } -} - -/** Factory for creating action instances. */ -export const Action: { - /** - * Create an action that does not require a provider client. - * - * @param id - Unique identifier for the action. - * @param config - Action configuration including types and transform. - */ - withoutClient<TIn extends Data, TOut extends Data, TParam>( - id: string, - config: ClientlessActionConfig<TIn, TOut, TParam>, - ): ActionInstance<void, TIn, TOut, TParam>; - withoutClient<TIn extends Data, TParam>( - id: string, - config: ClientlessActionConfig<TIn, TIn, TParam>, - ): ActionInstance<void, TIn, TIn, TParam>; - - /** - * Create an action that requires a provider client. - * - * @param id - Unique identifier for the action. - * @param clientClass - Class reference for the required provider client. - * @param config - Action configuration including types and transform. - */ - withClient<TClient, TIn extends Data, TOut extends Data, TParam>( - id: string, - clientClass: ClassRef<TClient>, - config: ClientActionConfig<TClient, TIn, TOut, TParam>, - ): ActionInstance<TClient, TIn, TOut, TParam>; - withClient<TClient, TIn extends Data, TParam>( - id: string, - clientClass: ClassRef<TClient>, - config: ClientActionConfig<TClient, TIn, TIn, TParam>, - ): ActionInstance<TClient, TIn, TIn, TParam>; -} = { - withoutClient( - id: string, - config: { - types: [ClassRef<Data>] | [ClassRef<Data>, ClassRef<Data>]; - params: z.ZodType<unknown>; - transform: (...args: never[]) => AsyncIterable<Data>; - }, - ): ActionInstance<void, Data, Data, unknown> { - const [inputClass, outputClass] = config.types; - return new ActionImpl({ - id, - inputClass, - outputClass: outputClass ?? inputClass, - schema: config.params, - transform: (stream, params, _client) => - ( - config.transform as ( - stream: AsyncIterable<Data>, - params: unknown, - ) => AsyncIterable<Data> - )(stream, params), - }); - }, - - withClient( - id: string, - clientClass: ClassRef<unknown>, - config: { - types: [ClassRef<Data>] | [ClassRef<Data>, ClassRef<Data>]; - params: z.ZodType<unknown>; - transform: (...args: never[]) => AsyncIterable<Data>; - }, - ): ActionInstance<unknown, Data, Data, unknown> { - const [inputClass, outputClass] = config.types; - return new ActionImpl({ - id, - clientClass, - inputClass, - outputClass: outputClass ?? inputClass, - schema: config.params, - transform: config.transform as ( - stream: AsyncIterable<Data>, - params: unknown, - client: unknown, - ) => AsyncIterable<Data>, - }); - }, -}; diff --git a/packages/nvisy-core/src/datatypes/blob.test.ts b/packages/nvisy-core/src/datatypes/blob.test.ts deleted file mode 100644 index 85ceef4..0000000 --- a/packages/nvisy-core/src/datatypes/blob.test.ts +++ /dev/null @@ -1,140 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { Blob } from "./blob.js"; - -describe("Blob", () => { - it("stores path and data", () => { - const data = Buffer.from("hello world"); - const blob = new Blob("uploads/file.txt", data); - expect(blob.path).toBe("uploads/file.txt"); - expect(blob.data).toBe(data); - expect(blob.data.toString()).toBe("hello world"); - }); - - it("provided.mime is undefined when no contentType given", () => { - const blob = new Blob("file.bin", Buffer.from([0x00, 0x01])); - expect(blob.provided.mime).toBeUndefined(); - }); - - it("provided.mime reflects constructor contentType", () => { - const blob = new Blob("report.pdf", Buffer.from("pdf content"), { - contentType: "application/pdf", - }); - expect(blob.provided.mime).toBe("application/pdf"); - }); - - it("size returns byte length of data", () => { - const blob = new Blob("file.txt", Buffer.from("abc")); - expect(blob.size).toBe(3); - }); - - it("size handles empty buffer", () => { - const blob = new Blob("empty.bin", Buffer.alloc(0)); - expect(blob.size).toBe(0); - }); - - it("size handles binary data correctly", () => { - const binaryData = Buffer.from([0x00, 0xff, 0x10, 0x20, 0x30]); - const blob = new Blob("binary.bin", binaryData); - expect(blob.size).toBe(5); - }); - - it("extends Data and has id, parentId, metadata", () => { - const blob = new Blob("file.txt", Buffer.from("content")); - expect(blob.id).toMatch( - /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/, - ); - expect(blob.parentId).toBeNull(); - expect(blob.metadata).toBeNull(); - }); - - it("supports deriveFrom for lineage", () => { - const parent = new Blob("parent.txt", Buffer.from("parent")); - const child = new Blob("child.txt", Buffer.from("child")).deriveFrom( - parent, - ); - expect(child.parentId).toBe(parent.id); - expect(child.isDerived).toBe(true); - }); - - it("supports withMetadata", () => { - const blob = new Blob("file.txt", Buffer.from("content")).withMetadata({ - source: "s3", - bucket: "my-bucket", - }); - expect(blob.metadata).toEqual({ source: "s3", bucket: "my-bucket" }); - }); - - describe("createdAt / updatedAt", () => { - it("defaults to undefined when not provided", () => { - const blob = new Blob("file.txt", Buffer.from("")); - expect(blob.createdAt).toBeUndefined(); - expect(blob.updatedAt).toBeUndefined(); - }); - - it("stores and returns the dates when provided", () => { - const created = new Date("2025-01-01T00:00:00Z"); - const updated = new Date("2025-06-15T12:00:00Z"); - const blob = new Blob("file.txt", Buffer.from(""), { - createdAt: created, - updatedAt: updated, - }); - expect(blob.createdAt).toBe(created); - expect(blob.updatedAt).toBe(updated); - }); - }); - - describe("provided", () => { - it("extracts extension from path", () => { - const blob = new Blob("report.pdf", Buffer.from("")); - expect(blob.provided.extension).toBe(".pdf"); - }); - - it("includes mime from contentType", () => { - const blob = new Blob("report.pdf", Buffer.from(""), { - contentType: "application/pdf", - }); - expect(blob.provided.mime).toBe("application/pdf"); - }); - - it("omits extension for extensionless path", () => { - const blob = new Blob("Makefile", Buffer.from("")); - expect(blob.provided.extension).toBeUndefined(); - }); - - it("lowercases the extension", () => { - const blob = new Blob("photo.JPG", Buffer.from("")); - expect(blob.provided.extension).toBe(".jpg"); - }); - - it("handles paths with multiple dots", () => { - const blob = new Blob("archive.tar.gz", Buffer.from("")); - expect(blob.provided.extension).toBe(".gz"); - }); - }); - - describe("identified", () => { - it("detects PDF from magic bytes", () => { - const pdfHeader = Buffer.from("%PDF-1.4 ..."); - const blob = new Blob("mystery.bin", pdfHeader); - expect(blob.identified.extension).toBe(".pdf"); - expect(blob.identified.mime).toBe("application/pdf"); - }); - - it("returns empty filetype for unrecognizable bytes (e.g. CSV)", () => { - const blob = new Blob("data.csv", Buffer.from("a,b\n1,2")); - expect(blob.identified.extension).toBeUndefined(); - expect(blob.identified.mime).toBeUndefined(); - }); - }); - - it("handles various path formats", () => { - const s3Blob = new Blob("s3://bucket/key/file.pdf", Buffer.from("")); - expect(s3Blob.path).toBe("s3://bucket/key/file.pdf"); - - const gcsBlob = new Blob("gs://bucket/object", Buffer.from("")); - expect(gcsBlob.path).toBe("gs://bucket/object"); - - const localBlob = new Blob("/var/data/file.txt", Buffer.from("")); - expect(localBlob.path).toBe("/var/data/file.txt"); - }); -}); diff --git a/packages/nvisy-core/src/datatypes/blob.ts b/packages/nvisy-core/src/datatypes/blob.ts deleted file mode 100644 index be03d4c..0000000 --- a/packages/nvisy-core/src/datatypes/blob.ts +++ /dev/null @@ -1,127 +0,0 @@ -/** - * Binary blob data type for files from object storage. - * - * @module - */ - -import { filetypeinfo } from "magic-bytes.js"; -import { Data } from "./data.js"; - -/** Extension and MIME type pair describing a file type. */ -export interface Filetype { - /** File extension including the dot (e.g. `".pdf"`). */ - readonly extension?: string; - /** MIME type (e.g. `"application/pdf"`). */ - readonly mime?: string; -} - -/** Options for constructing a {@link Blob}. */ -export interface BlobOptions { - /** MIME type declared by the source (e.g. `"application/pdf"`). */ - readonly contentType?: string; - /** Timestamp when the object was created in the source store. */ - readonly createdAt?: Date; - /** Timestamp when the object was last modified in the source store. */ - readonly updatedAt?: Date; -} - -/** - * A file or binary blob retrieved from object storage (S3, GCS, Dropbox, etc.). - * - * Wraps raw bytes together with their storage path so downstream - * processors can decide how to parse the content. File-type information - * is available via two {@link Filetype} getters: - * - * - {@link provided} — declared type from the path extension and the - * cloud-provider / caller-supplied `contentType`. - * - {@link identified} — detected type from the actual bytes via - * magic-bytes signatures (lazy, cached on first access). - * - * @example - * ```ts - * const blob = new Blob("uploads/report.pdf", pdfBytes, { - * contentType: "application/pdf", - * }); - * blob.provided; // { extension: ".pdf", mime: "application/pdf" } - * blob.identified; // { extension: ".pdf", mime: "application/pdf" } - * ``` - */ -export class Blob extends Data { - readonly #path: string; - readonly #data: Buffer; - readonly #filetype: Filetype; - readonly #createdAt?: Date | undefined; - readonly #updatedAt?: Date | undefined; - - // Lazy magic-bytes cache — `false` means "not yet computed" - #identified: false | Filetype = false; - - constructor(path: string, data: Buffer, options?: BlobOptions) { - super(); - this.#path = path; - this.#data = data; - this.#createdAt = options?.createdAt; - this.#updatedAt = options?.updatedAt; - - const ext = Blob.#parseExtension(path); - this.#filetype = { - ...(ext && { extension: ext }), - ...(options?.contentType && { mime: options.contentType }), - }; - } - - /** Storage path or key (e.g. `"s3://bucket/file.pdf"`). */ - get path(): string { - return this.#path; - } - - /** Raw binary content. */ - get data(): Buffer { - return this.#data; - } - - /** Size of the raw data in bytes. */ - get size(): number { - return this.#data.byteLength; - } - - /** Timestamp when the object was created in the source store. */ - get createdAt(): Date | undefined { - return this.#createdAt; - } - - /** Timestamp when the object was last modified in the source store. */ - get updatedAt(): Date | undefined { - return this.#updatedAt; - } - - /** Declared file type derived from path extension and constructor contentType. */ - get provided(): Filetype { - return this.#filetype; - } - - /** File type detected from magic bytes. Fields are absent when bytes are not recognizable. */ - get identified(): Filetype { - return this.#identify(); - } - - #identify(): Filetype { - if (this.#identified === false) { - const detected = filetypeinfo(this.#data); - const first = detected[0]; - this.#identified = first - ? { - ...(first.extension && { extension: `.${first.extension}` }), - ...(first.mime && { mime: first.mime }), - } - : {}; - } - return this.#identified; - } - - static #parseExtension(path: string): string | undefined { - const lastDot = path.lastIndexOf("."); - if (lastDot === -1 || lastDot === path.length - 1) return undefined; - return path.slice(lastDot).toLowerCase(); - } -} diff --git a/packages/nvisy-core/src/datatypes/chunk.test.ts b/packages/nvisy-core/src/datatypes/chunk.test.ts deleted file mode 100644 index e81178e..0000000 --- a/packages/nvisy-core/src/datatypes/chunk.test.ts +++ /dev/null @@ -1,77 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { Chunk } from "./chunk.js"; - -describe("Chunk", () => { - it("stores content", () => { - const chunk = new Chunk("Hello, world!"); - expect(chunk.content).toBe("Hello, world!"); - }); - - it("defaults chunkIndex and chunkTotal to undefined", () => { - const chunk = new Chunk("text"); - expect(chunk.chunkIndex).toBeUndefined(); - expect(chunk.chunkTotal).toBeUndefined(); - }); - - it("accepts chunkIndex and chunkTotal via options", () => { - const chunk = new Chunk("text", { chunkIndex: 2, chunkTotal: 10 }); - expect(chunk.chunkIndex).toBe(2); - expect(chunk.chunkTotal).toBe(10); - }); - - it("accepts partial options", () => { - const indexOnly = new Chunk("a", { chunkIndex: 0 }); - expect(indexOnly.chunkIndex).toBe(0); - expect(indexOnly.chunkTotal).toBeUndefined(); - - const totalOnly = new Chunk("b", { chunkTotal: 5 }); - expect(totalOnly.chunkIndex).toBeUndefined(); - expect(totalOnly.chunkTotal).toBe(5); - }); - - it("handles empty content", () => { - const chunk = new Chunk(""); - expect(chunk.content).toBe(""); - }); - - it("extends Data and has id, parentId, metadata", () => { - const chunk = new Chunk("content"); - expect(chunk.id).toMatch( - /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/, - ); - expect(chunk.parentId).toBeNull(); - expect(chunk.metadata).toBeNull(); - }); - - it("has a unique id per instance", () => { - const a = new Chunk("same"); - const b = new Chunk("same"); - expect(a.id).not.toBe(b.id); - }); - - it("supports deriveFrom for lineage", () => { - const parent = new Chunk("parent text"); - const child = new Chunk("child text").deriveFrom(parent); - expect(child.parentId).toBe(parent.id); - expect(child.isDerived).toBe(true); - }); - - it("deriveFrom copies metadata from parent", () => { - const parent = new Chunk("parent").withMetadata({ source: "pdf" }); - const child = new Chunk("child").deriveFrom(parent); - expect(child.metadata).toEqual({ source: "pdf" }); - }); - - it("supports withMetadata", () => { - const chunk = new Chunk("text").withMetadata({ - page: 3, - section: "intro", - }); - expect(chunk.metadata).toEqual({ page: 3, section: "intro" }); - }); - - it("supports withParent", () => { - const chunk = new Chunk("text").withParent("custom-parent-id"); - expect(chunk.parentId).toBe("custom-parent-id"); - }); -}); diff --git a/packages/nvisy-core/src/datatypes/chunk.ts b/packages/nvisy-core/src/datatypes/chunk.ts deleted file mode 100644 index e671046..0000000 --- a/packages/nvisy-core/src/datatypes/chunk.ts +++ /dev/null @@ -1,57 +0,0 @@ -/** - * Text chunk data type produced by chunking steps. - * - * @module - */ - -import { Data } from "./data.js"; - -/** Options for constructing a {@link Chunk}. */ -export interface ChunkOptions { - readonly chunkIndex?: number; - readonly chunkTotal?: number; -} - -/** - * A text segment produced by a chunking step. - * - * Represents a portion of a larger {@link Document} after splitting. - * Carries optional provenance fields ({@link chunkIndex}, - * {@link chunkTotal}) so downstream steps can trace chunks back to their - * origin. Use {@link Data.withParent | withParent} to set the source document ID. - * - * @example - * ```ts - * const chunk = new Chunk("First paragraph…", { - * chunkIndex: 0, - * chunkTotal: 5, - * }).deriveFrom(doc); - * ``` - */ -export class Chunk extends Data { - readonly #content: string; - readonly #chunkIndex?: number | undefined; - readonly #chunkTotal?: number | undefined; - - constructor(content: string, options?: ChunkOptions) { - super(); - this.#content = content; - this.#chunkIndex = options?.chunkIndex; - this.#chunkTotal = options?.chunkTotal; - } - - /** Text content of this chunk. */ - get content(): string { - return this.#content; - } - - /** Zero-based index of this chunk within the source document. */ - get chunkIndex(): number | undefined { - return this.#chunkIndex; - } - - /** Total number of chunks the source document was split into. */ - get chunkTotal(): number | undefined { - return this.#chunkTotal; - } -} diff --git a/packages/nvisy-core/src/datatypes/data.test.ts b/packages/nvisy-core/src/datatypes/data.test.ts deleted file mode 100644 index 2d98a8e..0000000 --- a/packages/nvisy-core/src/datatypes/data.test.ts +++ /dev/null @@ -1,88 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { Data } from "./data.js"; - -class TestData extends Data {} - -describe("Data", () => { - it("auto-generates a UUID id", () => { - const a = new TestData(); - const b = new TestData(); - expect(a.id).toMatch( - /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/, - ); - expect(a.id).not.toBe(b.id); - }); - - it("defaults: parentId is null, metadata is null, isDerived is false", () => { - const data = new TestData(); - expect(data.parentId).toBeNull(); - expect(data.metadata).toBeNull(); - expect(data.isDerived).toBe(false); - }); - - describe("deriveFrom", () => { - it("copies parentId and metadata from parent", () => { - const parent = new TestData().withMetadata({ key: "value" }); - const child = new TestData().deriveFrom(parent); - expect(child.parentId).toBe(parent.id); - expect(child.metadata).toEqual({ key: "value" }); - expect(child.isDerived).toBe(true); - }); - - it("copies null metadata from parent", () => { - const parent = new TestData(); - const child = new TestData().deriveFrom(parent); - expect(child.parentId).toBe(parent.id); - expect(child.metadata).toBeNull(); - }); - - it("returns this for chaining", () => { - const parent = new TestData(); - const child = new TestData(); - expect(child.deriveFrom(parent)).toBe(child); - }); - }); - - describe("withParent", () => { - it("sets parentId", () => { - const data = new TestData().withParent("parent-123"); - expect(data.parentId).toBe("parent-123"); - expect(data.isDerived).toBe(true); - }); - - it("accepts null to clear", () => { - const data = new TestData().withParent("p-1").withParent(null); - expect(data.parentId).toBeNull(); - expect(data.isDerived).toBe(false); - }); - - it("returns this for chaining", () => { - const data = new TestData(); - expect(data.withParent("x")).toBe(data); - }); - }); - - describe("withMetadata", () => { - it("sets metadata", () => { - const data = new TestData().withMetadata({ key: "value" }); - expect(data.metadata).toEqual({ key: "value" }); - }); - - it("accepts null to clear", () => { - const data = new TestData().withMetadata({ a: 1 }).withMetadata(null); - expect(data.metadata).toBeNull(); - }); - - it("returns this for chaining", () => { - const data = new TestData(); - expect(data.withMetadata({ a: 1 })).toBe(data); - }); - }); - - it("deriveFrom then withMetadata overrides metadata", () => { - const parent = new TestData().withMetadata({ old: 1 }); - const child = new TestData().deriveFrom(parent).withMetadata({ new: 2 }); - expect(child.parentId).toBe(parent.id); - expect(child.metadata).toEqual({ new: 2 }); - }); -}); diff --git a/packages/nvisy-core/src/datatypes/data.ts b/packages/nvisy-core/src/datatypes/data.ts deleted file mode 100644 index 1343f8b..0000000 --- a/packages/nvisy-core/src/datatypes/data.ts +++ /dev/null @@ -1,66 +0,0 @@ -/** - * Abstract base data class for all pipeline data types. - * - * @module - */ - -import type { Metadata } from "../types.js"; - -/** - * Abstract base class for all data types flowing through the pipeline. - * - * Every piece of data in the system — documents, embeddings, database rows, - * storage objects — extends this class, guaranteeing a unique {@link id} and - * optional key-value {@link metadata}. - * - * Use {@link deriveFrom} to set lineage and copy metadata from a parent in - * one call. Use {@link withParent} and {@link withMetadata} for manual - * control. All fluent setters return `this` for chaining. - */ -export abstract class Data { - readonly #id: string = crypto.randomUUID(); - #parentId: string | null = null; - #metadata: Metadata | null = null; - - /** Unique identifier for this data item. */ - get id(): string { - return this.#id; - } - - /** ID of the parent data item this was derived from. `null` when this is a root item. */ - get parentId(): string | null { - return this.#parentId; - } - - /** `true` when this item was derived from another (i.e. {@link parentId} is set). */ - get isDerived(): boolean { - return this.#parentId !== null; - } - - /** Key-value metadata attached to this data item. `null` when unset. */ - get metadata(): Metadata | null { - return this.#metadata; - } - - /** - * Mark this item as derived from `parent`, copying its {@link id} as - * {@link parentId} and its {@link metadata}. Returns `this` for chaining. - */ - deriveFrom(parent: Data): this { - this.#parentId = parent.#id; - this.#metadata = parent.#metadata; - return this; - } - - /** Set the parent ID for lineage tracking. Returns `this` for chaining. */ - withParent(id: string | null): this { - this.#parentId = id; - return this; - } - - /** Set or replace metadata. Returns `this` for chaining. */ - withMetadata(metadata: Metadata | null): this { - this.#metadata = metadata; - return this; - } -} diff --git a/packages/nvisy-core/src/datatypes/document.test.ts b/packages/nvisy-core/src/datatypes/document.test.ts deleted file mode 100644 index 940b4ad..0000000 --- a/packages/nvisy-core/src/datatypes/document.test.ts +++ /dev/null @@ -1,242 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { Element } from "../documents/elements.js"; -import { Document } from "./document.js"; - -describe("Document", () => { - it("stores content and has no elements by default", () => { - const doc = new Document("hello world"); - expect(doc.content).toBe("hello world"); - expect(doc.elements).toBeUndefined(); - }); - - it("constructor accepts elements in options", () => { - const el = new Element({ - type: "narrative-text", - text: "Hello", - }); - const doc = new Document("Hello", { elements: [el] }); - expect(doc.content).toBe("Hello"); - expect(doc.elements).toHaveLength(1); - expect(doc.elements![0]!.text).toBe("Hello"); - }); - - describe("title", () => { - it("is undefined by default", () => { - const doc = new Document("text"); - expect(doc.title).toBeUndefined(); - }); - - it("is set via constructor options", () => { - const doc = new Document("text", { title: "Quarterly Report" }); - expect(doc.title).toBe("Quarterly Report"); - }); - - it("is preserved by fromElements", () => { - const el = new Element({ - type: "narrative-text", - text: "hi", - }); - const doc = Document.fromElements([el], { - title: "My Page", - }); - expect(doc.title).toBe("My Page"); - }); - }); - - describe("languages", () => { - it("is empty when there are no elements", () => { - const doc = new Document("text"); - expect(doc.languages).toEqual([]); - }); - - it("is empty when no elements have languages", () => { - const doc = new Document("text", { - elements: [ - new Element({ - type: "narrative-text", - text: "hello", - }), - ], - }); - expect(doc.languages).toEqual([]); - }); - - it("collects unique languages from all elements", () => { - const doc = new Document("text", { - elements: [ - new Element({ - type: "narrative-text", - text: "hello", - languages: ["en"], - }), - new Element({ - type: "narrative-text", - text: "hallo", - languages: ["de", "en"], - }), - new Element({ - type: "narrative-text", - text: "bonjour", - languages: ["fr"], - }), - ], - }); - expect(doc.languages).toEqual(["en", "de", "fr"]); - }); - - it("skips elements without languages", () => { - const doc = new Document("text", { - elements: [ - new Element({ - type: "narrative-text", - text: "no lang", - }), - new Element({ - type: "narrative-text", - text: "has lang", - languages: ["es"], - }), - ], - }); - expect(doc.languages).toEqual(["es"]); - }); - }); - - describe("fromElements", () => { - it("derives content from element texts joined with \\n\\n", () => { - const elements = [ - new Element({ - type: "title", - text: "Title", - level: 1, - }), - new Element({ - type: "narrative-text", - text: "First paragraph.", - }), - new Element({ - type: "narrative-text", - text: "Second paragraph.", - }), - ]; - const doc = Document.fromElements(elements); - expect(doc.content).toBe( - "Title\n\nFirst paragraph.\n\nSecond paragraph.", - ); - expect(doc.elements).toHaveLength(3); - }); - - it("produces empty content from empty elements array", () => { - const doc = Document.fromElements([]); - expect(doc.content).toBe(""); - expect(doc.elements).toEqual([]); - }); - }); - - describe("getElementsByPage", () => { - it("returns empty map when there are no elements", () => { - const doc = new Document("text"); - expect(doc.getElementsByPage().size).toBe(0); - }); - - it("groups elements by pageNumber", () => { - const doc = new Document("text", { - elements: [ - new Element({ type: "title", text: "Title", pageNumber: 1 }), - new Element({ type: "narrative-text", text: "p1", pageNumber: 1 }), - new Element({ type: "narrative-text", text: "p2", pageNumber: 2 }), - ], - }); - const pages = doc.getElementsByPage(); - expect(pages.size).toBe(2); - expect(pages.get(1)).toHaveLength(2); - expect(pages.get(2)).toHaveLength(1); - expect(pages.get(2)![0].text).toBe("p2"); - }); - - it("collects elements without pageNumber under key 0", () => { - const doc = new Document("text", { - elements: [ - new Element({ type: "title", text: "Title" }), - new Element({ type: "narrative-text", text: "p1", pageNumber: 1 }), - ], - }); - const pages = doc.getElementsByPage(); - expect(pages.get(0)).toHaveLength(1); - expect(pages.get(0)![0].text).toBe("Title"); - expect(pages.get(1)).toHaveLength(1); - }); - - it("preserves element order within each page", () => { - const doc = new Document("text", { - elements: [ - new Element({ type: "title", text: "A", pageNumber: 1 }), - new Element({ type: "narrative-text", text: "B", pageNumber: 2 }), - new Element({ type: "narrative-text", text: "C", pageNumber: 1 }), - ], - }); - const page1 = doc.getElementsByPage().get(1)!; - expect(page1.map((e) => e.text)).toEqual(["A", "C"]); - }); - }); - - describe("Element", () => { - it("auto-generates a unique id", () => { - const a = new Element({ - type: "narrative-text", - text: "a", - }); - const b = new Element({ - type: "narrative-text", - text: "b", - }); - expect(a.id).toBeTruthy(); - expect(b.id).toBeTruthy(); - expect(a.id).not.toBe(b.id); - }); - - it("carries parentId for hierarchy", () => { - const table = new Element({ type: "table", text: "" }); - const child = new Element({ - type: "narrative-text", - text: "Revenue", - parentId: table.id, - }); - expect(child.parentId).toBe(table.id); - }); - - it("carries pageNumber", () => { - const el = new Element({ - type: "title", - text: "Intro", - pageNumber: 2, - }); - expect(el.pageNumber).toBe(2); - }); - - it("carries optional enrichment fields", () => { - const el = new Element({ - type: "table", - text: "A | B", - languages: ["en"], - provenance: { confidence: 0.95, isContinuation: false }, - }); - expect(el.provenance?.confidence).toBe(0.95); - expect(el.languages).toEqual(["en"]); - expect(el.provenance?.isContinuation).toBe(false); - }); - - it("accepts various element types", () => { - const elements = [ - new Element({ type: "formula", text: "E = mc²" }), - new Element({ type: "list-item", text: "First item" }), - new Element({ type: "page-break", text: "" }), - ]; - expect(elements.map((e) => e.type)).toEqual([ - "formula", - "list-item", - "page-break", - ]); - }); - }); -}); diff --git a/packages/nvisy-core/src/datatypes/document.ts b/packages/nvisy-core/src/datatypes/document.ts deleted file mode 100644 index 27998a8..0000000 --- a/packages/nvisy-core/src/datatypes/document.ts +++ /dev/null @@ -1,112 +0,0 @@ -/** - * Document data type with optional structured elements. - * - * @module - */ - -import type { Element } from "../documents/elements.js"; -import { Data } from "./data.js"; - -/** Options for constructing a {@link Document}. */ -export interface DocumentOptions { - /** Document title (e.g. HTML `<title>`, PDF metadata). */ - readonly title?: string; - /** Pre-extracted structural elements. */ - readonly elements?: readonly Element[]; -} - -/** - * A parsed human-readable text representation of a document. - * - * Represents extracted text from a partition step — the raw bytes have - * already been converted into plain text that can be chunked, enriched, - * or embedded. - * - * Structural detail is carried as a flat array of {@link Element} - * instances. Hierarchy is expressed via `parentId` references and page - * membership via `pageNumber` on each element. - * - * @example - * ```ts - * const doc = Document.fromElements([ - * new Element({ type: "title", text: "Quarterly Report", pageNumber: 1 }), - * new Element({ type: "narrative-text", text: "Revenue increased…", pageNumber: 1 }), - * ]); - * ``` - */ -export class Document extends Data { - readonly #content: string; - readonly #title?: string | undefined; - readonly #elements?: readonly Element[] | undefined; - - constructor(content: string, options?: DocumentOptions) { - super(); - this.#content = content; - this.#title = options?.title; - this.#elements = options?.elements; - } - - /** Text content of the document. */ - get content(): string { - return this.#content; - } - - /** Document title (e.g. HTML `<title>`, PDF metadata). */ - get title(): string | undefined { - return this.#title; - } - - /** Unique BCP-47 language tags collected from all elements. */ - get languages(): readonly string[] { - if (this.#elements == null) return []; - const uniqueLanguages = new Set<string>(); - for (const element of this.#elements) { - if (element.languages != null) { - for (const language of element.languages) { - uniqueLanguages.add(language); - } - } - } - return [...uniqueLanguages]; - } - - /** Flat ordered list of structural elements. */ - get elements(): readonly Element[] | undefined { - return this.#elements; - } - - /** - * Group elements by their 1-based page number. - * - * Returns a `Map` keyed by page number with each value being the - * ordered array of elements on that page. Elements without a - * `pageNumber` are collected under key `0`. - */ - getElementsByPage(): Map<number, Element[]> { - const map = new Map<number, Element[]>(); - if (this.#elements == null) return map; - for (const el of this.#elements) { - const page = el.pageNumber ?? 0; - let bucket = map.get(page); - if (bucket == null) { - bucket = []; - map.set(page, bucket); - } - bucket.push(el); - } - return map; - } - - /** - * Create a Document by deriving `content` from the element texts. - * - * Element texts are joined with `\n\n` separators. - */ - static fromElements( - elements: readonly Element[], - options?: Omit<DocumentOptions, "elements">, - ): Document { - const content = elements.map((el) => el.text).join("\n\n"); - return new Document(content, { ...options, elements }); - } -} diff --git a/packages/nvisy-core/src/datatypes/embedding.test.ts b/packages/nvisy-core/src/datatypes/embedding.test.ts deleted file mode 100644 index 1e0db5f..0000000 --- a/packages/nvisy-core/src/datatypes/embedding.test.ts +++ /dev/null @@ -1,50 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { Embedding } from "./embedding.js"; - -describe("Embedding", () => { - it("constructs from a number array", () => { - const e = new Embedding([0.1, -0.2, 0.3]); - expect(e.dimensions).toBe(3); - expect(e.vector).toBeInstanceOf(Float32Array); - }); - - it("constructs from a Float32Array", () => { - const arr = new Float32Array([1.0, 2.0]); - const e = new Embedding(arr); - expect(e.dimensions).toBe(2); - expect(e.vector).toBe(arr); - }); - - it("preserves approximate values from number array", () => { - const e = new Embedding([0.5, -0.5]); - expect(e.vector[0]).toBeCloseTo(0.5); - expect(e.vector[1]).toBeCloseTo(-0.5); - }); - - it("handles zero-length vector", () => { - const e = new Embedding([]); - expect(e.dimensions).toBe(0); - expect(e.vector).toHaveLength(0); - }); - - it("has a unique id", () => { - const a = new Embedding([1]); - const b = new Embedding([1]); - expect(a.id).not.toBe(b.id); - }); - - it("supports lineage via deriveFrom", () => { - const parent = new Embedding([1, 2]); - const child = new Embedding([3, 4]); - child.deriveFrom(parent); - - expect(child.parentId).toBe(parent.id); - expect(child.isDerived).toBe(true); - }); - - it("supports metadata", () => { - const e = new Embedding([0.1]); - e.withMetadata({ model: "text-embedding-3-small" }); - expect(e.metadata).toEqual({ model: "text-embedding-3-small" }); - }); -}); diff --git a/packages/nvisy-core/src/datatypes/embedding.ts b/packages/nvisy-core/src/datatypes/embedding.ts deleted file mode 100644 index 7de32c4..0000000 --- a/packages/nvisy-core/src/datatypes/embedding.ts +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Dense vector embedding data type for similarity search. - * - * @module - */ - -import { Data } from "./data.js"; - -/** - * A dense vector embedding produced by an embedding model. - * - * Stores the vector as a `Float32Array` for memory efficiency and fast - * math operations. Use {@link dimensions} to inspect the vector size - * without accessing the underlying array. - * - * @example - * ```ts - * const e = new Embedding([0.12, -0.34, 0.56]); - * console.log(e.dimensions); // 3 - * ``` - */ -export class Embedding extends Data { - readonly #vector: Float32Array; - - constructor(vector: Float32Array | number[]) { - super(); - this.#vector = - vector instanceof Float32Array ? vector : new Float32Array(vector); - } - - /** The dense embedding vector. */ - get vector(): Float32Array { - return this.#vector; - } - - /** Dimensionality of the embedding vector. */ - get dimensions(): number { - return this.#vector.length; - } -} diff --git a/packages/nvisy-core/src/datatypes/index.ts b/packages/nvisy-core/src/datatypes/index.ts deleted file mode 100644 index 0573345..0000000 --- a/packages/nvisy-core/src/datatypes/index.ts +++ /dev/null @@ -1,38 +0,0 @@ -/** - * @module datatypes - * - * Base data model and built-in types for the Nvisy pipeline. - */ - -export type { BlobOptions, Filetype } from "./blob.js"; -export { Blob } from "./blob.js"; -export type { ChunkOptions } from "./chunk.js"; -export { Chunk } from "./chunk.js"; -export { Data } from "./data.js"; -export type { DocumentOptions } from "./document.js"; -export { Document } from "./document.js"; -export { Embedding } from "./embedding.js"; - -import type { ClassRef } from "../types.js"; -import type { Data } from "./data.js"; - -/** - * A custom data type registered by a plugin. - * - * Plugins use this to extend the type system with new {@link Data} - * subclasses without modifying nvisy-core. - */ -export interface DatatypeDescriptor { - /** Unique identifier for this data type (e.g. "audio", "image"). */ - readonly id: string; - /** Class reference for the custom data type. */ - readonly dataClass: ClassRef<Data>; -} - -/** Factory for creating data type entries. */ -export const Datatype = { - /** Create a DatatypeDescriptor for registering a custom data type with a plugin. */ - define(id: string, dataClass: ClassRef<Data>): DatatypeDescriptor { - return { id, dataClass }; - }, -} as const; diff --git a/packages/nvisy-core/src/documents/coordinates.test.ts b/packages/nvisy-core/src/documents/coordinates.test.ts deleted file mode 100644 index 059cb13..0000000 --- a/packages/nvisy-core/src/documents/coordinates.test.ts +++ /dev/null @@ -1,148 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { CoordinateSystem, Orientations } from "./coordinates.js"; - -describe("Orientations", () => { - it("SCREEN is [1, -1]", () => { - expect(Orientations.SCREEN).toEqual([1, -1]); - }); - - it("CARTESIAN is [1, 1]", () => { - expect(Orientations.CARTESIAN).toEqual([1, 1]); - }); -}); - -describe("CoordinateSystem", () => { - describe("static factories", () => { - it("pixel() creates a screen-oriented system", () => { - const sys = CoordinateSystem.pixel(1920, 1080); - expect(sys.width).toBe(1920); - expect(sys.height).toBe(1080); - expect(sys.orientation).toEqual(Orientations.SCREEN); - }); - - it("point() creates a cartesian-oriented system", () => { - const sys = CoordinateSystem.point(612, 792); - expect(sys.width).toBe(612); - expect(sys.height).toBe(792); - expect(sys.orientation).toEqual(Orientations.CARTESIAN); - }); - - it("relative() creates a 1x1 cartesian system", () => { - const sys = CoordinateSystem.relative(); - expect(sys.width).toBe(1); - expect(sys.height).toBe(1); - expect(sys.orientation).toEqual(Orientations.CARTESIAN); - }); - }); - - describe("toRelative / fromRelative", () => { - it("pixel origin (0,0) maps to relative (0,1)", () => { - const px = CoordinateSystem.pixel(100, 100); - const rel = px.toRelative({ x: 0, y: 0 }); - expect(rel.x).toBeCloseTo(0); - expect(rel.y).toBeCloseTo(1); - }); - - it("pixel bottom-right maps to relative (1,0)", () => { - const px = CoordinateSystem.pixel(100, 100); - const rel = px.toRelative({ x: 100, y: 100 }); - expect(rel.x).toBeCloseTo(1); - expect(rel.y).toBeCloseTo(0); - }); - - it("point origin (0,0) maps to relative (0,0)", () => { - const pt = CoordinateSystem.point(612, 792); - const rel = pt.toRelative({ x: 0, y: 0 }); - expect(rel.x).toBeCloseTo(0); - expect(rel.y).toBeCloseTo(0); - }); - - it("fromRelative is the inverse of toRelative", () => { - const px = CoordinateSystem.pixel(200, 300); - const original = { x: 50, y: 75 }; - const rel = px.toRelative(original); - const back = px.fromRelative(rel); - expect(back.x).toBeCloseTo(original.x); - expect(back.y).toBeCloseTo(original.y); - }); - }); - - describe("convertTo", () => { - it("converts pixel top-left to point bottom-left", () => { - const px = CoordinateSystem.pixel(100, 100); - const pt = CoordinateSystem.point(100, 100); - const result = px.convertTo(pt, { x: 0, y: 0 }); - expect(result.x).toBeCloseTo(0); - expect(result.y).toBeCloseTo(100); - }); - - it("converts pixel center to point center", () => { - const px = CoordinateSystem.pixel(200, 200); - const pt = CoordinateSystem.point(200, 200); - const result = px.convertTo(pt, { x: 100, y: 100 }); - expect(result.x).toBeCloseTo(100); - expect(result.y).toBeCloseTo(100); - }); - - it("handles different dimensions between systems", () => { - const px = CoordinateSystem.pixel(1920, 1080); - const pt = CoordinateSystem.point(612, 792); - const result = px.convertTo(pt, { x: 960, y: 540 }); - expect(result.x).toBeCloseTo(306); - expect(result.y).toBeCloseTo(396); - }); - - it("round-trips through relative", () => { - const a = CoordinateSystem.pixel(800, 600); - const b = CoordinateSystem.point(400, 300); - const p = { x: 200, y: 150 }; - const converted = a.convertTo(b, p); - const back = b.convertTo(a, converted); - expect(back.x).toBeCloseTo(p.x); - expect(back.y).toBeCloseTo(p.y); - }); - }); - - describe("convertAllTo", () => { - it("converts all corner points at once", () => { - const px = CoordinateSystem.pixel(100, 100); - const pt = CoordinateSystem.point(100, 100); - const corners = [ - { x: 10, y: 20 }, - { x: 10, y: 80 }, - { x: 90, y: 80 }, - { x: 90, y: 20 }, - ]; - const result = px.convertAllTo(pt, corners); - expect(result).toHaveLength(4); - expect(result[0]!.x).toBeCloseTo(10); - expect(result[0]!.y).toBeCloseTo(80); - }); - - it("returns empty array for empty input", () => { - const px = CoordinateSystem.pixel(100, 100); - const pt = CoordinateSystem.point(100, 100); - expect(px.convertAllTo(pt, [])).toEqual([]); - }); - }); - - describe("equals", () => { - it("returns true for identical systems", () => { - const a = CoordinateSystem.pixel(100, 200); - const b = CoordinateSystem.pixel(100, 200); - expect(a.equals(b)).toBe(true); - }); - - it("returns false for different orientations", () => { - const px = CoordinateSystem.pixel(100, 100); - const pt = CoordinateSystem.point(100, 100); - expect(px.equals(pt)).toBe(false); - }); - - it("returns false for different dimensions", () => { - const a = CoordinateSystem.pixel(100, 100); - const b = CoordinateSystem.pixel(200, 100); - expect(a.equals(b)).toBe(false); - }); - }); -}); diff --git a/packages/nvisy-core/src/documents/coordinates.ts b/packages/nvisy-core/src/documents/coordinates.ts deleted file mode 100644 index 5cc378f..0000000 --- a/packages/nvisy-core/src/documents/coordinates.ts +++ /dev/null @@ -1,174 +0,0 @@ -/** - * Spatial coordinate systems and element positioning for document elements. - * - * Coordinate systems differ by origin and axis direction: - * - **Pixel space** — origin at top-left, y increases downward (images, OCR). - * - **Point space** — origin at bottom-left, y increases upward (PDF, PostScript). - * - **Relative** — unit square (0–1 on both axes), y increases upward. - * - * Element positions are stored as an array of corner {@link Point | points} - * rather than an axis-aligned bounding box, so rotated and skewed regions - * are represented without loss. - * - * Use {@link CoordinateSystem.convertTo} to transform points between systems. - * - * @example - * ```ts - * const px = CoordinateSystem.pixel(1920, 1080); - * const pt = CoordinateSystem.point(612, 792); - * const result = px.convertTo(pt, { x: 960, y: 540 }); - * ``` - * - * @module - */ - -/** A point in 2D space. */ -export interface Point { - readonly x: number; - readonly y: number; -} - -/** - * Axis orientation as an `[xSign, ySign]` tuple. - * - * - `1` — value grows in the standard (rightward / upward) direction. - * - `-1` — axis is inverted (e.g. y grows downward for screen coordinates). - */ -export type Orientation = readonly [x: 1 | -1, y: 1 | -1]; - -/** - * Built-in orientation presets. - * - * - `Orientations.SCREEN` — origin top-left, y increases downward. - * - `Orientations.CARTESIAN` — origin bottom-left, y increases upward. - */ -export const Orientations = { - /** Screen orientation — origin top-left, y increases downward. */ - SCREEN: [1, -1] as Orientation, - /** Cartesian orientation — origin bottom-left, y increases upward. */ - CARTESIAN: [1, 1] as Orientation, -} as const; - -/** Convert a single coordinate along one axis via a linear transformation. */ -function convertAxis( - value: number, - fromMax: number, - toMax: number, - sign: 1 | -1, -): number { - const t = value / fromMax; - return (((1 - t) * (1 - sign)) / 2 + (t * (1 + sign)) / 2) * toMax; -} - -/** - * A finite coordinate plane with a given width, height, and orientation. - * - * Instances are immutable value objects. Use the static factories - * {@link CoordinateSystem.pixel}, {@link CoordinateSystem.point}, and - * {@link CoordinateSystem.relative} for the common coordinate spaces. - */ -export class CoordinateSystem { - /** Width of the coordinate plane. */ - readonly width: number; - - /** Height of the coordinate plane. */ - readonly height: number; - - /** Axis orientation of this coordinate system. */ - readonly orientation: Orientation; - - constructor(width: number, height: number, orientation: Orientation) { - this.width = width; - this.height = height; - this.orientation = orientation; - } - - /** Pixel-space system (origin top-left, y down). */ - static pixel(width: number, height: number): CoordinateSystem { - return new CoordinateSystem(width, height, Orientations.SCREEN); - } - - /** Point-space system (origin bottom-left, y up). */ - static point(width: number, height: number): CoordinateSystem { - return new CoordinateSystem(width, height, Orientations.CARTESIAN); - } - - /** Unit-square relative coordinate system (0–1, Cartesian). */ - static relative(): CoordinateSystem { - return new CoordinateSystem(1, 1, Orientations.CARTESIAN); - } - - /** Convert a point from this system to the 0–1 relative system. */ - toRelative(p: Point): Point { - const [xSign, ySign] = this.orientation; - return { - x: convertAxis(p.x, this.width, 1, xSign), - y: convertAxis(p.y, this.height, 1, ySign), - }; - } - - /** Convert a point from the 0–1 relative system into this system. */ - fromRelative(p: Point): Point { - const [xSign, ySign] = this.orientation; - return { - x: convertAxis(p.x, 1, this.width, xSign), - y: convertAxis(p.y, 1, this.height, ySign), - }; - } - - /** Convert a point from this system into `target`. */ - convertTo(target: CoordinateSystem, p: Point): Point { - return target.fromRelative(this.toRelative(p)); - } - - /** - * Convert an array of points from this system into `target`. - * - * Convenience wrapper around {@link convertTo} for transforming - * all corners of an {@link ElementCoordinates.points} array at once. - */ - convertAllTo(target: CoordinateSystem, points: readonly Point[]): Point[] { - return points.map((p) => this.convertTo(target, p)); - } - - /** Structural equality. */ - equals(other: CoordinateSystem): boolean { - return ( - this.width === other.width && - this.height === other.height && - this.orientation[0] === other.orientation[0] && - this.orientation[1] === other.orientation[1] - ); - } -} - -/** - * Spatial coordinates for a document element. - * - * Corner points specify the bounding region of the element, starting - * from the top-left corner and proceeding counter-clockwise. Using - * points rather than an axis-aligned box naturally handles rotated - * and skewed regions. - * - * @example - * ```ts - * const coords: ElementCoordinates = { - * points: [ - * { x: 10, y: 20 }, // top-left - * { x: 10, y: 120 }, // bottom-left - * { x: 210, y: 120 }, // bottom-right - * { x: 210, y: 20 }, // top-right - * ], - * system: CoordinateSystem.pixel(1920, 1080), - * }; - * ``` - */ -export interface ElementCoordinates { - /** - * Corner points of the bounding region, counter-clockwise - * from top-left. - */ - readonly points: readonly Point[]; - /** The coordinate system the points were measured in. */ - readonly system: CoordinateSystem; -} diff --git a/packages/nvisy-core/src/documents/elements.test.ts b/packages/nvisy-core/src/documents/elements.test.ts deleted file mode 100644 index 85fdc38..0000000 --- a/packages/nvisy-core/src/documents/elements.test.ts +++ /dev/null @@ -1,310 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { - CompositeElement, - Element, - EmailElement, - FormElement, - ImageElement, - TableElement, -} from "./elements.js"; - -describe("Element", () => { - it("auto-generates a unique id", () => { - const a = new Element({ type: "title", text: "a" }); - const b = new Element({ type: "title", text: "b" }); - expect(a.id).toBeTruthy(); - expect(b.id).toBeTruthy(); - expect(a.id).not.toBe(b.id); - }); - - it("assigns type and text from options", () => { - const el = new Element({ type: "narrative-text", text: "Hello world" }); - expect(el.type).toBe("narrative-text"); - expect(el.text).toBe("Hello world"); - }); - - it("carries all base fields", () => { - const el = new Element({ - type: "title", - text: "Intro", - parentId: "parent-1", - pageNumber: 2, - level: 1, - languages: ["en"], - metadata: { key: "value" }, - }); - expect(el.parentId).toBe("parent-1"); - expect(el.pageNumber).toBe(2); - expect(el.level).toBe(1); - expect(el.languages).toEqual(["en"]); - expect(el.metadata).toEqual({ key: "value" }); - }); - - it("all optional fields default to undefined", () => { - const el = new Element({ type: "title", text: "" }); - expect(el.parentId).toBeUndefined(); - expect(el.pageNumber).toBeUndefined(); - expect(el.pageName).toBeUndefined(); - expect(el.level).toBeUndefined(); - expect(el.languages).toBeUndefined(); - expect(el.metadata).toBeUndefined(); - expect(el.sourceTag).toBeUndefined(); - expect(el.textAsHtml).toBeUndefined(); - expect(el.links).toBeUndefined(); - expect(el.emphasizedTexts).toBeUndefined(); - expect(el.provenance).toBeUndefined(); - }); - - describe("links and emphasizedTexts", () => { - it("carries links with startIndex", () => { - const el = new Element({ - type: "narrative-text", - text: "Visit example.com for details.", - links: [ - { text: "example.com", url: "https://example.com", startIndex: 6 }, - ], - }); - expect(el.links).toHaveLength(1); - expect(el.links![0].startIndex).toBe(6); - expect(el.links![0].url).toBe("https://example.com"); - }); - - it("carries emphasizedTexts", () => { - const el = new Element({ - type: "title", - text: "Important notice", - emphasizedTexts: [{ text: "Important", tag: "strong" }], - }); - expect(el.emphasizedTexts).toHaveLength(1); - expect(el.emphasizedTexts![0].tag).toBe("strong"); - }); - - it("available on any element type, not just text", () => { - const el = new TableElement({ - type: "table", - text: "A table with links", - links: [{ text: "link", url: "https://example.com", startIndex: 0 }], - emphasizedTexts: [{ text: "table", tag: "b" }], - }); - expect(el.links).toHaveLength(1); - expect(el.emphasizedTexts).toHaveLength(1); - }); - }); - - describe("table fields", () => { - it("carries cells with row, column, isHeader", () => { - const el = new TableElement({ - type: "table", - text: "", - cells: [ - { row: 0, column: 0, text: "Name", isHeader: true }, - { row: 0, column: 1, text: "Age", isHeader: true }, - { row: 1, column: 0, text: "Alice" }, - { row: 1, column: 1, text: "30" }, - ], - }); - expect(el).toBeInstanceOf(Element); - expect(el.cells).toHaveLength(4); - expect(el.cells![0].isHeader).toBe(true); - expect(el.cells![2].text).toBe("Alice"); - }); - - it("cells defaults to undefined", () => { - const el = new TableElement({ type: "table", text: "" }); - expect(el.cells).toBeUndefined(); - }); - }); - - describe("image fields", () => { - it("carries imageBase64, imageMimeType, imageUrl, imagePath", () => { - const el = new ImageElement({ - type: "image", - text: "A photo", - imageBase64: "abc123==", - imageMimeType: "image/png", - imageUrl: "https://example.com/photo.png", - imagePath: "/tmp/photo.png", - }); - expect(el.imageBase64).toBe("abc123=="); - expect(el.imageMimeType).toBe("image/png"); - expect(el.imageUrl).toBe("https://example.com/photo.png"); - expect(el.imagePath).toBe("/tmp/photo.png"); - }); - - it("is an instance of Element", () => { - const el = new ImageElement({ type: "image", text: "photo" }); - expect(el).toBeInstanceOf(Element); - expect(el).toBeInstanceOf(ImageElement); - }); - - it("image fields default to undefined", () => { - const el = new ImageElement({ type: "image", text: "" }); - expect(el.imageBase64).toBeUndefined(); - expect(el.imageMimeType).toBeUndefined(); - expect(el.imageUrl).toBeUndefined(); - expect(el.imagePath).toBeUndefined(); - }); - - it("image fields only live on ImageElement", () => { - const base = new Element({ type: "image", text: "" }); - expect("imageBase64" in base).toBe(false); - expect("imageMimeType" in base).toBe(false); - expect("imageUrl" in base).toBe(false); - expect("imagePath" in base).toBe(false); - }); - }); - - describe("form fields", () => { - it("carries checked and value", () => { - const el = new FormElement({ - type: "checkbox", - text: "Accept terms", - checked: true, - value: "yes", - }); - expect(el).toBeInstanceOf(Element); - expect(el.checked).toBe(true); - expect(el.value).toBe("yes"); - }); - - it("checked and value default to undefined", () => { - const el = new FormElement({ type: "checkbox", text: "" }); - expect(el.checked).toBeUndefined(); - expect(el.value).toBeUndefined(); - }); - }); - - describe("email fields", () => { - it("carries all email envelope fields", () => { - const el = new EmailElement({ - type: "email-message", - text: "Hello from email", - sentFrom: ["alice@example.com"], - sentTo: ["bob@example.com"], - ccRecipient: ["carol@example.com"], - bccRecipient: ["dave@example.com"], - subject: "Meeting notes", - signature: "— Alice", - emailMessageId: "<msg-001@example.com>", - }); - expect(el).toBeInstanceOf(Element); - expect(el.sentFrom).toEqual(["alice@example.com"]); - expect(el.sentTo).toEqual(["bob@example.com"]); - expect(el.ccRecipient).toEqual(["carol@example.com"]); - expect(el.bccRecipient).toEqual(["dave@example.com"]); - expect(el.subject).toBe("Meeting notes"); - expect(el.signature).toBe("— Alice"); - expect(el.emailMessageId).toBe("<msg-001@example.com>"); - }); - - it("email fields default to undefined", () => { - const el = new EmailElement({ type: "email-message", text: "" }); - expect(el.sentFrom).toBeUndefined(); - expect(el.sentTo).toBeUndefined(); - expect(el.ccRecipient).toBeUndefined(); - expect(el.bccRecipient).toBeUndefined(); - expect(el.subject).toBeUndefined(); - expect(el.signature).toBeUndefined(); - expect(el.emailMessageId).toBeUndefined(); - }); - }); - - describe("provenance fields", () => { - it("carries detectionOrigin via provenance", () => { - const el = new Element({ - type: "title", - text: "Hello", - provenance: { detectionOrigin: "tesseract-v5" }, - }); - expect(el.provenance?.detectionOrigin).toBe("tesseract-v5"); - }); - - it("carries headerFooterType via provenance", () => { - const el = new Element({ - type: "header", - text: "Page 1", - provenance: { headerFooterType: "page-header" }, - }); - expect(el.provenance?.headerFooterType).toBe("page-header"); - }); - }); - - describe("source fidelity fields", () => { - it("carries sourceTag for format-specific origin", () => { - const el = new Element({ - type: "narrative-text", - text: "To be or not to be", - sourceTag: "blockquote", - }); - expect(el.sourceTag).toBe("blockquote"); - }); - - it("carries textAsHtml on base Element", () => { - const el = new Element({ - type: "narrative-text", - text: "bold text", - textAsHtml: "<p><strong>bold</strong> text</p>", - }); - expect(el.textAsHtml).toBe("<p><strong>bold</strong> text</p>"); - }); - - it("carries textAsHtml on TableElement", () => { - const el = new TableElement({ - type: "table", - text: "Name\tAge\nAlice\t30", - textAsHtml: "<table><tr><td>Name</td><td>Age</td></tr></table>", - }); - expect(el.textAsHtml).toBe( - "<table><tr><td>Name</td><td>Age</td></tr></table>", - ); - }); - - it("carries pageName for worksheet-based sources", () => { - const el = new Element({ - type: "table", - text: "data", - pageName: "Sheet1", - }); - expect(el.pageName).toBe("Sheet1"); - }); - }); - - describe("form keyValuePairs", () => { - it("carries structured key-value pairs", () => { - const el = new FormElement({ - type: "form-keys-values", - text: "Name: Alice", - keyValuePairs: [ - { key: "Name", value: "Alice", confidence: 0.99 }, - { key: "Age", value: "30" }, - ], - }); - expect(el.keyValuePairs).toHaveLength(2); - expect(el.keyValuePairs![0].key).toBe("Name"); - expect(el.keyValuePairs![0].value).toBe("Alice"); - expect(el.keyValuePairs![0].confidence).toBe(0.99); - expect(el.keyValuePairs![1].confidence).toBeUndefined(); - }); - - it("keyValuePairs defaults to undefined", () => { - const el = new FormElement({ type: "form-keys-values", text: "" }); - expect(el.keyValuePairs).toBeUndefined(); - }); - }); - - describe("composite fields", () => { - it("carries origElements", () => { - const orig1 = new Element({ type: "narrative-text", text: "Part 1" }); - const orig2 = new Element({ type: "narrative-text", text: "Part 2" }); - const composite = new CompositeElement({ - type: "narrative-text", - text: "Part 1 Part 2", - origElements: [orig1, orig2], - }); - expect(composite).toBeInstanceOf(Element); - expect(composite.origElements).toHaveLength(2); - expect(composite.origElements[0].text).toBe("Part 1"); - expect(composite.origElements[1].text).toBe("Part 2"); - }); - }); -}); diff --git a/packages/nvisy-core/src/documents/elements.ts b/packages/nvisy-core/src/documents/elements.ts deleted file mode 100644 index 9544c56..0000000 --- a/packages/nvisy-core/src/documents/elements.ts +++ /dev/null @@ -1,374 +0,0 @@ -/** - * Document element model. - * - * Every structural piece of a parsed document — paragraphs, headings, - * tables, images, etc. — is represented as an {@link Element} instance. - * The {@link Element.type | type} field (one of the {@link ElementType} - * string literals defined in `ontology.ts`) is the primary discriminator. - * - * Type-specific fields live on dedicated subclasses: - * - * | Subclass | Category | Extra fields | - * | ------------------- | -------- | ----------------------------------------- | - * | {@link ImageElement} | media | base64, mime type, URL, path | - * | {@link TableElement} | table | structured cells | - * | {@link FormElement} | form | checkbox state, value, key-value pairs | - * | {@link EmailElement} | email | envelope (from, to, cc, bcc, subject, …) | - * | {@link CompositeElement} | any | pre-chunking original elements | - * - * Extraction / OCR provenance fields are bundled in - * {@link ElementProvenance} rather than scattered across the base class. - * - * Source-format fidelity is preserved via {@link Element.sourceTag} (the - * original HTML tag or format-specific type name) and - * {@link Element.textAsHtml} (original markup for round-tripping). - * - * @module - */ - -import type { Metadata } from "../types.js"; -import type { ElementCoordinates } from "./coordinates.js"; -import type { - ElementType, - EmailType, - FormType, - MediaType, - TableType, -} from "./ontology.js"; - -/** An inline hyperlink within element text. */ -export interface Link { - /** The visible link text. */ - readonly text: string; - /** The target URL. */ - readonly url: string; - /** 0-based character offset of the link text within the element's {@link Element.text}. */ - readonly startIndex: number; -} - -/** An inline formatting span within element text. */ -export interface EmphasizedText { - /** The formatted text content. */ - readonly text: string; - /** HTML tag name — `"b"`, `"i"`, `"em"`, `"strong"`, etc. */ - readonly tag: string; -} - -/** A single cell within a table structure. */ -export interface TableCellData { - /** 0-based row index. */ - readonly row: number; - /** 0-based column index. */ - readonly column: number; - /** Plain-text content of the cell. */ - readonly text: string; - /** `true` when this cell is part of the table header. */ - readonly isHeader?: boolean; -} - -/** Extraction / OCR provenance fields bundled into a single object. */ -export interface ElementProvenance { - /** Spatial position on the source page (OCR, PDF). */ - readonly coordinates?: ElementCoordinates; - /** Extraction confidence score (0–1). */ - readonly confidence?: number; - /** Which model or system produced this element. */ - readonly detectionOrigin?: string; - /** `true` when this element continues from a previous page or chunk. */ - readonly isContinuation?: boolean; - /** Distinguishes page-header vs document-header, etc. */ - readonly headerFooterType?: string; -} - -/** A structured key-value pair extracted from a form. */ -export interface FormKeyValuePair { - /** The field label. */ - readonly key: string; - /** The field value, if present. */ - readonly value?: string; - /** Extraction confidence score (0–1). */ - readonly confidence?: number; -} - -/** Options for constructing an {@link Element}. */ -export interface ElementOptions { - /** The element's structural type. */ - readonly type: ElementType; - /** Extracted text content. May be empty for non-textual elements. */ - readonly text: string; - /** ID of the parent element (e.g. a table cell's parent row). */ - readonly parentId?: string; - /** 1-based page number this element belongs to. */ - readonly pageNumber?: number; - /** Named page or sheet (e.g. XLSX worksheet name). */ - readonly pageName?: string; - /** Nesting depth — 1–6 for headings, 1+ for nested lists. */ - readonly level?: number; - /** BCP-47 language tags detected for this element. */ - readonly languages?: readonly string[]; - /** Element-scoped metadata (e.g. table caption, alt text). */ - readonly metadata?: Metadata; - /** Original source tag or format-specific type name (e.g. `"blockquote"`, `"dl"`). */ - readonly sourceTag?: string; - /** Original markup for round-tripping (e.g. the HTML of a table row). */ - readonly textAsHtml?: string; - /** Inline hyperlinks embedded in {@link text}. */ - readonly links?: readonly Link[]; - /** Bold / italic formatting spans embedded in {@link text}. */ - readonly emphasizedTexts?: readonly EmphasizedText[]; - /** Extraction / OCR provenance data. */ - readonly provenance?: ElementProvenance; -} - -/** - * A single structural element extracted from a document. - * - * Every element carries an {@link id}, a {@link type} discriminator, - * and its extracted {@link text}. Type-specific fields live on - * dedicated subclasses; provenance data is in {@link provenance}. - * - * Hierarchy is expressed via {@link parentId} references rather than - * nesting, keeping the element array flat and easy to iterate. - */ -export class Element { - /** Unique identifier for this element. */ - readonly id: string = crypto.randomUUID(); - /** The element's structural type. */ - readonly type: ElementType; - /** Extracted text content. May be empty for non-textual elements. */ - readonly text: string; - /** ID of the parent element (e.g. a table cell's parent row). */ - readonly parentId?: string | undefined; - /** 1-based page number this element belongs to. */ - readonly pageNumber?: number | undefined; - /** Named page or sheet (e.g. XLSX worksheet name). */ - readonly pageName?: string | undefined; - /** Nesting depth — 1–6 for headings, 1+ for nested lists. */ - readonly level?: number | undefined; - /** BCP-47 language tags detected for this element. */ - readonly languages?: readonly string[] | undefined; - /** Element-scoped metadata (e.g. table caption, alt text). */ - readonly metadata?: Metadata | undefined; - /** Original source tag or format-specific type name (e.g. `"blockquote"`, `"dl"`). */ - readonly sourceTag?: string | undefined; - /** Original markup for round-tripping (e.g. the HTML of a table row). */ - readonly textAsHtml?: string | undefined; - /** Inline hyperlinks embedded in {@link text}. */ - readonly links?: readonly Link[] | undefined; - /** Bold / italic formatting spans embedded in {@link text}. */ - readonly emphasizedTexts?: readonly EmphasizedText[] | undefined; - /** Extraction / OCR provenance data. */ - readonly provenance?: ElementProvenance | undefined; - - constructor(options: ElementOptions) { - this.type = options.type; - this.text = options.text; - this.parentId = options.parentId; - this.pageNumber = options.pageNumber; - this.pageName = options.pageName; - this.level = options.level; - this.languages = options.languages; - this.metadata = options.metadata; - this.sourceTag = options.sourceTag; - this.textAsHtml = options.textAsHtml; - this.links = options.links; - this.emphasizedTexts = options.emphasizedTexts; - this.provenance = options.provenance; - } -} - -/** - * Options for constructing an {@link ImageElement}. - * - * Narrows {@link ElementOptions.type | type} to {@link MediaType} and - * adds fields for carrying image data in various forms. - */ -export interface ImageElementOptions extends ElementOptions { - readonly type: MediaType; - /** Base64-encoded image content. */ - readonly imageBase64?: string; - /** MIME type of the image (e.g. `"image/png"`). */ - readonly imageMimeType?: string; - /** Remote URL where the image can be fetched. */ - readonly imageUrl?: string; - /** Local filesystem path to the image file. */ - readonly imagePath?: string; -} - -/** - * An element representing an image extracted from a document. - * - * Image data may be provided in one or more forms — inline base64, - * a remote URL, or a local file path. Use `instanceof ImageElement` - * for runtime type narrowing. - */ -export class ImageElement extends Element { - /** Base64-encoded image content. */ - readonly imageBase64?: string | undefined; - /** MIME type of the image (e.g. `"image/png"`). */ - readonly imageMimeType?: string | undefined; - /** Remote URL where the image can be fetched. */ - readonly imageUrl?: string | undefined; - /** Local filesystem path to the image file. */ - readonly imagePath?: string | undefined; - - constructor(options: ImageElementOptions) { - super(options); - this.imageBase64 = options.imageBase64; - this.imageMimeType = options.imageMimeType; - this.imageUrl = options.imageUrl; - this.imagePath = options.imagePath; - } -} - -/** - * Options for constructing a {@link TableElement}. - * - * Narrows {@link ElementOptions.type | type} to {@link TableType} and - * adds structured cell data. - */ -export interface TableElementOptions extends ElementOptions { - readonly type: TableType; - /** Structured cell data for the table. */ - readonly cells?: readonly TableCellData[]; -} - -/** - * An element representing a table extracted from a document. - * - * Structured cell data is in {@link cells}. The inherited - * {@link Element.textAsHtml | textAsHtml} field can carry the - * original `<table>` markup for lossless round-tripping. - */ -export class TableElement extends Element { - /** Structured cell data for the table. */ - readonly cells?: readonly TableCellData[] | undefined; - - constructor(options: TableElementOptions) { - super(options); - this.cells = options.cells; - } -} - -/** - * Options for constructing a {@link FormElement}. - * - * Narrows {@link ElementOptions.type | type} to {@link FormType} and - * adds checkbox / form-field state. - */ -export interface FormElementOptions extends ElementOptions { - readonly type: FormType; - /** Checkbox checked state. */ - readonly checked?: boolean; - /** Scalar form-field value. */ - readonly value?: string; - /** Structured key-value pairs extracted from a form. */ - readonly keyValuePairs?: readonly FormKeyValuePair[]; -} - -/** - * An element representing a form field or checkbox. - * - * Simple checkboxes use {@link checked}; richer forms use - * {@link keyValuePairs} for structured key-value extraction. - */ -export class FormElement extends Element { - /** Checkbox checked state. */ - readonly checked?: boolean | undefined; - /** Scalar form-field value. */ - readonly value?: string | undefined; - /** Structured key-value pairs extracted from a form. */ - readonly keyValuePairs?: readonly FormKeyValuePair[] | undefined; - - constructor(options: FormElementOptions) { - super(options); - this.checked = options.checked; - this.value = options.value; - this.keyValuePairs = options.keyValuePairs; - } -} - -/** - * Options for constructing an {@link EmailElement}. - * - * Narrows {@link ElementOptions.type | type} to {@link EmailType} and - * adds standard email envelope fields. - */ -export interface EmailElementOptions extends ElementOptions { - readonly type: EmailType; - /** Sender address(es). */ - readonly sentFrom?: readonly string[]; - /** Primary recipient address(es). */ - readonly sentTo?: readonly string[]; - /** CC recipient address(es). */ - readonly ccRecipient?: readonly string[]; - /** BCC recipient address(es). */ - readonly bccRecipient?: readonly string[]; - /** Email subject line. */ - readonly subject?: string; - /** Email signature block. */ - readonly signature?: string; - /** RFC 2822 Message-ID header value. */ - readonly emailMessageId?: string; -} - -/** - * An element representing an email message. - * - * Carries standard envelope fields (from, to, cc, bcc, subject) plus - * optional signature and message-id for threading. - */ -export class EmailElement extends Element { - /** Sender address(es). */ - readonly sentFrom?: readonly string[] | undefined; - /** Primary recipient address(es). */ - readonly sentTo?: readonly string[] | undefined; - /** CC recipient address(es). */ - readonly ccRecipient?: readonly string[] | undefined; - /** BCC recipient address(es). */ - readonly bccRecipient?: readonly string[] | undefined; - /** Email subject line. */ - readonly subject?: string | undefined; - /** Email signature block. */ - readonly signature?: string | undefined; - /** RFC 2822 Message-ID header value. */ - readonly emailMessageId?: string | undefined; - - constructor(options: EmailElementOptions) { - super(options); - this.sentFrom = options.sentFrom; - this.sentTo = options.sentTo; - this.ccRecipient = options.ccRecipient; - this.bccRecipient = options.bccRecipient; - this.subject = options.subject; - this.signature = options.signature; - this.emailMessageId = options.emailMessageId; - } -} - -/** - * Options for constructing a {@link CompositeElement}. - * - * Requires the original pre-chunking elements that were merged to - * form this composite. - */ -export interface CompositeElementOptions extends ElementOptions { - /** The original elements that were merged during chunking. */ - readonly origElements: readonly Element[]; -} - -/** - * A composite element formed by merging multiple elements during chunking. - * - * Preserves the original pre-chunking elements in {@link origElements} - * so downstream consumers can access fine-grained structure if needed. - */ -export class CompositeElement extends Element { - /** The original elements that were merged during chunking. */ - readonly origElements: readonly Element[]; - - constructor(options: CompositeElementOptions) { - super(options); - this.origElements = options.origElements; - } -} diff --git a/packages/nvisy-core/src/documents/index.ts b/packages/nvisy-core/src/documents/index.ts deleted file mode 100644 index bbf56e6..0000000 --- a/packages/nvisy-core/src/documents/index.ts +++ /dev/null @@ -1,69 +0,0 @@ -/** - * @module documents - * - * Element ontology, coordinate types, and element class - * for structured document representations. - * - * @example - * ```ts - * import { - * CoordinateSystem, - * ElementType, - * TextType, - * categoryOf, - * } from "@nvisy/core"; - * - * // Use the const object for type-safe element type checks - * if (el.type === ElementType.Title) { … } - * - * // Look up which category an element belongs to - * categoryOf("title"); // => "text" - * - * // Convert coordinates between pixel and point space - * const px = CoordinateSystem.pixel(1920, 1080); - * const pt = CoordinateSystem.point(612, 792); - * const result = px.convertTo(pt, { x: 960, y: 540 }); - * ``` - */ - -export type { - ElementCoordinates, - Orientation, - Point, -} from "./coordinates.js"; -export { CoordinateSystem, Orientations } from "./coordinates.js"; -export type { - CompositeElementOptions, - ElementOptions, - ElementProvenance, - EmailElementOptions, - EmphasizedText, - FormElementOptions, - FormKeyValuePair, - ImageElementOptions, - Link, - TableCellData, - TableElementOptions, -} from "./elements.js"; -export { - CompositeElement, - Element, - EmailElement, - FormElement, - ImageElement, - TableElement, -} from "./elements.js"; -export type { ElementCategory } from "./ontology.js"; -export { - CodeType, - categoryOf, - ElementType, - EmailType, - FormType, - LayoutType, - MathType, - MediaType, - ontology, - TableType, - TextType, -} from "./ontology.js"; diff --git a/packages/nvisy-core/src/documents/ontology.test.ts b/packages/nvisy-core/src/documents/ontology.test.ts deleted file mode 100644 index 6170722..0000000 --- a/packages/nvisy-core/src/documents/ontology.test.ts +++ /dev/null @@ -1,137 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { - CodeType, - categoryOf, - ElementType, - EmailType, - FormType, - LayoutType, - MathType, - MediaType, - ontology, - TableType, - TextType, -} from "./ontology.js"; - -describe("per-category const objects", () => { - it("TextType has 8 entries", () => { - expect(Object.values(TextType)).toHaveLength(8); - expect(TextType.Title).toBe("title"); - expect(TextType.NarrativeText).toBe("narrative-text"); - expect(TextType.ListItem).toBe("list-item"); - expect(TextType.Header).toBe("header"); - expect(TextType.Footer).toBe("footer"); - expect(TextType.FigureCaption).toBe("figure-caption"); - expect(TextType.Address).toBe("address"); - expect(TextType.UncategorizedText).toBe("uncategorized-text"); - }); - - it("TableType has 1 entry", () => { - expect(Object.values(TableType)).toEqual(["table"]); - }); - - it("MediaType has 1 entry", () => { - expect(Object.values(MediaType)).toEqual(["image"]); - }); - - it("CodeType has 1 entry", () => { - expect(Object.values(CodeType)).toEqual(["code-snippet"]); - }); - - it("MathType has 1 entry", () => { - expect(Object.values(MathType)).toEqual(["formula"]); - }); - - it("FormType has 2 entries", () => { - expect(Object.values(FormType)).toHaveLength(2); - expect(FormType.CheckBox).toBe("checkbox"); - expect(FormType.FormKeysValues).toBe("form-keys-values"); - }); - - it("LayoutType has 2 entries", () => { - expect(Object.values(LayoutType)).toHaveLength(2); - expect(LayoutType.PageBreak).toBe("page-break"); - expect(LayoutType.PageNumber).toBe("page-number"); - }); - - it("EmailType has 1 entry", () => { - expect(Object.values(EmailType)).toEqual(["email-message"]); - }); -}); - -describe("ElementType", () => { - it("has all 17 values", () => { - const allValues = Object.values(ElementType); - expect(allValues).toHaveLength(17); - }); - - it("includes values from every category", () => { - expect(ElementType.Title).toBe("title"); - expect(ElementType.Table).toBe("table"); - expect(ElementType.Image).toBe("image"); - expect(ElementType.CodeSnippet).toBe("code-snippet"); - expect(ElementType.Formula).toBe("formula"); - expect(ElementType.CheckBox).toBe("checkbox"); - expect(ElementType.PageBreak).toBe("page-break"); - }); -}); - -describe("ontology", () => { - it("maps every category to a non-empty array", () => { - for (const [category, types] of Object.entries(ontology)) { - expect(types.length, `${category} should have types`).toBeGreaterThan(0); - } - }); - - it("has 8 categories", () => { - expect(Object.keys(ontology)).toHaveLength(8); - }); - - it("has no duplicate element types across categories", () => { - const seen = new Map<string, string>(); - for (const [category, types] of Object.entries(ontology)) { - for (const t of types) { - expect( - seen.has(t), - `"${t}" appears in both "${seen.get(t)}" and "${category}"`, - ).toBe(false); - seen.set(t, category); - } - } - }); - - it("total entries across all categories equals 17", () => { - const total = Object.values(ontology).reduce( - (sum, arr) => sum + arr.length, - 0, - ); - expect(total).toBe(17); - }); -}); - -describe("categoryOf", () => { - it("returns the correct category for known types", () => { - expect(categoryOf(ElementType.Title)).toBe("text"); - expect(categoryOf(ElementType.NarrativeText)).toBe("text"); - expect(categoryOf(ElementType.ListItem)).toBe("text"); - expect(categoryOf(ElementType.Header)).toBe("text"); - expect(categoryOf(ElementType.Footer)).toBe("text"); - expect(categoryOf(ElementType.FigureCaption)).toBe("text"); - expect(categoryOf(ElementType.Address)).toBe("text"); - expect(categoryOf(ElementType.UncategorizedText)).toBe("text"); - expect(categoryOf(ElementType.Table)).toBe("table"); - expect(categoryOf(ElementType.Image)).toBe("media"); - expect(categoryOf(ElementType.CodeSnippet)).toBe("code"); - expect(categoryOf(ElementType.Formula)).toBe("math"); - expect(categoryOf(ElementType.CheckBox)).toBe("form"); - expect(categoryOf(ElementType.FormKeysValues)).toBe("form"); - expect(categoryOf(ElementType.PageBreak)).toBe("layout"); - expect(categoryOf(ElementType.PageNumber)).toBe("layout"); - expect(categoryOf(ElementType.EmailMessage)).toBe("email"); - }); - - it("returns undefined for unknown types", () => { - expect(categoryOf("unknown")).toBeUndefined(); - expect(categoryOf("")).toBeUndefined(); - }); -}); diff --git a/packages/nvisy-core/src/documents/ontology.ts b/packages/nvisy-core/src/documents/ontology.ts deleted file mode 100644 index 0d168d6..0000000 --- a/packages/nvisy-core/src/documents/ontology.ts +++ /dev/null @@ -1,125 +0,0 @@ -/** - * Element ontology — hierarchical categories for document elements. - * - * Every concrete {@link ElementType} belongs to exactly one - * {@link ElementCategory}. Categories let downstream consumers handle - * broad groups of elements (e.g. all text, all media) without matching - * individual types. - * - * Per-category const objects ({@link TextType}, {@link TableType}, etc.) - * are the single source of truth. The master {@link ElementType} is - * derived by spreading all category objects. - * - * @example - * ```ts - * import { categoryOf, ElementType, TextType } from "@nvisy/core"; - * - * categoryOf("title"); // => "text" - * categoryOf("table"); // => "table" - * ElementType.Title; // => "title" - * TextType.NarrativeText; // => "narrative-text" - * ``` - * - * @module - */ - -export const TextType = { - Title: "title", - NarrativeText: "narrative-text", - ListItem: "list-item", - Header: "header", - Footer: "footer", - FigureCaption: "figure-caption", - Address: "address", - UncategorizedText: "uncategorized-text", -} as const; -export type TextType = (typeof TextType)[keyof typeof TextType]; - -export const TableType = { Table: "table" } as const; -export type TableType = (typeof TableType)[keyof typeof TableType]; - -export const MediaType = { Image: "image" } as const; -export type MediaType = (typeof MediaType)[keyof typeof MediaType]; - -export const CodeType = { CodeSnippet: "code-snippet" } as const; -export type CodeType = (typeof CodeType)[keyof typeof CodeType]; - -export const MathType = { Formula: "formula" } as const; -export type MathType = (typeof MathType)[keyof typeof MathType]; - -export const FormType = { - CheckBox: "checkbox", - FormKeysValues: "form-keys-values", -} as const; -export type FormType = (typeof FormType)[keyof typeof FormType]; - -export const LayoutType = { - PageBreak: "page-break", - PageNumber: "page-number", -} as const; -export type LayoutType = (typeof LayoutType)[keyof typeof LayoutType]; - -export const EmailType = { EmailMessage: "email-message" } as const; -export type EmailType = (typeof EmailType)[keyof typeof EmailType]; - -/** Union of all per-category element type values. */ -export const ElementType = { - ...TextType, - ...TableType, - ...MediaType, - ...CodeType, - ...MathType, - ...FormType, - ...LayoutType, - ...EmailType, -} as const; -export type ElementType = (typeof ElementType)[keyof typeof ElementType]; - -export type ElementCategory = - | "text" - | "table" - | "media" - | "code" - | "math" - | "form" - | "layout" - | "email"; - -/** - * Map from {@link ElementCategory} to the element types it contains. - * - * This is the single source of truth for which types belong to which - * category. Use {@link categoryOf} for reverse lookups. - */ -export const ontology: Record<ElementCategory, readonly ElementType[]> = { - text: Object.values(TextType), - table: Object.values(TableType), - media: Object.values(MediaType), - code: Object.values(CodeType), - math: Object.values(MathType), - form: Object.values(FormType), - layout: Object.values(LayoutType), - email: Object.values(EmailType), -}; - -const reverseMap = new Map<string, ElementCategory>(); -for (const [category, types] of Object.entries(ontology)) { - for (const t of types) { - reverseMap.set(t, category as ElementCategory); - } -} - -/** - * Return the {@link ElementCategory} for a given element type string. - * - * @returns The category, or `undefined` for unrecognised types. - * - * @example - * ```ts - * categoryOf("title"); // => "text" - * categoryOf("unknown"); // => undefined - * ``` - */ -export function categoryOf(type: string): ElementCategory | undefined { - return reverseMap.get(type); -} diff --git a/packages/nvisy-core/src/errors/cancellation.ts b/packages/nvisy-core/src/errors/cancellation.ts deleted file mode 100644 index 9fd3c72..0000000 --- a/packages/nvisy-core/src/errors/cancellation.ts +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Cancellation error for intentionally aborted operations. - * - * @module - */ - -import { RuntimeError, type RuntimeErrorOptions } from "./runtime.js"; - -/** - * The operation was explicitly cancelled. - * - * Cancellation errors are not retryable by default since the - * cancellation was intentional. - * - * @example - * ```ts - * if (signal.aborted) { - * throw new CancellationError("Operation cancelled by user"); - * } - * ``` - */ -export class CancellationError extends RuntimeError { - constructor(message = "Operation cancelled", options?: RuntimeErrorOptions) { - super(message, { retryable: false, ...options }); - } -} diff --git a/packages/nvisy-core/src/errors/connection.ts b/packages/nvisy-core/src/errors/connection.ts deleted file mode 100644 index 2c449e5..0000000 --- a/packages/nvisy-core/src/errors/connection.ts +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Connection error for unreachable external services. - * - * @module - */ - -import { type ErrorContext, RuntimeError } from "./runtime.js"; - -/** - * Could not reach an external service, storage backend, or database. - * - * Also covers missing or unregistered connections. Connection errors - * are retryable by default since network issues are often transient. - * - * @example - * ```ts - * throw new ConnectionError("Database connection timeout", { - * source: "postgres", - * details: { host: "db.example.com", port: 5432 }, - * }); - * - * // Wrap provider connection failures - * catch (error) { - * throw ConnectionError.wrap(error, { source: "postgres" }); - * } - * ``` - */ -export class ConnectionError extends RuntimeError { - /** - * Wrap an unknown error as a ConnectionError. - * - * If the error is already a ConnectionError, returns it unchanged. - * Otherwise, creates a new ConnectionError with the original as cause. - * - * @param error - The error to wrap. - * @param context - Optional context (source, details). - */ - static override wrap( - error: unknown, - context?: ErrorContext, - ): ConnectionError { - if (error instanceof ConnectionError) return error; - const message = error instanceof Error ? error.message : String(error); - const cause = error instanceof Error ? error : undefined; - return new ConnectionError(`Connection failed: ${message}`, { - ...context, - ...(cause && { cause }), - }); - } - - /** - * Create a connection error for missing connections. - * - * @param connectionId - The connection ID that wasn't found. - * @param source - The component that raised the error. - */ - static notFound(connectionId: string, source?: string): ConnectionError { - return new ConnectionError(`Connection not found: ${connectionId}`, { - ...(source && { source }), - retryable: false, - }); - } -} diff --git a/packages/nvisy-core/src/errors/index.ts b/packages/nvisy-core/src/errors/index.ts deleted file mode 100644 index 38daf1a..0000000 --- a/packages/nvisy-core/src/errors/index.ts +++ /dev/null @@ -1,24 +0,0 @@ -/** - * Structured error hierarchy for the Nvisy runtime. - * - * All errors extend {@link RuntimeError} which provides: - * - `source` — component that raised the error - * - `details` — machine-readable context - * - `retryable` — whether the operation can be retried - * - * Default retryability: - * - {@link RuntimeError} — `true` (transient failures) - * - {@link ValidationError} — `false` (bad input won't fix itself) - * - {@link ConnectionError} — `true` (network issues are transient) - * - {@link TimeoutError} — `true` (timeouts are transient) - * - {@link CancellationError} — `false` (intentional cancellation) - * - * @module - */ - -export { CancellationError } from "./cancellation.js"; -export { ConnectionError } from "./connection.js"; -export type { ErrorContext, RuntimeErrorOptions } from "./runtime.js"; -export { RuntimeError } from "./runtime.js"; -export { TimeoutError } from "./timeout.js"; -export { ValidationError } from "./validation.js"; diff --git a/packages/nvisy-core/src/errors/runtime.ts b/packages/nvisy-core/src/errors/runtime.ts deleted file mode 100644 index 2b7cea8..0000000 --- a/packages/nvisy-core/src/errors/runtime.ts +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Base error class and shared error interfaces for the Nvisy runtime. - * - * @module - */ - -/** Structured context attached to runtime errors. */ -export interface ErrorContext { - /** Which component or subsystem produced the error. */ - readonly source?: string; - /** Machine-readable details about the failure (IDs, paths, limits, etc.). */ - readonly details?: Record<string, unknown>; -} - -/** Options for constructing a RuntimeError. */ -export interface RuntimeErrorOptions extends ErrorContext { - /** Whether the caller may safely retry the operation. */ - readonly retryable?: boolean; - /** The underlying error that caused this one. */ - readonly cause?: Error; -} - -/** - * Base class for all Nvisy runtime errors. - * - * Every error carries structured context for logging and debugging. - * Extends the built-in `Error` so `instanceof RuntimeError` works everywhere. - * - * By default, runtime errors are retryable. Subclasses like `ValidationError` - * override this to `false` since validation failures won't succeed on retry. - * - * @example - * ```ts - * throw new RuntimeError("Operation failed", { - * source: "engine", - * details: { nodeId: "abc" }, - * }); - * - * // Wrap unknown errors - * catch (error) { - * throw RuntimeError.wrap(error, { source: "provider" }); - * } - * ``` - */ -export class RuntimeError extends Error { - readonly #source: string | undefined; - readonly #details: Record<string, unknown> | undefined; - readonly #retryable: boolean; - - constructor(message: string, options?: RuntimeErrorOptions) { - super(message, options?.cause ? { cause: options.cause } : undefined); - this.name = this.constructor.name; - this.#source = options?.source; - this.#details = options?.details; - this.#retryable = options?.retryable ?? true; - } - - /** Which component or subsystem produced the error. */ - get source(): string | undefined { - return this.#source; - } - - /** Machine-readable details about the failure (IDs, paths, limits, etc.). */ - get details(): Record<string, unknown> | undefined { - return this.#details; - } - - /** Whether the caller may safely retry the operation. */ - get retryable(): boolean { - return this.#retryable; - } - - /** - * Wrap an unknown error as a RuntimeError. - * - * If the error is already a RuntimeError, returns it unchanged. - * Otherwise, creates a new RuntimeError with the original as cause. - */ - static wrap(error: unknown, context?: ErrorContext): RuntimeError { - if (error instanceof RuntimeError) return error; - const message = error instanceof Error ? error.message : String(error); - const cause = error instanceof Error ? error : undefined; - return new RuntimeError(message, { - ...context, - ...(cause && { cause }), - }); - } -} diff --git a/packages/nvisy-core/src/errors/timeout.ts b/packages/nvisy-core/src/errors/timeout.ts deleted file mode 100644 index 33f0012..0000000 --- a/packages/nvisy-core/src/errors/timeout.ts +++ /dev/null @@ -1,21 +0,0 @@ -/** - * Timeout error for operations that exceed their time limit. - * - * @module - */ - -import type { RuntimeErrorOptions } from "./runtime.js"; -import { RuntimeError } from "./runtime.js"; - -/** - * Thrown when an operation exceeds its time limit. - * - * Defaults to `retryable: true` because timeouts are typically transient. - * The engine handles retry timing via its backoff policies — this class - * does not carry a `retryAfterMs` field. - */ -export class TimeoutError extends RuntimeError { - constructor(message: string, options?: RuntimeErrorOptions) { - super(message, { retryable: true, ...options }); - } -} diff --git a/packages/nvisy-core/src/errors/validation.ts b/packages/nvisy-core/src/errors/validation.ts deleted file mode 100644 index 9bd3446..0000000 --- a/packages/nvisy-core/src/errors/validation.ts +++ /dev/null @@ -1,57 +0,0 @@ -/** - * Validation error for input that fails schema or business rules. - * - * @module - */ - -import { RuntimeError, type RuntimeErrorOptions } from "./runtime.js"; - -/** - * Input did not pass schema or business rules. - * - * Also covers invalid workflow and pipeline definitions. - * Validation errors are not retryable by default since the same - * input will fail validation again. - * - * @example - * ```ts - * throw new ValidationError("Invalid graph definition", { source: "compiler" }); - * - * // Use factory for common "not found" pattern - * throw ValidationError.notFound("myAction", "action", "registry"); - * ``` - */ -export class ValidationError extends RuntimeError { - constructor(message: string, options?: RuntimeErrorOptions) { - super(message, { retryable: false, ...options }); - } - - /** - * Create a "not found" validation error. - * - * @param name - The name that wasn't found. - * @param type - The type of thing (e.g., "action", "provider", "stream"). - * @param source - The component that raised the error. - */ - static notFound( - name: string, - type: string, - source?: string, - ): ValidationError { - return new ValidationError(`Unknown ${type}: ${name}`, { - ...(source && { source }), - }); - } - - /** - * Create a validation error for parse failures. - * - * @param message - The parse error message. - * @param source - The component that raised the error. - */ - static parse(message: string, source?: string): ValidationError { - return new ValidationError(`Parse error: ${message}`, { - ...(source && { source }), - }); - } -} diff --git a/packages/nvisy-core/src/index.ts b/packages/nvisy-core/src/index.ts deleted file mode 100644 index 10bcd05..0000000 --- a/packages/nvisy-core/src/index.ts +++ /dev/null @@ -1,98 +0,0 @@ -/** - * @module @nvisy/core - * - * Public API surface for the nvisy core library. - */ - -export type { ActionInstance } from "./action.js"; -export { Action } from "./action.js"; -export type { - BlobOptions, - ChunkOptions, - DatatypeDescriptor, - DocumentOptions, - Filetype, -} from "./datatypes/index.js"; -export { - Blob, - Chunk, - Data, - Datatype, - Document, - Embedding, -} from "./datatypes/index.js"; -export type { - CompositeElementOptions, - ElementOptions, - ElementProvenance, - EmailElementOptions, - EmphasizedText, - FormElementOptions, - FormKeyValuePair, - ImageElementOptions, - Link, - TableCellData, - TableElementOptions, -} from "./documents/elements.js"; -export { - CompositeElement, - Element, - EmailElement, - FormElement, - ImageElement, - TableElement, -} from "./documents/elements.js"; -export type { - ElementCategory, - ElementCoordinates, - Orientation, - Point, -} from "./documents/index.js"; -export { - CodeType, - CoordinateSystem, - categoryOf, - ElementType, - EmailType, - FormType, - LayoutType, - MathType, - MediaType, - Orientations, - ontology, - TableType, - TextType, -} from "./documents/index.js"; -export type { ErrorContext } from "./errors/index.js"; -export { - CancellationError, - ConnectionError, - RuntimeError, - TimeoutError, - ValidationError, -} from "./errors/index.js"; -export type { LoaderConfig, LoaderInstance, LoadFn } from "./loader.js"; -export { Loader } from "./loader.js"; -export type { - AnyActionInstance, - AnyLoaderInstance, - AnyProviderFactory, - AnyStreamSource, - AnyStreamTarget, - PluginInstance, -} from "./plugin.js"; -export { Plugin } from "./plugin.js"; -export type { - ConnectedInstance, - ProviderFactory, - ProviderInstance, -} from "./provider.js"; -export { Provider } from "./provider.js"; -export type { - Resumable, - StreamSource, - StreamTarget, - WriterFn, -} from "./stream.js"; -export { Stream } from "./stream.js"; -export type { ClassRef, JsonValue, Metadata } from "./types.js"; diff --git a/packages/nvisy-core/src/loader.ts b/packages/nvisy-core/src/loader.ts deleted file mode 100644 index 330729d..0000000 --- a/packages/nvisy-core/src/loader.ts +++ /dev/null @@ -1,109 +0,0 @@ -/** - * Loaders that transform {@link Blob | Blobs} into {@link Document | Documents}. - * - * Each loader declares the file extensions and MIME types it handles, - * so the engine can automatically select the right loader for a given - * blob. Use {@link Loader.define} to create new loaders. - * - * @module - */ - -import type { z } from "zod"; -import type { Blob } from "./datatypes/blob.js"; -import type { Document } from "./datatypes/document.js"; - -/** - * Function that transforms a Blob into one or more Documents. - * - * @template TParam - Configuration parameters for the loader. - */ -export type LoadFn<TParam> = ( - blob: Blob, - params: TParam, -) => AsyncIterable<Document>; - -/** - * Configuration for creating a loader. - * - * @template TParam - Configuration parameters for the loader. - */ -export interface LoaderConfig<TParam> { - /** File extensions this loader handles (e.g. [".pdf"], [".md", ".markdown"]). */ - readonly extensions: string[]; - /** MIME types this loader handles (e.g. ["application/pdf"], ["text/plain"]). */ - readonly contentTypes: string[]; - /** Zod schema for validating loader parameters. */ - readonly params: z.ZodType<TParam>; - /** The load function that transforms a Blob into Documents. */ - readonly load: LoadFn<TParam>; -} - -/** - * A registered loader instance that transforms Blobs into Documents. - * - * Loaders are specialized transforms that convert binary objects - * (files from object storage) into structured Document instances - * that can be processed by the pipeline. - * - * @template TParam - Configuration parameters for the loader. - */ -export interface LoaderInstance<TParam = unknown> { - /** Unique identifier for this loader (e.g. "pdf", "docx"). */ - readonly id: string; - /** File extensions this loader handles. */ - readonly extensions: readonly string[]; - /** MIME types this loader handles. */ - readonly contentTypes: readonly string[]; - /** Zod schema for validating loader parameters. */ - readonly schema: z.ZodType<TParam>; - /** Transform a Blob into one or more Documents. */ - load(blob: Blob, params: TParam): AsyncIterable<Document>; -} - -class LoaderImpl<TParam> implements LoaderInstance<TParam> { - readonly id: string; - readonly extensions: readonly string[]; - readonly contentTypes: readonly string[]; - readonly schema: z.ZodType<TParam>; - readonly #load: LoadFn<TParam>; - - constructor(config: { - id: string; - extensions: string[]; - contentTypes: string[]; - schema: z.ZodType<TParam>; - load: LoadFn<TParam>; - }) { - this.id = config.id; - this.extensions = config.extensions; - this.contentTypes = config.contentTypes; - this.schema = config.schema; - this.#load = config.load; - } - - load(blob: Blob, params: TParam): AsyncIterable<Document> { - return this.#load(blob, params); - } -} - -/** Factory for creating loader instances. */ -export const Loader = { - /** - * Create a loader that transforms Blobs into Documents. - * - * @param id - Unique identifier for the loader (e.g. "pdf", "docx"). - * @param config - Loader configuration including match criteria and load function. - */ - define<TParam>( - id: string, - config: LoaderConfig<TParam>, - ): LoaderInstance<TParam> { - return new LoaderImpl({ - id, - extensions: config.extensions, - contentTypes: config.contentTypes, - schema: config.params, - load: config.load, - }); - }, -} as const; diff --git a/packages/nvisy-core/src/plugin.ts b/packages/nvisy-core/src/plugin.ts deleted file mode 100644 index 5f7cb11..0000000 --- a/packages/nvisy-core/src/plugin.ts +++ /dev/null @@ -1,115 +0,0 @@ -/** - * Plugin system for bundling providers, streams, actions, loaders, - * and custom datatypes under a single namespace. - * - * Plugins are the unit of registration with the engine. Use - * {@link Plugin.define} to create a new plugin, then chain - * `.withProviders()`, `.withActions()`, etc. to populate it. - * - * @module - */ - -import type { ActionInstance } from "./action.js"; -import type { DatatypeDescriptor } from "./datatypes/index.js"; -import type { LoaderInstance } from "./loader.js"; -import type { ProviderFactory } from "./provider.js"; -import type { StreamSource, StreamTarget } from "./stream.js"; - -// biome-ignore lint/suspicious/noExplicitAny: existential type alias -export type AnyProviderFactory = ProviderFactory<any, any>; - -// biome-ignore lint/suspicious/noExplicitAny: existential type alias -export type AnyActionInstance = ActionInstance<any, any, any, any>; - -// biome-ignore lint/suspicious/noExplicitAny: existential type alias -export type AnyLoaderInstance = LoaderInstance<any>; - -// biome-ignore lint/suspicious/noExplicitAny: existential type alias -export type AnyStreamSource = StreamSource<any, any, any, any>; - -// biome-ignore lint/suspicious/noExplicitAny: existential type alias -export type AnyStreamTarget = StreamTarget<any, any, any>; - -/** - * A plugin bundles providers, streams, actions, and loaders under a namespace. - * - * Plugins are the unit of registration with the engine. All entries - * are namespaced as `"pluginId/name"` to avoid collisions. - */ -export interface PluginInstance { - /** Unique identifier for the plugin (e.g. "sql", "openai"). */ - readonly id: string; - /** Provider factories keyed by their ID. */ - readonly providers: Readonly<Record<string, AnyProviderFactory>>; - /** Stream sources and targets keyed by their ID. */ - readonly streams: Readonly<Record<string, AnyStreamSource | AnyStreamTarget>>; - /** Actions keyed by their ID. */ - readonly actions: Readonly<Record<string, AnyActionInstance>>; - /** Loaders keyed by their ID. */ - readonly loaders: Readonly<Record<string, AnyLoaderInstance>>; - /** Custom data types keyed by their ID. */ - readonly datatypes: Readonly<Record<string, DatatypeDescriptor>>; -} - -class PluginBuilder implements PluginInstance { - readonly id: string; - readonly providers: Readonly<Record<string, AnyProviderFactory>> = {}; - readonly streams: Readonly< - Record<string, AnyStreamSource | AnyStreamTarget> - > = {}; - readonly actions: Readonly<Record<string, AnyActionInstance>> = {}; - readonly loaders: Readonly<Record<string, AnyLoaderInstance>> = {}; - readonly datatypes: Readonly<Record<string, DatatypeDescriptor>> = {}; - - constructor(id: string) { - this.id = id; - } - - /** Add providers to this plugin. */ - withProviders(...providers: AnyProviderFactory[]): this { - const record = { ...this.providers }; - for (const p of providers) record[p.id] = p; - (this as { providers: typeof record }).providers = record; - return this; - } - - /** Add streams to this plugin. */ - withStreams(...streams: (AnyStreamSource | AnyStreamTarget)[]): this { - const record = { ...this.streams }; - for (const s of streams) record[s.id] = s; - (this as { streams: typeof record }).streams = record; - return this; - } - - /** Add actions to this plugin. */ - withActions(...actions: AnyActionInstance[]): this { - const record = { ...this.actions }; - for (const a of actions) record[a.id] = a; - (this as { actions: typeof record }).actions = record; - return this; - } - - /** Add loaders to this plugin. */ - withLoaders(...loaders: AnyLoaderInstance[]): this { - const record = { ...this.loaders }; - for (const l of loaders) record[l.id] = l; - (this as { loaders: typeof record }).loaders = record; - return this; - } - - /** Add custom data types to this plugin. */ - withDatatypes(...datatypes: DatatypeDescriptor[]): this { - const record = { ...this.datatypes }; - for (const d of datatypes) record[d.id] = d; - (this as { datatypes: typeof record }).datatypes = record; - return this; - } -} - -/** Factory for creating plugin definitions. */ -export const Plugin = { - /** Create a new plugin with the given ID. */ - define(id: string): PluginBuilder { - return new PluginBuilder(id); - }, -} as const; diff --git a/packages/nvisy-core/src/provider.ts b/packages/nvisy-core/src/provider.ts deleted file mode 100644 index d336e80..0000000 --- a/packages/nvisy-core/src/provider.ts +++ /dev/null @@ -1,197 +0,0 @@ -/** - * Provider lifecycle management for external service connections. - * - * Providers abstract credentials, connection setup, and teardown - * for databases, APIs, and other external systems. Use - * {@link Provider.withAuthentication} or - * {@link Provider.withoutAuthentication} to define new providers. - * - * @module - */ - -import { getLogger } from "@logtape/logtape"; -import { z } from "zod"; -import { ConnectionError } from "./errors/index.js"; - -const logger = getLogger(["nvisy", "provider"]); - -/** - * Configuration for creating a provider that requires credentials. - * - * @template TCred - Credential type required for authentication. - * @template TClient - Client type returned after successful connection. - */ -export interface AuthenticatedProviderConfig<TCred, TClient> { - /** Zod schema for validating credentials. */ - readonly credentials: z.ZodType<TCred>; - /** Verify connectivity without establishing a persistent connection. */ - readonly verify?: (credentials: TCred) => Promise<void>; - /** Factory function that establishes a connection using credentials. */ - readonly connect: (credentials: TCred) => Promise<ProviderInstance<TClient>>; -} - -/** - * Configuration for creating a provider that does not require credentials. - * - * @template TClient - Client type returned after successful connection. - */ -export interface UnauthenticatedProviderConfig<TClient> { - /** Factory function that establishes a connection. */ - readonly connect: () => Promise<ProviderInstance<TClient>>; -} - -/** - * Raw provider instance returned from a connect function. - * - * This is the internal representation before wrapping with lifecycle management. - * - * @template TClient - Client type provided by this instance. - */ -export interface ProviderInstance<TClient = void> { - /** The connected client ready for use. */ - readonly client: TClient; - /** Optional cleanup function called when disconnecting. */ - disconnect?(): Promise<void>; -} - -/** - * A connected provider instance with lifecycle management. - * - * Wraps a raw {@link ProviderInstance} with idempotent disconnect handling - * and logging. - * - * @template TClient - Client type provided by this instance. - */ -export interface ConnectedInstance<TClient = void> { - /** Identifier of the provider that created this instance. */ - readonly id: string; - /** The connected client ready for use. */ - readonly client: TClient; - /** Disconnect and release resources (idempotent). */ - disconnect(): Promise<void>; -} - -/** - * Factory for creating provider connections. - * - * Providers manage the lifecycle of external clients (databases, APIs, etc.). - * Each provider defines a credential schema and a connect function that - * returns a managed {@link ConnectedInstance}. - * - * @template TCred - Credential type required for authentication. - * @template TClient - Client type returned after successful connection. - */ -export interface ProviderFactory<TCred = unknown, TClient = unknown> { - /** Unique identifier for this provider. */ - readonly id: string; - /** Zod schema for validating credentials. */ - readonly credentialSchema: z.ZodType<TCred>; - /** Verify connectivity without establishing a persistent connection. */ - verify(credentials: TCred): Promise<void>; - /** Create a new connection using the provided credentials. */ - connect(credentials: TCred): Promise<ConnectedInstance<TClient>>; -} - -const noop = async () => {}; - -class ConnectedInstanceImpl<TClient> implements ConnectedInstance<TClient> { - readonly id: string; - readonly client: TClient; - readonly #disconnect: () => Promise<void>; - #disconnected = false; - - constructor(id: string, instance: ProviderInstance<TClient>) { - this.id = id; - this.client = instance.client; - this.#disconnect = instance.disconnect ?? noop; - } - - async disconnect(): Promise<void> { - if (this.#disconnected) return; - this.#disconnected = true; - - try { - await this.#disconnect(); - logger.debug("Provider disconnected", { provider: this.id }); - } catch (error) { - logger.warn("Provider disconnect failed", { - provider: this.id, - error: String(error), - }); - throw error; - } - } -} - -class ProviderFactoryImpl<TCred, TClient> - implements ProviderFactory<TCred, TClient> -{ - readonly id: string; - readonly credentialSchema: z.ZodType<TCred>; - readonly #connect: (credentials: TCred) => Promise<ProviderInstance<TClient>>; - readonly #verify: (credentials: TCred) => Promise<void>; - - constructor( - id: string, - credentialSchema: z.ZodType<TCred>, - connect: (credentials: TCred) => Promise<ProviderInstance<TClient>>, - verify?: (credentials: TCred) => Promise<void>, - ) { - this.id = id; - this.credentialSchema = credentialSchema; - this.#connect = connect; - this.#verify = verify ?? noop; - } - - async verify(credentials: TCred): Promise<void> { - await this.#verify(credentials); - } - - async connect(credentials: TCred): Promise<ConnectedInstance<TClient>> { - try { - const instance = await this.#connect(credentials); - logger.debug("Provider connected", { provider: this.id }); - return new ConnectedInstanceImpl(this.id, instance); - } catch (error) { - logger.warn("Provider connection failed", { - provider: this.id, - error: error instanceof Error ? error.message : String(error), - }); - throw ConnectionError.wrap(error, { source: this.id }); - } - } -} - -/** Factory for creating provider definitions. */ -export const Provider = { - /** - * Create a provider that requires authentication credentials. - * - * @param id - Unique identifier for the provider. - * @param config - Provider configuration including credential schema and connect function. - */ - withAuthentication<TClient, TCred>( - id: string, - config: AuthenticatedProviderConfig<TCred, TClient>, - ): ProviderFactory<TCred, TClient> { - return new ProviderFactoryImpl( - id, - config.credentials, - config.connect, - config.verify, - ); - }, - - /** - * Create a provider that does not require authentication. - * - * @param id - Unique identifier for the provider. - * @param config - Provider configuration including connect function. - */ - withoutAuthentication<TClient>( - id: string, - config: UnauthenticatedProviderConfig<TClient>, - ): ProviderFactory<void, TClient> { - return new ProviderFactoryImpl(id, z.void(), () => config.connect()); - }, -}; diff --git a/packages/nvisy-core/src/stream.ts b/packages/nvisy-core/src/stream.ts deleted file mode 100644 index d9a079e..0000000 --- a/packages/nvisy-core/src/stream.ts +++ /dev/null @@ -1,270 +0,0 @@ -/** - * Stream sources and targets for reading from and writing to external systems. - * - * Sources are pipeline entry points that emit {@link Resumable} items - * for crash recovery. Targets are pipeline exit points that persist - * processed data. Use {@link Stream.createSource} and - * {@link Stream.createTarget} to define new endpoints. - * - * @module - */ - -import type { z } from "zod"; -import type { Data } from "./datatypes/index.js"; -import type { ClassRef } from "./types.js"; - -/** - * A data item paired with resumption context. - * - * Stream sources emit resumables so that the engine can persist - * context after each item, enabling crash recovery. - * - * @template TData - The data type being streamed. - * @template TCtx - Context type for resumption (e.g. cursor, offset). - */ -export interface Resumable<TData extends Data = Data, TCtx = void> { - /** The data item being streamed. */ - readonly data: TData; - /** Context for resuming from this point. */ - readonly context: TCtx; -} - -/** - * Function that reads data from an external system. - * - * @template TClient - Provider client type for connecting to the source. - * @template TData - Data type produced by the reader. - * @template TCtx - Context type for resumption (e.g. cursor, offset). - * @template TParam - Configuration parameters for the reader. - */ -export type ReaderFn<TClient, TData extends Data, TCtx, TParam> = ( - client: TClient, - ctx: TCtx, - params: TParam, -) => AsyncIterable<Resumable<TData, TCtx>>; - -/** - * Function that returns a writer for persisting data items. - * - * @template TClient - Provider client type for connecting to the target. - * @template TData - Data type consumed by the writer. - * @template TParam - Configuration parameters for the writer. - */ -export type WriterFn<TClient, TData extends Data, TParam> = ( - client: TClient, - params: TParam, -) => (item: TData) => Promise<void>; - -/** - * Configuration for creating a stream source. - * - * @template TClient - Provider client type for connecting to the source. - * @template TData - Data type produced by the source. - * @template TCtx - Context type for resumption. - * @template TParam - Configuration parameters for the source. - */ -export interface SourceConfig<TClient, TData extends Data, TCtx, TParam> { - /** Class reference for the data type produced. */ - readonly type: ClassRef<TData>; - /** Zod schema for validating and parsing resumption context. */ - readonly context: z.ZodType<TCtx>; - /** Zod schema for validating stream parameters. */ - readonly params: z.ZodType<TParam>; - /** The reader function that produces data items. */ - readonly reader: ReaderFn<TClient, TData, TCtx, TParam>; -} - -/** - * Configuration for creating a stream target. - * - * @template TClient - Provider client type for connecting to the target. - * @template TData - Data type consumed by the target. - * @template TParam - Configuration parameters for the target. - */ -export interface TargetConfig<TClient, TData extends Data, TParam> { - /** Class reference for the data type consumed. */ - readonly type: ClassRef<TData>; - /** Zod schema for validating stream parameters. */ - readonly params: z.ZodType<TParam>; - /** The writer function that persists data items. */ - readonly writer: WriterFn<TClient, TData, TParam>; -} - -/** - * A stream source that reads data from an external system. - * - * Sources are the entry points of a pipeline, producing data items - * that flow through actions to targets. - * - * @template TClient - Provider client type for connecting to the source. - * @template TData - Data type produced by the source. - * @template TCtx - Context type for resumption. - * @template TParam - Configuration parameters for the source. - */ -export interface StreamSource< - TClient, - TData extends Data, - TCtx, - TParam = void, -> { - /** Discriminator for runtime type checking. */ - readonly kind: "source"; - /** Unique identifier for this stream source. */ - readonly id: string; - /** Class reference for the required provider client. */ - readonly clientClass: ClassRef<TClient>; - /** Class reference for the data type produced. */ - readonly dataClass: ClassRef<TData>; - /** Zod schema for validating and parsing resumption context. */ - readonly contextSchema: z.ZodType<TCtx>; - /** Zod schema for validating stream parameters. */ - readonly paramSchema: z.ZodType<TParam>; - /** Read data from the source, yielding resumable items. */ - read( - client: TClient, - ctx: TCtx, - params: TParam, - ): AsyncIterable<Resumable<TData, TCtx>>; -} - -/** - * A stream target that writes data to an external system. - * - * Targets are the exit points of a pipeline, persisting data items - * that have flowed from sources through actions. - * - * @template TClient - Provider client type for connecting to the target. - * @template TData - Data type consumed by the target. - * @template TParam - Configuration parameters for the target. - */ -export interface StreamTarget<TClient, TData extends Data, TParam = void> { - /** Discriminator for runtime type checking. */ - readonly kind: "target"; - /** Unique identifier for this stream target. */ - readonly id: string; - /** Class reference for the required provider client. */ - readonly clientClass: ClassRef<TClient>; - /** Class reference for the data type consumed. */ - readonly dataClass: ClassRef<TData>; - /** Zod schema for validating stream parameters. */ - readonly paramSchema: z.ZodType<TParam>; - /** Create a writer function for persisting items. */ - write(client: TClient, params: TParam): (item: TData) => Promise<void>; -} - -class StreamSourceImpl<TClient, TData extends Data, TCtx, TParam> - implements StreamSource<TClient, TData, TCtx, TParam> -{ - readonly kind = "source" as const; - readonly id: string; - readonly clientClass: ClassRef<TClient>; - readonly dataClass: ClassRef<TData>; - readonly contextSchema: z.ZodType<TCtx>; - readonly paramSchema: z.ZodType<TParam>; - readonly #read: ReaderFn<TClient, TData, TCtx, TParam>; - - constructor(config: { - id: string; - clientClass: ClassRef<TClient>; - dataClass: ClassRef<TData>; - contextSchema: z.ZodType<TCtx>; - paramSchema: z.ZodType<TParam>; - read: ReaderFn<TClient, TData, TCtx, TParam>; - }) { - this.id = config.id; - this.clientClass = config.clientClass; - this.dataClass = config.dataClass; - this.contextSchema = config.contextSchema; - this.paramSchema = config.paramSchema; - this.#read = config.read; - } - - read( - client: TClient, - ctx: TCtx, - params: TParam, - ): AsyncIterable<Resumable<TData, TCtx>> { - return this.#read(client, ctx, params); - } -} - -class StreamTargetImpl<TClient, TData extends Data, TParam> - implements StreamTarget<TClient, TData, TParam> -{ - readonly kind = "target" as const; - readonly id: string; - readonly clientClass: ClassRef<TClient>; - readonly dataClass: ClassRef<TData>; - readonly paramSchema: z.ZodType<TParam>; - readonly #writer: WriterFn<TClient, TData, TParam>; - - constructor(config: { - id: string; - clientClass: ClassRef<TClient>; - dataClass: ClassRef<TData>; - paramSchema: z.ZodType<TParam>; - writer: WriterFn<TClient, TData, TParam>; - }) { - this.id = config.id; - this.clientClass = config.clientClass; - this.dataClass = config.dataClass; - this.paramSchema = config.paramSchema; - this.#writer = config.writer; - } - - write(client: TClient, params: TParam): (item: TData) => Promise<void> { - return this.#writer(client, params); - } -} - -/** Factory for creating stream sources and targets. */ -export const Stream = { - /** - * Create a stream source for reading data from an external system. - * - * @param id - Unique identifier for the stream source. - * @param clientClass - Class reference for the required provider client. - * @param config - Source configuration including types and reader function. - */ - createSource<TClient, TData extends Data, TCtx, TParam>( - id: string, - clientClass: ClassRef<TClient>, - config: SourceConfig<TClient, TData, TCtx, TParam>, - ): StreamSource<TClient, TData, TCtx, TParam> { - const { - type: dataClass, - context: contextSchema, - params: paramSchema, - } = config; - return new StreamSourceImpl({ - id, - clientClass, - dataClass, - contextSchema, - paramSchema, - read: config.reader, - }); - }, - - /** - * Create a stream target for writing data to an external system. - * - * @param id - Unique identifier for the stream target. - * @param clientClass - Class reference for the required provider client. - * @param config - Target configuration including types and writer function. - */ - createTarget<TClient, TData extends Data, TParam>( - id: string, - clientClass: ClassRef<TClient>, - config: TargetConfig<TClient, TData, TParam>, - ): StreamTarget<TClient, TData, TParam> { - const { type: dataClass, params: paramSchema } = config; - return new StreamTargetImpl({ - id, - clientClass, - dataClass, - paramSchema, - writer: config.writer, - }); - }, -} as const; diff --git a/packages/nvisy-core/src/types.ts b/packages/nvisy-core/src/types.ts deleted file mode 100644 index 139ec9d..0000000 --- a/packages/nvisy-core/src/types.ts +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Shared type aliases used across the core library. - * - * @module - */ - -/** - * A JSON-compatible value. - * - * Mirrors the types that `JSON.parse` can return and `JSON.stringify` - * can accept, making it safe for serialisation boundaries (APIs, - * databases, message queues). - */ -export type JsonValue = - | string - | number - | boolean - | null - | JsonValue[] - | { [key: string]: JsonValue }; - -/** - * Key-value metadata bag attached to {@link Data} items. - * - * All values must be JSON-serialisable so metadata can travel across - * process boundaries without lossy conversion. - */ -export type Metadata = Record<string, JsonValue>; - -/** Constructor reference for runtime `instanceof` checks and generic type inference. */ -export type ClassRef<T> = abstract new (...args: never[]) => T; diff --git a/packages/nvisy-core/test/action.fixtures.ts b/packages/nvisy-core/test/action.fixtures.ts deleted file mode 100644 index 241ef6c..0000000 --- a/packages/nvisy-core/test/action.fixtures.ts +++ /dev/null @@ -1,64 +0,0 @@ -import { z } from "zod"; -import { Action } from "../src/action.js"; -import type { JsonValue } from "../src/datatypes/data.js"; -import { Data } from "../src/datatypes/data.js"; - -/** Minimal row-like data type for testing. */ -export class TestRow extends Data { - readonly #columns: Readonly<Record<string, JsonValue>>; - - constructor(columns: Record<string, JsonValue>) { - super(); - this.#columns = columns; - } - - get columns(): Readonly<Record<string, JsonValue>> { - return this.#columns; - } - - get(column: string): JsonValue | undefined { - return this.#columns[column]; - } -} - -export const FilterParams = z.object({ - column: z.string(), - value: z.string(), -}); -export type FilterParams = z.infer<typeof FilterParams>; - -export const ExampleFilter = Action.withoutClient("filter", { - types: [TestRow], - params: FilterParams, - transform: async function* (stream, params) { - for await (const row of stream) { - if (row.get(params.column) === params.value) yield row; - } - }, -}); - -export const MapParams = z.object({ - column: z.string(), - fn: z.enum(["uppercase", "lowercase"]), -}); -export type MapParams = z.infer<typeof MapParams>; - -export const ExampleMap = Action.withoutClient("map", { - types: [TestRow], - params: MapParams, - transform: async function* (stream, params) { - for await (const row of stream) { - const val = row.get(params.column); - if (typeof val !== "string") { - yield row; - } else { - const mapped = - params.fn === "uppercase" ? val.toUpperCase() : val.toLowerCase(); - yield new TestRow({ - ...row.columns, - [params.column]: mapped, - }).deriveFrom(row); - } - } - }, -}); diff --git a/packages/nvisy-core/test/action.test.ts b/packages/nvisy-core/test/action.test.ts deleted file mode 100644 index 5b7d433..0000000 --- a/packages/nvisy-core/test/action.test.ts +++ /dev/null @@ -1,84 +0,0 @@ -import { describe, expect, it } from "vitest"; -import type { ActionInstance } from "../src/action.js"; -import type { Data } from "../src/datatypes/data.js"; -import { ExampleFilter, ExampleMap, TestRow } from "./action.fixtures.js"; - -async function collect<T>(iter: AsyncIterable<T>): Promise<T[]> { - const result: T[] = []; - for await (const item of iter) result.push(item); - return result; -} - -async function* fromArray<T>(items: ReadonlyArray<T>): AsyncIterable<T> { - yield* items; -} - -async function runAction<TIn extends Data, TOut extends Data>( - // biome-ignore lint/suspicious/noExplicitAny: generic test helper - action: ActionInstance<void, TIn, TOut, any>, - items: ReadonlyArray<TIn>, - params: unknown, -): Promise<ReadonlyArray<TOut>> { - const stream = action.pipe(fromArray(items), params, undefined as undefined); - return collect(stream); -} - -const rows = [ - new TestRow({ id: "1", name: "Alice" }), - new TestRow({ id: "2", name: "Bob" }), - new TestRow({ id: "3", name: "Charlie" }), -]; - -describe("ExampleFilter", () => { - it("keeps rows matching the predicate", async () => { - const result = await runAction(ExampleFilter, rows, { - column: "name", - value: "Bob", - }); - - expect(result).toHaveLength(1); - expect(result[0]!.get("name")).toBe("Bob"); - }); - - it("returns empty array when nothing matches", async () => { - const result = await runAction(ExampleFilter, rows, { - column: "name", - value: "Nobody", - }); - - expect(result).toHaveLength(0); - }); -}); - -describe("ExampleMap", () => { - it("transforms column values to uppercase", async () => { - const result = await runAction(ExampleMap, rows, { - column: "name", - fn: "uppercase", - }); - - expect(result).toHaveLength(3); - expect(result[0]!.get("name")).toBe("ALICE"); - expect(result[1]!.get("name")).toBe("BOB"); - expect(result[2]!.get("name")).toBe("CHARLIE"); - }); - - it("transforms column values to lowercase", async () => { - const result = await runAction(ExampleMap, rows, { - column: "name", - fn: "lowercase", - }); - - expect(result[0]!.get("name")).toBe("alice"); - }); - - it("leaves non-string columns unchanged", async () => { - const result = await runAction(ExampleMap, rows, { - column: "id", - fn: "uppercase", - }); - - expect(result[0]!.get("id")).toBe("1"); - expect(result[0]!.get("name")).toBe("Alice"); - }); -}); diff --git a/packages/nvisy-core/test/provider.fixtures.ts b/packages/nvisy-core/test/provider.fixtures.ts deleted file mode 100644 index a073848..0000000 --- a/packages/nvisy-core/test/provider.fixtures.ts +++ /dev/null @@ -1,81 +0,0 @@ -import { z } from "zod"; -import type { JsonValue } from "../src/datatypes/data.js"; -import { Data } from "../src/datatypes/data.js"; -import { Provider } from "../src/provider.js"; -import type { Resumable } from "../src/stream.js"; -import { Stream } from "../src/stream.js"; - -/** Minimal row-like data type for testing. */ -export class TestRow extends Data { - readonly #columns: Readonly<Record<string, JsonValue>>; - - constructor(columns: Record<string, JsonValue>) { - super(); - this.#columns = columns; - } - - get columns(): Readonly<Record<string, JsonValue>> { - return this.#columns; - } - - get(column: string): JsonValue | undefined { - return this.#columns[column]; - } -} - -export const Credentials = z.object({ - host: z.string(), - port: z.number(), -}); -export type Credentials = z.infer<typeof Credentials>; - -export const Params = z.object({ - table: z.string(), -}); -export type Params = z.infer<typeof Params>; - -export const Cursor = z.object({ - offset: z.number(), -}); -export type Cursor = z.infer<typeof Cursor>; - -export class ExampleClient { - readonly rows: ReadonlyArray<Record<string, JsonValue>> = [ - { id: "1", name: "Alice" }, - { id: "2", name: "Bob" }, - { id: "3", name: "Charlie" }, - ]; -} - -async function* readStream( - client: ExampleClient, - ctx: Cursor, - _params: Params, -): AsyncIterable<Resumable<TestRow, Cursor>> { - const items = client.rows.slice(ctx.offset).map((row, i) => ({ - data: new TestRow(row), - context: { offset: ctx.offset + i + 1 }, - })); - yield* items; -} - -export const ExampleProvider = Provider.withAuthentication("example", { - credentials: Credentials, - connect: async (_credentials) => ({ - client: new ExampleClient(), - disconnect: async () => {}, - }), -}); - -export const ExampleSource = Stream.createSource("read", ExampleClient, { - type: TestRow, - context: Cursor, - params: Params, - reader: (client, ctx, params) => readStream(client, ctx, params), -}); - -export const ExampleTarget = Stream.createTarget("write", ExampleClient, { - type: TestRow, - params: Params, - writer: (_client, _params) => async (_item) => {}, -}); diff --git a/packages/nvisy-core/test/provider.test.ts b/packages/nvisy-core/test/provider.test.ts deleted file mode 100644 index 60df76a..0000000 --- a/packages/nvisy-core/test/provider.test.ts +++ /dev/null @@ -1,83 +0,0 @@ -import { beforeAll, describe, expect, it } from "vitest"; -import { - ExampleClient, - ExampleProvider, - ExampleSource, - ExampleTarget, - TestRow, -} from "./provider.fixtures.js"; - -async function collect<T>(iter: AsyncIterable<T>): Promise<T[]> { - const result: T[] = []; - for await (const item of iter) result.push(item); - return result; -} - -describe("ExampleProvider", () => { - it("connect returns a managed instance with a client", async () => { - const instance = await ExampleProvider.connect({ - host: "localhost", - port: 5432, - }); - expect(instance.id).toBe("example"); - expect(instance.client).toBeInstanceOf(ExampleClient); - }); -}); - -describe("ExampleSource", () => { - let client: ExampleClient; - - beforeAll(async () => { - const instance = await ExampleProvider.connect({ - host: "localhost", - port: 5432, - }); - client = instance.client; - }); - - it("reads all rows from offset 0", async () => { - const collected = await collect( - ExampleSource.read(client, { offset: 0 }, { table: "users" }), - ); - - expect(collected).toHaveLength(3); - expect(collected[0]!.data.columns).toEqual({ id: "1", name: "Alice" }); - expect(collected[2]!.data.columns).toEqual({ id: "3", name: "Charlie" }); - }); - - it("resumes from a given offset", async () => { - const collected = await collect( - ExampleSource.read(client, { offset: 2 }, { table: "users" }), - ); - - expect(collected).toHaveLength(1); - expect(collected[0]!.data.columns).toEqual({ id: "3", name: "Charlie" }); - }); - - it("yields correct resumption context", async () => { - const collected = await collect( - ExampleSource.read(client, { offset: 0 }, { table: "users" }), - ); - const contexts = collected.map((r) => r.context); - - expect(contexts).toEqual([{ offset: 1 }, { offset: 2 }, { offset: 3 }]); - }); -}); - -describe("ExampleTarget", () => { - let client: ExampleClient; - - beforeAll(async () => { - const instance = await ExampleProvider.connect({ - host: "localhost", - port: 5432, - }); - client = instance.client; - }); - - it("writes a row without error", async () => { - const row = new TestRow({ id: "4", name: "Diana" }); - const writer = ExampleTarget.write(client, { table: "users" }); - await writer(row); - }); -}); diff --git a/packages/nvisy-core/tsconfig.json b/packages/nvisy-core/tsconfig.json deleted file mode 100644 index 054a6c8..0000000 --- a/packages/nvisy-core/tsconfig.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - /* Emit */ - "outDir": "./dist", - "rootDir": "./src", - "composite": true - }, - /* Scope */ - "include": ["src/**/*"], - "exclude": ["node_modules", "dist", "src/**/*.test.ts", "src/**/*.spec.ts"] -} diff --git a/packages/nvisy-core/tsup.config.ts b/packages/nvisy-core/tsup.config.ts deleted file mode 100644 index d68a5db..0000000 --- a/packages/nvisy-core/tsup.config.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - /* Entry */ - entry: ["src/index.ts"], - format: ["esm"], - - /* Output */ - outDir: "dist", - dts: { compilerOptions: { composite: false } }, - sourcemap: true, - clean: true, - - /* Optimization */ - splitting: false, - treeshake: true, - skipNodeModulesBundle: true, - - /* Environment */ - platform: "node", - target: "es2024", -}); diff --git a/packages/nvisy-plugin-ai/README.md b/packages/nvisy-plugin-ai/README.md deleted file mode 100644 index 573ca1f..0000000 --- a/packages/nvisy-plugin-ai/README.md +++ /dev/null @@ -1,53 +0,0 @@ -# @nvisy/plugin-ai - -[![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) - -AI provider plugin for the Nvisy runtime, backed by the [Vercel AI SDK](https://sdk.vercel.ai). - -## Features - -- **Embedding generation** — batch-embed documents into vectors -- **Chunking** — character, section, page, embedding-similarity, and LLM-contextual strategies -- **Partitioning** — extract text from blobs and documents (auto-detect or regex rules) -- **Enrichment** — metadata extraction, NER, image/table description, and table-to-HTML via LLM - -## Overview - -Provides LLM and embedding model integrations for AI-powered data pipelines. The plugin exposes: - -- **Providers**: - - `ai/openai-completion` — OpenAI completion (language model) - - `ai/openai-embedding` — OpenAI embedding - - `ai/anthropic-completion` — Anthropic completion - - `ai/gemini-completion` — Gemini completion - - `ai/gemini-embedding` — Gemini embedding -- **Actions**: - - `ai/embed` — generate embeddings from documents (batched) - - `ai/chunk` — split documents by character, section, or page boundaries - - `ai/chunk_similarity` — split using embedding cosine-similarity thresholds - - `ai/chunk_contextual` — split using an LLM to find natural boundaries - - `ai/partition` — extract text from blobs/documents (auto or regex rules) - - `ai/partition_contextual` — AI-based contextual partitioning (stub, not yet implemented) - - `ai/enrich` — extract metadata, entities, image/table descriptions, or convert tables to HTML via LLM - -## Usage - -```ts -import { aiPlugin } from "@nvisy/plugin-ai"; - -engine.register(aiPlugin); -``` - -## Changelog - -See [CHANGELOG.md](../../CHANGELOG.md) for release notes and version history. - -## License - -Apache 2.0 License - see [LICENSE.txt](../../LICENSE.txt) - -## Support - -- **Documentation**: [docs.nvisy.com](https://docs.nvisy.com) -- **Issues**: [GitHub Issues](https://github.com/nvisycom/runtime/issues) -- **Email**: [support@nvisy.com](mailto:support@nvisy.com) diff --git a/packages/nvisy-plugin-ai/package.json b/packages/nvisy-plugin-ai/package.json deleted file mode 100644 index 35dc224..0000000 --- a/packages/nvisy-plugin-ai/package.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "name": "@nvisy/plugin-ai", - "version": "0.1.0", - "description": "AI provider integrations for the Nvisy platform", - "type": "module", - "exports": { - ".": { - "source": "./src/index.ts", - "types": "./dist/index.d.ts", - "import": "./dist/index.js" - } - }, - "files": [ - "dist" - ], - "scripts": { - "build": "tsup", - "build:watch": "tsup --watch", - "clean": "rimraf dist", - "typecheck": "tsc -b" - }, - "dependencies": { - "@ai-sdk/anthropic": "^3.0.36", - "@ai-sdk/google": "^3.0.20", - "@ai-sdk/openai": "^3.0.25", - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "ai": "^6.0.69", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } -} diff --git a/packages/nvisy-plugin-ai/src/actions/chunk-contextual.ts b/packages/nvisy-plugin-ai/src/actions/chunk-contextual.ts deleted file mode 100644 index 1209890..0000000 --- a/packages/nvisy-plugin-ai/src/actions/chunk-contextual.ts +++ /dev/null @@ -1,71 +0,0 @@ -import { Action, Chunk, Document } from "@nvisy/core"; -import { z } from "zod"; -import { AICompletionClient } from "../providers/client.js"; - -const ChunkContextualParams = z.object({ - /** Maximum characters per chunk. Defaults to 2000. */ - maxChunkSize: z.number().default(2000), -}); - -/** - * Split documents into semantically meaningful chunks using an LLM. - * - * Uses a language model to determine natural chunk boundaries. - */ -export const chunkContextual = Action.withClient( - "chunk_contextual", - AICompletionClient, - { - types: [Document, Chunk], - params: ChunkContextualParams, - transform: transformChunkContextual, - }, -); - -async function* transformChunkContextual( - stream: AsyncIterable<Document>, - params: z.infer<typeof ChunkContextualParams>, - client: AICompletionClient, -) { - for await (const doc of stream) { - const texts = await chunkByContext(doc.content, params, client); - - for (let i = 0; i < texts.length; i++) { - yield new Chunk(texts[i]!, { - chunkIndex: i, - chunkTotal: texts.length, - }).deriveFrom(doc); - } - } -} - -/** Use an LLM to determine natural chunk boundaries. */ -async function chunkByContext( - text: string, - params: { maxChunkSize: number }, - client: AICompletionClient, -): Promise<string[]> { - const result = await client.complete({ - messages: [ - { - role: "system", - content: `You are a text segmentation assistant. Split the following text into semantically coherent chunks. Each chunk should be at most ${params.maxChunkSize} characters. Return ONLY a JSON array of strings, where each string is one chunk. Do not add any explanation.`, - }, - { - role: "user", - content: text, - }, - ], - }); - - try { - const parsed = JSON.parse(result.content) as unknown; - if (Array.isArray(parsed) && parsed.every((c) => typeof c === "string")) { - return parsed as string[]; - } - } catch { - // Fall back to returning the whole text as a single chunk - } - - return [text]; -} diff --git a/packages/nvisy-plugin-ai/src/actions/chunk-similarity.ts b/packages/nvisy-plugin-ai/src/actions/chunk-similarity.ts deleted file mode 100644 index 2b1940b..0000000 --- a/packages/nvisy-plugin-ai/src/actions/chunk-similarity.ts +++ /dev/null @@ -1,90 +0,0 @@ -import { Action, Chunk, Document } from "@nvisy/core"; -import { z } from "zod"; -import { EmbeddingClient } from "../providers/client.js"; - -const ChunkSimilarityParams = z.object({ - /** Cosine similarity threshold for splitting (0-1). Defaults to 0.5. */ - threshold: z.number().min(0).max(1).default(0.5), -}); - -/** - * Split documents into semantically meaningful chunks using embedding similarity. - * - * Computes embeddings for sentences and splits where cosine - * similarity drops below a threshold. - */ -export const chunkSimilarity = Action.withClient( - "chunk_similarity", - EmbeddingClient, - { - types: [Document, Chunk], - params: ChunkSimilarityParams, - transform: transformChunkSimilarity, - }, -); - -async function* transformChunkSimilarity( - stream: AsyncIterable<Document>, - params: z.infer<typeof ChunkSimilarityParams>, - client: EmbeddingClient, -) { - for await (const doc of stream) { - const texts = await chunkBySimilarity(doc.content, params, client); - - for (let i = 0; i < texts.length; i++) { - yield new Chunk(texts[i]!, { - chunkIndex: i, - chunkTotal: texts.length, - }).deriveFrom(doc); - } - } -} - -/** Split sentences into semantic groups by embedding similarity. */ -async function chunkBySimilarity( - text: string, - params: { threshold: number }, - client: EmbeddingClient, -): Promise<string[]> { - const sentences = splitSentences(text); - if (sentences.length <= 1) return [text]; - - const vectors = await client.embed(sentences, {}); - - const chunks: string[] = []; - let current: string[] = [sentences[0]!]; - - for (let i = 1; i < sentences.length; i++) { - const sim = cosineSimilarity(vectors[i - 1]!, vectors[i]!); - if (sim < params.threshold) { - chunks.push(current.join(" ")); - current = []; - } - current.push(sentences[i]!); - } - if (current.length > 0) { - chunks.push(current.join(" ")); - } - - return chunks; -} - -function splitSentences(text: string): string[] { - return text - .split(/(?<=[.!?])\s+/) - .map((s) => s.trim()) - .filter((s) => s.length > 0); -} - -function cosineSimilarity(a: Float32Array, b: Float32Array): number { - let dot = 0; - let normA = 0; - let normB = 0; - for (let i = 0; i < a.length; i++) { - dot += a[i]! * b[i]!; - normA += a[i]! * a[i]!; - normB += b[i]! * b[i]!; - } - const denom = Math.sqrt(normA) * Math.sqrt(normB); - return denom === 0 ? 0 : dot / denom; -} diff --git a/packages/nvisy-plugin-ai/src/actions/enrich-by-description.ts b/packages/nvisy-plugin-ai/src/actions/enrich-by-description.ts deleted file mode 100644 index 523147c..0000000 --- a/packages/nvisy-plugin-ai/src/actions/enrich-by-description.ts +++ /dev/null @@ -1,35 +0,0 @@ -/** - * Content description enrichment strategy. - * - * Generates AI descriptions for image or table content. - * - * @module - */ - -import type { Metadata } from "@nvisy/core"; -import type { AICompletionClient } from "../providers/client.js"; -import { parseJsonResponse } from "../providers/client.js"; - -/** Description enrichment parameters. */ -export interface DescriptionEnrichParams { - /** The kind of content to describe. */ - readonly contentKind: "image" | "table"; -} - -/** Describe content using AI. */ -export async function enrichByDescription( - text: string, - params: DescriptionEnrichParams, - client: AICompletionClient, -): Promise<Metadata> { - const result = await client.complete({ - messages: [ - { - role: "system", - content: `Describe the following ${params.contentKind} content in detail. Return ONLY a JSON object with a "description" field containing your description.`, - }, - { role: "user", content: text }, - ], - }); - return parseJsonResponse(result.content, "description"); -} diff --git a/packages/nvisy-plugin-ai/src/actions/enrich-by-metadata.ts b/packages/nvisy-plugin-ai/src/actions/enrich-by-metadata.ts deleted file mode 100644 index d0a4210..0000000 --- a/packages/nvisy-plugin-ai/src/actions/enrich-by-metadata.ts +++ /dev/null @@ -1,35 +0,0 @@ -/** - * Metadata extraction enrichment strategy. - * - * Extracts structured fields from document content using an AI model. - * - * @module - */ - -import type { Metadata } from "@nvisy/core"; -import type { AICompletionClient } from "../providers/client.js"; -import { parseJsonResponse } from "../providers/client.js"; - -/** Metadata enrichment parameters. */ -export interface MetadataEnrichParams { - /** Field names to extract from the document. */ - readonly fields: string[]; -} - -/** Extract structured metadata fields from text using AI. */ -export async function enrichByMetadata( - text: string, - params: MetadataEnrichParams, - client: AICompletionClient, -): Promise<Metadata> { - const result = await client.complete({ - messages: [ - { - role: "system", - content: `Extract the following fields from the document: ${params.fields.join(", ")}. Return ONLY a JSON object with these fields as keys. If a field cannot be determined, set it to null.`, - }, - { role: "user", content: text }, - ], - }); - return parseJsonResponse(result.content, "extracted"); -} diff --git a/packages/nvisy-plugin-ai/src/actions/enrich-by-ner.ts b/packages/nvisy-plugin-ai/src/actions/enrich-by-ner.ts deleted file mode 100644 index f7fe9d4..0000000 --- a/packages/nvisy-plugin-ai/src/actions/enrich-by-ner.ts +++ /dev/null @@ -1,39 +0,0 @@ -/** - * Named entity recognition enrichment strategy. - * - * Extracts named entities from document content using an AI model. - * - * @module - */ - -import type { Metadata } from "@nvisy/core"; -import type { AICompletionClient } from "../providers/client.js"; -import { parseJsonResponse } from "../providers/client.js"; - -/** NER enrichment parameters. */ -export interface NerEnrichParams { - /** Entity types to extract (e.g. ["PERSON", "ORG"]). If omitted, extract all. */ - readonly entityTypes?: string[] | undefined; -} - -/** Extract named entities from text using AI. */ -export async function enrichByNer( - text: string, - params: NerEnrichParams, - client: AICompletionClient, -): Promise<Metadata> { - const typeClause = params.entityTypes - ? `Focus on these entity types: ${params.entityTypes.join(", ")}.` - : "Extract all entity types you can identify."; - - const result = await client.complete({ - messages: [ - { - role: "system", - content: `Perform named entity recognition on the following text. ${typeClause} Return ONLY a JSON object where keys are entity types and values are arrays of extracted entities.`, - }, - { role: "user", content: text }, - ], - }); - return { entities: parseJsonResponse(result.content, "entities") }; -} diff --git a/packages/nvisy-plugin-ai/src/actions/enrich-by-table-html.ts b/packages/nvisy-plugin-ai/src/actions/enrich-by-table-html.ts deleted file mode 100644 index 9263b1c..0000000 --- a/packages/nvisy-plugin-ai/src/actions/enrich-by-table-html.ts +++ /dev/null @@ -1,33 +0,0 @@ -/** - * Table-to-HTML enrichment strategy. - * - * Converts table content into HTML markup using an AI model. - * - * @module - */ - -import type { Metadata } from "@nvisy/core"; -import type { AICompletionClient } from "../providers/client.js"; -import { parseJsonResponse } from "../providers/client.js"; - -/** Table-to-HTML enrichment parameters (no additional options). */ -export type TableHtmlEnrichParams = Record<string, never>; - -/** Convert table content to HTML using AI. */ -export async function enrichByTableHtml( - text: string, - _params: TableHtmlEnrichParams, - client: AICompletionClient, -): Promise<Metadata> { - const result = await client.complete({ - messages: [ - { - role: "system", - content: - 'Convert the following table content into clean HTML. Return ONLY a JSON object with an "html" field containing the HTML table markup.', - }, - { role: "user", content: text }, - ], - }); - return parseJsonResponse(result.content, "tableHtml"); -} diff --git a/packages/nvisy-plugin-ai/src/actions/enrich.ts b/packages/nvisy-plugin-ai/src/actions/enrich.ts deleted file mode 100644 index 055bdda..0000000 --- a/packages/nvisy-plugin-ai/src/actions/enrich.ts +++ /dev/null @@ -1,129 +0,0 @@ -/** - * AI-powered enrich action that extracts metadata, entities, - * descriptions, or HTML from documents. - * - * @module - */ - -import type { Metadata } from "@nvisy/core"; -import { Action, Document } from "@nvisy/core"; -import { z } from "zod"; -import { AICompletionClient } from "../providers/client.js"; -import { enrichByDescription } from "./enrich-by-description.js"; -import { enrichByMetadata } from "./enrich-by-metadata.js"; -import { enrichByNer } from "./enrich-by-ner.js"; -import { enrichByTableHtml } from "./enrich-by-table-html.js"; - -export type { DescriptionEnrichParams } from "./enrich-by-description.js"; -export type { MetadataEnrichParams } from "./enrich-by-metadata.js"; -export type { NerEnrichParams } from "./enrich-by-ner.js"; -export type { TableHtmlEnrichParams } from "./enrich-by-table-html.js"; - -const BaseMetadata = z.object({ - fields: z.array(z.string()), -}); - -const BaseNer = z.object({ - entityTypes: z.array(z.string()).optional(), -}); - -const BaseImageDescription = z.object({}); - -const BaseTableDescription = z.object({}); - -const BaseTableToHtml = z.object({}); - -const MetadataType = BaseMetadata.extend({ - type: z.literal("metadata"), -}); - -const NerType = BaseNer.extend({ - type: z.literal("ner"), -}); - -const ImageDescriptionType = BaseImageDescription.extend({ - type: z.literal("image_description"), -}); - -const TableDescriptionType = BaseTableDescription.extend({ - type: z.literal("table_description"), -}); - -const TableToHtmlType = BaseTableToHtml.extend({ - type: z.literal("table_to_html"), -}); - -const EnrichParams = z.discriminatedUnion("type", [ - MetadataType, - NerType, - ImageDescriptionType, - TableDescriptionType, - TableToHtmlType, -]); - -/** - * Enrich documents with AI-extracted metadata. - * - * - `"metadata"`: extract structured fields from content - * - `"ner"`: named entity recognition - * - `"image_description"`: describe image content - * - `"table_description"`: describe table content - * - `"table_to_html"`: convert table content to HTML - */ -export const enrich = Action.withClient("enrich", AICompletionClient, { - types: [Document], - params: EnrichParams, - transform: transformEnrich, -}); - -async function* transformEnrich( - stream: AsyncIterable<Document>, - params: z.infer<typeof EnrichParams>, - client: AICompletionClient, -): AsyncGenerator<Document> { - for await (const doc of stream) { - let enrichedMeta: Metadata; - - switch (params.type) { - case "metadata": { - const { type: _, ...rest } = params; - enrichedMeta = await enrichByMetadata(doc.content, rest, client); - break; - } - case "ner": { - const { type: _, ...rest } = params; - enrichedMeta = await enrichByNer(doc.content, rest, client); - break; - } - case "image_description": { - const { type: _, ...rest } = params; - enrichedMeta = await enrichByDescription( - doc.content, - { ...rest, contentKind: "image" }, - client, - ); - break; - } - case "table_description": { - const { type: _, ...rest } = params; - enrichedMeta = await enrichByDescription( - doc.content, - { ...rest, contentKind: "table" }, - client, - ); - break; - } - case "table_to_html": { - const { type: _, ...rest } = params; - enrichedMeta = await enrichByTableHtml(doc.content, rest, client); - break; - } - } - - yield new Document(doc.content, { - ...(doc.elements != null ? { elements: doc.elements } : {}), - }) - .deriveFrom(doc) - .withMetadata({ ...(doc.metadata ?? {}), ...enrichedMeta }); - } -} diff --git a/packages/nvisy-plugin-ai/src/actions/generate-embedding.ts b/packages/nvisy-plugin-ai/src/actions/generate-embedding.ts deleted file mode 100644 index e5ba913..0000000 --- a/packages/nvisy-plugin-ai/src/actions/generate-embedding.ts +++ /dev/null @@ -1,60 +0,0 @@ -import { Action, Document, Embedding } from "@nvisy/core"; -import { z } from "zod"; -import { EmbeddingClient } from "../providers/client.js"; - -const EmbedParams = z.object({ - /** Desired embedding dimensions (if supported by the model). */ - dimensions: z.number().optional(), - /** Number of documents to embed per API call. */ - batchSize: z.number().default(64), -}); - -/** - * Generate embeddings for documents using an AI provider. - * - * Consumes {@link Document} items, batches their content, calls the - * provider's embedding API, and yields one {@link Embedding} per document. - */ -export const embed = Action.withClient("embed", EmbeddingClient, { - types: [Document, Embedding], - params: EmbedParams, - transform: transformEmbed, -}); - -async function* transformEmbed( - stream: AsyncIterable<Document>, - params: z.infer<typeof EmbedParams>, - client: EmbeddingClient, -) { - let batch: Document[] = []; - - for await (const doc of stream) { - batch.push(doc); - if (batch.length >= params.batchSize) { - yield* emitBatch(batch, client, params.dimensions); - batch = []; - } - } - - if (batch.length > 0) { - yield* emitBatch(batch, client, params.dimensions); - } -} - -async function* emitBatch( - batch: Document[], - client: EmbeddingClient, - dimensions: number | undefined, -): AsyncIterable<Embedding> { - const texts = batch.map((doc) => doc.content); - - const vectors = await client.embed(texts, { - ...(dimensions != null ? { dimensions } : {}), - }); - - for (let i = 0; i < batch.length; i++) { - const doc = batch[i]!; - const vector = vectors[i]!; - yield new Embedding(vector).deriveFrom(doc); - } -} diff --git a/packages/nvisy-plugin-ai/src/actions/index.ts b/packages/nvisy-plugin-ai/src/actions/index.ts deleted file mode 100644 index b9d6b05..0000000 --- a/packages/nvisy-plugin-ai/src/actions/index.ts +++ /dev/null @@ -1,5 +0,0 @@ -export { chunkContextual } from "./chunk-contextual.js"; -export { chunkSimilarity } from "./chunk-similarity.js"; -export { enrich } from "./enrich.js"; -export { embed } from "./generate-embedding.js"; -export { partitionContextual } from "./partition-contextual.js"; diff --git a/packages/nvisy-plugin-ai/src/actions/partition-contextual.ts b/packages/nvisy-plugin-ai/src/actions/partition-contextual.ts deleted file mode 100644 index 51b7233..0000000 --- a/packages/nvisy-plugin-ai/src/actions/partition-contextual.ts +++ /dev/null @@ -1,33 +0,0 @@ -import { Action, Document, RuntimeError } from "@nvisy/core"; -import { z } from "zod"; -import { AICompletionClient } from "../providers/client.js"; - -const PartitionContextualParams = z.object({}); - -/** - * Partition documents and blobs using an AI model for contextual analysis. - * - * This action is a placeholder — it throws "not yet implemented" - * until AI-based contextual partitioning support is added. - */ -export const partitionContextual = Action.withClient( - "partition_contextual", - AICompletionClient, - { - types: [Document], - params: PartitionContextualParams, - transform: transformPartitionContextual, - }, -); - -// biome-ignore lint/correctness/useYield: stub action throws before yielding -async function* transformPartitionContextual( - _stream: AsyncIterable<Document>, - _params: z.infer<typeof PartitionContextualParams>, - _client: AICompletionClient, -) { - throw new RuntimeError("partition_contextual is not yet implemented", { - source: "ai/partition_contextual", - retryable: false, - }); -} diff --git a/packages/nvisy-plugin-ai/src/datatypes/index.ts b/packages/nvisy-plugin-ai/src/datatypes/index.ts deleted file mode 100644 index e69de29..0000000 diff --git a/packages/nvisy-plugin-ai/src/index.ts b/packages/nvisy-plugin-ai/src/index.ts deleted file mode 100644 index 5ea7561..0000000 --- a/packages/nvisy-plugin-ai/src/index.ts +++ /dev/null @@ -1,53 +0,0 @@ -/** - * @module @nvisy/plugin-ai - * - * AI provider plugin for the Nvisy runtime. - * - * Exposes LLM providers (OpenAI, Anthropic, Gemini), embedding generation, - * chunking, partitioning, and enrichment actions for AI-powered pipelines. - * - * Backed by the Vercel AI SDK for unified provider access. - * - * @example - * ```ts - * import { aiPlugin } from "@nvisy/plugin-ai"; - * - * engine.register(aiPlugin); - * ``` - */ - -import { Plugin } from "@nvisy/core"; -import { - chunkContextual, - chunkSimilarity, - embed, - enrich, - partitionContextual, -} from "./actions/index.js"; -import { - anthropicCompletion, - geminiCompletion, - geminiEmbedding, - openaiCompletion, - openaiEmbedding, -} from "./providers/index.js"; - -/** The AI plugin: register this with the runtime to enable all AI providers and actions. */ -export const aiPlugin = Plugin.define("ai") - .withProviders( - openaiCompletion, - openaiEmbedding, - anthropicCompletion, - geminiCompletion, - geminiEmbedding, - ) - .withActions( - embed, - chunkSimilarity, - chunkContextual, - partitionContextual, - enrich, - ); - -export type { ChunkOptions } from "@nvisy/core"; -export { Chunk, Embedding } from "@nvisy/core"; diff --git a/packages/nvisy-plugin-ai/src/providers/anthropic.ts b/packages/nvisy-plugin-ai/src/providers/anthropic.ts deleted file mode 100644 index fb46c5e..0000000 --- a/packages/nvisy-plugin-ai/src/providers/anthropic.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { createAnthropic } from "@ai-sdk/anthropic"; -import { generateText } from "ai"; -import { - makeProvider, - type ProviderConnection, - VercelCompletionClient, -} from "./client.js"; - -function makeAnthropic(credentials: ProviderConnection) { - return createAnthropic({ - apiKey: credentials.apiKey, - ...(credentials.baseUrl != null ? { baseURL: credentials.baseUrl } : {}), - }); -} - -/** Anthropic completion provider factory backed by the Vercel AI SDK. */ -export const anthropicCompletion = makeProvider({ - id: "anthropic-completion", - createClient: (credentials) => - new VercelCompletionClient({ - languageModel: makeAnthropic(credentials)(credentials.model), - }), - verify: async (credentials) => { - const provider = makeAnthropic(credentials); - await generateText({ - model: provider(credentials.model), - prompt: "hi", - maxOutputTokens: 1, - }); - }, -}); diff --git a/packages/nvisy-plugin-ai/src/providers/client.ts b/packages/nvisy-plugin-ai/src/providers/client.ts deleted file mode 100644 index 9b4a986..0000000 --- a/packages/nvisy-plugin-ai/src/providers/client.ts +++ /dev/null @@ -1,230 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { - ConnectionError, - type Metadata, - Provider, - type ProviderFactory, -} from "@nvisy/core"; -import type { EmbeddingModel, LanguageModel } from "ai"; -import { embedMany, generateText } from "ai"; -import type { ProviderConnection } from "./schemas.js"; -import { ProviderConnection as ApiKeyCredentialsSchema } from "./schemas.js"; - -export type { ProviderConnection } from "./schemas.js"; - -const logger = getLogger(["nvisy", "ai"]); - -/** A single message in a chat conversation. */ -export interface ChatMessage { - readonly role: "system" | "user" | "assistant"; - readonly content: string; -} - -/** Options for a completion request. */ -export interface CompletionOptions { - /** Messages comprising the conversation. */ - readonly messages: ReadonlyArray<ChatMessage>; - /** Sampling temperature (0-2). */ - readonly temperature?: number; - /** Maximum tokens to generate. */ - readonly maxTokens?: number; -} - -/** Result of a completion request. */ -export interface CompletionResult { - /** Generated text content. */ - readonly content: string; - /** Token usage statistics. */ - readonly usage?: { - readonly promptTokens: number; - readonly completionTokens: number; - }; -} - -/** Options for an embedding request. */ -export interface EmbedOptions { - /** Desired embedding dimensions (if supported by the model). */ - readonly dimensions?: number; -} - -/** - * Abstract AI client with completion capability. - */ -export abstract class AICompletionClient { - abstract complete(options: CompletionOptions): Promise<CompletionResult>; -} - -/** - * Abstract AI client with embedding capability. - */ -export abstract class EmbeddingClient { - abstract embed( - input: string[], - options: EmbedOptions, - ): Promise<Float32Array[]>; -} - -/** Build the generateText call options from our CompletionOptions. */ -function buildGenerateTextArgs( - model: LanguageModel, - options: CompletionOptions, -) { - const systemParts = options.messages - .filter((m) => m.role === "system") - .map((m) => m.content); - - const systemText = - systemParts.length > 0 ? systemParts.join("\n\n") : undefined; - - return { - model, - ...(systemText != null ? { system: systemText } : {}), - messages: options.messages - .filter((m) => m.role !== "system") - .map((m) => ({ - role: m.role as "user" | "assistant", - content: m.content, - })), - ...(options.temperature != null - ? { temperature: options.temperature } - : {}), - ...(options.maxTokens != null - ? { maxOutputTokens: options.maxTokens } - : {}), - }; -} - -/** Map AI SDK usage to our CompletionResult format. */ -function mapUsage( - usage: - | { inputTokens: number | undefined; outputTokens: number | undefined } - | undefined, -): CompletionResult["usage"] { - if (!usage || usage.inputTokens == null || usage.outputTokens == null) { - return undefined; - } - return { - promptTokens: usage.inputTokens, - completionTokens: usage.outputTokens, - }; -} - -/** - * Embedding-only AI client backed by the Vercel AI SDK. - * - * Uses {@link embedMany} from the `ai` package, - * delegating model creation to provider-specific factories. - */ -export class VercelEmbeddingClient extends EmbeddingClient { - readonly #model: EmbeddingModel; - - constructor(config: { embeddingModel: EmbeddingModel }) { - super(); - this.#model = config.embeddingModel; - } - - async embed( - input: string[], - _options: EmbedOptions, - ): Promise<Float32Array[]> { - const result = await embedMany({ - model: this.#model, - values: input, - }); - - return result.embeddings.map((e) => new Float32Array(e)); - } -} - -/** - * Completion-only AI client backed by the Vercel AI SDK. - */ -export class VercelCompletionClient extends AICompletionClient { - readonly #model: LanguageModel; - - constructor(config: { languageModel: LanguageModel }) { - super(); - this.#model = config.languageModel; - } - - async complete(options: CompletionOptions): Promise<CompletionResult> { - const result = await generateText( - buildGenerateTextArgs(this.#model, options), - ); - const usage = mapUsage(result.usage); - return { - content: result.text, - ...(usage != null ? { usage } : {}), - }; - } -} - -/** Parse an AI response as JSON, falling back to a keyed wrapper. */ -export function parseJsonResponse( - content: string, - fallbackKey: string, -): Metadata { - try { - const parsed = JSON.parse(content) as Record<string, unknown>; - if ( - typeof parsed === "object" && - parsed !== null && - !Array.isArray(parsed) - ) { - return parsed as Metadata; - } - } catch { - // If JSON parsing fails, store the raw response - } - return { [fallbackKey]: content }; -} - -/** Normalise an unknown throw into a {@link ConnectionError}. */ -function toConnectionError(error: unknown, source: string): ConnectionError { - if (error instanceof ConnectionError) return error; - logger.error("Connection to {provider} failed: {error}", { - provider: source, - error: error instanceof Error ? error.message : String(error), - }); - return ConnectionError.wrap(error, { source }); -} - -/** Configuration for {@link makeProvider}. */ -export interface ProviderConfig<TClient> { - /** Unique provider identifier, e.g. `"openai"`, `"anthropic"`, `"gemini"`. */ - readonly id: string; - /** Factory that creates an AI client from validated credentials. */ - readonly createClient: (credentials: ProviderConnection) => TClient; - /** Verify the connection is live (called once during connect). */ - readonly verify: (credentials: ProviderConnection) => Promise<void>; -} - -/** - * Create an AI {@link ProviderFactory} parameterised by a client constructor. - * - * The returned factory validates {@link ProviderConnection} at parse time, then - * creates the client on connect after verifying connectivity. - */ -export const makeProvider = <TClient>( - config: ProviderConfig<TClient>, -): ProviderFactory<ProviderConnection, TClient> => - Provider.withAuthentication(config.id, { - credentials: ApiKeyCredentialsSchema, - verify: config.verify, - connect: async (credentials) => { - try { - const client = config.createClient(credentials); - logger.info("Connected to {provider}", { provider: config.id }); - return { - client, - disconnect: async () => { - logger.debug("Disconnected from {provider}", { - provider: config.id, - }); - }, - }; - } catch (error) { - throw toConnectionError(error, config.id); - } - }, - }); diff --git a/packages/nvisy-plugin-ai/src/providers/gemini.ts b/packages/nvisy-plugin-ai/src/providers/gemini.ts deleted file mode 100644 index 22e1286..0000000 --- a/packages/nvisy-plugin-ai/src/providers/gemini.ts +++ /dev/null @@ -1,48 +0,0 @@ -import { createGoogleGenerativeAI } from "@ai-sdk/google"; -import { embedMany, generateText } from "ai"; -import { - makeProvider, - type ProviderConnection, - VercelCompletionClient, - VercelEmbeddingClient, -} from "./client.js"; - -function makeGemini(credentials: ProviderConnection) { - return createGoogleGenerativeAI({ - apiKey: credentials.apiKey, - ...(credentials.baseUrl != null ? { baseURL: credentials.baseUrl } : {}), - }); -} - -/** Gemini completion provider factory backed by the Vercel AI SDK. */ -export const geminiCompletion = makeProvider({ - id: "gemini-completion", - createClient: (credentials) => - new VercelCompletionClient({ - languageModel: makeGemini(credentials)(credentials.model), - }), - verify: async (credentials) => { - const provider = makeGemini(credentials); - await generateText({ - model: provider(credentials.model), - prompt: "hi", - maxOutputTokens: 1, - }); - }, -}); - -/** Gemini embedding provider factory backed by the Vercel AI SDK. */ -export const geminiEmbedding = makeProvider({ - id: "gemini-embedding", - createClient: (credentials) => - new VercelEmbeddingClient({ - embeddingModel: makeGemini(credentials).embeddingModel(credentials.model), - }), - verify: async (credentials) => { - const provider = makeGemini(credentials); - await embedMany({ - model: provider.embeddingModel(credentials.model), - values: ["test"], - }); - }, -}); diff --git a/packages/nvisy-plugin-ai/src/providers/index.ts b/packages/nvisy-plugin-ai/src/providers/index.ts deleted file mode 100644 index 711b3f1..0000000 --- a/packages/nvisy-plugin-ai/src/providers/index.ts +++ /dev/null @@ -1,18 +0,0 @@ -export { anthropicCompletion } from "./anthropic.js"; -export type { - ChatMessage, - CompletionOptions, - CompletionResult, - EmbedOptions, - ProviderConfig, - ProviderConnection, -} from "./client.js"; -export { - AICompletionClient, - EmbeddingClient, - makeProvider, - VercelCompletionClient, - VercelEmbeddingClient, -} from "./client.js"; -export { geminiCompletion, geminiEmbedding } from "./gemini.js"; -export { openaiCompletion, openaiEmbedding } from "./openai.js"; diff --git a/packages/nvisy-plugin-ai/src/providers/openai.ts b/packages/nvisy-plugin-ai/src/providers/openai.ts deleted file mode 100644 index 9862a17..0000000 --- a/packages/nvisy-plugin-ai/src/providers/openai.ts +++ /dev/null @@ -1,48 +0,0 @@ -import { createOpenAI } from "@ai-sdk/openai"; -import { embedMany, generateText } from "ai"; -import { - makeProvider, - type ProviderConnection, - VercelCompletionClient, - VercelEmbeddingClient, -} from "./client.js"; - -function makeOpenAI(credentials: ProviderConnection) { - return createOpenAI({ - apiKey: credentials.apiKey, - ...(credentials.baseUrl != null ? { baseURL: credentials.baseUrl } : {}), - }); -} - -/** OpenAI completion provider factory backed by the Vercel AI SDK. */ -export const openaiCompletion = makeProvider({ - id: "openai-completion", - createClient: (credentials) => - new VercelCompletionClient({ - languageModel: makeOpenAI(credentials)(credentials.model), - }), - verify: async (credentials) => { - const provider = makeOpenAI(credentials); - await generateText({ - model: provider(credentials.model), - prompt: "hi", - maxOutputTokens: 1, - }); - }, -}); - -/** OpenAI embedding provider factory backed by the Vercel AI SDK. */ -export const openaiEmbedding = makeProvider({ - id: "openai-embedding", - createClient: (credentials) => - new VercelEmbeddingClient({ - embeddingModel: makeOpenAI(credentials).embedding(credentials.model), - }), - verify: async (credentials) => { - const provider = makeOpenAI(credentials); - await embedMany({ - model: provider.embedding(credentials.model), - values: ["test"], - }); - }, -}); diff --git a/packages/nvisy-plugin-ai/src/providers/schemas.ts b/packages/nvisy-plugin-ai/src/providers/schemas.ts deleted file mode 100644 index 10c6f7b..0000000 --- a/packages/nvisy-plugin-ai/src/providers/schemas.ts +++ /dev/null @@ -1,16 +0,0 @@ -import { z } from "zod"; - -/** - * API key credentials shared by all AI providers. - * - * Validated at graph parse time before any connection is attempted. - */ -export const ProviderConnection = z.object({ - /** API key for authentication. */ - apiKey: z.string(), - /** Optional custom base URL for the API. */ - baseUrl: z.string().optional(), - /** Model identifier bound to this connection. */ - model: z.string(), -}); -export type ProviderConnection = z.infer<typeof ProviderConnection>; diff --git a/packages/nvisy-plugin-ai/tsconfig.json b/packages/nvisy-plugin-ai/tsconfig.json deleted file mode 100644 index c91a2dd..0000000 --- a/packages/nvisy-plugin-ai/tsconfig.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - /* Emit */ - "outDir": "./dist", - "rootDir": "./src", - "composite": true - }, - /* Scope */ - "include": ["src/**/*"], - "exclude": ["node_modules", "dist"], - "references": [{ "path": "../nvisy-core" }] -} diff --git a/packages/nvisy-plugin-ai/tsup.config.ts b/packages/nvisy-plugin-ai/tsup.config.ts deleted file mode 100644 index d68a5db..0000000 --- a/packages/nvisy-plugin-ai/tsup.config.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - /* Entry */ - entry: ["src/index.ts"], - format: ["esm"], - - /* Output */ - outDir: "dist", - dts: { compilerOptions: { composite: false } }, - sourcemap: true, - clean: true, - - /* Optimization */ - splitting: false, - treeshake: true, - skipNodeModulesBundle: true, - - /* Environment */ - platform: "node", - target: "es2024", -}); diff --git a/packages/nvisy-plugin-core/README.md b/packages/nvisy-plugin-core/README.md deleted file mode 100644 index 0522bd0..0000000 --- a/packages/nvisy-plugin-core/README.md +++ /dev/null @@ -1,141 +0,0 @@ -# @nvisy/plugin-core - -Core plugin for the Nvisy runtime with built-in chunking, partitioning, loading, and text splitting. - -## Install - -```bash -npm install @nvisy/plugin-core -``` - -## Plugin Registration - -```ts -import { corePlugin } from "@nvisy/plugin-core"; - -// Register with the engine -registry.load(corePlugin); -``` - -The `corePlugin` registers: - -- **Datatype**: `Document`, `Blob`, `Chunk`, `Embedding` -- **Actions**: `chunkSimple`, `partition` -- **Loaders**: `plaintextLoader`, `csvLoader`, `jsonLoader` - -## Actions - -### `chunkSimple` - -Splits documents into smaller chunks. Accepts a `strategy` discriminator to select the splitting method. - -**Character strategy** — fixed-size windows with optional overlap: - -```ts -{ strategy: "character", maxCharacters: 500, overlap: 50 } -``` - -**Section strategy** — split on markdown headings: - -```ts -{ strategy: "section", level: 2, maxCharacters: 1000, combineUnder: 200 } -``` - -**Page strategy** — split on page boundaries (`\f`, `---`, `***`) or structured page elements: - -```ts -{ strategy: "page", maxCharacters: 2000 } -``` - -### `partition` - -Partitions documents into multiple documents with metadata tracking. - -**Auto strategy** — pass-through, preserves content as-is: - -```ts -{ strategy: "auto" } -``` - -**Rule strategy** — split on a regex pattern: - -```ts -{ strategy: "rule", pattern: "\\n{2,}", includeDelimiter: false, inferTableStructure: false } -``` - -## Loaders - -### `plaintextLoader` - -Converts `.txt` blobs into documents. - -| Parameter | Type | Default | -|-----------|------|---------| -| `encoding` | `"utf-8" \| "ascii" \| "latin1" \| "utf16le"` | `"utf-8"` | - -### `csvLoader` - -Converts `.csv` / `.tsv` blobs into documents. Rows are formatted as `column: value` when headers are present. - -| Parameter | Type | Default | -|-----------|------|---------| -| `delimiter` | `string` | `","` | -| `hasHeader` | `boolean` | `true` | -| `encoding` | `"utf-8" \| "ascii" \| "latin1" \| "utf16le"` | `"utf-8"` | - -### `jsonLoader` - -Converts `.json` / `.jsonl` / `.ndjson` blobs into documents. Scalar object fields are extracted as document metadata. - -| Parameter | Type | Default | -|-----------|------|---------| -| `encoding` | `"utf-8" \| "ascii" \| "latin1" \| "utf16le"` | `"utf-8"` | - -## Splitters - -Reusable `string → string[]` splitting utilities, usable independently of the action system. - -### `splitByDelimiter` - -Split text on a literal string delimiter. - -```ts -import { splitByDelimiter } from "@nvisy/plugin-core"; - -splitByDelimiter("a---b---c", { delimiter: "---" }); -// → ["a", "b", "c"] - -splitByDelimiter("a---b---c", { delimiter: "---", keepDelimiter: true }); -// → ["a", "---b", "---c"] -``` - -| Option | Type | Default | Description | -|--------|------|---------|-------------| -| `delimiter` | `string` | — | String to split on | -| `keepDelimiter` | `boolean` | `false` | Prepend delimiter to subsequent segments | -| `trimEmpty` | `boolean` | `true` | Discard empty/whitespace-only segments | - -### `splitByRegex` - -Split text on a regex pattern (compiled with `gm` flags). - -```ts -import { splitByRegex } from "@nvisy/plugin-core"; - -splitByRegex("intro\n## A\ncontent A\n## B\ncontent B", { pattern: "^## .+$", keepSeparator: true }); -// → ["intro\n", "## A\ncontent A\n", "## B\ncontent B"] -``` - -| Option | Type | Default | Description | -|--------|------|---------|-------------| -| `pattern` | `string` | — | Regex pattern to split on | -| `keepSeparator` | `boolean` | `false` | Keep matched separator at start of segments | -| `trimEmpty` | `boolean` | `true` | Discard empty/whitespace-only segments | - -## Changelog - -See [CHANGELOG.md](../../CHANGELOG.md) for release notes and version history. - -## License - -Apache 2.0 License - see [LICENSE.txt](../../LICENSE.txt) diff --git a/packages/nvisy-plugin-core/package.json b/packages/nvisy-plugin-core/package.json deleted file mode 100644 index d69cdb3..0000000 --- a/packages/nvisy-plugin-core/package.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "name": "@nvisy/plugin-core", - "version": "0.1.0", - "description": "Core plugin with built-in chunk and partition actions for the Nvisy platform", - "type": "module", - "exports": { - ".": { - "source": "./src/index.ts", - "types": "./dist/index.d.ts", - "import": "./dist/index.js" - } - }, - "files": [ - "dist" - ], - "scripts": { - "build": "tsup", - "build:watch": "tsup --watch", - "clean": "rimraf dist", - "typecheck": "tsc -b" - }, - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "csv-parse": "^6.1.0", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } -} diff --git a/packages/nvisy-plugin-core/src/actions/chunk-by-character.test.ts b/packages/nvisy-plugin-core/src/actions/chunk-by-character.test.ts deleted file mode 100644 index 92ae1d1..0000000 --- a/packages/nvisy-plugin-core/src/actions/chunk-by-character.test.ts +++ /dev/null @@ -1,85 +0,0 @@ -import { Document } from "@nvisy/core"; -import { describe, expect, it } from "vitest"; -import { chunkByCharacter } from "./chunk-by-character.js"; - -describe("chunkByCharacter", () => { - it("splits text into chunks of maxCharacters", () => { - const doc = new Document("abcdefghij"); - const chunks = [ - ...chunkByCharacter(doc, { - maxCharacters: 3, - overlap: 0, - }), - ]; - expect(chunks).toHaveLength(4); - expect(chunks[0]!.content).toBe("abc"); - expect(chunks[1]!.content).toBe("def"); - expect(chunks[2]!.content).toBe("ghi"); - expect(chunks[3]!.content).toBe("j"); - }); - - it("applies overlap between chunks", () => { - const doc = new Document("abcdefghij"); - // maxCharacters=5, overlap=2 → step=3, starts at 0, 3, 6, 9 - const chunks = [ - ...chunkByCharacter(doc, { - maxCharacters: 5, - overlap: 2, - }), - ]; - expect(chunks[0]!.content).toBe("abcde"); - expect(chunks[1]!.content).toBe("defgh"); - expect(chunks[2]!.content).toBe("ghij"); - }); - - it("yields nothing when step is zero", () => { - const doc = new Document("hello"); - const chunks = [ - ...chunkByCharacter(doc, { - maxCharacters: 3, - overlap: 3, - }), - ]; - expect(chunks).toHaveLength(0); - }); - - it("sets chunkIndex and chunkTotal on each chunk", () => { - const doc = new Document("abcdef"); - const chunks = [ - ...chunkByCharacter(doc, { - maxCharacters: 2, - overlap: 0, - }), - ]; - expect(chunks).toHaveLength(3); - for (let i = 0; i < chunks.length; i++) { - expect(chunks[i]!.chunkIndex).toBe(i); - expect(chunks[i]!.chunkTotal).toBe(3); - } - }); - - it("derives chunks from the source document", () => { - const doc = new Document("abcdef"); - const chunks = [ - ...chunkByCharacter(doc, { - maxCharacters: 3, - overlap: 0, - }), - ]; - for (const chunk of chunks) { - expect(chunk.parentId).toBe(doc.id); - } - }); - - it("returns single chunk when text fits in maxCharacters", () => { - const doc = new Document("abc"); - const chunks = [ - ...chunkByCharacter(doc, { - maxCharacters: 10, - overlap: 0, - }), - ]; - expect(chunks).toHaveLength(1); - expect(chunks[0]!.content).toBe("abc"); - }); -}); diff --git a/packages/nvisy-plugin-core/src/actions/chunk-by-character.ts b/packages/nvisy-plugin-core/src/actions/chunk-by-character.ts deleted file mode 100644 index 948a8d4..0000000 --- a/packages/nvisy-plugin-core/src/actions/chunk-by-character.ts +++ /dev/null @@ -1,40 +0,0 @@ -/** - * Character-based chunking strategy. - * - * Splits document content into fixed-size character windows - * with configurable overlap. - * - * @module - */ - -import { Chunk, type Document } from "@nvisy/core"; - -/** Character-strategy parameters. */ -export interface CharacterStrategyParams { - /** Maximum chunk size in characters. */ - readonly maxCharacters: number; - /** Number of overlapping characters between chunks. */ - readonly overlap: number; -} - -/** Split a document into fixed-size character chunks with optional overlap. */ -export function* chunkByCharacter( - doc: Document, - params: CharacterStrategyParams, -): Generator<Chunk> { - const text = doc.content; - const step = params.maxCharacters - params.overlap; - if (step <= 0) return; - - const total = Math.ceil(text.length / step); - let index = 0; - let start = 0; - while (start < text.length) { - yield new Chunk(text.slice(start, start + params.maxCharacters), { - chunkIndex: index, - chunkTotal: total, - }).deriveFrom(doc); - index++; - start += step; - } -} diff --git a/packages/nvisy-plugin-core/src/actions/chunk-by-page.test.ts b/packages/nvisy-plugin-core/src/actions/chunk-by-page.test.ts deleted file mode 100644 index 0f31acb..0000000 --- a/packages/nvisy-plugin-core/src/actions/chunk-by-page.test.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { Document } from "@nvisy/core"; -import { describe, expect, it } from "vitest"; -import { chunkByPage } from "./chunk-by-page.js"; - -describe("chunkByPage", () => { - it("splits on form feed markers", () => { - const doc = new Document("Page 1\fPage 2\fPage 3"); - const chunks = [...chunkByPage(doc, {})]; - expect(chunks).toHaveLength(3); - expect(chunks[0]!.content).toBe("Page 1"); - expect(chunks[1]!.content).toBe("Page 2"); - expect(chunks[2]!.content).toBe("Page 3"); - }); - - describe("maxCharacters", () => { - it("splits long pages into smaller chunks", () => { - const longPage = "a".repeat(100); - const doc = new Document(`${longPage}\f${"b".repeat(20)}`); - const chunks = [...chunkByPage(doc, { maxCharacters: 30 })]; - for (const chunk of chunks) { - expect(chunk.content.length).toBeLessThanOrEqual(30); - } - // 100 chars / 30 = 4 pieces + 1 short page = 5 - expect(chunks).toHaveLength(5); - }); - - it("leaves short pages intact", () => { - const doc = new Document("Page 1\fPage 2"); - const chunks = [...chunkByPage(doc, { maxCharacters: 1000 })]; - expect(chunks).toHaveLength(2); - expect(chunks[0]!.content).toBe("Page 1"); - expect(chunks[1]!.content).toBe("Page 2"); - }); - - it("updates chunkIndex and chunkTotal after splitting", () => { - const doc = new Document("a".repeat(50)); - const chunks = [...chunkByPage(doc, { maxCharacters: 20 })]; - for (let i = 0; i < chunks.length; i++) { - expect(chunks[i]!.chunkIndex).toBe(i); - expect(chunks[i]!.chunkTotal).toBe(chunks.length); - } - }); - }); - - it("derives all chunks from the source document", () => { - const doc = new Document("Page 1\fPage 2"); - const chunks = [...chunkByPage(doc, {})]; - for (const chunk of chunks) { - expect(chunk.parentId).toBe(doc.id); - } - }); -}); diff --git a/packages/nvisy-plugin-core/src/actions/chunk-by-page.ts b/packages/nvisy-plugin-core/src/actions/chunk-by-page.ts deleted file mode 100644 index c317614..0000000 --- a/packages/nvisy-plugin-core/src/actions/chunk-by-page.ts +++ /dev/null @@ -1,70 +0,0 @@ -/** - * Page-based chunking strategy. - * - * Splits documents at page boundaries. When structured elements with - * page numbers are available, elements are grouped by page; otherwise - * the raw text is split on common page-break markers (`\f`, `---`, - * `***`). - * - * @module - */ - -import { Chunk, Document } from "@nvisy/core"; - -/** Page-strategy parameters. */ -export interface PageStrategyParams { - /** Optional maximum chunk size in characters. Splits pages that exceed this limit. */ - readonly maxCharacters?: number | undefined; -} - -/** Split a document on page boundaries (elements or text markers). */ -export function* chunkByPage( - doc: Document, - params: PageStrategyParams, -): Generator<Chunk> { - let texts: string[]; - - // Element-based path: group elements by page number - if (doc.elements != null && doc.elements.length > 0) { - texts = [...doc.getElementsByPage().entries()] - .sort(([a], [b]) => a - b) - .map(([, els]) => Document.fromElements(els).content); - } else { - // Fallback: split on common page break markers - const pages = doc.content.split(/\f|\n---\n|\n\*\*\*\n/); - const filtered: string[] = []; - for (const page of pages) { - const trimmed = page.trim(); - if (trimmed.length > 0) { - filtered.push(trimmed); - } - } - texts = filtered.length > 0 ? filtered : [doc.content]; - } - - if (params.maxCharacters != null) { - texts = splitLongTexts(texts, params.maxCharacters); - } - - for (let i = 0; i < texts.length; i++) { - yield new Chunk(texts[i]!, { - chunkIndex: i, - chunkTotal: texts.length, - }).deriveFrom(doc); - } -} - -/** Split texts that exceed maxCharacters into smaller pieces. */ -function splitLongTexts(texts: string[], max: number): string[] { - const result: string[] = []; - for (const text of texts) { - if (text.length <= max) { - result.push(text); - } else { - for (let i = 0; i < text.length; i += max) { - result.push(text.slice(i, i + max)); - } - } - } - return result; -} diff --git a/packages/nvisy-plugin-core/src/actions/chunk-by-section.test.ts b/packages/nvisy-plugin-core/src/actions/chunk-by-section.test.ts deleted file mode 100644 index 361b512..0000000 --- a/packages/nvisy-plugin-core/src/actions/chunk-by-section.test.ts +++ /dev/null @@ -1,142 +0,0 @@ -import { Document } from "@nvisy/core"; -import { describe, expect, it } from "vitest"; -import { chunkBySection } from "./chunk-by-section.js"; - -describe("chunkBySection", () => { - const markdown = [ - "Intro text", - "## Section A", - "Content A is here", - "## Section B", - "Content B is here", - "## Section C", - "Content C is short", - ].join("\n"); - - it("splits on heading level", () => { - const doc = new Document(markdown); - const chunks = [...chunkBySection(doc, { level: 2 })]; - expect(chunks).toHaveLength(4); - expect(chunks[0]!.content).toBe("Intro text"); - expect(chunks[1]!.content).toContain("Section A"); - expect(chunks[2]!.content).toContain("Section B"); - expect(chunks[3]!.content).toContain("Section C"); - }); - - describe("maxCharacters", () => { - it("splits long sections into smaller chunks", () => { - const longContent = `## Title\n${"x".repeat(100)}`; - const doc = new Document(longContent); - const chunks = [ - ...chunkBySection(doc, { - level: 2, - maxCharacters: 30, - }), - ]; - for (const chunk of chunks) { - expect(chunk.content.length).toBeLessThanOrEqual(30); - } - expect(chunks.length).toBeGreaterThan(1); - }); - - it("leaves short sections intact", () => { - const doc = new Document("## Short\nHello"); - const chunks = [ - ...chunkBySection(doc, { - level: 2, - maxCharacters: 1000, - }), - ]; - expect(chunks).toHaveLength(1); - expect(chunks[0]!.content).toContain("Hello"); - }); - - it("updates chunkIndex and chunkTotal after splitting", () => { - const longContent = `## Title\n${"a".repeat(50)}`; - const doc = new Document(longContent); - const chunks = [ - ...chunkBySection(doc, { - level: 2, - maxCharacters: 20, - }), - ]; - for (let i = 0; i < chunks.length; i++) { - expect(chunks[i]!.chunkIndex).toBe(i); - expect(chunks[i]!.chunkTotal).toBe(chunks.length); - } - }); - }); - - describe("combineUnder", () => { - it("merges consecutive short sections", () => { - const short = "## A\naa\n## B\nbb\n## C\ncc"; - const doc = new Document(short); - const chunks = [ - ...chunkBySection(doc, { - level: 2, - combineUnder: 200, - }), - ]; - expect(chunks).toHaveLength(1); - expect(chunks[0]!.content).toContain("## A"); - expect(chunks[0]!.content).toContain("## C"); - }); - - it("does not merge sections that exceed threshold", () => { - const sections = [ - "## A", - "a".repeat(50), - "## B", - "b".repeat(50), - "## C", - "c".repeat(50), - ].join("\n"); - const doc = new Document(sections); - const chunks = [ - ...chunkBySection(doc, { - level: 2, - combineUnder: 30, - }), - ]; - expect(chunks).toHaveLength(3); - }); - - it("combines then splits with both options", () => { - const sections = "## A\naa\n## B\nbb\n## C\ncc"; - const doc = new Document(sections); - // Combine first (all short), then split result - const chunks = [ - ...chunkBySection(doc, { - level: 2, - combineUnder: 500, - maxCharacters: 10, - }), - ]; - for (const chunk of chunks) { - expect(chunk.content.length).toBeLessThanOrEqual(10); - } - }); - - it("keeps long sections separate", () => { - const sections = ["## Short", "hi", "## Long", "x".repeat(200)].join( - "\n", - ); - const doc = new Document(sections); - const chunks = [ - ...chunkBySection(doc, { - level: 2, - combineUnder: 50, - }), - ]; - expect(chunks.length).toBeGreaterThanOrEqual(2); - }); - }); - - it("derives all chunks from the source document", () => { - const doc = new Document(markdown); - const chunks = [...chunkBySection(doc, { level: 2 })]; - for (const chunk of chunks) { - expect(chunk.parentId).toBe(doc.id); - } - }); -}); diff --git a/packages/nvisy-plugin-core/src/actions/chunk-by-section.ts b/packages/nvisy-plugin-core/src/actions/chunk-by-section.ts deleted file mode 100644 index d22eb74..0000000 --- a/packages/nvisy-plugin-core/src/actions/chunk-by-section.ts +++ /dev/null @@ -1,125 +0,0 @@ -/** - * Section-based chunking strategy. - * - * Splits documents at markdown heading boundaries. When structured - * elements are available, headings are matched by {@link Element.level}; - * otherwise the raw text is split on `#`-prefixed lines. - * - * @module - */ - -import { Chunk, Document, type Element } from "@nvisy/core"; - -/** Section-strategy parameters. */ -export interface SectionStrategyParams { - /** Heading level to split on (1-6). */ - readonly level: number; - /** Optional maximum chunk size in characters. Splits sections that exceed this limit. */ - readonly maxCharacters?: number | undefined; - /** Combine consecutive sections whose text is shorter than this threshold. */ - readonly combineUnder?: number | undefined; -} - -/** Split a document into sections at markdown headings of the given level. */ -export function* chunkBySection( - doc: Document, - params: SectionStrategyParams, -): Generator<Chunk> { - let texts: string[]; - - // Element-based path: split structured elements by heading level - if (doc.elements != null && doc.elements.length > 0) { - texts = splitByHeadingLevel(doc.elements, params.level).map( - (els) => Document.fromElements(els).content, - ); - } else { - // Fallback: string-based splitting - const prefix = "#".repeat(params.level); - const pattern = new RegExp(`^${prefix}\\s`, "m"); - const parts = doc.content.split(pattern); - - const sections: string[] = []; - for (let i = 0; i < parts.length; i++) { - const part = parts[i]!.trim(); - if (part.length === 0) continue; - // Re-add the heading prefix for sections after the first - sections.push(i > 0 ? `${prefix} ${part}` : part); - } - texts = sections.length > 0 ? sections : [doc.content]; - } - - if (params.combineUnder != null) { - texts = combineShortTexts(texts, params.combineUnder); - } - - if (params.maxCharacters != null) { - texts = splitLongTexts(texts, params.maxCharacters); - } - - for (let i = 0; i < texts.length; i++) { - yield new Chunk(texts[i]!, { - chunkIndex: i, - chunkTotal: texts.length, - }).deriveFrom(doc); - } -} - -/** Combine consecutive texts that are shorter than the threshold. */ -function combineShortTexts(texts: string[], threshold: number): string[] { - const result: string[] = []; - let buffer = ""; - - for (const text of texts) { - if (buffer.length === 0) { - buffer = text; - } else if (buffer.length + text.length < threshold) { - buffer += `\n\n${text}`; - } else { - result.push(buffer); - buffer = text; - } - } - if (buffer.length > 0) { - result.push(buffer); - } - return result; -} - -/** Split texts that exceed maxCharacters into smaller pieces. */ -function splitLongTexts(texts: string[], max: number): string[] { - const result: string[] = []; - for (const text of texts) { - if (text.length <= max) { - result.push(text); - } else { - for (let i = 0; i < text.length; i += max) { - result.push(text.slice(i, i + max)); - } - } - } - return result; -} - -/** Split elements into sections at headings of the given level. */ -function splitByHeadingLevel( - elements: readonly Element[], - level: number, -): Element[][] { - const sections: Element[][] = []; - let current: Element[] = []; - - for (const el of elements) { - if (el.type === "title" && el.level != null && el.level <= level) { - if (current.length > 0) { - sections.push(current); - } - current = [el]; - } else { - current.push(el); - } - } - if (current.length > 0) { - sections.push(current); - } - return sections; -} diff --git a/packages/nvisy-plugin-core/src/actions/chunk.ts b/packages/nvisy-plugin-core/src/actions/chunk.ts deleted file mode 100644 index 8cc8c83..0000000 --- a/packages/nvisy-plugin-core/src/actions/chunk.ts +++ /dev/null @@ -1,87 +0,0 @@ -/** - * Rule-based chunk action that splits documents using character, - * section, or page strategies. - * - * @module - */ - -import { Action, Chunk, Document } from "@nvisy/core"; -import { z } from "zod"; -import { chunkByCharacter } from "./chunk-by-character.js"; -import { chunkByPage } from "./chunk-by-page.js"; -import { chunkBySection } from "./chunk-by-section.js"; - -export type { CharacterStrategyParams } from "./chunk-by-character.js"; -export type { PageStrategyParams } from "./chunk-by-page.js"; -export type { SectionStrategyParams } from "./chunk-by-section.js"; - -const BaseCharacter = z.object({ - maxCharacters: z.number(), - overlap: z.number().default(0), -}); - -const BaseSection = z.object({ - level: z.number().min(1).max(6).default(2), - maxCharacters: z.number().optional(), - combineUnder: z.number().optional(), -}); - -const BasePage = z.object({ - maxCharacters: z.number().optional(), -}); - -const CharacterStrategy = BaseCharacter.extend({ - strategy: z.literal("character"), -}); - -const SectionStrategy = BaseSection.extend({ - strategy: z.literal("section"), -}); - -const PageStrategy = BasePage.extend({ - strategy: z.literal("page"), -}); - -const ChunkParams = z.discriminatedUnion("strategy", [ - CharacterStrategy, - SectionStrategy, - PageStrategy, -]); - -/** - * Split documents into smaller chunks using various strategies. - * - * - `"character"`: fixed-size character splitting with optional overlap - * - `"section"`: split on markdown headings at a given level - * - `"page"`: split on page boundary markers in content - */ -export const chunkSimple = Action.withoutClient("chunk", { - types: [Document, Chunk], - params: ChunkParams, - transform: transformChunk, -}); - -async function* transformChunk( - stream: AsyncIterable<Document>, - params: z.infer<typeof ChunkParams>, -): AsyncGenerator<Chunk> { - for await (const doc of stream) { - switch (params.strategy) { - case "character": { - const { strategy: _, ...rest } = params; - yield* chunkByCharacter(doc, rest); - break; - } - case "section": { - const { strategy: _, ...rest } = params; - yield* chunkBySection(doc, rest); - break; - } - case "page": { - const { strategy: _, ...rest } = params; - yield* chunkByPage(doc, rest); - break; - } - } - } -} diff --git a/packages/nvisy-plugin-core/src/actions/index.ts b/packages/nvisy-plugin-core/src/actions/index.ts deleted file mode 100644 index e1b0d0c..0000000 --- a/packages/nvisy-plugin-core/src/actions/index.ts +++ /dev/null @@ -1,8 +0,0 @@ -/** - * @module actions - * - * Built-in rule-based transforms for the core plugin. - */ - -export { chunkSimple } from "./chunk.js"; -export { partition } from "./partition.js"; diff --git a/packages/nvisy-plugin-core/src/actions/partition-by-auto.test.ts b/packages/nvisy-plugin-core/src/actions/partition-by-auto.test.ts deleted file mode 100644 index 9bd3bdb..0000000 --- a/packages/nvisy-plugin-core/src/actions/partition-by-auto.test.ts +++ /dev/null @@ -1,24 +0,0 @@ -import { Document } from "@nvisy/core"; -import { describe, expect, it } from "vitest"; -import { partitionByAuto } from "./partition-by-auto.js"; - -describe("partitionByAuto", () => { - it("returns document content as a single-element array", () => { - const doc = new Document("Hello, world!"); - const parts = partitionByAuto(doc, {}); - expect(parts).toEqual(["Hello, world!"]); - }); - - it("preserves full content without modification", () => { - const content = "Line 1\nLine 2\n\nParagraph 2"; - const doc = new Document(content); - const parts = partitionByAuto(doc, {}); - expect(parts).toEqual([content]); - }); - - it("handles empty content", () => { - const doc = new Document(""); - const parts = partitionByAuto(doc, {}); - expect(parts).toEqual([""]); - }); -}); diff --git a/packages/nvisy-plugin-core/src/actions/partition-by-auto.ts b/packages/nvisy-plugin-core/src/actions/partition-by-auto.ts deleted file mode 100644 index 734bf1b..0000000 --- a/packages/nvisy-plugin-core/src/actions/partition-by-auto.ts +++ /dev/null @@ -1,21 +0,0 @@ -/** - * Auto partition strategy. - * - * Passes document content through as-is, preserving structured - * elements when present. - * - * @module - */ - -import type { Document } from "@nvisy/core"; - -/** Auto-strategy parameters (no additional options). */ -export type AutoStrategyParams = Record<string, never>; - -/** Pass document content through unchanged. */ -export function partitionByAuto( - doc: Document, - _params: AutoStrategyParams, -): string[] { - return [doc.content]; -} diff --git a/packages/nvisy-plugin-core/src/actions/partition-by-rule.test.ts b/packages/nvisy-plugin-core/src/actions/partition-by-rule.test.ts deleted file mode 100644 index 15191b7..0000000 --- a/packages/nvisy-plugin-core/src/actions/partition-by-rule.test.ts +++ /dev/null @@ -1,113 +0,0 @@ -import { Document, TableElement } from "@nvisy/core"; -import { describe, expect, it } from "vitest"; -import { partitionByRule } from "./partition-by-rule.js"; - -describe("partitionByRule", () => { - it("splits on a regex pattern", () => { - const doc = new Document("one---two---three"); - const parts = partitionByRule(doc, { - pattern: "---", - includeDelimiter: false, - inferTableStructure: false, - }); - expect(parts).toEqual(["one", "two", "three"]); - }); - - describe("inferTableStructure", () => { - it("replaces table text with HTML when enabled", () => { - const table = new TableElement({ - type: "table", - text: "Name Age\nAlice 30", - cells: [ - { row: 0, column: 0, text: "Name", isHeader: true }, - { row: 0, column: 1, text: "Age", isHeader: true }, - { row: 1, column: 0, text: "Alice" }, - { row: 1, column: 1, text: "30" }, - ], - }); - - const doc = new Document("Before\n---\nName Age\nAlice 30\n---\nAfter", { - elements: [table], - }); - - const parts = partitionByRule(doc, { - pattern: "\n---\n", - includeDelimiter: false, - inferTableStructure: true, - }); - - expect(parts).toHaveLength(3); - expect(parts[0]).toBe("Before"); - expect(parts[1]).toContain("<table>"); - expect(parts[1]).toContain("<th>Name</th>"); - expect(parts[1]).toContain("<th>Age</th>"); - expect(parts[1]).toContain("<td>Alice</td>"); - expect(parts[1]).toContain("<td>30</td>"); - expect(parts[1]).toContain("</table>"); - expect(parts[2]).toBe("After"); - }); - - it("does not modify content when disabled", () => { - const table = new TableElement({ - type: "table", - text: "Name Age", - cells: [ - { row: 0, column: 0, text: "Name", isHeader: true }, - { row: 0, column: 1, text: "Age", isHeader: true }, - ], - }); - - const doc = new Document("Name Age", { elements: [table] }); - - const parts = partitionByRule(doc, { - pattern: "---", - includeDelimiter: false, - inferTableStructure: false, - }); - - expect(parts).toEqual(["Name Age"]); - }); - - it("ignores elements without cells", () => { - const table = new TableElement({ - type: "table", - text: "some table", - }); - - const doc = new Document("some table", { elements: [table] }); - - const parts = partitionByRule(doc, { - pattern: "---", - includeDelimiter: false, - inferTableStructure: true, - }); - - expect(parts).toEqual(["some table"]); - }); - - it("sorts cells by row and column", () => { - const table = new TableElement({ - type: "table", - text: "data", - cells: [ - { row: 1, column: 1, text: "D" }, - { row: 0, column: 1, text: "B", isHeader: true }, - { row: 1, column: 0, text: "C" }, - { row: 0, column: 0, text: "A", isHeader: true }, - ], - }); - - const doc = new Document("data", { elements: [table] }); - - const parts = partitionByRule(doc, { - pattern: "---", - includeDelimiter: false, - inferTableStructure: true, - }); - - expect(parts[0]).toBe( - "<table><tr><th>A</th><th>B</th></tr><tr><td>C</td><td>D</td></tr></table>", - ); - }); - }); -}); diff --git a/packages/nvisy-plugin-core/src/actions/partition-by-rule.ts b/packages/nvisy-plugin-core/src/actions/partition-by-rule.ts deleted file mode 100644 index ae14ce8..0000000 --- a/packages/nvisy-plugin-core/src/actions/partition-by-rule.ts +++ /dev/null @@ -1,80 +0,0 @@ -/** - * Rule-based partition strategy. - * - * Splits document content using a user-supplied regex pattern. - * Optionally infers HTML table structure from structured - * {@link TableElement} cells. - * - * @module - */ - -import type { Document } from "@nvisy/core"; -import { type Element, type TableCellData, TableElement } from "@nvisy/core"; - -/** Rule-strategy parameters. */ -export interface RuleStrategyParams { - /** Regex pattern to split content on. */ - readonly pattern: string; - /** Whether to include the delimiter in chunks. */ - readonly includeDelimiter: boolean; - /** Replace table element text with inferred HTML table markup. */ - readonly inferTableStructure: boolean; -} - -/** Split document content using a regex pattern. */ -export function partitionByRule( - doc: Document, - params: RuleStrategyParams, -): string[] { - let content = doc.content; - - if (params.inferTableStructure && doc.elements != null) { - content = applyTableStructure(content, doc.elements); - } - - const regex = new RegExp(params.pattern, "g"); - return content.split(regex).filter((p) => p.length > 0); -} - -/** Replace plain-text table representations with HTML tables built from cell data. */ -function applyTableStructure( - content: string, - elements: readonly Element[], -): string { - for (const el of elements) { - if ( - !(el instanceof TableElement) || - el.cells == null || - el.cells.length === 0 - ) { - continue; - } - - const html = cellsToHtml(el.cells); - content = content.replace(el.text, html); - } - return content; -} - -/** Build an HTML `<table>` string from structured cell data. */ -function cellsToHtml(cells: readonly TableCellData[]): string { - const rows = new Map<number, (typeof cells)[number][]>(); - for (const cell of cells) { - let row = rows.get(cell.row); - if (row == null) { - row = []; - rows.set(cell.row, row); - } - row.push(cell); - } - - const lines: string[] = ["<table>"]; - for (const [, rowCells] of [...rows.entries()].sort(([a], [b]) => a - b)) { - rowCells.sort((a, b) => a.column - b.column); - const tag = rowCells[0]?.isHeader ? "th" : "td"; - const cellHtml = rowCells.map((c) => `<${tag}>${c.text}</${tag}>`).join(""); - lines.push(`<tr>${cellHtml}</tr>`); - } - lines.push("</table>"); - return lines.join(""); -} diff --git a/packages/nvisy-plugin-core/src/actions/partition.ts b/packages/nvisy-plugin-core/src/actions/partition.ts deleted file mode 100644 index 559e336..0000000 --- a/packages/nvisy-plugin-core/src/actions/partition.ts +++ /dev/null @@ -1,87 +0,0 @@ -/** - * Rule-based partition action that splits documents using auto - * pass-through or regex-based splitting. - * - * @module - */ - -import type { Metadata } from "@nvisy/core"; -import { Action, Document } from "@nvisy/core"; -import { z } from "zod"; -import { partitionByAuto } from "./partition-by-auto.js"; -import { partitionByRule } from "./partition-by-rule.js"; - -export type { AutoStrategyParams } from "./partition-by-auto.js"; -export type { RuleStrategyParams } from "./partition-by-rule.js"; - -const BaseAuto = z.object({}); - -const BaseRule = z.object({ - pattern: z.string(), - includeDelimiter: z.boolean().default(false), - inferTableStructure: z.boolean().default(false), -}); - -const AutoStrategy = BaseAuto.extend({ - strategy: z.literal("auto"), -}); - -const RuleStrategy = BaseRule.extend({ - strategy: z.literal("rule"), -}); - -const PartitionParams = z.discriminatedUnion("strategy", [ - AutoStrategy, - RuleStrategy, -]); - -/** - * Partition documents into structured documents. - * - * - `"auto"`: pass through document content as-is - * - `"rule"`: split content using a regex pattern - */ -export const partition = Action.withoutClient("partition", { - types: [Document, Document], - params: PartitionParams, - transform: transformPartition, -}); - -async function* transformPartition( - stream: AsyncIterable<Document>, - params: z.infer<typeof PartitionParams>, -): AsyncGenerator<Document> { - for await (const item of stream) { - let parts: string[]; - switch (params.strategy) { - case "auto": { - const { strategy: _, ...rest } = params; - parts = partitionByAuto(item, rest); - break; - } - case "rule": { - const { strategy: _, ...rest } = params; - parts = partitionByRule(item, rest); - break; - } - } - - const sourceId = item.id; - const baseMeta = item.metadata; - - for (let i = 0; i < parts.length; i++) { - const metadata: Metadata = { - ...(baseMeta ?? {}), - partIndex: i, - partTotal: parts.length, - }; - yield new Document(parts[i]!, { - ...(params.strategy === "auto" && item.elements != null - ? { elements: item.elements } - : {}), - }) - .withParent(sourceId) - .withMetadata(metadata); - } - } -} diff --git a/packages/nvisy-plugin-core/src/index.ts b/packages/nvisy-plugin-core/src/index.ts deleted file mode 100644 index 5be4e00..0000000 --- a/packages/nvisy-plugin-core/src/index.ts +++ /dev/null @@ -1,42 +0,0 @@ -import { - Blob, - Chunk, - Datatype, - Document, - Embedding, - Plugin, -} from "@nvisy/core"; -import { chunkSimple, partition } from "./actions/index.js"; -import { csvLoader, jsonLoader, plaintextLoader } from "./loaders/index.js"; - -export const corePlugin = Plugin.define("core") - .withDatatypes( - Datatype.define("document", Document), - Datatype.define("blob", Blob), - Datatype.define("chunk", Chunk), - Datatype.define("embedding", Embedding), - ) - .withActions(chunkSimple, partition) - .withLoaders(plaintextLoader, csvLoader, jsonLoader); - -export type { - CharacterStrategyParams, - PageStrategyParams, - SectionStrategyParams, -} from "./actions/chunk.js"; -export { chunkSimple, partition } from "./actions/index.js"; -export type { - AutoStrategyParams, - RuleStrategyParams, -} from "./actions/partition.js"; -export type { CsvParams } from "./loaders/csv.js"; -export { csvLoader, csvParamsSchema } from "./loaders/csv.js"; -export type { JsonParams } from "./loaders/json.js"; -export { jsonLoader, jsonParamsSchema } from "./loaders/json.js"; -export type { PlaintextParams } from "./loaders/plaintext.js"; -export { plaintextLoader, plaintextParamsSchema } from "./loaders/plaintext.js"; -export type { - DelimiterSplitOptions, - RegexSplitOptions, -} from "./splitter/index.js"; -export { splitByDelimiter, splitByRegex } from "./splitter/index.js"; diff --git a/packages/nvisy-plugin-core/src/loaders/csv.test.ts b/packages/nvisy-plugin-core/src/loaders/csv.test.ts deleted file mode 100644 index b42cf2b..0000000 --- a/packages/nvisy-plugin-core/src/loaders/csv.test.ts +++ /dev/null @@ -1,169 +0,0 @@ -import { Blob, type Document } from "@nvisy/core"; -import { describe, expect, it } from "vitest"; -import { csvLoader } from "./csv.js"; - -async function collectDocs(iter: AsyncIterable<Document>) { - const docs = []; - for await (const doc of iter) { - docs.push(doc); - } - return docs; -} - -describe("csvLoader", () => { - it("has id 'csv'", () => { - expect(csvLoader.id).toBe("csv"); - }); - - it("matches .csv and .tsv extensions", () => { - expect(csvLoader.extensions).toContain(".csv"); - expect(csvLoader.extensions).toContain(".tsv"); - }); - - it("matches text/csv content type", () => { - expect(csvLoader.contentTypes).toContain("text/csv"); - }); - - it("parses CSV with headers into a single document", async () => { - const csv = "name,age\nAlice,30\nBob,25"; - const blob = new Blob("data.csv", Buffer.from(csv)); - const docs = await collectDocs( - csvLoader.load(blob, { - delimiter: ",", - hasHeader: true, - encoding: "utf-8", - }), - ); - - expect(docs).toHaveLength(1); - expect(docs[0]!.content).toBe("name: Alice\nage: 30\n\nname: Bob\nage: 25"); - }); - - it("parses CSV without headers", async () => { - const csv = "Alice,30\nBob,25"; - const blob = new Blob("data.csv", Buffer.from(csv)); - const docs = await collectDocs( - csvLoader.load(blob, { - delimiter: ",", - hasHeader: false, - encoding: "utf-8", - }), - ); - - expect(docs).toHaveLength(1); - expect(docs[0]!.content).toBe("Alice,30\nBob,25"); - }); - - it("supports tab delimiter for TSV", async () => { - const tsv = "name\tage\nAlice\t30"; - const blob = new Blob("data.tsv", Buffer.from(tsv)); - const docs = await collectDocs( - csvLoader.load(blob, { - delimiter: "\t", - hasHeader: true, - encoding: "utf-8", - }), - ); - - expect(docs).toHaveLength(1); - expect(docs[0]!.content).toBe("name: Alice\nage: 30"); - }); - - it("handles quoted fields with commas", async () => { - const csv = 'name,address\nAlice,"123 Main St, Apt 4"'; - const blob = new Blob("data.csv", Buffer.from(csv)); - const docs = await collectDocs( - csvLoader.load(blob, { - delimiter: ",", - hasHeader: true, - encoding: "utf-8", - }), - ); - - expect(docs[0]!.content).toContain("address: 123 Main St, Apt 4"); - }); - - it("handles escaped quotes in fields", async () => { - const csv = 'name,note\nAlice,"She said ""hello"""'; - const blob = new Blob("data.csv", Buffer.from(csv)); - const docs = await collectDocs( - csvLoader.load(blob, { - delimiter: ",", - hasHeader: true, - encoding: "utf-8", - }), - ); - - expect(docs[0]!.content).toContain('note: She said "hello"'); - }); - - it("derives document from blob", async () => { - const csv = "a\n1\n2"; - const blob = new Blob("data.csv", Buffer.from(csv)); - const docs = await collectDocs( - csvLoader.load(blob, { - delimiter: ",", - hasHeader: true, - encoding: "utf-8", - }), - ); - - expect(docs[0]!.parentId).toBe(blob.id); - }); - - it("handles empty file", async () => { - const blob = new Blob("empty.csv", Buffer.alloc(0)); - const docs = await collectDocs( - csvLoader.load(blob, { - delimiter: ",", - hasHeader: true, - encoding: "utf-8", - }), - ); - - expect(docs).toHaveLength(0); - }); - - it("handles header-only file", async () => { - const csv = "name,age"; - const blob = new Blob("header.csv", Buffer.from(csv)); - const docs = await collectDocs( - csvLoader.load(blob, { - delimiter: ",", - hasHeader: true, - encoding: "utf-8", - }), - ); - - expect(docs).toHaveLength(0); - }); - - it("handles CRLF line endings", async () => { - const csv = "name,age\r\nAlice,30\r\nBob,25"; - const blob = new Blob("data.csv", Buffer.from(csv)); - const docs = await collectDocs( - csvLoader.load(blob, { - delimiter: ",", - hasHeader: true, - encoding: "utf-8", - }), - ); - - expect(docs).toHaveLength(1); - expect(docs[0]!.content).toBe("name: Alice\nage: 30\n\nname: Bob\nage: 25"); - }); - - it("uses defaults for optional params", async () => { - const csv = "a,b\n1,2"; - const blob = new Blob("data.csv", Buffer.from(csv)); - const params = csvLoader.schema.parse({}); - const docs = await collectDocs(csvLoader.load(blob, params)); - - expect(docs).toHaveLength(1); - expect(docs[0]!.content).toBe("a: 1\nb: 2"); - }); - - it("schema rejects unknown properties", () => { - expect(() => csvLoader.schema.parse({ extra: "field" })).toThrow(); - }); -}); diff --git a/packages/nvisy-plugin-core/src/loaders/csv.ts b/packages/nvisy-plugin-core/src/loaders/csv.ts deleted file mode 100644 index 826ce9d..0000000 --- a/packages/nvisy-plugin-core/src/loaders/csv.ts +++ /dev/null @@ -1,78 +0,0 @@ -/** - * CSV loader. - * - * Converts `.csv` and `.tsv` blobs into a single Document. - * When a header row is present the content is formatted as - * `"column: value"` blocks separated by blank lines; otherwise - * raw delimited rows are used. - * - * @module - */ - -import { type Blob, Document, Loader } from "@nvisy/core"; -import { parse } from "csv-parse/sync"; -import { z } from "zod"; - -/** Schema for CSV loader parameters. */ -export const csvParamsSchema = z - .object({ - /** Column delimiter. Defaults to `","`. */ - delimiter: z.string().optional().default(","), - /** Whether the first row contains column headers. Defaults to `true`. */ - hasHeader: z.boolean().optional().default(true), - /** Character encoding of the blob data. Defaults to `"utf-8"`. */ - encoding: z - .enum(["utf-8", "ascii", "latin1", "utf16le"]) - .optional() - .default("utf-8"), - }) - .strict(); - -export type CsvParams = z.infer<typeof csvParamsSchema>; - -/** - * Loader that converts CSV/TSV blobs into a single Document. - * - * Header columns are stored as metadata on the Document. - */ -export const csvLoader = Loader.define<CsvParams>("csv", { - extensions: [".csv", ".tsv"], - contentTypes: ["text/csv", "text/tab-separated-values"], - params: csvParamsSchema, - load: loadCsv, -}); - -async function* loadCsv( - blob: Blob, - params: CsvParams, -): AsyncGenerator<Document> { - const text = blob.data.toString(params.encoding); - if (text.trim().length === 0) return; - - const records: string[][] = parse(text, { - delimiter: params.delimiter, - relax_column_count: true, - skip_empty_lines: true, - }); - if (records.length === 0) return; - - let headers: string[] | null = null; - let dataRows: string[][] = records; - - if (params.hasHeader) { - headers = records[0]!; - dataRows = records.slice(1); - } - - if (dataRows.length === 0) return; - - const content = headers - ? dataRows - .map((row) => headers.map((h, j) => `${h}: ${row[j] ?? ""}`).join("\n")) - .join("\n\n") - : dataRows.map((row) => row.join(params.delimiter)).join("\n"); - - const doc = new Document(content); - doc.deriveFrom(blob); - yield doc; -} diff --git a/packages/nvisy-plugin-core/src/loaders/index.ts b/packages/nvisy-plugin-core/src/loaders/index.ts deleted file mode 100644 index f317cc3..0000000 --- a/packages/nvisy-plugin-core/src/loaders/index.ts +++ /dev/null @@ -1,12 +0,0 @@ -/** - * @module loaders - * - * Built-in loaders for the core plugin. - */ - -export type { CsvParams } from "./csv.js"; -export { csvLoader, csvParamsSchema } from "./csv.js"; -export type { JsonParams } from "./json.js"; -export { jsonLoader, jsonParamsSchema } from "./json.js"; -export type { PlaintextParams } from "./plaintext.js"; -export { plaintextLoader, plaintextParamsSchema } from "./plaintext.js"; diff --git a/packages/nvisy-plugin-core/src/loaders/json.test.ts b/packages/nvisy-plugin-core/src/loaders/json.test.ts deleted file mode 100644 index 5250931..0000000 --- a/packages/nvisy-plugin-core/src/loaders/json.test.ts +++ /dev/null @@ -1,168 +0,0 @@ -import { Blob, type Document } from "@nvisy/core"; -import { describe, expect, it } from "vitest"; -import { jsonLoader } from "./json.js"; - -async function collectDocs(iter: AsyncIterable<Document>) { - const docs = []; - for await (const doc of iter) { - docs.push(doc); - } - return docs; -} - -describe("jsonLoader", () => { - it("has id 'json'", () => { - expect(jsonLoader.id).toBe("json"); - }); - - it("matches .json, .jsonl, and .ndjson extensions", () => { - expect(jsonLoader.extensions).toContain(".json"); - expect(jsonLoader.extensions).toContain(".jsonl"); - expect(jsonLoader.extensions).toContain(".ndjson"); - }); - - it("matches application/json content type", () => { - expect(jsonLoader.contentTypes).toContain("application/json"); - }); - - describe("JSON files", () => { - it("creates one document from a JSON object", async () => { - const json = JSON.stringify({ name: "Alice", age: 30 }); - const blob = new Blob("data.json", Buffer.from(json)); - const docs = await collectDocs( - jsonLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs).toHaveLength(1); - }); - - it("promotes scalar fields to metadata", async () => { - const json = JSON.stringify({ name: "Alice", age: 30, active: true }); - const blob = new Blob("data.json", Buffer.from(json)); - const docs = await collectDocs( - jsonLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs[0]!.metadata).toMatchObject({ - name: "Alice", - age: 30, - active: true, - }); - }); - - it("creates one document from a JSON array", async () => { - const json = JSON.stringify([ - { id: 1, text: "first" }, - { id: 2, text: "second" }, - ]); - const blob = new Blob("data.json", Buffer.from(json)); - const docs = await collectDocs( - jsonLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs).toHaveLength(1); - expect(docs[0]!.content).toBe( - JSON.stringify( - [ - { id: 1, text: "first" }, - { id: 2, text: "second" }, - ], - null, - 2, - ), - ); - }); - - it("handles string JSON values", async () => { - const json = JSON.stringify("just a string"); - const blob = new Blob("data.json", Buffer.from(json)); - const docs = await collectDocs( - jsonLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs).toHaveLength(1); - expect(docs[0]!.content).toBe("just a string"); - }); - - it("pretty-prints object content", async () => { - const obj = { key: "value" }; - const blob = new Blob("data.json", Buffer.from(JSON.stringify(obj))); - const docs = await collectDocs( - jsonLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs[0]!.content).toBe(JSON.stringify(obj, null, 2)); - }); - - it("derives document from blob", async () => { - const json = JSON.stringify({ a: 1 }); - const blob = new Blob("data.json", Buffer.from(json)); - const docs = await collectDocs( - jsonLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs[0]!.parentId).toBe(blob.id); - }); - }); - - describe("JSONL files", () => { - it("creates one document from JSONL", async () => { - const jsonl = '{"id":1}\n{"id":2}\n{"id":3}'; - const blob = new Blob("data.jsonl", Buffer.from(jsonl)); - const docs = await collectDocs( - jsonLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs).toHaveLength(1); - expect(docs[0]!.content).toBe( - JSON.stringify([{ id: 1 }, { id: 2 }, { id: 3 }], null, 2), - ); - }); - - it("skips empty lines", async () => { - const jsonl = '{"a":1}\n\n{"b":2}\n'; - const blob = new Blob("data.jsonl", Buffer.from(jsonl)); - const docs = await collectDocs( - jsonLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs).toHaveLength(1); - expect(docs[0]!.content).toBe( - JSON.stringify([{ a: 1 }, { b: 2 }], null, 2), - ); - }); - - it("handles .ndjson extension", async () => { - const ndjson = '{"x":1}\n{"x":2}'; - const blob = new Blob("data.ndjson", Buffer.from(ndjson)); - const docs = await collectDocs( - jsonLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs).toHaveLength(1); - }); - - it("derives document from blob", async () => { - const jsonl = '{"a":1}\n{"b":2}'; - const blob = new Blob("data.jsonl", Buffer.from(jsonl)); - const docs = await collectDocs( - jsonLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs[0]!.parentId).toBe(blob.id); - }); - }); - - it("uses defaults for optional params", async () => { - const json = JSON.stringify({ hello: "world" }); - const blob = new Blob("data.json", Buffer.from(json)); - const params = jsonLoader.schema.parse({}); - const docs = await collectDocs(jsonLoader.load(blob, params)); - - expect(docs).toHaveLength(1); - }); - - it("schema rejects unknown properties", () => { - expect(() => jsonLoader.schema.parse({ extra: "field" })).toThrow(); - }); -}); diff --git a/packages/nvisy-plugin-core/src/loaders/json.ts b/packages/nvisy-plugin-core/src/loaders/json.ts deleted file mode 100644 index 43c33c8..0000000 --- a/packages/nvisy-plugin-core/src/loaders/json.ts +++ /dev/null @@ -1,82 +0,0 @@ -/** - * JSON / JSON Lines loader. - * - * Converts `.json`, `.jsonl`, and `.ndjson` blobs into a single - * Document whose content is the pretty-printed JSON text. - * For JSONL/NDJSON files the lines are collected into an array first. - * - * @module - */ - -import { type Blob, Document, Loader } from "@nvisy/core"; -import { z } from "zod"; - -/** Schema for JSON loader parameters. */ -export const jsonParamsSchema = z - .object({ - /** Character encoding of the blob data. Defaults to `"utf-8"`. */ - encoding: z - .enum(["utf-8", "ascii", "latin1", "utf16le"]) - .optional() - .default("utf-8"), - }) - .strict(); - -export type JsonParams = z.infer<typeof jsonParamsSchema>; - -/** - * Loader that converts JSON / JSONL blobs into a single Document. - * - * Scalar object fields are promoted to metadata. - */ -export const jsonLoader = Loader.define<JsonParams>("json", { - extensions: [".json", ".jsonl", ".ndjson"], - contentTypes: ["application/json", "application/x-ndjson"], - params: jsonParamsSchema, - load: loadJson, -}); - -async function* loadJson( - blob: Blob, - params: JsonParams, -): AsyncGenerator<Document> { - const text = blob.data.toString(params.encoding); - const isJsonLines = - blob.path.endsWith(".jsonl") || blob.path.endsWith(".ndjson"); - - const parsed: unknown = isJsonLines ? parseJsonLines(text) : JSON.parse(text); - const content = - typeof parsed === "string" ? parsed : JSON.stringify(parsed, null, 2); - - const doc = new Document(content); - doc.deriveFrom(blob); - - if (typeof parsed === "object" && parsed !== null && !Array.isArray(parsed)) { - const metadata: Record<string, string | number | boolean> = {}; - for (const [k, v] of Object.entries(parsed)) { - if ( - typeof v === "string" || - typeof v === "number" || - typeof v === "boolean" - ) { - metadata[k] = v; - } - } - if (Object.keys(metadata).length > 0) { - doc.withMetadata(metadata); - } - } - - yield doc; -} - -/** Parse newline-delimited JSON into an array of values. */ -function parseJsonLines(text: string): unknown[] { - const results: unknown[] = []; - for (const line of text.split(/\r?\n/)) { - const trimmed = line.trim(); - if (trimmed.length === 0) continue; - results.push(JSON.parse(trimmed)); - } - return results; -} diff --git a/packages/nvisy-plugin-core/src/loaders/plaintext.test.ts b/packages/nvisy-plugin-core/src/loaders/plaintext.test.ts deleted file mode 100644 index 03507ee..0000000 --- a/packages/nvisy-plugin-core/src/loaders/plaintext.test.ts +++ /dev/null @@ -1,103 +0,0 @@ -import { Blob, type Document } from "@nvisy/core"; -import { describe, expect, it } from "vitest"; -import { plaintextLoader } from "./plaintext.js"; - -async function collectDocs(iter: AsyncIterable<Document>) { - const docs = []; - for await (const doc of iter) { - docs.push(doc); - } - return docs; -} - -describe("plaintextLoader", () => { - it("has id 'plaintext'", () => { - expect(plaintextLoader.id).toBe("plaintext"); - }); - - it("matches .txt extension", () => { - expect(plaintextLoader.extensions).toContain(".txt"); - }); - - it("matches text/plain content type", () => { - expect(plaintextLoader.contentTypes).toContain("text/plain"); - }); - - it("converts utf-8 text blob to document", async () => { - const blob = new Blob("readme.txt", Buffer.from("Hello, world!")); - const docs = await collectDocs( - plaintextLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs).toHaveLength(1); - expect(docs[0]!.content).toBe("Hello, world!"); - }); - - it("derives document from blob (sets parentId)", async () => { - const blob = new Blob("file.txt", Buffer.from("content")); - const docs = await collectDocs( - plaintextLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs[0]!.parentId).toBe(blob.id); - expect(docs[0]!.isDerived).toBe(true); - }); - - it("handles empty file", async () => { - const blob = new Blob("empty.txt", Buffer.alloc(0)); - const docs = await collectDocs( - plaintextLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs).toHaveLength(1); - expect(docs[0]!.content).toBe(""); - }); - - it("handles multiline content", async () => { - const content = "Line 1\nLine 2\nLine 3"; - const blob = new Blob("multi.txt", Buffer.from(content)); - const docs = await collectDocs( - plaintextLoader.load(blob, { encoding: "utf-8" }), - ); - - expect(docs[0]!.content).toBe(content); - }); - - it("supports ascii encoding", async () => { - const blob = new Blob("ascii.txt", Buffer.from("ASCII text", "ascii")); - const docs = await collectDocs( - plaintextLoader.load(blob, { encoding: "ascii" }), - ); - - expect(docs[0]!.content).toBe("ASCII text"); - }); - - it("supports latin1 encoding", async () => { - const blob = new Blob("latin.txt", Buffer.from("café", "latin1")); - const docs = await collectDocs( - plaintextLoader.load(blob, { encoding: "latin1" }), - ); - - expect(docs[0]!.content).toBe("café"); - }); - - it("defaults to utf-8 when encoding not specified", async () => { - const blob = new Blob("utf8.txt", Buffer.from("Unicode: 你好")); - const params = plaintextLoader.schema.parse({}); - const docs = await collectDocs(plaintextLoader.load(blob, params)); - - expect(docs[0]!.content).toBe("Unicode: 你好"); - }); - - it("schema validates encoding enum", () => { - expect(() => - plaintextLoader.schema.parse({ encoding: "invalid" }), - ).toThrow(); - }); - - it("schema rejects unknown properties", () => { - expect(() => - plaintextLoader.schema.parse({ encoding: "utf-8", extra: "field" }), - ).toThrow(); - }); -}); diff --git a/packages/nvisy-plugin-core/src/loaders/plaintext.ts b/packages/nvisy-plugin-core/src/loaders/plaintext.ts deleted file mode 100644 index 5fe2202..0000000 --- a/packages/nvisy-plugin-core/src/loaders/plaintext.ts +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Plaintext loader. - * - * Converts `.txt` blobs into Documents by decoding the raw bytes - * with a configurable character encoding. - * - * @module - */ - -import { type Blob, Document, Loader } from "@nvisy/core"; -import { z } from "zod"; - -/** Schema for plaintext loader parameters. */ -export const plaintextParamsSchema = z - .object({ - /** Character encoding of the blob data. Defaults to "utf-8". */ - encoding: z - .enum(["utf-8", "ascii", "latin1", "utf16le"]) - .optional() - .default("utf-8"), - }) - .strict(); - -export type PlaintextParams = z.infer<typeof plaintextParamsSchema>; - -/** - * Loader that converts plaintext blobs (.txt files) into Documents. - * - * Reads the blob data as text using the specified encoding and - * creates a Document with the text content. - */ -export const plaintextLoader = Loader.define<PlaintextParams>("plaintext", { - extensions: [".txt"], - contentTypes: ["text/plain"], - params: plaintextParamsSchema, - load: loadPlaintext, -}); - -async function* loadPlaintext( - blob: Blob, - params: PlaintextParams, -): AsyncGenerator<Document> { - const content = blob.data.toString(params.encoding); - const doc = new Document(content); - doc.deriveFrom(blob); - yield doc; -} diff --git a/packages/nvisy-plugin-core/src/splitter/delimiter.test.ts b/packages/nvisy-plugin-core/src/splitter/delimiter.test.ts deleted file mode 100644 index 91c76d2..0000000 --- a/packages/nvisy-plugin-core/src/splitter/delimiter.test.ts +++ /dev/null @@ -1,90 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { splitByDelimiter } from "./delimiter.js"; - -describe("splitByDelimiter", () => { - it("splits on a simple delimiter", () => { - const result = splitByDelimiter("a,b,c", { delimiter: "," }); - expect(result).toEqual(["a", "b", "c"]); - }); - - it("splits on a multi-character delimiter", () => { - const result = splitByDelimiter("a---b---c", { delimiter: "---" }); - expect(result).toEqual(["a", "b", "c"]); - }); - - it("splits on newline delimiter", () => { - const result = splitByDelimiter("line1\nline2\nline3", { - delimiter: "\n", - }); - expect(result).toEqual(["line1", "line2", "line3"]); - }); - - it("returns [text] when delimiter is not found", () => { - const result = splitByDelimiter("no match here", { delimiter: "," }); - expect(result).toEqual(["no match here"]); - }); - - it("returns empty array for empty input (trimEmpty=true)", () => { - const result = splitByDelimiter("", { delimiter: "," }); - expect(result).toEqual([]); - }); - - it("returns [''] for empty input when trimEmpty=false", () => { - const result = splitByDelimiter("", { - delimiter: ",", - trimEmpty: false, - }); - expect(result).toEqual([""]); - }); - - describe("keepDelimiter", () => { - it("prepends delimiter to subsequent segments", () => { - const result = splitByDelimiter("a,b,c", { - delimiter: ",", - keepDelimiter: true, - }); - expect(result).toEqual(["a", ",b", ",c"]); - }); - - it("prepends multi-character delimiter", () => { - const result = splitByDelimiter("a---b---c", { - delimiter: "---", - keepDelimiter: true, - }); - expect(result).toEqual(["a", "---b", "---c"]); - }); - }); - - describe("trimEmpty", () => { - it("filters whitespace-only segments by default", () => { - const result = splitByDelimiter("a,, ,b", { delimiter: "," }); - expect(result).toEqual(["a", "b"]); - }); - - it("keeps whitespace-only segments when trimEmpty=false", () => { - const result = splitByDelimiter("a,, ,b", { - delimiter: ",", - trimEmpty: false, - }); - expect(result).toEqual(["a", "", " ", "b"]); - }); - }); - - it("handles consecutive delimiters", () => { - const result = splitByDelimiter("a,,b", { delimiter: "," }); - expect(result).toEqual(["a", "b"]); - }); - - it("handles delimiter at start and end", () => { - const result = splitByDelimiter(",a,b,", { delimiter: "," }); - expect(result).toEqual(["a", "b"]); - }); - - it("handles delimiter at start and end with trimEmpty=false", () => { - const result = splitByDelimiter(",a,b,", { - delimiter: ",", - trimEmpty: false, - }); - expect(result).toEqual(["", "a", "b", ""]); - }); -}); diff --git a/packages/nvisy-plugin-core/src/splitter/delimiter.ts b/packages/nvisy-plugin-core/src/splitter/delimiter.ts deleted file mode 100644 index 9251fdf..0000000 --- a/packages/nvisy-plugin-core/src/splitter/delimiter.ts +++ /dev/null @@ -1,31 +0,0 @@ -export interface DelimiterSplitOptions { - /** String to split on (e.g. `"\n"`, `"---"`). */ - readonly delimiter: string; - /** If true, keep the delimiter at the start of each subsequent segment. Default: false. */ - readonly keepDelimiter?: boolean; - /** Discard segments that are empty or whitespace-only after splitting. Default: true. */ - readonly trimEmpty?: boolean; -} - -/** Split `text` on a literal delimiter string. */ -export function splitByDelimiter( - text: string, - options: DelimiterSplitOptions, -): string[] { - const { delimiter, keepDelimiter = false, trimEmpty = true } = options; - - const raw = text.split(delimiter); - - let segments: string[]; - if (keepDelimiter) { - segments = raw.map((seg, i) => (i === 0 ? seg : `${delimiter}${seg}`)); - } else { - segments = raw; - } - - if (trimEmpty) { - segments = segments.filter((s) => s.trim().length > 0); - } - - return segments; -} diff --git a/packages/nvisy-plugin-core/src/splitter/index.ts b/packages/nvisy-plugin-core/src/splitter/index.ts deleted file mode 100644 index a9345a4..0000000 --- a/packages/nvisy-plugin-core/src/splitter/index.ts +++ /dev/null @@ -1,4 +0,0 @@ -export type { DelimiterSplitOptions } from "./delimiter.js"; -export { splitByDelimiter } from "./delimiter.js"; -export type { RegexSplitOptions } from "./regex.js"; -export { splitByRegex } from "./regex.js"; diff --git a/packages/nvisy-plugin-core/src/splitter/regex.test.ts b/packages/nvisy-plugin-core/src/splitter/regex.test.ts deleted file mode 100644 index 54c53c1..0000000 --- a/packages/nvisy-plugin-core/src/splitter/regex.test.ts +++ /dev/null @@ -1,90 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { splitByRegex } from "./regex.js"; - -describe("splitByRegex", () => { - it("splits on a simple pattern", () => { - const result = splitByRegex("a1b2c", { pattern: "\\d" }); - expect(result).toEqual(["a", "b", "c"]); - }); - - it("splits on a multi-character pattern", () => { - const result = splitByRegex("hello---world---end", { pattern: "-+" }); - expect(result).toEqual(["hello", "world", "end"]); - }); - - it("splits on newline patterns", () => { - const result = splitByRegex("line1\n\nline2\n\nline3", { - pattern: "\\n{2,}", - }); - expect(result).toEqual(["line1", "line2", "line3"]); - }); - - it("returns [text] when pattern does not match", () => { - const result = splitByRegex("no match here", { pattern: "\\d+" }); - expect(result).toEqual(["no match here"]); - }); - - it("returns empty array for empty input (trimEmpty=true)", () => { - const result = splitByRegex("", { pattern: "\\d" }); - expect(result).toEqual([]); - }); - - it("returns [''] for empty input when trimEmpty=false", () => { - const result = splitByRegex("", { pattern: "\\d", trimEmpty: false }); - expect(result).toEqual([""]); - }); - - describe("keepSeparator", () => { - it("prepends matched separator to subsequent segments", () => { - const result = splitByRegex("intro\n## A\ncontent A\n## B\ncontent B", { - pattern: "^## .+$", - keepSeparator: true, - }); - expect(result).toEqual([ - "intro\n", - "## A\ncontent A\n", - "## B\ncontent B", - ]); - }); - - it("keeps separator with simple pattern", () => { - const result = splitByRegex("a1b2c", { - pattern: "\\d", - keepSeparator: true, - }); - expect(result).toEqual(["a", "1b", "2c"]); - }); - }); - - describe("trimEmpty", () => { - it("filters whitespace-only segments by default", () => { - const result = splitByRegex("a,,b", { pattern: "," }); - expect(result).toEqual(["a", "b"]); - }); - - it("keeps whitespace-only segments when trimEmpty=false", () => { - const result = splitByRegex("a,,b", { - pattern: ",", - trimEmpty: false, - }); - expect(result).toEqual(["a", "", "b"]); - }); - }); - - it("handles consecutive separators", () => { - const result = splitByRegex("a--b--c", { pattern: "-" }); - expect(result).toEqual(["a", "b", "c"]); - }); - - it("handles pattern at start and end", () => { - const result = splitByRegex("1a1b1", { pattern: "\\d" }); - expect(result).toEqual(["a", "b"]); - }); - - it("uses multiline flag so ^ matches line starts", () => { - const result = splitByRegex("line1\nline2\nline3", { - pattern: "^line2$", - }); - expect(result).toEqual(["line1\n", "\nline3"]); - }); -}); diff --git a/packages/nvisy-plugin-core/src/splitter/regex.ts b/packages/nvisy-plugin-core/src/splitter/regex.ts deleted file mode 100644 index c39e71a..0000000 --- a/packages/nvisy-plugin-core/src/splitter/regex.ts +++ /dev/null @@ -1,59 +0,0 @@ -export interface RegexSplitOptions { - /** Pattern to split on. Compiled to a RegExp with the `gm` flags. */ - readonly pattern: string; - /** If true, keep the matched separator at the start of each subsequent segment. Default: false. */ - readonly keepSeparator?: boolean; - /** Discard segments that are empty or whitespace-only after splitting. Default: true. */ - readonly trimEmpty?: boolean; -} - -/** Split `text` on a regex pattern. */ -export function splitByRegex( - text: string, - options: RegexSplitOptions, -): string[] { - const { pattern, keepSeparator = false, trimEmpty = true } = options; - - const re = new RegExp(pattern, "gm"); - - // Collect all match boundaries - const boundaries: { start: number; end: number }[] = []; - for (let match = re.exec(text); match !== null; match = re.exec(text)) { - if (match[0].length === 0) { - re.lastIndex++; - continue; - } - boundaries.push({ start: match.index, end: match.index + match[0].length }); - } - - if (boundaries.length === 0) { - const result = trimEmpty && text.trim().length === 0 ? [] : [text]; - return result; - } - - const segments: string[] = []; - - if (keepSeparator) { - // First segment: everything before the first match - segments.push(text.slice(0, boundaries[0]!.start)); - // Subsequent segments start at each match start, end at next match start (or end of text) - for (let i = 0; i < boundaries.length; i++) { - const segStart = boundaries[i]!.start; - const segEnd = - i + 1 < boundaries.length ? boundaries[i + 1]!.start : text.length; - segments.push(text.slice(segStart, segEnd)); - } - } else { - let cursor = 0; - for (const b of boundaries) { - segments.push(text.slice(cursor, b.start)); - cursor = b.end; - } - segments.push(text.slice(cursor)); - } - - if (trimEmpty) { - return segments.filter((s) => s.trim().length > 0); - } - return segments; -} diff --git a/packages/nvisy-plugin-core/tsconfig.json b/packages/nvisy-plugin-core/tsconfig.json deleted file mode 100644 index 67241bf..0000000 --- a/packages/nvisy-plugin-core/tsconfig.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - /* Emit */ - "outDir": "./dist", - "rootDir": "./src", - "composite": true - }, - /* Scope */ - "include": ["src/**/*"], - "exclude": ["node_modules", "dist", "src/**/*.test.ts", "src/**/*.spec.ts"], - "references": [{ "path": "../nvisy-core" }] -} diff --git a/packages/nvisy-plugin-core/tsup.config.ts b/packages/nvisy-plugin-core/tsup.config.ts deleted file mode 100644 index d68a5db..0000000 --- a/packages/nvisy-plugin-core/tsup.config.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - /* Entry */ - entry: ["src/index.ts"], - format: ["esm"], - - /* Output */ - outDir: "dist", - dts: { compilerOptions: { composite: false } }, - sourcemap: true, - clean: true, - - /* Optimization */ - splitting: false, - treeshake: true, - skipNodeModulesBundle: true, - - /* Environment */ - platform: "node", - target: "es2024", -}); diff --git a/packages/nvisy-plugin-nosql/package.json b/packages/nvisy-plugin-nosql/package.json deleted file mode 100644 index f2cbaa7..0000000 --- a/packages/nvisy-plugin-nosql/package.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "name": "@nvisy/plugin-nosql", - "version": "0.1.0", - "description": "NoSQL database integrations for the Nvisy platform", - "type": "module", - "exports": { - ".": { - "source": "./src/index.ts", - "types": "./dist/index.d.ts", - "import": "./dist/index.js" - } - }, - "files": [ - "dist" - ], - "scripts": { - "build": "tsup", - "build:watch": "tsup --watch", - "clean": "rimraf dist", - "typecheck": "tsc -b" - }, - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } -} diff --git a/packages/nvisy-plugin-nosql/src/index.ts b/packages/nvisy-plugin-nosql/src/index.ts deleted file mode 100644 index ec4ee10..0000000 --- a/packages/nvisy-plugin-nosql/src/index.ts +++ /dev/null @@ -1,13 +0,0 @@ -/** - * @module @nvisy/plugin-nosql - * - * NoSQL database plugin for the Nvisy runtime. - * - * Provides source and target streams for document databases - * (MongoDB, DynamoDB, Firestore). - */ - -import { Plugin } from "@nvisy/core"; - -/** NoSQL database plugin instance. */ -export const nosqlPlugin = Plugin.define("nosql"); diff --git a/packages/nvisy-plugin-nosql/tsconfig.json b/packages/nvisy-plugin-nosql/tsconfig.json deleted file mode 100644 index c91a2dd..0000000 --- a/packages/nvisy-plugin-nosql/tsconfig.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - /* Emit */ - "outDir": "./dist", - "rootDir": "./src", - "composite": true - }, - /* Scope */ - "include": ["src/**/*"], - "exclude": ["node_modules", "dist"], - "references": [{ "path": "../nvisy-core" }] -} diff --git a/packages/nvisy-plugin-nosql/tsup.config.ts b/packages/nvisy-plugin-nosql/tsup.config.ts deleted file mode 100644 index d68a5db..0000000 --- a/packages/nvisy-plugin-nosql/tsup.config.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - /* Entry */ - entry: ["src/index.ts"], - format: ["esm"], - - /* Output */ - outDir: "dist", - dts: { compilerOptions: { composite: false } }, - sourcemap: true, - clean: true, - - /* Optimization */ - splitting: false, - treeshake: true, - skipNodeModulesBundle: true, - - /* Environment */ - platform: "node", - target: "es2024", -}); diff --git a/packages/nvisy-plugin-object/package.json b/packages/nvisy-plugin-object/package.json deleted file mode 100644 index cdb6e8c..0000000 --- a/packages/nvisy-plugin-object/package.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "name": "@nvisy/plugin-object", - "version": "0.1.0", - "description": "Object store integrations for the Nvisy platform", - "type": "module", - "exports": { - ".": { - "source": "./src/index.ts", - "types": "./dist/index.d.ts", - "import": "./dist/index.js" - } - }, - "files": [ - "dist" - ], - "scripts": { - "build": "tsup", - "build:watch": "tsup --watch", - "clean": "rimraf dist", - "typecheck": "tsc -b" - }, - "dependencies": { - "@aws-sdk/client-s3": "^3.750.0", - "@azure/storage-blob": "^12.26.0", - "@google-cloud/storage": "^7.15.0", - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } -} diff --git a/packages/nvisy-plugin-object/src/index.ts b/packages/nvisy-plugin-object/src/index.ts deleted file mode 100644 index 60f8f54..0000000 --- a/packages/nvisy-plugin-object/src/index.ts +++ /dev/null @@ -1,28 +0,0 @@ -/** - * @module @nvisy/plugin-object - * - * Object store plugin for the Nvisy runtime. - * - * Exposes S3, GCS, and Azure Blob providers, plus read/write streams - * that list, get, and put objects as {@link Blob}s. - * - * @example - * ```ts - * import { objectPlugin } from "@nvisy/plugin-object"; - * - * // Register with the runtime - * runtime.register(objectPlugin); - * ``` - */ - -import { Plugin } from "@nvisy/core"; -import { azure, gcs, s3 } from "./providers/index.js"; -import { read, write } from "./streams/index.js"; - -/** The Object plugin: register this with the runtime to enable object store providers and streams. */ -export const objectPlugin = Plugin.define("object") - .withProviders(s3, gcs, azure) - .withStreams(read, write); - -export type { ListResult } from "./providers/index.js"; -export { ObjectStoreClient } from "./providers/index.js"; diff --git a/packages/nvisy-plugin-object/src/providers/azure.ts b/packages/nvisy-plugin-object/src/providers/azure.ts deleted file mode 100644 index 0ceef9e..0000000 --- a/packages/nvisy-plugin-object/src/providers/azure.ts +++ /dev/null @@ -1,130 +0,0 @@ -import { - type BlobHTTPHeaders, - BlobServiceClient, - type BlockBlobUploadOptions, - type ContainerClient, - StorageSharedKeyCredential, -} from "@azure/storage-blob"; -import { getLogger } from "@logtape/logtape"; -import { z } from "zod"; -import { - type ListResult, - makeObjectProvider, - ObjectStoreClient, - ObjectStoreProvider, -} from "./client.js"; - -const logger = getLogger(["nvisy", "object"]); - -/** - * Credentials for connecting to Azure Blob Storage. - */ -export const AzureCredentials = z.object({ - /** Azure storage account name. */ - accountName: z.string(), - /** Azure Blob container name. */ - containerName: z.string(), - /** Storage account key (provide this or `connectionString`). */ - accountKey: z.string().optional(), - /** Full connection string (provide this or `accountKey`). */ - connectionString: z.string().optional(), -}); -export type AzureCredentials = z.infer<typeof AzureCredentials>; - -class AzureObjectStoreClient extends ObjectStoreClient { - readonly #container: ContainerClient; - - constructor(container: ContainerClient) { - super(); - this.#container = container; - } - - async list(prefix: string, cursor?: string): Promise<ListResult> { - const keys: string[] = []; - const iter = cursor - ? this.#container - .listBlobsFlat({ prefix }) - .byPage({ continuationToken: cursor }) - : this.#container.listBlobsFlat({ prefix }).byPage(); - - const page = await iter.next(); - if (!page.done) { - for (const blob of page.value.segment.blobItems) { - keys.push(blob.name); - } - const token = page.value.continuationToken; - if (token) { - return { keys, nextCursor: token }; - } - } - return { keys }; - } - - async get(key: string): Promise<{ data: Buffer; contentType?: string }> { - const blobClient = this.#container.getBlobClient(key); - const response = await blobClient.download(); - const body = response.readableStreamBody; - if (!body) throw new Error(`Empty response body for blob "${key}"`); - - const chunks: Buffer[] = []; - for await (const chunk of body) { - chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); - } - const contentType = response.contentType; - if (contentType) { - return { data: Buffer.concat(chunks), contentType }; - } - return { data: Buffer.concat(chunks) }; - } - - async put(key: string, data: Buffer, contentType?: string): Promise<void> { - const blockClient = this.#container.getBlockBlobClient(key); - const opts: BlockBlobUploadOptions = {}; - if (contentType) { - const headers: BlobHTTPHeaders = { blobContentType: contentType }; - opts.blobHTTPHeaders = headers; - } - await blockClient.upload(data, data.byteLength, opts); - } -} - -function createContainerClient(creds: AzureCredentials): ContainerClient { - if (creds.connectionString) { - return BlobServiceClient.fromConnectionString( - creds.connectionString, - ).getContainerClient(creds.containerName); - } - if (creds.accountKey) { - const sharedKey = new StorageSharedKeyCredential( - creds.accountName, - creds.accountKey, - ); - const service = new BlobServiceClient( - `https://${creds.accountName}.blob.core.windows.net`, - sharedKey, - ); - return service.getContainerClient(creds.containerName); - } - throw new Error( - "Azure credentials must include either accountKey or connectionString", - ); -} - -/** Azure Blob Storage provider. */ -export const azure = makeObjectProvider( - "azure", - AzureCredentials, - async (creds) => { - logger.debug( - "Connecting to Azure container {containerName} in account {accountName}", - { containerName: creds.containerName, accountName: creds.accountName }, - ); - - const container = createContainerClient(creds); - - return new ObjectStoreProvider( - new AzureObjectStoreClient(container), - "azure", - ); - }, -); diff --git a/packages/nvisy-plugin-object/src/providers/client.ts b/packages/nvisy-plugin-object/src/providers/client.ts deleted file mode 100644 index 30d23c7..0000000 --- a/packages/nvisy-plugin-object/src/providers/client.ts +++ /dev/null @@ -1,83 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { - Provider, - type ProviderFactory, - type ProviderInstance, -} from "@nvisy/core"; -import type { z } from "zod"; - -const logger = getLogger(["nvisy", "object"]); - -/** - * Result of listing objects under a prefix. - */ -export interface ListResult { - /** Object keys returned in this page. */ - readonly keys: string[]; - /** Opaque cursor for fetching the next page, or `undefined` if exhausted. */ - readonly nextCursor?: string; -} - -/** - * Abstract client that object-store streams use for I/O. - * - * Each provider (S3, GCS, Azure) supplies a concrete subclass. - * The class reference is required by {@link Stream.createSource} and - * {@link Stream.createTarget} for runtime client-type matching. - */ -export abstract class ObjectStoreClient { - /** List object keys under `prefix`, optionally resuming from `cursor`. */ - abstract list(prefix: string, cursor?: string): Promise<ListResult>; - - /** Retrieve a single object by key. */ - abstract get(key: string): Promise<{ data: Buffer; contentType?: string }>; - - /** Write a single object by key. */ - abstract put(key: string, data: Buffer, contentType?: string): Promise<void>; -} - -/** - * Connected object-store provider instance. - * - * Holds an {@link ObjectStoreClient} and manages teardown on - * {@link disconnect}. - */ -export class ObjectStoreProvider - implements ProviderInstance<ObjectStoreClient> -{ - readonly client: ObjectStoreClient; - readonly #id: string; - readonly #disconnect: (() => Promise<void>) | undefined; - - constructor( - client: ObjectStoreClient, - id: string, - disconnect?: () => Promise<void>, - ) { - this.client = client; - this.#id = id; - this.#disconnect = disconnect; - } - - async disconnect(): Promise<void> { - await this.#disconnect?.(); - logger.debug("Disconnected from {provider}", { provider: this.#id }); - } -} - -/** - * Create an object-store {@link ProviderFactory} from a credential schema - * and a connect function. - * - * This mirrors {@link makeSqlProvider} but is generic over credentials - * so that S3, GCS, and Azure can each supply their own schema. - */ -export const makeObjectProvider = <TCred>( - id: string, - credentials: z.ZodType<TCred>, - connect: (creds: TCred) => Promise<ProviderInstance<ObjectStoreClient>>, -): ProviderFactory<TCred, ObjectStoreClient> => - Provider.withAuthentication(id, { - credentials, - connect, - }); diff --git a/packages/nvisy-plugin-object/src/providers/gcs.ts b/packages/nvisy-plugin-object/src/providers/gcs.ts deleted file mode 100644 index 20c6c63..0000000 --- a/packages/nvisy-plugin-object/src/providers/gcs.ts +++ /dev/null @@ -1,86 +0,0 @@ -import { Storage, type StorageOptions } from "@google-cloud/storage"; -import { getLogger } from "@logtape/logtape"; -import { z } from "zod"; -import { - type ListResult, - makeObjectProvider, - ObjectStoreClient, - ObjectStoreProvider, -} from "./client.js"; - -const logger = getLogger(["nvisy", "object"]); - -/** - * Credentials for connecting to Google Cloud Storage. - */ -export const GcsCredentials = z.object({ - /** GCP project ID. */ - projectId: z.string(), - /** GCS bucket name. */ - bucket: z.string(), - /** Path to a service-account key file (optional if running on GCE). */ - keyFilename: z.string().optional(), -}); -export type GcsCredentials = z.infer<typeof GcsCredentials>; - -class GcsObjectStoreClient extends ObjectStoreClient { - readonly #storage: Storage; - readonly #bucket: string; - - constructor(storage: Storage, bucket: string) { - super(); - this.#storage = storage; - this.#bucket = bucket; - } - - async list(prefix: string, cursor?: string): Promise<ListResult> { - const options: { prefix: string; startOffset?: string } = { prefix }; - if (cursor) { - options.startOffset = cursor; - } - - const [files] = await this.#storage.bucket(this.#bucket).getFiles(options); - // When resuming, GCS startOffset is inclusive — skip the cursor key itself - const keys = files.map((f) => f.name).filter((name) => name !== cursor); - return { keys }; - } - - async get(key: string): Promise<{ data: Buffer; contentType?: string }> { - const file = this.#storage.bucket(this.#bucket).file(key); - const [contents] = await file.download(); - const [metadata] = await file.getMetadata(); - const contentType = metadata.contentType as string | undefined; - if (contentType) { - return { data: contents, contentType }; - } - return { data: contents }; - } - - async put(key: string, data: Buffer, contentType?: string): Promise<void> { - const file = this.#storage.bucket(this.#bucket).file(key); - if (contentType) { - await file.save(data, { contentType }); - } else { - await file.save(data); - } - } -} - -/** Google Cloud Storage provider. */ -export const gcs = makeObjectProvider("gcs", GcsCredentials, async (creds) => { - logger.debug("Connecting to GCS bucket {bucket} in project {projectId}", { - bucket: creds.bucket, - projectId: creds.projectId, - }); - - const opts: StorageOptions = { projectId: creds.projectId }; - if (creds.keyFilename) { - opts.keyFilename = creds.keyFilename; - } - const storage = new Storage(opts); - - return new ObjectStoreProvider( - new GcsObjectStoreClient(storage, creds.bucket), - "gcs", - ); -}); diff --git a/packages/nvisy-plugin-object/src/providers/index.ts b/packages/nvisy-plugin-object/src/providers/index.ts deleted file mode 100644 index 795aa8b..0000000 --- a/packages/nvisy-plugin-object/src/providers/index.ts +++ /dev/null @@ -1,9 +0,0 @@ -export { type AzureCredentials, azure } from "./azure.js"; -export { - type ListResult, - makeObjectProvider, - ObjectStoreClient, - ObjectStoreProvider, -} from "./client.js"; -export { type GcsCredentials, gcs } from "./gcs.js"; -export { type S3Credentials, s3 } from "./s3.js"; diff --git a/packages/nvisy-plugin-object/src/providers/s3.ts b/packages/nvisy-plugin-object/src/providers/s3.ts deleted file mode 100644 index 5f315d9..0000000 --- a/packages/nvisy-plugin-object/src/providers/s3.ts +++ /dev/null @@ -1,115 +0,0 @@ -import { - GetObjectCommand, - ListObjectsV2Command, - PutObjectCommand, - S3Client, - type S3ClientConfig, -} from "@aws-sdk/client-s3"; -import { getLogger } from "@logtape/logtape"; -import { z } from "zod"; -import { - type ListResult, - makeObjectProvider, - ObjectStoreClient, - ObjectStoreProvider, -} from "./client.js"; - -const logger = getLogger(["nvisy", "object"]); - -/** - * Credentials for connecting to Amazon S3. - */ -export const S3Credentials = z.object({ - /** AWS region (e.g. `"us-east-1"`). */ - region: z.string(), - /** S3 bucket name. */ - bucket: z.string(), - /** AWS access key ID. */ - accessKeyId: z.string(), - /** AWS secret access key. */ - secretAccessKey: z.string(), - /** Optional custom endpoint for S3-compatible stores (e.g. MinIO). */ - endpoint: z.string().optional(), -}); -export type S3Credentials = z.infer<typeof S3Credentials>; - -class S3ObjectStoreClient extends ObjectStoreClient { - readonly #client: S3Client; - readonly #bucket: string; - - constructor(client: S3Client, bucket: string) { - super(); - this.#client = client; - this.#bucket = bucket; - } - - async list(prefix: string, cursor?: string): Promise<ListResult> { - const response = await this.#client.send( - new ListObjectsV2Command({ - Bucket: this.#bucket, - Prefix: prefix, - StartAfter: cursor, - }), - ); - const keys = (response.Contents ?? []) - .map((o) => o.Key) - .filter((k): k is string => k != null); - const lastKey = response.IsTruncated - ? response.Contents?.at(-1)?.Key - : undefined; - if (lastKey) { - return { keys, nextCursor: lastKey }; - } - return { keys }; - } - - async get(key: string): Promise<{ data: Buffer; contentType?: string }> { - const response = await this.#client.send( - new GetObjectCommand({ Bucket: this.#bucket, Key: key }), - ); - const bytes = await response.Body!.transformToByteArray(); - const contentType = response.ContentType; - if (contentType) { - return { data: Buffer.from(bytes), contentType }; - } - return { data: Buffer.from(bytes) }; - } - - async put(key: string, data: Buffer, contentType?: string): Promise<void> { - await this.#client.send( - new PutObjectCommand({ - Bucket: this.#bucket, - Key: key, - Body: data, - ContentType: contentType, - }), - ); - } -} - -/** Amazon S3 provider. */ -export const s3 = makeObjectProvider("s3", S3Credentials, async (creds) => { - logger.debug("Connecting to S3 bucket {bucket} in {region}", { - bucket: creds.bucket, - region: creds.region, - }); - - const config: S3ClientConfig = { - region: creds.region, - credentials: { - accessKeyId: creds.accessKeyId, - secretAccessKey: creds.secretAccessKey, - }, - }; - if (creds.endpoint) { - config.endpoint = creds.endpoint; - } - - const client = new S3Client(config); - - return new ObjectStoreProvider( - new S3ObjectStoreClient(client, creds.bucket), - "s3", - async () => client.destroy(), - ); -}); diff --git a/packages/nvisy-plugin-object/src/streams/index.ts b/packages/nvisy-plugin-object/src/streams/index.ts deleted file mode 100644 index a37fd24..0000000 --- a/packages/nvisy-plugin-object/src/streams/index.ts +++ /dev/null @@ -1,2 +0,0 @@ -export { read } from "./read.js"; -export { write } from "./write.js"; diff --git a/packages/nvisy-plugin-object/src/streams/read.ts b/packages/nvisy-plugin-object/src/streams/read.ts deleted file mode 100644 index 73cdd60..0000000 --- a/packages/nvisy-plugin-object/src/streams/read.ts +++ /dev/null @@ -1,95 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { Blob, type Resumable, RuntimeError, Stream } from "@nvisy/core"; -import { z } from "zod"; -import { ObjectStoreClient } from "../providers/client.js"; - -const logger = getLogger(["nvisy", "object"]); - -/** - * Per-node parameters for the object-store read stream. - */ -export const ObjectParams = z.object({ - /** Key prefix to list objects under (e.g. `"uploads/2024/"`). */ - prefix: z.string().default(""), - /** Maximum keys to fetch per list page. */ - batchSize: z.number().default(100), -}); -export type ObjectParams = z.infer<typeof ObjectParams>; - -/** - * Keyset pagination cursor for resumable object reads. - * - * `lastKey` is `null` on the very first page. - */ -export const ObjectCursor = z.object({ - /** The last key successfully yielded, or `null` before the first page. */ - lastKey: z.string().nullable().default(null), -}); -export type ObjectCursor = z.infer<typeof ObjectCursor>; - -/** - * Source stream that lists objects under a prefix and yields each as - * a {@link Blob}. Pagination uses the last-key cursor from the store's - * list API. - */ -export const read = Stream.createSource("read", ObjectStoreClient, { - type: Blob, - context: ObjectCursor, - params: ObjectParams, - reader: (client, cursor, params) => readStream(client, cursor, params), -}); - -async function* readStream( - client: ObjectStoreClient, - cursor: ObjectCursor, - params: ObjectParams, -): AsyncIterable<Resumable<Blob, ObjectCursor>> { - const { prefix, batchSize } = params; - - logger.debug("Read stream opened on prefix {prefix}", { prefix, batchSize }); - - let nextCursor: string | undefined = cursor.lastKey ?? undefined; - let totalObjects = 0; - - while (true) { - let keys: readonly string[]; - let pageCursor: string | undefined; - - try { - const result = await client.list(prefix, nextCursor); - keys = result.keys; - pageCursor = result.nextCursor; - logger.debug("List returned {count} keys", { count: keys.length }); - } catch (error) { - logger.error("List failed for prefix {prefix}: {error}", { - prefix, - error: error instanceof Error ? error.message : String(error), - }); - throw RuntimeError.wrap(error, { source: "object/read" }); - } - - for (const key of keys) { - try { - const { data, contentType } = await client.get(key); - totalObjects++; - yield { - data: new Blob(key, data, { contentType }), - context: { lastKey: key } as ObjectCursor, - }; - } catch (error) { - logger.error("Get failed for key {key}: {error}", { - key, - error: error instanceof Error ? error.message : String(error), - }); - throw RuntimeError.wrap(error, { source: "object/read" }); - } - } - - if (keys.length < batchSize || !pageCursor) break; - nextCursor = pageCursor; - } - - logger.debug("Read stream closed, {totalObjects} objects yielded", { - totalObjects, - }); -} diff --git a/packages/nvisy-plugin-object/src/streams/write.ts b/packages/nvisy-plugin-object/src/streams/write.ts deleted file mode 100644 index 0fb199f..0000000 --- a/packages/nvisy-plugin-object/src/streams/write.ts +++ /dev/null @@ -1,40 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { Blob, RuntimeError, Stream } from "@nvisy/core"; -import { z } from "zod"; -import { ObjectStoreClient } from "../providers/client.js"; - -const logger = getLogger(["nvisy", "object"]); - -/** - * Per-node parameters for the object-store write stream. - */ -export const WriteParams = z.object({ - /** Key prefix to prepend to each blob path on write. */ - prefix: z.string().default(""), -}); -export type WriteParams = z.infer<typeof WriteParams>; - -/** - * Target stream that writes each {@link Blob} to the object store - * via the provider client's `put` method. - */ -export const write = Stream.createTarget("write", ObjectStoreClient, { - type: Blob, - params: WriteParams, - writer: (client, params) => async (item: Blob) => { - const key = params.prefix ? `${params.prefix}${item.path}` : item.path; - try { - await client.put(key, item.data, item.provided.mime); - logger.debug("Put object {key} ({size} bytes)", { - key, - size: item.size, - }); - } catch (error) { - logger.error("Put failed for {key}: {error}", { - key, - error: error instanceof Error ? error.message : String(error), - }); - throw RuntimeError.wrap(error, { source: "object/write" }); - } - }, -}); diff --git a/packages/nvisy-plugin-object/tsconfig.json b/packages/nvisy-plugin-object/tsconfig.json deleted file mode 100644 index c91a2dd..0000000 --- a/packages/nvisy-plugin-object/tsconfig.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - /* Emit */ - "outDir": "./dist", - "rootDir": "./src", - "composite": true - }, - /* Scope */ - "include": ["src/**/*"], - "exclude": ["node_modules", "dist"], - "references": [{ "path": "../nvisy-core" }] -} diff --git a/packages/nvisy-plugin-object/tsup.config.ts b/packages/nvisy-plugin-object/tsup.config.ts deleted file mode 100644 index d68a5db..0000000 --- a/packages/nvisy-plugin-object/tsup.config.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - /* Entry */ - entry: ["src/index.ts"], - format: ["esm"], - - /* Output */ - outDir: "dist", - dts: { compilerOptions: { composite: false } }, - sourcemap: true, - clean: true, - - /* Optimization */ - splitting: false, - treeshake: true, - skipNodeModulesBundle: true, - - /* Environment */ - platform: "node", - target: "es2024", -}); diff --git a/packages/nvisy-plugin-pandoc/package.json b/packages/nvisy-plugin-pandoc/package.json deleted file mode 100644 index 3dfdc72..0000000 --- a/packages/nvisy-plugin-pandoc/package.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "name": "@nvisy/plugin-pandoc", - "version": "0.1.0", - "description": "Pandoc document conversion plugin for the Nvisy platform", - "type": "module", - "exports": { - ".": { - "source": "./src/index.ts", - "types": "./dist/index.d.ts", - "import": "./dist/index.js" - } - }, - "files": [ - "dist" - ], - "scripts": { - "build": "tsup", - "build:watch": "tsup --watch", - "clean": "rimraf dist", - "typecheck": "tsc -b" - }, - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } -} diff --git a/packages/nvisy-plugin-pandoc/src/index.ts b/packages/nvisy-plugin-pandoc/src/index.ts deleted file mode 100644 index 201a3d8..0000000 --- a/packages/nvisy-plugin-pandoc/src/index.ts +++ /dev/null @@ -1,12 +0,0 @@ -/** - * @module @nvisy/plugin-pandoc - * - * Pandoc document conversion plugin for the Nvisy runtime. - * - * Provides actions for converting documents between formats using Pandoc. - */ - -import { Plugin } from "@nvisy/core"; - -/** Pandoc plugin instance. */ -export const pandocPlugin = Plugin.define("pandoc"); diff --git a/packages/nvisy-plugin-pandoc/tsconfig.json b/packages/nvisy-plugin-pandoc/tsconfig.json deleted file mode 100644 index c91a2dd..0000000 --- a/packages/nvisy-plugin-pandoc/tsconfig.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - /* Emit */ - "outDir": "./dist", - "rootDir": "./src", - "composite": true - }, - /* Scope */ - "include": ["src/**/*"], - "exclude": ["node_modules", "dist"], - "references": [{ "path": "../nvisy-core" }] -} diff --git a/packages/nvisy-plugin-pandoc/tsup.config.ts b/packages/nvisy-plugin-pandoc/tsup.config.ts deleted file mode 100644 index d68a5db..0000000 --- a/packages/nvisy-plugin-pandoc/tsup.config.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - /* Entry */ - entry: ["src/index.ts"], - format: ["esm"], - - /* Output */ - outDir: "dist", - dts: { compilerOptions: { composite: false } }, - sourcemap: true, - clean: true, - - /* Optimization */ - splitting: false, - treeshake: true, - skipNodeModulesBundle: true, - - /* Environment */ - platform: "node", - target: "es2024", -}); diff --git a/packages/nvisy-plugin-queue/package.json b/packages/nvisy-plugin-queue/package.json deleted file mode 100644 index a481298..0000000 --- a/packages/nvisy-plugin-queue/package.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "name": "@nvisy/plugin-queue", - "version": "0.1.0", - "description": "Message queue integrations for the Nvisy platform", - "type": "module", - "exports": { - ".": { - "source": "./src/index.ts", - "types": "./dist/index.d.ts", - "import": "./dist/index.js" - } - }, - "files": [ - "dist" - ], - "scripts": { - "build": "tsup", - "build:watch": "tsup --watch", - "clean": "rimraf dist", - "typecheck": "tsc -b" - }, - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } -} diff --git a/packages/nvisy-plugin-queue/src/index.ts b/packages/nvisy-plugin-queue/src/index.ts deleted file mode 100644 index d66e2d9..0000000 --- a/packages/nvisy-plugin-queue/src/index.ts +++ /dev/null @@ -1,13 +0,0 @@ -/** - * @module @nvisy/plugin-queue - * - * Message queue plugin for the Nvisy runtime. - * - * Provides source and target streams for message queue systems - * (Kafka, RabbitMQ, SQS, Redis Streams). - */ - -import { Plugin } from "@nvisy/core"; - -/** Message queue plugin instance. */ -export const queuePlugin = Plugin.define("queue"); diff --git a/packages/nvisy-plugin-queue/tsconfig.json b/packages/nvisy-plugin-queue/tsconfig.json deleted file mode 100644 index c91a2dd..0000000 --- a/packages/nvisy-plugin-queue/tsconfig.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - /* Emit */ - "outDir": "./dist", - "rootDir": "./src", - "composite": true - }, - /* Scope */ - "include": ["src/**/*"], - "exclude": ["node_modules", "dist"], - "references": [{ "path": "../nvisy-core" }] -} diff --git a/packages/nvisy-plugin-queue/tsup.config.ts b/packages/nvisy-plugin-queue/tsup.config.ts deleted file mode 100644 index d68a5db..0000000 --- a/packages/nvisy-plugin-queue/tsup.config.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - /* Entry */ - entry: ["src/index.ts"], - format: ["esm"], - - /* Output */ - outDir: "dist", - dts: { compilerOptions: { composite: false } }, - sourcemap: true, - clean: true, - - /* Optimization */ - splitting: false, - treeshake: true, - skipNodeModulesBundle: true, - - /* Environment */ - platform: "node", - target: "es2024", -}); diff --git a/packages/nvisy-plugin-sql/README.md b/packages/nvisy-plugin-sql/README.md deleted file mode 100644 index b994591..0000000 --- a/packages/nvisy-plugin-sql/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# @nvisy/plugin-sql - -[![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) - -SQL provider plugin for the Nvisy runtime. - -## Features - -- **Postgres, MySQL, and MSSQL** providers with credential validation and connection lifecycle management -- **Keyset-paginated reads** for efficient, resumable streaming over large tables -- **Per-item writes** via Kysely INSERT for batch pipeline sinks -- **Row-level transforms**: filter, project, rename, and coerce columns in the pipeline - -## Overview - -Provides Postgres, MySQL, and MSSQL integrations through a unified Kysely-based client. The plugin exposes: - -- **Providers** (`sql/postgres`, `sql/mysql`, `sql/mssql`): connection lifecycle management with credential validation. -- **Streams** (`sql/read`, `sql/write`): keyset-paginated source and per-item insert sink. -- **Actions** (`sql/filter`, `sql/project`, `sql/rename`, `sql/coerce`): row-level transforms applied in the pipeline. - -## Usage - -```ts -import { sqlPlugin } from "@nvisy/plugin-sql"; - -registry.load(sqlPlugin); -``` - -## Changelog - -See [CHANGELOG.md](../../CHANGELOG.md) for release notes and version history. - -## License - -Apache 2.0 License - see [LICENSE.txt](../../LICENSE.txt) - -## Support - -- **Documentation**: [docs.nvisy.com](https://docs.nvisy.com) -- **Issues**: [GitHub Issues](https://github.com/nvisycom/runtime/issues) -- **Email**: [support@nvisy.com](mailto:support@nvisy.com) diff --git a/packages/nvisy-plugin-sql/package.json b/packages/nvisy-plugin-sql/package.json deleted file mode 100644 index 6e65044..0000000 --- a/packages/nvisy-plugin-sql/package.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "name": "@nvisy/plugin-sql", - "version": "0.1.0", - "description": "SQL provider integrations for the Nvisy platform", - "type": "module", - "exports": { - ".": { - "source": "./src/index.ts", - "types": "./dist/index.d.ts", - "import": "./dist/index.js" - } - }, - "files": [ - "dist" - ], - "scripts": { - "build": "tsup", - "build:watch": "tsup --watch", - "clean": "rimraf dist", - "typecheck": "tsc -b" - }, - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "kysely": "^0.28.11", - "mysql2": "^3.16.3", - "pg": "^8.18.0", - "tarn": "^3.0.2", - "tedious": "^19.2.0", - "zod": "^4.3.6" - }, - "devDependencies": { - "@types/pg": "^8.16.0" - }, - "engines": { - "node": ">=22.0.0" - } -} diff --git a/packages/nvisy-plugin-sql/src/actions/coerce.ts b/packages/nvisy-plugin-sql/src/actions/coerce.ts deleted file mode 100644 index 81f30e5..0000000 --- a/packages/nvisy-plugin-sql/src/actions/coerce.ts +++ /dev/null @@ -1,63 +0,0 @@ -import type { JsonValue } from "@nvisy/core"; -import { Action } from "@nvisy/core"; -import { z } from "zod"; -import { Row } from "../datatypes/index.js"; - -/** Allowed target types for column coercion. */ -const CoerceTarget = z.enum(["string", "number", "boolean"]); - -/** - * Parameters for the `sql/coerce` action. - * - * `columns` maps column names to a target type. Columns not listed - * are passed through unchanged. - */ -const CoerceParams = z.object({ - columns: z.record(z.string(), CoerceTarget), -}); - -/** - * Cast a single value to the requested type. - * - * - `null` / `undefined` -> `null` regardless of target. - * - `"number"` on a non-numeric string -> `null`. - */ -function coerceValue( - value: JsonValue | undefined, - target: "string" | "number" | "boolean", -): JsonValue { - if (value === null || value === undefined) return null; - - switch (target) { - case "string": - return String(value); - case "number": { - const n = Number(value); - return Number.isNaN(n) ? null : n; - } - case "boolean": - return Boolean(value); - } -} - -/** - * Coerce column values to a target type (`string`, `number`, or `boolean`). - * - * Null values remain null regardless of the target. Non-numeric strings - * coerced to `number` become `null`. Row identity and metadata are preserved. - */ -export const coerce = Action.withoutClient("coerce", { - types: [Row], - params: CoerceParams, - transform: async function* (stream, params) { - for await (const row of stream) { - const result: Record<string, JsonValue> = { ...row.columns }; - - for (const [column, target] of Object.entries(params.columns)) { - result[column] = coerceValue(result[column], target); - } - - yield new Row(result).deriveFrom(row); - } - }, -}); diff --git a/packages/nvisy-plugin-sql/src/actions/filter.ts b/packages/nvisy-plugin-sql/src/actions/filter.ts deleted file mode 100644 index a972ea2..0000000 --- a/packages/nvisy-plugin-sql/src/actions/filter.ts +++ /dev/null @@ -1,111 +0,0 @@ -import type { JsonValue } from "@nvisy/core"; -import { Action } from "@nvisy/core"; -import { z } from "zod"; -import { Row } from "../datatypes/index.js"; - -/** Supported comparison operators for a single filter condition. */ -const Operator = z.enum([ - "eq", - "neq", - "gt", - "gte", - "lt", - "lte", - "in", - "notIn", - "isNull", - "isNotNull", -]); -type Operator = z.infer<typeof Operator>; - -/** A single predicate: `column <op> value`. */ -const FilterCondition = z.object({ - column: z.string(), - op: Operator, - value: z.unknown().optional(), -}); - -/** - * Parameters for the `sql/filter` action. - * - * @param conditions Array of predicates applied to each row. - * @param mode Combine with `"and"` (default) or `"or"`. - */ -const FilterParams = z.object({ - conditions: z.array(FilterCondition), - mode: z.enum(["and", "or"]).optional(), -}); - -/** Evaluate a single {@link FilterCondition} against a row. */ -function matchCondition( - row: Row, - condition: { column: string; op: Operator; value?: unknown }, -): boolean { - const val = row.get(condition.column); - - switch (condition.op) { - case "eq": - return val === condition.value; - case "neq": - return val !== condition.value; - case "gt": - return ( - typeof val === "number" && - typeof condition.value === "number" && - val > condition.value - ); - case "gte": - return ( - typeof val === "number" && - typeof condition.value === "number" && - val >= condition.value - ); - case "lt": - return ( - typeof val === "number" && - typeof condition.value === "number" && - val < condition.value - ); - case "lte": - return ( - typeof val === "number" && - typeof condition.value === "number" && - val <= condition.value - ); - case "in": - return ( - Array.isArray(condition.value) && - (condition.value as JsonValue[]).includes(val as JsonValue) - ); - case "notIn": - return ( - Array.isArray(condition.value) && - !(condition.value as JsonValue[]).includes(val as JsonValue) - ); - case "isNull": - return val === null || val === undefined; - case "isNotNull": - return val !== null && val !== undefined; - } -} - -/** - * Filter rows by a set of column-level predicates. - * - * Conditions are combined with AND (default) or OR. Supports equality, - * comparison, set membership, and null checks. - */ -export const filter = Action.withoutClient("filter", { - types: [Row], - params: FilterParams, - transform: async function* (stream, params) { - const mode = params.mode ?? "and"; - for await (const row of stream) { - const match = - mode === "and" - ? params.conditions.every((c) => matchCondition(row, c)) - : params.conditions.some((c) => matchCondition(row, c)); - if (match) yield row; - } - }, -}); diff --git a/packages/nvisy-plugin-sql/src/actions/index.ts b/packages/nvisy-plugin-sql/src/actions/index.ts deleted file mode 100644 index 8aa2d94..0000000 --- a/packages/nvisy-plugin-sql/src/actions/index.ts +++ /dev/null @@ -1,4 +0,0 @@ -export { coerce } from "./coerce.js"; -export { filter } from "./filter.js"; -export { project } from "./project.js"; -export { rename } from "./rename.js"; diff --git a/packages/nvisy-plugin-sql/src/actions/project.ts b/packages/nvisy-plugin-sql/src/actions/project.ts deleted file mode 100644 index a676632..0000000 --- a/packages/nvisy-plugin-sql/src/actions/project.ts +++ /dev/null @@ -1,52 +0,0 @@ -import type { JsonValue } from "@nvisy/core"; -import { Action } from "@nvisy/core"; -import { z } from "zod"; -import { Row } from "../datatypes/index.js"; - -/** - * Parameters for the `sql/project` action. - * - * Provide **either** `keep` (include only these columns) or `drop` - * (exclude these columns). Columns not present in the row are ignored. - */ -const ProjectParams = z.union([ - z.object({ keep: z.array(z.string()) }), - z.object({ drop: z.array(z.string()) }), -]); - -/** - * Project (select / exclude) columns from each row. - * - * Use `{ keep: [...] }` to retain only named columns, or - * `{ drop: [...] }` to remove named columns. Row identity and - * metadata are preserved. - */ -export const project = Action.withoutClient("project", { - types: [Row], - params: ProjectParams, - transform: async function* (stream, params) { - for await (const row of stream) { - const cols = row.columns; - let projected: Record<string, JsonValue>; - - if ("keep" in params) { - projected = {}; - for (const key of params.keep) { - if (key in cols) { - projected[key] = cols[key]!; - } - } - } else { - const dropSet = new Set(params.drop); - projected = {}; - for (const [key, val] of Object.entries(cols)) { - if (!dropSet.has(key)) { - projected[key] = val; - } - } - } - - yield new Row(projected).deriveFrom(row); - } - }, -}); diff --git a/packages/nvisy-plugin-sql/src/actions/rename.ts b/packages/nvisy-plugin-sql/src/actions/rename.ts deleted file mode 100644 index 9a9f40d..0000000 --- a/packages/nvisy-plugin-sql/src/actions/rename.ts +++ /dev/null @@ -1,37 +0,0 @@ -import type { JsonValue } from "@nvisy/core"; -import { Action } from "@nvisy/core"; -import { z } from "zod"; -import { Row } from "../datatypes/index.js"; - -/** - * Parameters for the `sql/rename` action. - * - * `mapping` is a `{ oldName: newName }` record. Columns not present - * in the mapping are passed through unchanged. - */ -const RenameParams = z.object({ - mapping: z.record(z.string(), z.string()), -}); - -/** - * Rename columns according to a key mapping. - * - * Each entry in `mapping` renames `oldKey -> newKey`. Columns not in - * the mapping are preserved as-is. Row identity and metadata are kept. - */ -export const rename = Action.withoutClient("rename", { - types: [Row], - params: RenameParams, - transform: async function* (stream, params) { - for await (const row of stream) { - const result: Record<string, JsonValue> = {}; - - for (const [key, val] of Object.entries(row.columns)) { - const newKey = params.mapping[key] ?? key; - result[newKey] = val; - } - - yield new Row(result).deriveFrom(row); - } - }, -}); diff --git a/packages/nvisy-plugin-sql/src/datatypes/index.ts b/packages/nvisy-plugin-sql/src/datatypes/index.ts deleted file mode 100644 index 6dce926..0000000 --- a/packages/nvisy-plugin-sql/src/datatypes/index.ts +++ /dev/null @@ -1 +0,0 @@ -export { Row } from "./row.js"; diff --git a/packages/nvisy-plugin-sql/src/datatypes/row.ts b/packages/nvisy-plugin-sql/src/datatypes/row.ts deleted file mode 100644 index 9d1f48c..0000000 --- a/packages/nvisy-plugin-sql/src/datatypes/row.ts +++ /dev/null @@ -1,35 +0,0 @@ -import type { JsonValue } from "@nvisy/core"; -import { Data } from "@nvisy/core"; - -/** - * A row from a relational database. - * - * Maps column names to JSON-compatible values. Use the {@link get} helper - * for safe column access that returns `undefined` on missing keys rather - * than throwing. - * - * @example - * ```ts - * const row = new Row({ name: "Alice", age: 30, active: true }); - * row.get("name"); // "Alice" - * row.get("missing"); // undefined - * ``` - */ -export class Row extends Data { - readonly #columns: Readonly<Record<string, JsonValue>>; - - constructor(columns: Record<string, JsonValue>) { - super(); - this.#columns = columns; - } - - /** Column name -> value mapping. */ - get columns(): Readonly<Record<string, JsonValue>> { - return this.#columns; - } - - /** Get a column value by name, or `undefined` if missing. */ - get(column: string): JsonValue | undefined { - return this.#columns[column]; - } -} diff --git a/packages/nvisy-plugin-sql/src/index.ts b/packages/nvisy-plugin-sql/src/index.ts deleted file mode 100644 index 7eff668..0000000 --- a/packages/nvisy-plugin-sql/src/index.ts +++ /dev/null @@ -1,32 +0,0 @@ -/** - * @module @nvisy/plugin-sql - * - * SQL provider plugin for the Nvisy runtime. - * - * Exposes Postgres, MySQL, and MSSQL providers (client lifecycle only), - * read/write streams (keyset-paginated source + batch-insert sink), and - * row-level transform actions (filter, project, rename, coerce). - * - * @example - * ```ts - * import { sqlPlugin } from "@nvisy/plugin-sql"; - * - * // Register with the runtime - * runtime.register(sqlPlugin); - * ``` - */ - -import { Datatype, Plugin } from "@nvisy/core"; -import { coerce, filter, project, rename } from "./actions/index.js"; -import { Row } from "./datatypes/index.js"; -import { mssql, mysql, postgres } from "./providers/index.js"; -import { read, write } from "./streams/index.js"; - -/** The SQL plugin: register this with the runtime to enable all SQL providers, streams, and actions. */ -export const sqlPlugin = Plugin.define("sql") - .withProviders(postgres, mysql, mssql) - .withStreams(read, write) - .withActions(filter, project, rename, coerce) - .withDatatypes(Datatype.define("row", Row)); - -export { Row } from "./datatypes/index.js"; diff --git a/packages/nvisy-plugin-sql/src/providers/client.ts b/packages/nvisy-plugin-sql/src/providers/client.ts deleted file mode 100644 index db52503..0000000 --- a/packages/nvisy-plugin-sql/src/providers/client.ts +++ /dev/null @@ -1,140 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { - ConnectionError, - Provider, - type ProviderFactory, - type ProviderInstance, -} from "@nvisy/core"; -import { type Dialect, Kysely, sql } from "kysely"; -import { SqlCredentials } from "./schemas.js"; - -export type { SqlCredentials } from "./schemas.js"; - -const logger = getLogger(["nvisy", "sql"]); - -/** A database with an unknown schema: any table, any column, unknown values. */ -type DynamicDatabase = Record<string, Record<string, unknown>>; - -/** - * Wrapper around a {@link Kysely} instance that serves as the concrete - * class reference required by {@link Stream.createSource} and - * {@link Stream.createTarget} for runtime client-type matching. - * - * The underlying instance is schema-agnostic ({@link DynamicDatabase}) - * because table structures are not known at compile time. - */ -export class KyselyClient { - #db: Kysely<DynamicDatabase>; - - constructor(db: Kysely<DynamicDatabase>) { - this.#db = db; - } - - /** The underlying Kysely instance used for query building and execution. */ - get db(): Kysely<DynamicDatabase> { - return this.#db; - } -} - -/** Configuration for {@link makeSqlProvider}. */ -export interface SqlProviderConfig { - /** Unique provider identifier, e.g. `"postgres"`, `"mysql"`, `"mssql"`. */ - readonly id: string; - /** Build a Kysely {@link Dialect} from validated connection credentials. */ - readonly createDialect: (creds: SqlCredentials) => Dialect; -} - -/** - * Connected SQL provider instance returned by {@link makeSqlProvider}. - * - * Holds a {@link KyselyClient} and manages teardown of the underlying - * Kysely connection pool on {@link disconnect}. - */ -export class SqlProvider implements ProviderInstance<KyselyClient> { - readonly client: KyselyClient; - #id: string; - - constructor(client: KyselyClient, id: string) { - this.client = client; - this.#id = id; - } - - async disconnect(): Promise<void> { - await this.client.db.destroy(); - logger.debug("Disconnected from {provider}", { provider: this.#id }); - } -} - -/** Instantiate a Kysely dialect and wrap it in a {@link KyselyClient}. */ -function createClient( - config: SqlProviderConfig, - credentials: SqlCredentials, -): KyselyClient { - logger.debug("Connecting to {provider} at {host}:{port}/{database}", { - provider: config.id, - host: credentials.host, - port: credentials.port, - database: credentials.database, - }); - return new KyselyClient( - new Kysely<DynamicDatabase>({ dialect: config.createDialect(credentials) }), - ); -} - -/** Run `SELECT 1` to verify the connection is live. */ -async function verifyConnection( - config: SqlProviderConfig, - credentials: SqlCredentials, -): Promise<void> { - const client = createClient(config, credentials); - try { - await sql`SELECT 1`.execute(client.db); - logger.info("Verified {provider} at {host}:{port}/{database}", { - provider: config.id, - host: credentials.host, - port: credentials.port, - database: credentials.database, - }); - } finally { - await client.db.destroy(); - } -} - -/** Normalise an unknown throw into a {@link ConnectionError}, re-throwing as-is if already one. */ -function toConnectionError(error: unknown, source: string): ConnectionError { - if (error instanceof ConnectionError) return error; - logger.error("Connection to {provider} failed: {error}", { - provider: source, - error: error instanceof Error ? error.message : String(error), - }); - return ConnectionError.wrap(error, { source }); -} - -/** - * Create a SQL {@link ProviderFactory} parameterised by a dialect constructor. - * - * The returned factory validates {@link SqlCredentials} at parse time, then - * opens a {@link KyselyClient} on connect and tears it down on disconnect. - * Actual data I/O is handled by the stream layer, not the provider. - */ -export const makeSqlProvider = ( - config: SqlProviderConfig, -): ProviderFactory<SqlCredentials, KyselyClient> => - Provider.withAuthentication(config.id, { - credentials: SqlCredentials, - verify: async (credentials) => { - try { - await verifyConnection(config, credentials); - } catch (error) { - throw toConnectionError(error, config.id); - } - }, - connect: async (credentials) => { - try { - const client = createClient(config, credentials); - return new SqlProvider(client, config.id); - } catch (error) { - throw toConnectionError(error, config.id); - } - }, - }); diff --git a/packages/nvisy-plugin-sql/src/providers/index.ts b/packages/nvisy-plugin-sql/src/providers/index.ts deleted file mode 100644 index 049883b..0000000 --- a/packages/nvisy-plugin-sql/src/providers/index.ts +++ /dev/null @@ -1,6 +0,0 @@ -export type { SqlProviderConfig } from "./client.js"; -export { KyselyClient, makeSqlProvider, SqlProvider } from "./client.js"; -export { mssql } from "./mssql.js"; -export { mysql } from "./mysql.js"; -export { postgres } from "./postgres.js"; -export { SqlCredentials } from "./schemas.js"; diff --git a/packages/nvisy-plugin-sql/src/providers/mssql.ts b/packages/nvisy-plugin-sql/src/providers/mssql.ts deleted file mode 100644 index ab4ddc4..0000000 --- a/packages/nvisy-plugin-sql/src/providers/mssql.ts +++ /dev/null @@ -1,41 +0,0 @@ -import { type Dialect, MssqlDialect } from "kysely"; -import * as Tarn from "tarn"; -import * as Tedious from "tedious"; -import { makeSqlProvider } from "./client.js"; -import type { SqlCredentials } from "./schemas.js"; - -/** Create a `tedious` {@link Tedious.Connection} from credentials. */ -function createConnection(creds: SqlCredentials): Tedious.Connection { - return new Tedious.Connection({ - server: creds.host, - authentication: { - options: { - userName: creds.username, - password: creds.password, - }, - type: "default", - }, - options: { - database: creds.database, - port: creds.port, - trustServerCertificate: true, - }, - }); -} - -/** Create an MSSQL dialect backed by `tedious` with a `tarn` connection pool. */ -function createDialect(creds: SqlCredentials): Dialect { - return new MssqlDialect({ - tarn: { - ...Tarn, - options: { min: 0, max: 10 }, - }, - tedious: { - ...Tedious, - connectionFactory: () => createConnection(creds), - }, - }); -} - -/** Microsoft SQL Server provider. Keyset-paginated source and batch-insert sink via kysely + `tedious`. */ -export const mssql = makeSqlProvider({ id: "mssql", createDialect }); diff --git a/packages/nvisy-plugin-sql/src/providers/mysql.ts b/packages/nvisy-plugin-sql/src/providers/mysql.ts deleted file mode 100644 index 0c0de8d..0000000 --- a/packages/nvisy-plugin-sql/src/providers/mysql.ts +++ /dev/null @@ -1,20 +0,0 @@ -import { type Dialect, MysqlDialect } from "kysely"; -import { createPool } from "mysql2/promise"; -import { makeSqlProvider } from "./client.js"; -import type { SqlCredentials } from "./schemas.js"; - -/** Create a MySQL dialect backed by a `mysql2` connection pool. */ -function createDialect(creds: SqlCredentials): Dialect { - return new MysqlDialect({ - pool: createPool({ - host: creds.host, - port: creds.port, - database: creds.database, - user: creds.username, - password: creds.password, - }), - }); -} - -/** MySQL provider. Keyset-paginated source and batch-insert sink via kysely + `mysql2`. */ -export const mysql = makeSqlProvider({ id: "mysql", createDialect }); diff --git a/packages/nvisy-plugin-sql/src/providers/postgres.ts b/packages/nvisy-plugin-sql/src/providers/postgres.ts deleted file mode 100644 index 7c7fdb3..0000000 --- a/packages/nvisy-plugin-sql/src/providers/postgres.ts +++ /dev/null @@ -1,20 +0,0 @@ -import { type Dialect, PostgresDialect } from "kysely"; -import pg from "pg"; -import { makeSqlProvider } from "./client.js"; -import type { SqlCredentials } from "./schemas.js"; - -/** Create a PostgreSQL dialect backed by a `pg.Pool`. */ -function createDialect(creds: SqlCredentials): Dialect { - return new PostgresDialect({ - pool: new pg.Pool({ - host: creds.host, - port: creds.port, - database: creds.database, - user: creds.username, - password: creds.password, - }), - }); -} - -/** PostgreSQL provider. Keyset-paginated source and batch-insert sink via kysely + `pg`. */ -export const postgres = makeSqlProvider({ id: "postgres", createDialect }); diff --git a/packages/nvisy-plugin-sql/src/providers/schemas.ts b/packages/nvisy-plugin-sql/src/providers/schemas.ts deleted file mode 100644 index df1b2bd..0000000 --- a/packages/nvisy-plugin-sql/src/providers/schemas.ts +++ /dev/null @@ -1,20 +0,0 @@ -import { z } from "zod"; - -/** - * Connection credentials shared by all SQL providers. - * - * Validated at graph parse time before any connection is attempted. - */ -export const SqlCredentials = z.object({ - /** Database server hostname or IP address. */ - host: z.string(), - /** Database server port. */ - port: z.number(), - /** Target database name. */ - database: z.string(), - /** Authentication username. */ - username: z.string(), - /** Authentication password. */ - password: z.string(), -}); -export type SqlCredentials = z.infer<typeof SqlCredentials>; diff --git a/packages/nvisy-plugin-sql/src/streams/index.ts b/packages/nvisy-plugin-sql/src/streams/index.ts deleted file mode 100644 index b319f8d..0000000 --- a/packages/nvisy-plugin-sql/src/streams/index.ts +++ /dev/null @@ -1,3 +0,0 @@ -export { read } from "./read.js"; -export { SqlCursor, SqlParams } from "./schemas.js"; -export { write } from "./write.js"; diff --git a/packages/nvisy-plugin-sql/src/streams/read.ts b/packages/nvisy-plugin-sql/src/streams/read.ts deleted file mode 100644 index 9d886e8..0000000 --- a/packages/nvisy-plugin-sql/src/streams/read.ts +++ /dev/null @@ -1,100 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import type { JsonValue, Resumable } from "@nvisy/core"; -import { RuntimeError, Stream } from "@nvisy/core"; -import { type SqlBool, sql } from "kysely"; -import { Row } from "../datatypes/index.js"; -import { KyselyClient } from "../providers/client.js"; -import { SqlCursor, SqlParams } from "./schemas.js"; - -const logger = getLogger(["nvisy", "sql"]); - -/** - * Keyset-paginated source stream that yields one {@link Row} at a time. - * - * Pages are fetched using a composite `(idColumn, tiebreaker)` cursor - * for stable ordering across batches. The stream terminates when a - * batch returns fewer rows than `batchSize`, or when `limit` rows - * have been yielded. - */ -export const read = Stream.createSource("read", KyselyClient, { - type: Row, - context: SqlCursor, - params: SqlParams, - reader: (client, cursor, params) => readStream(client, cursor, params), -}); - -async function* readStream( - client: KyselyClient, - cursor: SqlCursor, - params: SqlParams, -): AsyncIterable<Resumable<Row, SqlCursor>> { - const { table, columns, idColumn, tiebreaker, batchSize, limit } = params; - const { ref } = client.db.dynamic; - - logger.debug("Read stream opened on {table}", { - table, - idColumn, - tiebreaker, - batchSize, - ...(limit != null ? { limit } : {}), - }); - - let lastId = cursor.lastId; - let lastTiebreaker = cursor.lastTiebreaker; - let totalRows = 0; - - while (true) { - let rows: ReadonlyArray<Record<string, unknown>>; - - try { - let query = client.db - .selectFrom(table) - .orderBy(ref(idColumn), "asc") - .orderBy(ref(tiebreaker), "asc") - .limit(batchSize); - - if (columns.length > 0) { - query = query.select(columns.map((c) => ref(c))); - } else { - query = query.selectAll(); - } - - if (lastId !== null && lastTiebreaker !== null) { - query = query.where( - sql<SqlBool>`(${sql.ref(idColumn)}, ${sql.ref(tiebreaker)}) > (${lastId}, ${lastTiebreaker})`, - ); - } - - rows = await query.execute(); - logger.debug("Read batch returned {count} rows from {table}", { - count: rows.length, - table, - }); - } catch (error) { - logger.error("Read failed on {table}: {error}", { - table, - error: error instanceof Error ? error.message : String(error), - }); - throw RuntimeError.wrap(error, { source: "sql/read" }); - } - - for (const row of rows) { - totalRows++; - lastId = (row[idColumn] as string | number) ?? null; - lastTiebreaker = (row[tiebreaker] as string | number) ?? null; - yield { - data: new Row(row as Record<string, JsonValue>), - context: { lastId, lastTiebreaker } as SqlCursor, - }; - if (limit != null && totalRows >= limit) break; - } - - if (limit != null && totalRows >= limit) break; - if (rows.length < batchSize) break; - } - - logger.debug("Read stream closed on {table}, {totalRows} rows yielded", { - table, - totalRows, - }); -} diff --git a/packages/nvisy-plugin-sql/src/streams/schemas.ts b/packages/nvisy-plugin-sql/src/streams/schemas.ts deleted file mode 100644 index c19f68c..0000000 --- a/packages/nvisy-plugin-sql/src/streams/schemas.ts +++ /dev/null @@ -1,36 +0,0 @@ -import { z } from "zod"; - -/** - * Per-node parameters that describe what to read from or write to. - * - * Attached to each provider node in the workflow graph. - */ -export const SqlParams = z.object({ - /** Target table name. */ - table: z.string(), - /** Columns to select (empty array = `SELECT *`). */ - columns: z.array(z.string()), - /** Primary sort column for keyset pagination (must be sequential / monotonic). */ - idColumn: z.string(), - /** Secondary sort column for stable ordering when `idColumn` values collide. */ - tiebreaker: z.string(), - /** Maximum rows per page during keyset pagination. */ - batchSize: z.number(), - /** Maximum total rows to yield. When omitted, all rows are read. */ - limit: z.number().int().positive().optional(), -}); -export type SqlParams = z.infer<typeof SqlParams>; - -/** - * Keyset pagination cursor for resumable reads. - * - * Both fields are `null` on the very first page and are updated after - * each yielded row. - */ -export const SqlCursor = z.object({ - /** Last seen value of the `idColumn`, or `null` for the first page. */ - lastId: z.union([z.number(), z.string(), z.null()]), - /** Last seen value of the `tiebreaker` column, or `null` for the first page. */ - lastTiebreaker: z.union([z.number(), z.string(), z.null()]), -}); -export type SqlCursor = z.infer<typeof SqlCursor>; diff --git a/packages/nvisy-plugin-sql/src/streams/write.ts b/packages/nvisy-plugin-sql/src/streams/write.ts deleted file mode 100644 index 9d63f46..0000000 --- a/packages/nvisy-plugin-sql/src/streams/write.ts +++ /dev/null @@ -1,34 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { RuntimeError, Stream } from "@nvisy/core"; -import { Row } from "../datatypes/index.js"; -import { KyselyClient } from "../providers/client.js"; -import { SqlParams } from "./schemas.js"; - -const logger = getLogger(["nvisy", "sql"]); - -/** - * Per-item insert target stream. - * - * Extracts the column map from each {@link Row} and writes it via - * a Kysely INSERT. Each element piped through the writer triggers - * an individual INSERT statement. - */ -export const write = Stream.createTarget("write", KyselyClient, { - type: Row, - params: SqlParams, - writer: (client, params) => async (item: Row) => { - const record = item.columns as Record<string, unknown>; - if (Object.keys(record).length === 0) return; - - try { - await client.db.insertInto(params.table).values(record).execute(); - logger.debug("Inserted row into {table}", { table: params.table }); - } catch (error) { - logger.error("Write failed on {table}: {error}", { - table: params.table, - error: error instanceof Error ? error.message : String(error), - }); - throw RuntimeError.wrap(error, { source: "sql/write" }); - } - }, -}); diff --git a/packages/nvisy-plugin-sql/tsconfig.json b/packages/nvisy-plugin-sql/tsconfig.json deleted file mode 100644 index c91a2dd..0000000 --- a/packages/nvisy-plugin-sql/tsconfig.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - /* Emit */ - "outDir": "./dist", - "rootDir": "./src", - "composite": true - }, - /* Scope */ - "include": ["src/**/*"], - "exclude": ["node_modules", "dist"], - "references": [{ "path": "../nvisy-core" }] -} diff --git a/packages/nvisy-plugin-sql/tsup.config.ts b/packages/nvisy-plugin-sql/tsup.config.ts deleted file mode 100644 index d68a5db..0000000 --- a/packages/nvisy-plugin-sql/tsup.config.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - /* Entry */ - entry: ["src/index.ts"], - format: ["esm"], - - /* Output */ - outDir: "dist", - dts: { compilerOptions: { composite: false } }, - sourcemap: true, - clean: true, - - /* Optimization */ - splitting: false, - treeshake: true, - skipNodeModulesBundle: true, - - /* Environment */ - platform: "node", - target: "es2024", -}); diff --git a/packages/nvisy-plugin-tesseract/package.json b/packages/nvisy-plugin-tesseract/package.json deleted file mode 100644 index aabba40..0000000 --- a/packages/nvisy-plugin-tesseract/package.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "name": "@nvisy/plugin-tesseract", - "version": "0.1.0", - "description": "Optical character recognition for the Nvisy platform", - "type": "module", - "exports": { - ".": { - "source": "./src/index.ts", - "types": "./dist/index.d.ts", - "import": "./dist/index.js" - } - }, - "files": [ - "dist" - ], - "scripts": { - "build": "tsup", - "build:watch": "tsup --watch", - "clean": "rimraf dist", - "typecheck": "tsc -b" - }, - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } -} diff --git a/packages/nvisy-plugin-tesseract/src/index.ts b/packages/nvisy-plugin-tesseract/src/index.ts deleted file mode 100644 index 8660d15..0000000 --- a/packages/nvisy-plugin-tesseract/src/index.ts +++ /dev/null @@ -1,13 +0,0 @@ -/** - * @module @nvisy/plugin-tesseract - * - * Optical character recognition plugin for the Nvisy runtime. - * - * Provides actions for extracting text from images and scanned - * documents using Tesseract. - */ - -import { Plugin } from "@nvisy/core"; - -/** Tesseract OCR plugin instance. */ -export const tesseractPlugin = Plugin.define("tesseract"); diff --git a/packages/nvisy-plugin-tesseract/tsconfig.json b/packages/nvisy-plugin-tesseract/tsconfig.json deleted file mode 100644 index c91a2dd..0000000 --- a/packages/nvisy-plugin-tesseract/tsconfig.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - /* Emit */ - "outDir": "./dist", - "rootDir": "./src", - "composite": true - }, - /* Scope */ - "include": ["src/**/*"], - "exclude": ["node_modules", "dist"], - "references": [{ "path": "../nvisy-core" }] -} diff --git a/packages/nvisy-plugin-tesseract/tsup.config.ts b/packages/nvisy-plugin-tesseract/tsup.config.ts deleted file mode 100644 index d68a5db..0000000 --- a/packages/nvisy-plugin-tesseract/tsup.config.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - /* Entry */ - entry: ["src/index.ts"], - format: ["esm"], - - /* Output */ - outDir: "dist", - dts: { compilerOptions: { composite: false } }, - sourcemap: true, - clean: true, - - /* Optimization */ - splitting: false, - treeshake: true, - skipNodeModulesBundle: true, - - /* Environment */ - platform: "node", - target: "es2024", -}); diff --git a/packages/nvisy-plugin-vector/README.md b/packages/nvisy-plugin-vector/README.md deleted file mode 100644 index e4247b8..0000000 --- a/packages/nvisy-plugin-vector/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# @nvisy/plugin-vector - -[![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) - -Vector database plugin for the Nvisy runtime. - -## Features - -- **Pinecone, Qdrant, Milvus, Weaviate, and pgvector** providers with credential validation and connection lifecycle management -- **Vector upsert streams** for writing embeddings to vector databases -- **Vector search streams** for similarity queries -- **Metadata filtering** actions for vector search results - -## Overview - -Provides vector database integrations for embedding storage and similarity search. The plugin exposes: - -- **Providers** (`vector/pinecone`, `vector/qdrant`, `vector/milvus`, `vector/weaviate`, `vector/pgvector`): connection lifecycle management with credential validation. -- **Streams** (`vector/upsert`, `vector/search`): vector write and similarity search streams. -- **Actions** (`vector/filter`, `vector/rerank`): post-processing transforms for search results. - -## Usage - -```ts -import { vectorPlugin } from "@nvisy/plugin-vector"; - -registry.load(vectorPlugin); -``` - -## Changelog - -See [CHANGELOG.md](../../CHANGELOG.md) for release notes and version history. - -## License - -Apache 2.0 License - see [LICENSE.txt](../../LICENSE.txt) - -## Support - -- **Documentation**: [docs.nvisy.com](https://docs.nvisy.com) -- **Issues**: [GitHub Issues](https://github.com/nvisycom/runtime/issues) -- **Email**: [support@nvisy.com](mailto:support@nvisy.com) diff --git a/packages/nvisy-plugin-vector/package.json b/packages/nvisy-plugin-vector/package.json deleted file mode 100644 index 1787197..0000000 --- a/packages/nvisy-plugin-vector/package.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "name": "@nvisy/plugin-vector", - "version": "0.1.0", - "description": "Vector database integrations for the Nvisy platform", - "type": "module", - "exports": { - ".": { - "source": "./src/index.ts", - "types": "./dist/index.d.ts", - "import": "./dist/index.js" - } - }, - "files": [ - "dist" - ], - "scripts": { - "build": "tsup", - "build:watch": "tsup --watch", - "clean": "rimraf dist", - "typecheck": "tsc -b" - }, - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "@pinecone-database/pinecone": "^7.0.0", - "@qdrant/js-client-rest": "^1.13.0", - "@zilliz/milvus2-sdk-node": "^2.5.0", - "pg": "^8.13.0", - "pgvector": "^0.2.0", - "weaviate-client": "^3.5.0", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } -} diff --git a/packages/nvisy-plugin-vector/src/index.ts b/packages/nvisy-plugin-vector/src/index.ts deleted file mode 100644 index 2b67dee..0000000 --- a/packages/nvisy-plugin-vector/src/index.ts +++ /dev/null @@ -1,33 +0,0 @@ -/** - * @module @nvisy/plugin-vector - * - * Vector database plugin for the Nvisy runtime. - * - * Exposes vector database providers (Pinecone, Qdrant, Milvus, Weaviate, pgvector) - * and an upsert target stream for writing embeddings to vector stores. - * - * @example - * ```ts - * import { vectorPlugin } from "@nvisy/plugin-vector"; - * - * engine.register(vectorPlugin); - * ``` - */ - -import { Plugin } from "@nvisy/core"; -import { - milvus, - pgvectorProvider, - pinecone, - qdrant, - weaviateProvider, -} from "./providers/index.js"; -import { upsert } from "./streams/index.js"; - -/** The Vector plugin: register this with the runtime to enable vector store providers and streams. */ -export const vectorPlugin = Plugin.define("vector") - .withProviders(pinecone, qdrant, milvus, weaviateProvider, pgvectorProvider) - .withStreams(upsert); - -export type { UpsertVector } from "./providers/index.js"; -export { VectorClient } from "./providers/index.js"; diff --git a/packages/nvisy-plugin-vector/src/providers/client.ts b/packages/nvisy-plugin-vector/src/providers/client.ts deleted file mode 100644 index 47efaa4..0000000 --- a/packages/nvisy-plugin-vector/src/providers/client.ts +++ /dev/null @@ -1,73 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { - Provider, - type ProviderFactory, - type ProviderInstance, -} from "@nvisy/core"; -import type { z } from "zod"; - -const logger = getLogger(["nvisy", "vector"]); - -/** - * A single vector to upsert into the vector store. - */ -export interface UpsertVector { - /** Unique identifier for this vector. */ - readonly id: string; - /** The dense embedding vector. */ - readonly vector: Float32Array | number[]; - /** Optional metadata to store alongside the vector. */ - readonly metadata?: Record<string, unknown> | undefined; -} - -/** - * Abstract client that vector-store streams use for I/O. - * - * Each provider (Pinecone, Qdrant, Milvus, Weaviate, pgvector) supplies a - * concrete subclass. The class reference is required by - * {@link Stream.createTarget} for runtime client-type matching. - */ -export abstract class VectorClient { - /** Upsert one or more vectors into the store. */ - abstract upsert(vectors: UpsertVector[]): Promise<void>; -} - -/** - * Connected vector-store provider instance. - * - * Holds a {@link VectorClient} and manages teardown on {@link disconnect}. - */ -export class VectorProvider implements ProviderInstance<VectorClient> { - readonly client: VectorClient; - readonly #id: string; - readonly #disconnect: (() => Promise<void>) | undefined; - - constructor( - client: VectorClient, - id: string, - disconnect?: () => Promise<void>, - ) { - this.client = client; - this.#id = id; - this.#disconnect = disconnect; - } - - async disconnect(): Promise<void> { - await this.#disconnect?.(); - logger.debug("Disconnected from {provider}", { provider: this.#id }); - } -} - -/** - * Create a vector-store {@link ProviderFactory} from a credential schema - * and a connect function. - */ -export const makeVectorProvider = <TCred>( - id: string, - credentials: z.ZodType<TCred>, - connect: (creds: TCred) => Promise<ProviderInstance<VectorClient>>, -): ProviderFactory<TCred, VectorClient> => - Provider.withAuthentication(id, { - credentials, - connect, - }); diff --git a/packages/nvisy-plugin-vector/src/providers/index.ts b/packages/nvisy-plugin-vector/src/providers/index.ts deleted file mode 100644 index 32d6383..0000000 --- a/packages/nvisy-plugin-vector/src/providers/index.ts +++ /dev/null @@ -1,11 +0,0 @@ -export { - makeVectorProvider, - type UpsertVector, - VectorClient, - VectorProvider, -} from "./client.js"; -export { type MilvusCredentials, milvus } from "./milvus.js"; -export { type PgvectorCredentials, pgvectorProvider } from "./pgvector.js"; -export { type PineconeCredentials, pinecone } from "./pinecone.js"; -export { type QdrantCredentials, qdrant } from "./qdrant.js"; -export { type WeaviateCredentials, weaviateProvider } from "./weaviate.js"; diff --git a/packages/nvisy-plugin-vector/src/providers/milvus.ts b/packages/nvisy-plugin-vector/src/providers/milvus.ts deleted file mode 100644 index 1a3db18..0000000 --- a/packages/nvisy-plugin-vector/src/providers/milvus.ts +++ /dev/null @@ -1,75 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { MilvusClient } from "@zilliz/milvus2-sdk-node"; -import { z } from "zod"; -import { - makeVectorProvider, - type UpsertVector, - VectorClient, - VectorProvider, -} from "./client.js"; - -const logger = getLogger(["nvisy", "vector"]); - -/** - * Credentials for connecting to Milvus / Zilliz. - */ -export const MilvusCredentials = z.object({ - /** Milvus server address (e.g. `"localhost:19530"`). */ - address: z.string(), - /** Optional authentication token. */ - token: z.string().optional(), - /** Name of the Milvus collection. */ - collectionName: z.string(), -}); -export type MilvusCredentials = z.infer<typeof MilvusCredentials>; - -class MilvusVectorClient extends VectorClient { - readonly #client: MilvusClient; - readonly #collectionName: string; - - constructor(client: MilvusClient, collectionName: string) { - super(); - this.#client = client; - this.#collectionName = collectionName; - } - - async upsert(vectors: UpsertVector[]): Promise<void> { - await this.#client.upsert({ - collection_name: this.#collectionName, - data: vectors.map((v) => ({ - id: v.id, - vector: [...v.vector], - ...v.metadata, - })), - }); - } -} - -/** Milvus / Zilliz vector database provider. */ -export const milvus = makeVectorProvider( - "milvus", - MilvusCredentials, - async (creds) => { - logger.debug( - "Connecting to Milvus at {address} collection {collectionName}", - { - address: creds.address, - collectionName: creds.collectionName, - }, - ); - - const config: ConstructorParameters<typeof MilvusClient>[0] = { - address: creds.address, - }; - if (creds.token) { - config.token = creds.token; - } - - const client = new MilvusClient(config); - - return new VectorProvider( - new MilvusVectorClient(client, creds.collectionName), - "milvus", - ); - }, -); diff --git a/packages/nvisy-plugin-vector/src/providers/pgvector.ts b/packages/nvisy-plugin-vector/src/providers/pgvector.ts deleted file mode 100644 index f8e2499..0000000 --- a/packages/nvisy-plugin-vector/src/providers/pgvector.ts +++ /dev/null @@ -1,73 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import pg from "pg"; -import pgvector from "pgvector"; -import { z } from "zod"; -import { - makeVectorProvider, - type UpsertVector, - VectorClient, - VectorProvider, -} from "./client.js"; - -const logger = getLogger(["nvisy", "vector"]); - -/** - * Credentials for connecting to PostgreSQL with pgvector. - */ -export const PgvectorCredentials = z.object({ - /** PostgreSQL connection string (e.g. `"postgresql://user:pass@host/db"`). */ - connectionString: z.string(), - /** Table name to store vectors in. */ - tableName: z.string(), -}); -export type PgvectorCredentials = z.infer<typeof PgvectorCredentials>; - -class PgVectorClient extends VectorClient { - readonly #pool: pg.Pool; - readonly #tableName: string; - - constructor(pool: pg.Pool, tableName: string) { - super(); - this.#pool = pool; - this.#tableName = tableName; - } - - async upsert(vectors: UpsertVector[]): Promise<void> { - const client = await this.#pool.connect(); - try { - for (const v of vectors) { - const embedding = pgvector.toSql([...v.vector]); - await client.query( - `INSERT INTO ${this.#tableName} (id, embedding, metadata) - VALUES ($1, $2, $3) - ON CONFLICT (id) DO UPDATE SET embedding = $2, metadata = $3`, - [v.id, embedding, JSON.stringify(v.metadata ?? {})], - ); - } - } finally { - client.release(); - } - } -} - -/** PostgreSQL + pgvector provider. */ -export const pgvectorProvider = makeVectorProvider( - "pgvector", - PgvectorCredentials, - async (creds) => { - logger.debug("Connecting to pgvector table {tableName}", { - tableName: creds.tableName, - }); - - const pool = new pg.Pool({ connectionString: creds.connectionString }); - await pool.query("CREATE EXTENSION IF NOT EXISTS vector"); - - return new VectorProvider( - new PgVectorClient(pool, creds.tableName), - "pgvector", - async () => { - await pool.end(); - }, - ); - }, -); diff --git a/packages/nvisy-plugin-vector/src/providers/pinecone.ts b/packages/nvisy-plugin-vector/src/providers/pinecone.ts deleted file mode 100644 index baba0a1..0000000 --- a/packages/nvisy-plugin-vector/src/providers/pinecone.ts +++ /dev/null @@ -1,57 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { Pinecone } from "@pinecone-database/pinecone"; -import { z } from "zod"; -import { - makeVectorProvider, - type UpsertVector, - VectorClient, - VectorProvider, -} from "./client.js"; - -const logger = getLogger(["nvisy", "vector"]); - -/** - * Credentials for connecting to Pinecone. - */ -export const PineconeCredentials = z.object({ - /** Pinecone API key. */ - apiKey: z.string(), - /** Name of the Pinecone index. */ - indexName: z.string(), -}); -export type PineconeCredentials = z.infer<typeof PineconeCredentials>; - -class PineconeVectorClient extends VectorClient { - readonly #index: ReturnType<Pinecone["index"]>; - - constructor(index: ReturnType<Pinecone["index"]>) { - super(); - this.#index = index; - } - - async upsert(vectors: UpsertVector[]): Promise<void> { - await this.#index.upsert({ - records: vectors.map((v) => ({ - id: v.id, - values: [...v.vector], - metadata: v.metadata as Record<string, string>, - })), - }); - } -} - -/** Pinecone vector database provider. */ -export const pinecone = makeVectorProvider( - "pinecone", - PineconeCredentials, - async (creds) => { - logger.debug("Connecting to Pinecone index {indexName}", { - indexName: creds.indexName, - }); - - const client = new Pinecone({ apiKey: creds.apiKey }); - const index = client.index(creds.indexName); - - return new VectorProvider(new PineconeVectorClient(index), "pinecone"); - }, -); diff --git a/packages/nvisy-plugin-vector/src/providers/qdrant.ts b/packages/nvisy-plugin-vector/src/providers/qdrant.ts deleted file mode 100644 index 16fc149..0000000 --- a/packages/nvisy-plugin-vector/src/providers/qdrant.ts +++ /dev/null @@ -1,71 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { QdrantClient } from "@qdrant/js-client-rest"; -import { z } from "zod"; -import { - makeVectorProvider, - type UpsertVector, - VectorClient, - VectorProvider, -} from "./client.js"; - -const logger = getLogger(["nvisy", "vector"]); - -/** - * Credentials for connecting to Qdrant. - */ -export const QdrantCredentials = z.object({ - /** Qdrant server URL. */ - url: z.string(), - /** Optional API key. */ - apiKey: z.string().optional(), - /** Name of the Qdrant collection. */ - collectionName: z.string(), -}); -export type QdrantCredentials = z.infer<typeof QdrantCredentials>; - -class QdrantVectorClient extends VectorClient { - readonly #client: QdrantClient; - readonly #collectionName: string; - - constructor(client: QdrantClient, collectionName: string) { - super(); - this.#client = client; - this.#collectionName = collectionName; - } - - async upsert(vectors: UpsertVector[]): Promise<void> { - await this.#client.upsert(this.#collectionName, { - points: vectors.map((v) => ({ - id: v.id, - vector: [...v.vector], - payload: v.metadata ?? {}, - })), - }); - } -} - -/** Qdrant vector database provider. */ -export const qdrant = makeVectorProvider( - "qdrant", - QdrantCredentials, - async (creds) => { - logger.debug("Connecting to Qdrant at {url} collection {collectionName}", { - url: creds.url, - collectionName: creds.collectionName, - }); - - const config: ConstructorParameters<typeof QdrantClient>[0] = { - url: creds.url, - }; - if (creds.apiKey) { - config.apiKey = creds.apiKey; - } - - const client = new QdrantClient(config); - - return new VectorProvider( - new QdrantVectorClient(client, creds.collectionName), - "qdrant", - ); - }, -); diff --git a/packages/nvisy-plugin-vector/src/providers/weaviate.ts b/packages/nvisy-plugin-vector/src/providers/weaviate.ts deleted file mode 100644 index 4755c8f..0000000 --- a/packages/nvisy-plugin-vector/src/providers/weaviate.ts +++ /dev/null @@ -1,77 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import weaviate, { type WeaviateClient } from "weaviate-client"; -import { z } from "zod"; -import { - makeVectorProvider, - type UpsertVector, - VectorClient, - VectorProvider, -} from "./client.js"; - -const logger = getLogger(["nvisy", "vector"]); - -/** - * Credentials for connecting to Weaviate. - */ -export const WeaviateCredentials = z.object({ - /** Weaviate host (e.g. `"localhost:8080"`). */ - host: z.string(), - /** Weaviate gRPC port (defaults to 50051). */ - grpcPort: z.number().default(50051), - /** Optional API key. */ - apiKey: z.string().optional(), - /** Name of the Weaviate collection (class). */ - collectionName: z.string(), -}); -export type WeaviateCredentials = z.infer<typeof WeaviateCredentials>; - -class WeaviateVectorClient extends VectorClient { - readonly #client: WeaviateClient; - readonly #collectionName: string; - - constructor(client: WeaviateClient, collectionName: string) { - super(); - this.#client = client; - this.#collectionName = collectionName; - } - - async upsert(vectors: UpsertVector[]): Promise<void> { - const collection = this.#client.collections.get(this.#collectionName); - await collection.data.insertMany( - vectors.map((v) => ({ - properties: (v.metadata ?? {}) as Record<string, never>, - vectors: [...v.vector], - })), - ); - } -} - -/** Weaviate vector database provider. */ -export const weaviateProvider = makeVectorProvider( - "weaviate", - WeaviateCredentials, - async (creds) => { - logger.debug( - "Connecting to Weaviate at {host} collection {collectionName}", - { - host: creds.host, - collectionName: creds.collectionName, - }, - ); - - const connectOpts: Parameters<typeof weaviate.connectToLocal>[0] = { - host: creds.host, - grpcPort: creds.grpcPort, - }; - if (creds.apiKey) { - connectOpts.authCredentials = new weaviate.ApiKey(creds.apiKey); - } - const client = await weaviate.connectToLocal(connectOpts); - - return new VectorProvider( - new WeaviateVectorClient(client, creds.collectionName), - "weaviate", - async () => client.close(), - ); - }, -); diff --git a/packages/nvisy-plugin-vector/src/streams/index.ts b/packages/nvisy-plugin-vector/src/streams/index.ts deleted file mode 100644 index 8f7b108..0000000 --- a/packages/nvisy-plugin-vector/src/streams/index.ts +++ /dev/null @@ -1 +0,0 @@ -export { upsert } from "./upsert.js"; diff --git a/packages/nvisy-plugin-vector/src/streams/upsert.ts b/packages/nvisy-plugin-vector/src/streams/upsert.ts deleted file mode 100644 index 4cb4f7e..0000000 --- a/packages/nvisy-plugin-vector/src/streams/upsert.ts +++ /dev/null @@ -1,44 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { Embedding, RuntimeError, Stream } from "@nvisy/core"; -import { z } from "zod"; -import { VectorClient } from "../providers/client.js"; - -const logger = getLogger(["nvisy", "vector"]); - -/** - * Per-node parameters for the vector upsert stream. - */ -export const UpsertParams = z.object({}); -export type UpsertParams = z.infer<typeof UpsertParams>; - -/** - * Target stream that upserts each {@link Embedding} into the vector store - * via the provider client's `upsert` method. - */ -export const upsert = Stream.createTarget("upsert", VectorClient, { - type: Embedding, - params: UpsertParams, - writer: - (client: VectorClient, _params: UpsertParams) => - async (item: Embedding) => { - try { - await client.upsert([ - { - id: item.id, - vector: item.vector, - metadata: item.metadata ?? undefined, - }, - ]); - logger.debug("Upserted vector {id} ({dims} dims)", { - id: item.id, - dims: item.dimensions, - }); - } catch (error) { - logger.error("Upsert failed for {id}: {error}", { - id: item.id, - error: error instanceof Error ? error.message : String(error), - }); - throw RuntimeError.wrap(error, { source: "vector/upsert" }); - } - }, -}); diff --git a/packages/nvisy-plugin-vector/tsconfig.json b/packages/nvisy-plugin-vector/tsconfig.json deleted file mode 100644 index c91a2dd..0000000 --- a/packages/nvisy-plugin-vector/tsconfig.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - /* Emit */ - "outDir": "./dist", - "rootDir": "./src", - "composite": true - }, - /* Scope */ - "include": ["src/**/*"], - "exclude": ["node_modules", "dist"], - "references": [{ "path": "../nvisy-core" }] -} diff --git a/packages/nvisy-plugin-vector/tsup.config.ts b/packages/nvisy-plugin-vector/tsup.config.ts deleted file mode 100644 index d68a5db..0000000 --- a/packages/nvisy-plugin-vector/tsup.config.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - /* Entry */ - entry: ["src/index.ts"], - format: ["esm"], - - /* Output */ - outDir: "dist", - dts: { compilerOptions: { composite: false } }, - sourcemap: true, - clean: true, - - /* Optimization */ - splitting: false, - treeshake: true, - skipNodeModulesBundle: true, - - /* Environment */ - platform: "node", - target: "es2024", -}); diff --git a/packages/nvisy-runtime/README.md b/packages/nvisy-runtime/README.md deleted file mode 100644 index 2c842fa..0000000 --- a/packages/nvisy-runtime/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# @nvisy/runtime - -[![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) - -Graph definition, DAG compiler, and execution engine for the Nvisy platform. - -## Features - -- **Graph schema**: JSON-based pipeline definitions with source, action, and target nodes -- **DAG compiler**: validates graph structure, detects cycles, and produces execution plans -- **Execution engine**: runs pipelines with Effection-based structured concurrency -- **Retry policies**: configurable backoff strategies for transient failures -- **Timeout policies**: per-node execution time limits - -## Overview - -Parses JSON graph definitions into an immutable execution plan, then runs it — walking the DAG in topological order with Effection-based concurrency, retry policies, and timeout handling. - -- **Schema** (`Graph`, `SourceNode`, `ActionNode`, `TargetNode`): Zod schemas for validating pipeline definitions. -- **Compiler** (`compile`): transforms a graph into an execution plan with resolved registry entries. -- **Engine** (`Engine`): validates, compiles, and executes graphs with connection management. - -## Usage - -### Registering Plugins - -```ts -import { Engine } from "@nvisy/runtime"; -import { sqlPlugin } from "@nvisy/plugin-sql"; - -const engine = new Engine().register(sqlPlugin); -``` - -### Validating a Graph - -```ts -const result = engine.validate(graphDefinition, connections); - -if (!result.valid) { - console.error(result.errors); -} -``` - -### Executing a Graph - -```ts -const result = await engine.execute(graphDefinition, connections, { - signal: abortController.signal, - onContextUpdate: (nodeId, credId, ctx) => { - // Persist resumption context - }, -}); - -console.log(result.status); // "success" | "partial_failure" | "failure" -``` - -## Changelog - -See [CHANGELOG.md](../../CHANGELOG.md) for release notes and version history. - -## License - -Apache 2.0 License - see [LICENSE.txt](../../LICENSE.txt) - -## Support - -- **Documentation**: [docs.nvisy.com](https://docs.nvisy.com) -- **Issues**: [GitHub Issues](https://github.com/nvisycom/runtime/issues) -- **Email**: [support@nvisy.com](mailto:support@nvisy.com) diff --git a/packages/nvisy-runtime/package.json b/packages/nvisy-runtime/package.json deleted file mode 100644 index b74d8cd..0000000 --- a/packages/nvisy-runtime/package.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "name": "@nvisy/runtime", - "version": "0.1.0", - "description": "Graph definition, DAG compilation, and execution engine for the Nvisy platform", - "type": "module", - "exports": { - ".": { - "source": "./src/index.ts", - "types": "./dist/index.d.ts", - "import": "./dist/index.js" - } - }, - "files": [ - "dist" - ], - "scripts": { - "build": "tsup", - "build:watch": "tsup --watch", - "clean": "rimraf dist", - "typecheck": "tsc -b" - }, - "dependencies": { - "@logtape/logtape": "^2.0.2", - "@nvisy/core": "*", - "@nvisy/plugin-core": "*", - "effection": "^4.0.2", - "graphology": "^0.26.0", - "graphology-dag": "^0.4.1", - "graphology-types": "^0.24.8", - - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } -} diff --git a/packages/nvisy-runtime/src/compiler/index.ts b/packages/nvisy-runtime/src/compiler/index.ts deleted file mode 100644 index 25d4ebb..0000000 --- a/packages/nvisy-runtime/src/compiler/index.ts +++ /dev/null @@ -1,26 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import type { Registry } from "../registry.js"; -import { parseGraph } from "./parse.js"; -import type { ExecutionPlan } from "./plan.js"; -import { buildPlan } from "./plan.js"; - -const logger = getLogger(["nvisy", "compiler"]); - -export type { - ExecutionPlan, - ResolvedActionNode, - ResolvedNode, - ResolvedSourceNode, - ResolvedTargetNode, -} from "./plan.js"; - -/** Compile a graph definition into an execution plan. */ -export function compile(input: unknown, registry: Registry): ExecutionPlan { - logger.info("Compiling graph"); - const parsed = parseGraph(input); - const plan = buildPlan(parsed, registry); - logger.info("Graph {graphId} compiled successfully", { - graphId: plan.definition.id, - }); - return plan; -} diff --git a/packages/nvisy-runtime/src/compiler/parse.ts b/packages/nvisy-runtime/src/compiler/parse.ts deleted file mode 100644 index 5dec74c..0000000 --- a/packages/nvisy-runtime/src/compiler/parse.ts +++ /dev/null @@ -1,61 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { ValidationError } from "@nvisy/core"; -import { DirectedGraph } from "graphology"; -import { Graph, type GraphNode } from "../schema.js"; - -const logger = getLogger(["nvisy", "compiler"]); - -/** Node attributes stored in the runtime graph. */ -export interface RuntimeNodeAttrs { - readonly schema: GraphNode; -} - -/** Graphology directed graph with typed node and edge attributes. */ -export type RuntimeGraph = DirectedGraph<RuntimeNodeAttrs>; - -/** Result of parsing a graph definition. */ -export interface ParsedGraph { - readonly definition: Graph; - readonly graph: RuntimeGraph; -} - -/** Convert a parsed Graph into a graphology DirectedGraph. */ -function buildRuntimeGraph(def: Graph): RuntimeGraph { - const graph: RuntimeGraph = new DirectedGraph(); - - for (const node of def.nodes) { - graph.addNode(node.id, { schema: node }); - } - - for (const edge of def.edges) { - graph.addEdgeWithKey(`${edge.from}->${edge.to}`, edge.from, edge.to); - } - - return graph; -} - -/** Parse and validate a graph definition from unknown input. */ -export function parseGraph(input: unknown): ParsedGraph { - const result = Graph.safeParse(input); - if (!result.success) { - logger.warn("Graph parse failed: {error}", { error: result.error.message }); - throw new ValidationError(`Graph parse error: ${result.error.message}`, { - source: "compiler", - retryable: false, - }); - } - - const definition = result.data; - logger.debug( - "Graph parsed: {graphId} ({nodeCount} nodes, {edgeCount} edges)", - { - graphId: definition.id, - nodeCount: definition.nodes.length, - edgeCount: definition.edges.length, - }, - ); - return { - definition, - graph: buildRuntimeGraph(definition), - }; -} diff --git a/packages/nvisy-runtime/src/compiler/plan.ts b/packages/nvisy-runtime/src/compiler/plan.ts deleted file mode 100644 index ecd1381..0000000 --- a/packages/nvisy-runtime/src/compiler/plan.ts +++ /dev/null @@ -1,196 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import type { - AnyActionInstance, - AnyProviderFactory, - AnyStreamSource, - AnyStreamTarget, -} from "@nvisy/core"; -import { ValidationError } from "@nvisy/core"; -import { hasCycle, topologicalSort } from "graphology-dag"; -import type { Registry } from "../registry.js"; -import type { Graph, GraphNode } from "../schema.js"; -import type { ParsedGraph, RuntimeGraph } from "./parse.js"; - -const logger = getLogger(["nvisy", "compiler"]); - -/** Resolved source node with provider and stream references. */ -export interface ResolvedSourceNode { - readonly type: "source"; - readonly provider: AnyProviderFactory; - readonly stream: AnyStreamSource; - readonly connection: string; - readonly params: Readonly<Record<string, unknown>>; -} - -/** Resolved action node with action reference. */ -export interface ResolvedActionNode { - readonly type: "action"; - readonly action: AnyActionInstance; - readonly provider?: AnyProviderFactory; - readonly connection?: string; - readonly params: Readonly<Record<string, unknown>>; -} - -/** Resolved target node with provider and stream references. */ -export interface ResolvedTargetNode { - readonly type: "target"; - readonly provider: AnyProviderFactory; - readonly stream: AnyStreamTarget; - readonly connection: string; - readonly params: Readonly<Record<string, unknown>>; -} - -/** A resolved registry entry carried with the node for execution. */ -export type ResolvedNode = - | ResolvedSourceNode - | ResolvedActionNode - | ResolvedTargetNode; - -/** Compiled graph ready for execution. */ -export interface ExecutionPlan { - readonly graph: RuntimeGraph; - readonly definition: Graph; - readonly order: ReadonlyArray<string>; - readonly resolved: ReadonlyMap<string, ResolvedNode>; -} - -/** Build an execution plan from a parsed graph. */ -export function buildPlan( - parsed: ParsedGraph, - registry: Registry, -): ExecutionPlan { - const { definition, graph } = parsed; - - if (hasCycle(graph)) { - logger.warn("Graph contains a cycle", { graphId: definition.id }); - throw new ValidationError("Graph contains a cycle", { - source: "compiler", - retryable: false, - }); - } - - const resolved = resolveAllNodes(definition.nodes, registry, definition.id); - const order = topologicalSort(graph); - - logger.debug("Execution plan built", { - graphId: definition.id, - order: order.join(" → "), - }); - - return { graph, definition, order, resolved }; -} - -function resolveAllNodes( - nodes: ReadonlyArray<GraphNode>, - registry: Registry, - graphId: string, -): Map<string, ResolvedNode> { - const resolved = new Map<string, ResolvedNode>(); - const unresolved: string[] = []; - - for (const node of nodes) { - const entry = resolveNode(node, registry, unresolved); - if (entry) { - resolved.set(node.id, entry); - } - } - - if (unresolved.length > 0) { - logger.warn("Unresolved names: {names}", { - graphId, - names: unresolved.join(", "), - }); - throw new ValidationError(`Unresolved names: ${unresolved.join(", ")}`, { - source: "compiler", - retryable: false, - }); - } - - return resolved; -} - -function resolveNode( - node: GraphNode, - registry: Registry, - unresolved: string[], -): ResolvedNode | undefined { - switch (node.type) { - case "source": { - const provider = registry.findProvider(node.provider); - const stream = registry.findStream(node.stream); - if (!provider) { - unresolved.push(`provider "${node.provider}" (node ${node.id})`); - } - if (!stream) { - unresolved.push(`stream "${node.stream}" (node ${node.id})`); - } else if (stream.kind !== "source") { - unresolved.push( - `stream "${node.stream}" is not a source (node ${node.id})`, - ); - } - if (provider && stream?.kind === "source") { - return { - type: "source", - provider, - stream, - connection: node.connection, - params: node.params as Readonly<Record<string, unknown>>, - }; - } - return undefined; - } - case "target": { - const provider = registry.findProvider(node.provider); - const stream = registry.findStream(node.stream); - if (!provider) { - unresolved.push(`provider "${node.provider}" (node ${node.id})`); - } - if (!stream) { - unresolved.push(`stream "${node.stream}" (node ${node.id})`); - } else if (stream.kind !== "target") { - unresolved.push( - `stream "${node.stream}" is not a target (node ${node.id})`, - ); - } - if (provider && stream?.kind === "target") { - return { - type: "target", - provider, - stream, - connection: node.connection, - params: node.params as Readonly<Record<string, unknown>>, - }; - } - return undefined; - } - case "action": { - const action = registry.findAction(node.action); - if (!action) { - unresolved.push(`action "${node.action}" (node ${node.id})`); - return undefined; - } - - if (node.provider) { - const provider = registry.findProvider(node.provider); - if (!provider) { - unresolved.push(`provider "${node.provider}" (node ${node.id})`); - return undefined; - } - - return { - type: "action", - action, - provider, - ...(node.connection != null ? { connection: node.connection } : {}), - params: node.params as Readonly<Record<string, unknown>>, - }; - } - - return { - type: "action", - action, - params: node.params as Readonly<Record<string, unknown>>, - }; - } - } -} diff --git a/packages/nvisy-runtime/src/engine/bridge.ts b/packages/nvisy-runtime/src/engine/bridge.ts deleted file mode 100644 index cc195f4..0000000 --- a/packages/nvisy-runtime/src/engine/bridge.ts +++ /dev/null @@ -1,98 +0,0 @@ -/** - * Loader bridge for automatic Blob → Document conversion. - * - * When a source node produces {@link Blob}s but downstream action or - * target nodes expect {@link Document}s, the bridge transparently - * selects a matching loader from the registry (by file extension and - * magic-byte content type), converts each blob, and yields the - * resulting documents. Converted documents are cached by blob ID in a - * per-run {@link LoaderCache} so the same blob is never loaded twice - * even when consumed by multiple downstream branches. - * - * @module - */ - -import { getLogger } from "@logtape/logtape"; -import { Blob, type Data, type Document, RuntimeError } from "@nvisy/core"; -import type { Registry } from "../registry.js"; - -const logger = getLogger(["nvisy", "bridge"]); - -/** Cache for converted documents, shared across an execution run. */ -export type LoaderCache = Map<string, Document[]>; - -/** Creates a new empty loader cache for an execution run. */ -export function createLoaderCache(): LoaderCache { - return new Map(); -} - -/** Options for the loader bridge. */ -export interface BridgeOptions { - /** When true, skip blobs with no matching loader instead of throwing. */ - readonly ignoreUnsupported?: boolean; -} - -/** - * Wrap an async iterable to automatically convert Blobs to Documents. - * - * Non-Blob items pass through unchanged. For each Blob the registry - * is queried for a loader that matches the file's extension / content - * type. If no loader is found, behaviour depends on - * {@link BridgeOptions.ignoreUnsupported}: when true the blob is - * silently dropped; otherwise a {@link RuntimeError} is thrown. - * - * @param stream - Upstream data items (may contain a mix of Blobs and other types). - * @param registry - Used to look up loaders by extension / magic bytes. - * @param cache - Per-run cache; blobs already converted are yielded from cache. - * @param options - Optional bridge configuration. - */ -export async function* applyLoaderBridge( - stream: AsyncIterable<Data>, - registry: Registry, - cache: LoaderCache, - options?: BridgeOptions, -): AsyncIterable<Data> { - for await (const item of stream) { - if (!(item instanceof Blob)) { - yield item; - continue; - } - - const cached = cache.get(item.id); - if (cached) { - logger.debug("Using cached documents for blob {id}", { id: item.id }); - for (const doc of cached) { - yield doc; - } - continue; - } - - const loader = registry.findLoaderForBlob(item); - - if (!loader) { - if (options?.ignoreUnsupported) { - logger.warn("No loader found for blob {path}, skipping", { - path: item.path, - }); - continue; - } - throw new RuntimeError( - `No loader found for blob: ${item.path} (mime: ${item.provided.mime ?? "unknown"})`, - { source: "bridge", retryable: false }, - ); - } - - logger.debug("Converting blob {path} using loader {loader}", { - path: item.path, - loader: loader.id, - }); - - const docs: Document[] = []; - const params = loader.schema.parse({}); - for await (const doc of loader.load(item, params)) { - docs.push(doc); - yield doc; - } - cache.set(item.id, docs); - } -} diff --git a/packages/nvisy-runtime/src/engine/connections.ts b/packages/nvisy-runtime/src/engine/connections.ts deleted file mode 100644 index 45d3baf..0000000 --- a/packages/nvisy-runtime/src/engine/connections.ts +++ /dev/null @@ -1,136 +0,0 @@ -/** - * Connection validation and types. - * - * A "connection" pairs a provider type with its credentials (and an - * optional resumption context). Before graph execution, every - * connection referenced by the plan is validated upfront against its - * provider's Zod credential schema via {@link validateConnections}, - * ensuring misconfigured credentials surface early rather than - * mid-pipeline. - * - * @module - */ - -import type { AnyProviderFactory } from "@nvisy/core"; -import { ValidationError } from "@nvisy/core"; -import { z } from "zod"; -import type { ExecutionPlan } from "../compiler/index.js"; -import type { - ResolvedActionNode, - ResolvedNode, - ResolvedSourceNode, - ResolvedTargetNode, -} from "../compiler/plan.js"; - -/** Schema for a single connection entry. */ -export const ConnectionSchema = z.object({ - /** Provider type identifier (e.g., "postgres", "s3"). */ - type: z.string(), - /** Provider-specific credentials (validated against provider schema at runtime). */ - credentials: z.unknown(), - /** Optional resumption context for crash recovery. */ - context: z.unknown(), -}); - -/** Schema for the connections map (UUID keys). */ -export const ConnectionsSchema = z.record(z.uuid(), ConnectionSchema); - -/** A connection entry with credentials for a specific provider. */ -export type Connection = z.infer<typeof ConnectionSchema>; - -/** - * Map of connection ID (UUID) to connection configuration. - * - * Connections are referenced by nodes in the graph definition. - * Each connection specifies credentials that are validated against - * the provider's credential schema before execution. - */ -export type Connections = z.infer<typeof ConnectionsSchema>; - -/** - * A connection whose credentials have passed provider-schema validation. - * - * Created by {@link validateConnections} before execution starts. - * `credentials` is the Zod-parsed output (defaults applied, types - * narrowed), ready to be passed directly to `provider.connect()`. - */ -export interface ValidatedConnection { - /** The provider factory that owns this connection's credential schema. */ - readonly provider: AnyProviderFactory; - /** Parsed credentials (output of `provider.credentialSchema.parse`). */ - readonly credentials: unknown; - /** Optional resumption context carried from a previous run. */ - readonly context: unknown; -} - -interface ResolvedWithConnection { - readonly provider: AnyProviderFactory; - readonly connection: string; -} - -function hasConnection( - resolved: ResolvedNode, -): resolved is (ResolvedSourceNode | ResolvedTargetNode | ResolvedActionNode) & - ResolvedWithConnection { - return "connection" in resolved && resolved.connection !== undefined; -} - -/** - * Validate every connection referenced by the execution plan. - * - * Iterates through each plan node that has an associated connection, - * resolves the connection entry from the `connections` map, and parses - * its credentials against the provider's Zod schema. Missing or - * invalid entries are collected and thrown as a single - * {@link ValidationError} so callers see all problems at once. - * - * @returns Map of connection ID → validated connection, ready for execution. - * @throws {ValidationError} If any connection is missing or has invalid credentials. - */ -export function validateConnections( - plan: ExecutionPlan, - connections: Connections, -): Map<string, ValidatedConnection> { - const validated = new Map<string, ValidatedConnection>(); - const errors: string[] = []; - - for (const nodeId of plan.order) { - const resolved = plan.resolved.get(nodeId); - if (!resolved || !hasConnection(resolved)) continue; - - const connId = resolved.connection; - if (validated.has(connId)) continue; - - const conn = connections[connId]; - if (!conn) { - errors.push(`Missing connection "${connId}" for node ${nodeId}`); - continue; - } - - const result = resolved.provider.credentialSchema.safeParse( - conn.credentials, - ); - if (!result.success) { - errors.push( - `Invalid credentials for connection "${connId}": ${result.error.message}`, - ); - continue; - } - - validated.set(connId, { - provider: resolved.provider, - credentials: result.data, - context: conn.context, - }); - } - - if (errors.length > 0) { - throw new ValidationError(errors.join("; "), { - source: "engine", - retryable: false, - details: { errors }, - }); - } - - return validated; -} diff --git a/packages/nvisy-runtime/src/engine/context.ts b/packages/nvisy-runtime/src/engine/context.ts deleted file mode 100644 index 361960a..0000000 --- a/packages/nvisy-runtime/src/engine/context.ts +++ /dev/null @@ -1,169 +0,0 @@ -/** - * Execution context and edge graph construction. - * - * The {@link ExecutionContext} is the single object threaded through - * every node executor during a run. It carries the compiled plan, - * validated connections, Effection edge queues, and convenience - * accessors for looking up nodes and connections by ID. - * - * {@link buildEdges} converts the plan's DAG edges into pairs of - * Effection {@link Queue}s (one per edge direction) that enable - * backpressure-aware streaming between producer and consumer nodes. - * - * @module - */ - -import type { Data } from "@nvisy/core"; -import { ValidationError } from "@nvisy/core"; -import { createQueue, type Queue } from "effection"; -import type { ExecutionPlan } from "../compiler/index.js"; -import type { ResolvedNode } from "../compiler/plan.js"; -import type { Registry } from "../registry.js"; -import type { GraphNode } from "../schema.js"; -import type { LoaderCache } from "./bridge.js"; -import type { ValidatedConnection } from "./connections.js"; -import type { ExecuteOptions } from "./executor.js"; - -/** - * An edge in the execution graph. - * - * Edges connect nodes and carry data via an Effection queue. - * The queue enables backpressure-aware streaming between nodes. - */ -export interface Edge { - readonly from: string; - readonly to: string; - readonly queue: Queue<Data, void>; -} - -/** - * Context passed through the execution of a graph. - * - * Provides access to the execution plan, validated connections, - * edge queues, and helper methods for retrieving node information. - */ -export interface ExecutionContext { - readonly runId: string; - readonly plan: ExecutionPlan; - readonly connections: ReadonlyMap<string, ValidatedConnection>; - readonly inEdges: ReadonlyMap<string, Edge[]>; - readonly outEdges: ReadonlyMap<string, Edge[]>; - readonly options: ExecuteOptions | undefined; - readonly registry: Registry; - readonly loaderCache: LoaderCache; - - /** Look up a node's raw graph schema. Throws {@link ValidationError} if missing. */ - getNode(nodeId: string): GraphNode; - /** Look up a node's compiler-resolved metadata. Throws {@link ValidationError} if missing. */ - getResolved(nodeId: string): ResolvedNode; - /** Look up the validated connection for a provider-backed node. Throws {@link ValidationError} if the node has no connection or the connection is missing. */ - getConnection(nodeId: string): ValidatedConnection; -} - -/** Configuration for creating an execution context. */ -export interface ContextConfig { - readonly runId: string; - readonly plan: ExecutionPlan; - readonly connections: ReadonlyMap<string, ValidatedConnection>; - readonly inEdges: ReadonlyMap<string, Edge[]>; - readonly outEdges: ReadonlyMap<string, Edge[]>; - readonly registry: Registry; - readonly loaderCache: LoaderCache; - readonly options?: ExecuteOptions; -} - -/** - * Build edge maps from the execution plan. - * - * Creates Effection queues for each edge in the graph. - * These queues enable backpressure-aware streaming between nodes. - */ -export function buildEdges(plan: ExecutionPlan): { - inEdges: Map<string, Edge[]>; - outEdges: Map<string, Edge[]>; -} { - const inEdges = new Map<string, Edge[]>(); - const outEdges = new Map<string, Edge[]>(); - - for (const id of plan.order) { - inEdges.set(id, []); - outEdges.set(id, []); - } - - for (const entry of plan.graph.edgeEntries()) { - const edge: Edge = { - from: entry.source, - to: entry.target, - queue: createQueue<Data, void>(), - }; - outEdges.get(entry.source)!.push(edge); - inEdges.get(entry.target)!.push(edge); - } - - return { inEdges, outEdges }; -} - -/** Create an execution context for a graph run. */ -export function createContext(config: ContextConfig): ExecutionContext { - const { - runId, - plan, - connections, - inEdges, - outEdges, - registry, - loaderCache, - options, - } = config; - - return { - runId, - plan, - connections, - inEdges, - outEdges, - options, - registry, - loaderCache, - - getNode(nodeId: string): GraphNode { - const node = plan.graph.getNodeAttributes(nodeId).schema; - if (!node) { - throw new ValidationError(`Node not found: ${nodeId}`, { - source: "engine", - retryable: false, - }); - } - return node; - }, - - getResolved(nodeId: string): ResolvedNode { - const resolved = plan.resolved.get(nodeId); - if (!resolved) { - throw new ValidationError(`Resolved node not found: ${nodeId}`, { - source: "engine", - retryable: false, - }); - } - return resolved; - }, - - getConnection(nodeId: string): ValidatedConnection { - const resolved = plan.resolved.get(nodeId); - if (!resolved || !("connection" in resolved) || !resolved.connection) { - throw new ValidationError(`Node ${nodeId} has no connection`, { - source: "engine", - retryable: false, - }); - } - const conn = connections.get(resolved.connection); - if (!conn) { - throw new ValidationError( - `Connection not found: ${resolved.connection}`, - { source: "engine", retryable: false }, - ); - } - return conn; - }, - }; -} diff --git a/packages/nvisy-runtime/src/engine/engine.ts b/packages/nvisy-runtime/src/engine/engine.ts deleted file mode 100644 index 3f0b678..0000000 --- a/packages/nvisy-runtime/src/engine/engine.ts +++ /dev/null @@ -1,243 +0,0 @@ -/** - * Primary runtime entry point. - * - * Coordinates plugin registration, graph validation, and execution. - * The Engine auto-loads {@link corePlugin} (Document, Blob, Chunk, - * Embedding datatypes plus chunk/partition actions) at construction. - * Additional plugins are registered via {@link Engine.register}. - * - * Delegates graph execution to the {@link execute executor} and run - * tracking to the {@link RunManager}. - * - * @example - * ```ts - * const engine = new Engine() - * .register(sqlPlugin) - * .register(aiPlugin); - * - * // Background execution with run tracking - * const runId = engine.execute(graph, connections); - * const state = engine.getRun(runId); - * - * // Synchronous execution (blocks until completion) - * const result = await engine.executeSync(graph, connections); - * ``` - * - * @module - */ - -import type { PluginInstance } from "@nvisy/core"; -import { ValidationError } from "@nvisy/core"; -import { corePlugin } from "@nvisy/plugin-core"; -import { compile, type ExecutionPlan } from "../compiler/index.js"; -import { Registry, type RegistrySchema } from "../registry.js"; -import { - type Connections, - ConnectionsSchema, - validateConnections, -} from "./connections.js"; -import { type ExecuteOptions, execute, type RunResult } from "./executor.js"; -import { - RunManager, - type RunState, - type RunStatus, - type RunSummary, -} from "./runs.js"; - -/** - * Result of graph validation. - * - * Returned by {@link Engine.validate}. When `valid` is false, `errors` - * contains human-readable descriptions of every issue found (graph - * structure problems, missing connections, credential schema mismatches). - */ -export interface ValidationResult { - /** Whether the graph and its connections passed all checks. */ - readonly valid: boolean; - /** Validation error messages (empty when valid). */ - readonly errors: ReadonlyArray<string>; -} - -/** - * Central orchestrator for pipeline registration, validation, and execution. - * - * The constructor pre-loads {@link corePlugin} so the built-in datatypes - * (Document, Blob, Chunk, Embedding) and actions (chunk, partition) are - * always available. Call {@link register} to add provider and action - * plugins before executing graphs. - * - * Execution modes: - * - {@link execute} — fire-and-forget; returns a `runId` for polling via - * {@link getRun}, {@link listRuns}, and {@link cancelRun}. - * - {@link executeSync} — awaitable; resolves with the full - * {@link RunResult} when the graph finishes. - */ -export class Engine { - readonly #registry = new Registry(); - readonly #runs = new RunManager(); - - /** Pre-loads {@link corePlugin} so built-in datatypes and actions are always available. */ - constructor() { - this.#registry.load(corePlugin); - } - - /** Snapshot of every registered action, provider, stream, loader, and datatype. */ - get schema(): RegistrySchema { - return this.#registry.schema; - } - - /** - * Register a plugin's providers, actions, streams, loaders, and datatypes. - * - * Plugins are registered under their `id`; duplicate IDs throw a - * {@link ValidationError}. Returns `this` to allow fluent chaining. - * - * @param plugin - Plugin instance produced by `Plugin.define(…)`. - */ - register(plugin: PluginInstance): this { - this.#registry.load(plugin); - return this; - } - - /** - * Validate a graph definition and connections without executing. - * - * Performs three layers of validation: - * 1. **Connection shape** — each entry matches {@link ConnectionSchema}. - * 2. **Graph structure** — JSON parsing, cycle detection, dangling - * edges, and name resolution against the registry. - * 3. **Credential validation** — each connection's credentials are - * checked against the provider's Zod schema. - * - * All errors are collected; the method never throws. - */ - validate(graph: unknown, connections: Connections): ValidationResult { - const errors: string[] = []; - - const shapeResult = ConnectionsSchema.safeParse(connections); - if (!shapeResult.success) { - errors.push(...shapeResult.error.issues.map((i) => i.message)); - } - - let plan: ExecutionPlan | null = null; - try { - plan = compile(graph, this.#registry); - } catch (e) { - errors.push(e instanceof Error ? e.message : String(e)); - } - - if (plan) { - try { - validateConnections(plan, connections); - } catch (e) { - // biome-ignore lint/complexity/useLiteralKeys: index signature requires bracket access - if (e instanceof ValidationError && e.details?.["errors"]) { - // biome-ignore lint/complexity/useLiteralKeys: index signature requires bracket access - errors.push(...(e.details["errors"] as string[])); - } else { - errors.push(e instanceof Error ? e.message : String(e)); - } - } - } - - return { valid: errors.length === 0, errors }; - } - - /** - * Execute a graph in the background. - * - * Compiles and validates the graph, then hands it to the - * {@link RunManager} for asynchronous execution. Returns - * immediately with a `runId` for polling progress via - * {@link getRun} or cancelling via {@link cancelRun}. - * - * @param graph - Raw graph definition (validated and compiled internally). - * @param connections - Connection credentials keyed by UUID. - * @param options - Optional abort signal and context-update callback. - * @returns Unique run ID (UUID). - * @throws {ValidationError} If graph or connections fail validation. - */ - execute( - graph: unknown, - connections: Connections, - options?: ExecuteOptions, - ): string { - const plan = this.#compile(graph, connections); - return this.#runs.submit({ - runId: crypto.randomUUID(), - plan, - connections, - registry: this.#registry, - executor: execute, - ...(options && { options }), - }); - } - - /** - * Execute a graph and await the result. - * - * Unlike {@link execute}, this method resolves only when the entire - * graph has finished (or an abort signal fires). Use this for - * scripting, tests, or any context where you need the result inline. - * - * @param graph - Raw graph definition (validated and compiled internally). - * @param connections - Connection credentials keyed by UUID. - * @param options - Optional abort signal and context-update callback. - * @throws {ValidationError} If graph or connections fail validation. - * @throws {CancellationError} If execution is aborted. - */ - async executeSync( - graph: unknown, - connections: Connections, - options?: ExecuteOptions, - ): Promise<RunResult> { - const plan = this.#compile(graph, connections); - return execute(plan, connections, this.#registry, options); - } - - /** - * Get the current state of a run. - * - * Returns per-node progress, overall status, and (once finished) the - * final {@link RunResult}. Returns `undefined` if the run ID is unknown - * or has already been cleaned up (see {@link RunManager} TTL). - */ - getRun(runId: string): RunState | undefined { - return this.#runs.get(runId); - } - - /** - * List all tracked runs, optionally filtered by status. - * - * @param status - If provided, only runs in this lifecycle phase are returned. - */ - listRuns(status?: RunStatus): RunSummary[] { - return this.#runs.list(status); - } - - /** - * Request cancellation of a running or pending execution. - * - * Signals the run's internal {@link AbortController}; nodes that - * have already completed are unaffected. Returns `false` if the - * run was not found or already finished. - */ - cancelRun(runId: string): boolean { - return this.#runs.cancel(runId); - } - - #compile(graph: unknown, connections: Connections): ExecutionPlan { - const validation = this.validate(graph, connections); - if (!validation.valid) { - throw new ValidationError( - `Graph validation failed: ${validation.errors.join("; ")}`, - { - source: "engine", - retryable: false, - details: { errors: validation.errors }, - }, - ); - } - return compile(graph, this.#registry); - } -} diff --git a/packages/nvisy-runtime/src/engine/executor.ts b/packages/nvisy-runtime/src/engine/executor.ts deleted file mode 100644 index 3d9332c..0000000 --- a/packages/nvisy-runtime/src/engine/executor.ts +++ /dev/null @@ -1,202 +0,0 @@ -/** - * Graph execution orchestration. - * - * Executes compiled graphs using Effection for structured concurrency. - * Nodes are spawned as concurrent tasks that communicate via edge queues. - * - * The execution model: - * 1. All nodes are spawned concurrently in topological order - * 2. Each node waits for its dependencies (incoming edges) to complete - * 3. Data flows through edges via backpressure-aware queues - * 4. Node failures are isolated - other branches can continue - */ - -import { getLogger } from "@logtape/logtape"; -import { CancellationError } from "@nvisy/core"; -import { - run as effectionRun, - type Operation, - spawn, - type WithResolvers, - withResolvers, -} from "effection"; -import type { ExecutionPlan } from "../compiler/index.js"; -import type { Registry } from "../registry.js"; -import { createLoaderCache } from "./bridge.js"; -import type { Connections, ValidatedConnection } from "./connections.js"; -import { validateConnections } from "./connections.js"; -import { buildEdges, createContext, type ExecutionContext } from "./context.js"; -import { executeNode, type NodeResult } from "./nodes.js"; - -const logger = getLogger(["nvisy", "executor"]); - -/** Options for controlling graph execution behaviour. */ -export interface ExecuteOptions { - /** External abort signal; when fired, Effection halts all spawned tasks. */ - readonly signal?: AbortSignal; - /** - * Callback invoked after each source item is read, providing the - * resumption context for crash-recovery persistence. - */ - readonly onContextUpdate?: ( - nodeId: string, - connectionId: string, - context: unknown, - ) => void; -} - -/** - * Result of executing a complete graph. - * - * `status` is derived from per-node outcomes: - * - `"success"` — every node succeeded. - * - `"partial_failure"` — at least one node failed but others succeeded. - * - `"failure"` — every node failed. - */ -export interface RunResult { - readonly runId: string; - readonly status: "success" | "partial_failure" | "failure"; - readonly nodes: ReadonlyArray<NodeResult>; -} - -/** - * Spawn a node as a concurrent task. - * - * Waits for all dependencies to complete before executing. - * Closes outgoing edge queues when done (success or failure). - */ -function* spawnNode( - ctx: ExecutionContext, - nodeId: string, - completions: ReadonlyMap<string, WithResolvers<NodeResult>>, -): Operation<void> { - const deps = ctx.plan.graph.inNeighbors(nodeId); - const completion = completions.get(nodeId); - if (!completion) return; - - const outEdges = ctx.outEdges.get(nodeId) ?? []; - - yield* spawn(function* () { - for (const dep of deps) { - const depCompletion = completions.get(dep); - if (depCompletion) yield* depCompletion.operation; - } - - try { - const result = yield* executeNode(ctx, nodeId); - completion.resolve(result); - } catch (error) { - logger.error("Node {nodeId} failed: {error}", { - nodeId, - error: error instanceof Error ? error.message : String(error), - }); - completion.resolve({ - nodeId, - status: "failure", - error: error instanceof Error ? error : new Error(String(error)), - itemsProcessed: 0, - }); - } finally { - for (const edge of outEdges) { - edge.queue.close(); - } - } - }); -} - -/** - * Execute a graph within Effection structured concurrency. - * - * Spawns all nodes concurrently and collects results. - * Determines overall status based on individual node results. - */ -function* runGraph( - plan: ExecutionPlan, - validatedConnections: Map<string, ValidatedConnection>, - registry: Registry, - options?: ExecuteOptions, -): Operation<RunResult> { - const runId = crypto.randomUUID(); - logger.info("Run {runId} started ({count} nodes)", { - runId, - count: plan.order.length, - }); - - const { inEdges, outEdges } = buildEdges(plan); - const completions = new Map<string, WithResolvers<NodeResult>>(); - for (const id of plan.order) { - completions.set(id, withResolvers<NodeResult>()); - } - - const ctx = createContext({ - runId, - plan, - connections: validatedConnections, - inEdges, - outEdges, - registry, - loaderCache: createLoaderCache(), - ...(options && { options }), - }); - - for (const id of plan.order) { - yield* spawnNode(ctx, id, completions); - } - - const results: NodeResult[] = []; - for (const id of plan.order) { - const completion = completions.get(id); - if (completion) results.push(yield* completion.operation); - } - - const hasFailure = results.some((r) => r.status === "failure"); - const allFailure = results.every((r) => r.status === "failure"); - const status = allFailure - ? "failure" - : hasFailure - ? "partial_failure" - : "success"; - - logger.info("Run {runId} completed ({status})", { runId, status }); - return { runId, status, nodes: results }; -} - -/** - * Execute a compiled execution plan. - * - * This is the main entry point for graph execution. It: - * 1. Validates all connections upfront - * 2. Runs the graph using Effection structured concurrency - * 3. Handles cancellation via AbortSignal - */ -export async function execute( - plan: ExecutionPlan, - connections: Connections, - registry: Registry, - options?: ExecuteOptions, -): Promise<RunResult> { - const signal = options?.signal; - - if (signal?.aborted) { - throw new CancellationError("Execution cancelled"); - } - - const validatedConnections = validateConnections(plan, connections); - - const task = effectionRun(() => - runGraph(plan, validatedConnections, registry, options), - ); - - if (!signal) { - return task; - } - - const onAbort = () => void task.halt(); - signal.addEventListener("abort", onAbort, { once: true }); - - try { - return await task; - } finally { - signal.removeEventListener("abort", onAbort); - } -} diff --git a/packages/nvisy-runtime/src/engine/index.ts b/packages/nvisy-runtime/src/engine/index.ts deleted file mode 100644 index c044c87..0000000 --- a/packages/nvisy-runtime/src/engine/index.ts +++ /dev/null @@ -1,20 +0,0 @@ -/** - * Engine module public API. - * - * Re-exports the {@link Engine} class and all supporting types that - * consumers need for graph registration, validation, execution, - * and run monitoring. - * - * @module - */ - -export type { - ActionDescriptor, - ProviderDescriptor, - RegistrySchema, -} from "../registry.js"; -export type { Connection, Connections } from "./connections.js"; -export { Engine, type ValidationResult } from "./engine.js"; -export type { ExecuteOptions, RunResult } from "./executor.js"; -export type { NodeResult } from "./nodes.js"; -export type { NodeProgress, RunState, RunStatus, RunSummary } from "./runs.js"; diff --git a/packages/nvisy-runtime/src/engine/nodes.ts b/packages/nvisy-runtime/src/engine/nodes.ts deleted file mode 100644 index 41eb358..0000000 --- a/packages/nvisy-runtime/src/engine/nodes.ts +++ /dev/null @@ -1,325 +0,0 @@ -/** - * Node execution logic for source, action, and target nodes. - * - * Each of the three node types has a dedicated executor: - * - * - **Source** — connects to a provider, reads items via a stream - * source, and pushes each item to all outgoing edges. Emits - * resumption-context callbacks after every item for crash recovery. - * - **Action** — drains incoming edge queues (with automatic - * Blob → Document bridging), pipes them through the action's - * transform, and writes results to outgoing edges. Optionally - * connects to a provider when the action requires a client. - * - **Target** — connects to a provider and writes each incoming - * item via the stream's writer function. - * - * All executors are wrapped by {@link withRetry} and - * {@link withTimeout} policies before being spawned by the executor. - * - * @module - */ - -import { getLogger } from "@logtape/logtape"; -import type { Data } from "@nvisy/core"; -import { TimeoutError, ValidationError } from "@nvisy/core"; -import { call, type Operation, spawn } from "effection"; -import type { - ResolvedActionNode, - ResolvedNode, - ResolvedSourceNode, - ResolvedTargetNode, -} from "../compiler/plan.js"; -import type { GraphNode } from "../schema.js"; -import { applyLoaderBridge } from "./bridge.js"; -import type { Edge, ExecutionContext } from "./context.js"; -import { withRetry, withTimeout } from "./policies.js"; - -const logger = getLogger(["nvisy", "nodes"]); - -/** - * Result of executing a single node. - * - * Collected by the executor after each node completes (or fails) - * and aggregated into the overall {@link RunResult}. - */ -export interface NodeResult { - readonly nodeId: string; - readonly status: "success" | "failure" | "skipped"; - /** Present only when `status` is `"failure"`. */ - readonly error?: Error; - /** Number of data items that flowed through this node. */ - readonly itemsProcessed: number; -} - -/** Validate parameters against a Zod-like schema. */ -function validateParams<T>( - schema: { - safeParse: ( - v: unknown, - ) => - | { success: true; data: T } - | { success: false; error: { message: string } }; - }, - params: unknown, - nodeId: string, -): T { - const result = schema.safeParse(params); - if (!result.success) { - throw new ValidationError( - `Invalid params for node ${nodeId}: ${result.error.message}`, - { source: "engine", retryable: false }, - ); - } - return result.data; -} - -/** - * Convert edge queues to a ReadableStream. - * - * Spawns a background task that drains each edge queue sequentially - * and writes items to a TransformStream. The readable side is returned - * for consumption by action nodes. - */ -function* edgesToStream(edges: Edge[]): Operation<ReadableStream<Data>> { - const { readable, writable } = new TransformStream<Data>(); - - yield* spawn(function* () { - const writer = writable.getWriter(); - try { - for (const edge of edges) { - for ( - let next = yield* edge.queue.next(); - !next.done; - next = yield* edge.queue.next() - ) { - yield* call(() => writer.write(next.value)); - } - } - } finally { - yield* call(() => writer.close()); - } - }); - - return readable; -} - -/** - * Execute a source node: read data from an external system. - * - * Connects to the provider, reads items via the stream, and pushes - * each item to all outgoing edges. Emits context updates for - * crash recovery. - */ -function* executeSource( - ctx: ExecutionContext, - node: GraphNode, - resolved: ResolvedSourceNode, - outEdges: Edge[], -): Operation<number> { - const conn = ctx.getConnection(node.id); - const params = validateParams( - resolved.stream.paramSchema, - resolved.params, - node.id, - ); - - const instance = yield* call(() => conn.provider.connect(conn.credentials)); - let count = 0; - - try { - const initialCtx = resolved.stream.contextSchema.parse(conn.context ?? {}); - const readable = resolved.stream.read(instance.client, initialCtx, params); - - yield* call(async () => { - for await (const resumable of readable) { - for (const edge of outEdges) { - edge.queue.add(resumable.data); - } - count++; - ctx.options?.onContextUpdate?.( - node.id, - resolved.connection, - resumable.context, - ); - } - }); - } finally { - yield* call(() => instance.disconnect()); - } - - return count; -} - -/** - * Execute an action node: transform data through a processing function. - * - * Optionally connects to a provider if the action requires a client. - * Reads from incoming edges, applies the transformation, and writes - * to outgoing edges. - */ -function* executeAction( - ctx: ExecutionContext, - node: GraphNode, - resolved: ResolvedActionNode, - inEdges: Edge[], - outEdges: Edge[], -): Operation<number> { - const params = validateParams( - resolved.action.schema, - resolved.params, - node.id, - ); - - let client: unknown; - let disconnect: (() => Promise<void>) | undefined; - - if (resolved.provider && resolved.connection) { - const conn = ctx.connections.get(resolved.connection); - if (!conn) { - throw new ValidationError( - `Connection not found: ${resolved.connection}`, - { source: "engine", retryable: false }, - ); - } - const instance = yield* call(() => conn.provider.connect(conn.credentials)); - client = instance.client; - disconnect = () => instance.disconnect(); - - if ( - resolved.action.clientClass && - !(client instanceof resolved.action.clientClass) - ) { - throw new ValidationError( - `Provider "${resolved.provider.id}" client is not compatible with action "${resolved.action.id}"`, - { source: "engine", retryable: false }, - ); - } - } - - try { - const rawInputStream = yield* edgesToStream(inEdges); - const inputStream = applyLoaderBridge( - rawInputStream, - ctx.registry, - ctx.loaderCache, - ); - const outputStream = resolved.action.pipe(inputStream, params, client); - let count = 0; - - yield* call(async () => { - for await (const item of outputStream) { - for (const edge of outEdges) { - edge.queue.add(item); - } - count++; - } - }); - - return count; - } finally { - if (disconnect) yield* call(disconnect); - } -} - -/** - * Execute a target node: write data to an external system. - * - * Connects to the provider and writes each item from incoming edges - * using the stream's writer function. - */ -function* executeTarget( - ctx: ExecutionContext, - node: GraphNode, - resolved: ResolvedTargetNode, - inEdges: Edge[], -): Operation<number> { - const conn = ctx.getConnection(node.id); - const params = validateParams( - resolved.stream.paramSchema, - resolved.params, - node.id, - ); - - const instance = yield* call(() => conn.provider.connect(conn.credentials)); - let count = 0; - - try { - const writeFn = resolved.stream.write(instance.client, params); - for (const edge of inEdges) { - for ( - let next = yield* edge.queue.next(); - !next.done; - next = yield* edge.queue.next() - ) { - yield* call(() => writeFn(next.value)); - count++; - } - } - } finally { - yield* call(() => instance.disconnect()); - } - - return count; -} - -/** Dispatch to the appropriate node executor based on type. */ -function* dispatchNode( - ctx: ExecutionContext, - node: GraphNode, - resolved: ResolvedNode, - inEdges: Edge[], - outEdges: Edge[], -): Operation<number> { - switch (resolved.type) { - case "source": - return yield* executeSource(ctx, node, resolved, outEdges); - case "action": - return yield* executeAction(ctx, node, resolved, inEdges, outEdges); - case "target": - return yield* executeTarget(ctx, node, resolved, inEdges); - } -} - -/** - * Execute a single node with retry and timeout policies. - * - * Wraps the node execution with configurable retry logic and timeout. - * Returns a NodeResult indicating success or failure. - */ -export function* executeNode( - ctx: ExecutionContext, - nodeId: string, -): Operation<NodeResult> { - const node = ctx.getNode(nodeId); - const resolved = ctx.getResolved(nodeId); - const inEdges = ctx.inEdges.get(nodeId) ?? []; - const outEdges = ctx.outEdges.get(nodeId) ?? []; - - logger.debug("Executing node {nodeId} ({type})", { - nodeId, - type: resolved.type, - }); - - function* base(): Operation<NodeResult> { - const count = yield* dispatchNode(ctx, node, resolved, inEdges, outEdges); - - logger.debug("Node {nodeId} completed ({count} items)", { nodeId, count }); - return { nodeId, status: "success", itemsProcessed: count }; - } - - const timeoutMs = node.timeout?.nodeTimeoutMs; - const timeoutFallback: NodeResult = { - nodeId, - status: "failure", - error: new TimeoutError(`Node ${nodeId} timed out after ${timeoutMs}ms`, { - source: "engine", - }), - itemsProcessed: 0, - }; - - return yield* withTimeout( - () => withRetry(base, node.retry, nodeId), - timeoutMs, - timeoutFallback, - ); -} diff --git a/packages/nvisy-runtime/src/engine/policies.ts b/packages/nvisy-runtime/src/engine/policies.ts deleted file mode 100644 index fe4a0ed..0000000 --- a/packages/nvisy-runtime/src/engine/policies.ts +++ /dev/null @@ -1,150 +0,0 @@ -/** - * Execution policies for retry and timeout handling. - * - * Both policies wrap Effection {@link Operation}s and compose freely: - * - * - {@link withRetry} — retries an operation up to `maxRetries` times - * using fixed, exponential, or jittered backoff. Non-retryable - * {@link RuntimeError}s bypass retry and propagate immediately. - * - {@link withTimeout} — races an operation against an Effection - * `sleep` timer; if the timer wins, the operation is cancelled and - * a caller-supplied fallback value is returned. - * - * @module - */ - -import { getLogger } from "@logtape/logtape"; -import { RuntimeError } from "@nvisy/core"; -import { type Operation, race, sleep } from "effection"; -import type { RetryPolicy } from "../schema.js"; - -const logger = getLogger(["nvisy", "engine"]); - -/** - * Wrap an operation with retry logic. - * - * Retries are only attempted for errors that are marked as retryable. - * Non-retryable errors (RuntimeError with `retryable: false`) are - * thrown immediately without retry. - * - * @param fn - Operation factory to retry. - * @param policy - Retry configuration (maxRetries, backoff strategy, delays). - * @param nodeId - Node ID for logging. - * @returns The operation result if successful. - * @throws The last error if all retries are exhausted. - * - * @example - * ```ts - * const result = yield* withRetry( - * () => fetchData(), - * { maxRetries: 3, backoff: "exponential", initialDelayMs: 100, maxDelayMs: 5000 }, - * "node-123" - * ); - * ``` - */ -export function* withRetry<T>( - fn: () => Operation<T>, - policy: RetryPolicy | undefined, - nodeId: string, -): Operation<T> { - if (!policy) return yield* fn(); - - const { maxRetries, backoff, initialDelayMs, maxDelayMs } = policy; - let lastError: unknown; - - for (let attempt = 0; attempt <= maxRetries; attempt++) { - try { - return yield* fn(); - } catch (error) { - lastError = error; - - // Non-retryable errors fail immediately - if (error instanceof RuntimeError && error.retryable === false) { - throw error; - } - - logger.warn("Node {nodeId} attempt {attempt} failed: {error}", { - nodeId, - attempt: attempt + 1, - maxRetries, - error: error instanceof Error ? error.message : String(error), - }); - - if (attempt < maxRetries) { - const delay = computeDelay( - backoff, - initialDelayMs, - maxDelayMs, - attempt, - ); - yield* sleep(delay); - } - } - } - - throw lastError; -} - -/** - * Compute delay for a retry attempt based on backoff strategy. - * - * Strategies: - * - `fixed`: Always use initialDelayMs - * - `exponential`: Double the delay each attempt (capped at maxDelayMs) - * - `jitter`: Random delay between 0 and exponential delay (good for avoiding thundering herd) - */ -function computeDelay( - backoff: RetryPolicy["backoff"], - initialDelayMs: number, - maxDelayMs: number, - attempt: number, -): number { - const exponentialDelay = Math.min(initialDelayMs * 2 ** attempt, maxDelayMs); - switch (backoff) { - case "fixed": - return initialDelayMs; - case "exponential": - return exponentialDelay; - case "jitter": - // Full jitter: random value between 0 and the exponential delay - // This provides good collision avoidance while maintaining backoff growth - return Math.floor(Math.random() * exponentialDelay); - } -} - -/** - * Wrap an operation with a timeout. - * - * Uses Effection's `race` to run the operation against a timer. - * If the timeout expires first, returns the fallback value. - * The original operation is automatically cancelled by Effection. - * - * @param fn - Operation factory to execute. - * @param timeoutMs - Maximum time to wait (undefined = no timeout). - * @param fallback - Value to return if timeout expires. - * @returns The operation result or fallback. - * - * @example - * ```ts - * const result = yield* withTimeout( - * () => slowOperation(), - * 5000, - * { status: "timeout" } - * ); - * ``` - */ -export function* withTimeout<T>( - fn: () => Operation<T>, - timeoutMs: number | undefined, - fallback: T, -): Operation<T> { - if (!timeoutMs) return yield* fn(); - - return yield* race([ - fn(), - (function* (): Operation<T> { - yield* sleep(timeoutMs); - return fallback; - })(), - ]); -} diff --git a/packages/nvisy-runtime/src/engine/runs.ts b/packages/nvisy-runtime/src/engine/runs.ts deleted file mode 100644 index 05201ac..0000000 --- a/packages/nvisy-runtime/src/engine/runs.ts +++ /dev/null @@ -1,305 +0,0 @@ -/** - * Run management for background graph executions. - * - * A "run" is a single execution of a compiled graph. The - * {@link RunManager} tracks every run through its lifecycle - * (`pending → running → completed | failed | cancelled`), - * exposes per-node progress for monitoring, supports mid-flight - * cancellation via {@link AbortController}, and automatically - * evicts finished runs after a configurable TTL. - * - * @module - */ - -import { getLogger } from "@logtape/logtape"; -import type { ExecutionPlan } from "../compiler/index.js"; -import type { Registry } from "../registry.js"; -import type { Connections } from "./connections.js"; -import type { ExecuteOptions, RunResult } from "./executor.js"; -import type { NodeResult } from "./nodes.js"; - -const logger = getLogger(["nvisy", "runs"]); - -/** - * Lifecycle status of a background execution run. - * - * Transitions: pending → running → completed | failed | cancelled - */ -export type RunStatus = - | "pending" - | "running" - | "completed" - | "failed" - | "cancelled"; - -/** - * Progress of a single node within a run. - * - * Updated as items flow through the node; `itemsProcessed` is - * incremented each time a context-update callback fires. - */ -export interface NodeProgress { - readonly nodeId: string; - readonly status: "pending" | "running" | "completed" | "failed"; - /** Number of data items the node has processed so far. */ - readonly itemsProcessed: number; - /** Present only when `status` is `"failed"`. */ - readonly error?: Error; -} - -/** - * Complete state of an execution run. - * - * Includes per-node progress for monitoring long-running executions. - */ -export interface RunState { - readonly runId: string; - readonly status: RunStatus; - readonly startedAt: Date; - readonly completedAt?: Date; - readonly nodeProgress: ReadonlyMap<string, NodeProgress>; - readonly result?: RunResult; - readonly error?: Error; -} - -/** Summary of a run for listing (without full progress details). */ -export interface RunSummary { - readonly runId: string; - readonly status: RunStatus; - readonly startedAt: Date; - readonly completedAt?: Date; -} - -/** Function signature for executing a compiled plan (injected into {@link SubmitConfig}). */ -export type PlanExecutor = ( - plan: ExecutionPlan, - connections: Connections, - registry: Registry, - options?: ExecuteOptions, -) => Promise<RunResult>; - -/** Configuration for submitting a graph execution to the {@link RunManager}. */ -export interface SubmitConfig { - readonly runId: string; - readonly plan: ExecutionPlan; - readonly connections: Connections; - readonly registry: Registry; - readonly executor: PlanExecutor; - readonly options?: ExecuteOptions; -} - -interface MutableRun { - runId: string; - status: RunStatus; - startedAt: Date; - completedAt: Date | null; - nodeProgress: Map<string, NodeProgress>; - result: RunResult | null; - error: Error | null; - abort: AbortController; -} - -function createRunState(run: MutableRun): RunState { - return { - runId: run.runId, - status: run.status, - startedAt: run.startedAt, - nodeProgress: new Map(run.nodeProgress), - ...(run.completedAt && { completedAt: run.completedAt }), - ...(run.result && { result: run.result }), - ...(run.error && { error: run.error }), - }; -} - -function createRunSummary(run: MutableRun): RunSummary { - return { - runId: run.runId, - status: run.status, - startedAt: run.startedAt, - ...(run.completedAt && { completedAt: run.completedAt }), - }; -} - -function createNodeProgress(nodeId: string, result?: NodeResult): NodeProgress { - return { - nodeId, - status: result - ? result.status === "success" - ? "completed" - : "failed" - : "pending", - itemsProcessed: result?.itemsProcessed ?? 0, - ...(result?.error && { error: result.error }), - }; -} - -/** - * Manages in-flight and recently completed graph executions. - * - * @example - * ```ts - * const manager = new RunManager({ ttlMs: 5 * 60 * 1000 }); - * const runId = manager.submit({ runId: id, plan, connections, registry, executor }); - * - * const state = manager.get(runId); - * console.log(state?.status, state?.nodeProgress); - * - * manager.cancel(runId); - * ``` - */ -export class RunManager { - readonly #runs = new Map<string, MutableRun>(); - readonly #ttlMs: number; - - constructor(options?: { ttlMs?: number }) { - this.#ttlMs = options?.ttlMs ?? 5 * 60 * 1000; - } - - /** - * Submit a graph for background execution. - * - * Starts execution immediately and returns the run ID. - * Use {@link get} to monitor progress or {@link cancel} to abort. - */ - submit(config: SubmitConfig): string { - const { runId, plan, connections, registry, executor, options } = config; - - const run: MutableRun = { - runId, - status: "pending", - startedAt: new Date(), - completedAt: null, - nodeProgress: new Map( - plan.order.map((id) => [id, createNodeProgress(id)]), - ), - result: null, - error: null, - abort: new AbortController(), - }; - - this.#runs.set(runId, run); - logger.info("Run submitted: {runId}", { runId }); - - this.#executeInBackground(run, { - plan, - connections, - registry, - executor, - ...(options && { options }), - }); - - return runId; - } - - /** Get the current state of a run. */ - get(runId: string): RunState | undefined { - const run = this.#runs.get(runId); - return run ? createRunState(run) : undefined; - } - - /** List all runs, optionally filtered by status. */ - list(status?: RunStatus): RunSummary[] { - const summaries: RunSummary[] = []; - for (const run of this.#runs.values()) { - if (!status || run.status === status) { - summaries.push(createRunSummary(run)); - } - } - return summaries; - } - - /** - * Request cancellation of a running execution. - * - * @returns True if cancellation was requested, false if run not found or already completed. - */ - cancel(runId: string): boolean { - const run = this.#runs.get(runId); - if (!run || (run.status !== "pending" && run.status !== "running")) { - return false; - } - - run.abort.abort(); - logger.info("Run cancellation requested: {runId}", { runId }); - return true; - } - - /** Check if a run exists. */ - has(runId: string): boolean { - return this.#runs.has(runId); - } - - async #executeInBackground( - run: MutableRun, - config: Omit<SubmitConfig, "runId">, - ): Promise<void> { - const { plan, connections, registry, executor, options } = config; - - run.status = "running"; - logger.info("Run started: {runId}", { runId: run.runId }); - - const signal = options?.signal - ? AbortSignal.any([options.signal, run.abort.signal]) - : run.abort.signal; - - try { - const result = await executor(plan, connections, registry, { - ...options, - signal, - onContextUpdate: (nodeId, connectionId, context) => { - const progress = run.nodeProgress.get(nodeId); - if (progress) { - run.nodeProgress.set(nodeId, { - ...progress, - status: "running", - itemsProcessed: progress.itemsProcessed + 1, - }); - } - options?.onContextUpdate?.(nodeId, connectionId, context); - }, - }); - - run.status = "completed"; - run.completedAt = new Date(); - run.result = result; - - for (const nodeResult of result.nodes) { - run.nodeProgress.set( - nodeResult.nodeId, - createNodeProgress(nodeResult.nodeId, nodeResult), - ); - } - - logger.info("Run completed: {runId} (status={status})", { - runId: run.runId, - status: result.status, - }); - } catch (error) { - run.completedAt = new Date(); - - if (run.abort.signal.aborted) { - run.status = "cancelled"; - logger.info("Run cancelled: {runId}", { runId: run.runId }); - } else { - run.status = "failed"; - run.error = error instanceof Error ? error : new Error(String(error)); - logger.error("Run failed: {runId} (error={error})", { - runId: run.runId, - error: run.error.message, - }); - } - } - - this.#scheduleCleanup(run.runId); - } - - #scheduleCleanup(runId: string): void { - setTimeout(() => { - const run = this.#runs.get(runId); - if (run?.completedAt) { - this.#runs.delete(runId); - logger.debug("Run cleaned up: {runId}", { runId }); - } - }, this.#ttlMs); - } -} diff --git a/packages/nvisy-runtime/src/index.ts b/packages/nvisy-runtime/src/index.ts deleted file mode 100644 index cd75c6e..0000000 --- a/packages/nvisy-runtime/src/index.ts +++ /dev/null @@ -1,37 +0,0 @@ -/** - * @module @nvisy/runtime - * - * Pipeline execution engine for the Nvisy runtime. - * - * Compiles graph definitions into execution plans, validates connections, - * and runs pipelines with retry, timeout, and cancellation support. - */ - -export type { - ActionDescriptor, - Connection, - Connections, - ExecuteOptions, - NodeProgress, - NodeResult, - ProviderDescriptor, - RegistrySchema, - RunResult, - RunState, - RunStatus, - RunSummary, - ValidationResult, -} from "./engine/index.js"; -export { Engine } from "./engine/index.js"; -export type { - ActionNode, - BackoffStrategy, - ConcurrencyPolicy, - Graph, - GraphEdge, - GraphNode, - RetryPolicy, - SourceNode, - TargetNode, - TimeoutPolicy, -} from "./schema.js"; diff --git a/packages/nvisy-runtime/src/registry.ts b/packages/nvisy-runtime/src/registry.ts deleted file mode 100644 index 08ffe14..0000000 --- a/packages/nvisy-runtime/src/registry.ts +++ /dev/null @@ -1,236 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import type { - AnyActionInstance, - AnyLoaderInstance, - AnyProviderFactory, - AnyStreamSource, - AnyStreamTarget, - DatatypeDescriptor, - PluginInstance, -} from "@nvisy/core"; -import { type Blob, ValidationError } from "@nvisy/core"; - -const logger = getLogger(["nvisy", "registry"]); - -/** Describes a single registered action for schema generation. */ -export interface ActionDescriptor { - readonly name: string; - readonly configSchema: AnyActionInstance["schema"]; -} - -/** Describes a single registered provider for schema generation. */ -export interface ProviderDescriptor { - readonly name: string; - readonly credentialSchema: AnyProviderFactory["credentialSchema"]; -} - -/** - * Complete snapshot of everything currently registered, - * suitable for generating an OpenAPI spec alongside the static - * graph/node/edge schemas. - */ -export interface RegistrySchema { - readonly actions: ReadonlyArray<ActionDescriptor>; - readonly providers: ReadonlyArray<ProviderDescriptor>; - readonly streams: number; - readonly loaders: number; - readonly datatypes: number; -} - -/** - * Unified registry that stores providers, actions, and loaders contributed by - * {@link PluginInstance} objects. - * - * All entries are keyed as `"pluginId/name"`. - */ -export class Registry { - readonly #actions = new Map<string, AnyActionInstance>(); - readonly #loaders = new Map<string, AnyLoaderInstance>(); - readonly #providers = new Map<string, AnyProviderFactory>(); - readonly #streams = new Map<string, AnyStreamSource | AnyStreamTarget>(); - readonly #datatypes = new Map<string, DatatypeDescriptor>(); - readonly #plugins = new Set<string>(); - - /** Snapshot of all registered actions and providers with their schemas. */ - get schema(): RegistrySchema { - const actions: ActionDescriptor[] = []; - for (const [name, action] of this.#actions) { - actions.push({ name, configSchema: action.schema }); - } - - const providers: ProviderDescriptor[] = []; - for (const [name, factory] of this.#providers) { - providers.push({ name, credentialSchema: factory.credentialSchema }); - } - - return { - actions, - providers, - streams: this.#streams.size, - loaders: this.#loaders.size, - datatypes: this.#datatypes.size, - }; - } - - /** Load all providers, actions, loaders, and streams declared by a plugin. */ - load(plugin: PluginInstance): void { - if (this.#plugins.has(plugin.id)) { - throw new ValidationError(`Plugin already loaded: ${plugin.id}`, { - source: "registry", - retryable: false, - details: { pluginId: plugin.id }, - }); - } - - const maps = [ - ["provider", this.#providers, plugin.providers], - ["action", this.#actions, plugin.actions], - ["loader", this.#loaders, plugin.loaders], - ["stream", this.#streams, plugin.streams], - ["datatype", this.#datatypes, plugin.datatypes], - ] as const; - - // Check for collisions across all maps - const collisions: string[] = []; - for (const [kind, map, entries] of maps) { - for (const name of Object.keys(entries)) { - const key = `${plugin.id}/${name}`; - if (map.has(key)) { - collisions.push(`${kind} "${key}"`); - } - } - } - - if (collisions.length > 0) { - logger.error( - "Registry collision loading plugin {pluginId}: {collisions}", - { pluginId: plugin.id, collisions: collisions.join(", ") }, - ); - throw new ValidationError( - `Registry collision: ${collisions.join(", ")}`, - { - source: "registry", - retryable: false, - details: { pluginId: plugin.id, collisions }, - }, - ); - } - - // Register all entries - this.#plugins.add(plugin.id); - const loaded: Record<string, string[]> = {}; - for (const [kind, map, entries] of maps) { - const names: string[] = []; - for (const [name, value] of Object.entries(entries)) { - (map as Map<string, unknown>).set(`${plugin.id}/${name}`, value); - names.push(name); - } - loaded[kind] = names; - } - - const counts: Record<string, number> = {}; - for (const [kind, names] of Object.entries(loaded)) { - if (names.length > 0) { - counts[`${kind}s`] = names.length; - } - } - logger.debug("Plugin loaded: {plugin}", { plugin: plugin.id, ...counts }); - } - - /** Look up an action by name. */ - getAction(name: string): AnyActionInstance { - return this.#getOrThrow(this.#actions, name, "action"); - } - - /** Look up a provider factory by name. */ - getProvider(name: string): AnyProviderFactory { - return this.#getOrThrow(this.#providers, name, "provider"); - } - - /** Look up a stream by name. */ - getStream(name: string): AnyStreamSource | AnyStreamTarget { - return this.#getOrThrow(this.#streams, name, "stream"); - } - - /** Look up a loader by name. */ - getLoader(name: string): AnyLoaderInstance { - return this.#getOrThrow(this.#loaders, name, "loader"); - } - - /** Look up a data type by name. */ - getDataType(name: string): DatatypeDescriptor { - return this.#getOrThrow(this.#datatypes, name, "datatype"); - } - - /** Look up an action by name, returning undefined if not found. */ - findAction(name: string): AnyActionInstance | undefined { - return this.#actions.get(name); - } - - /** Look up a loader by name, returning undefined if not found. */ - findLoader(name: string): AnyLoaderInstance | undefined { - return this.#loaders.get(name); - } - - /** Look up a provider factory by name, returning undefined if not found. */ - findProvider(name: string): AnyProviderFactory | undefined { - return this.#providers.get(name); - } - - /** Look up a stream by name, returning undefined if not found. */ - findStream(name: string): (AnyStreamSource | AnyStreamTarget) | undefined { - return this.#streams.get(name); - } - - /** Look up a data type by name, returning undefined if not found. */ - findDataType(name: string): DatatypeDescriptor | undefined { - return this.#datatypes.get(name); - } - - /** - * Find a loader that matches the given blob by content type, magic bytes, or extension. - * - * Matching priority: - * 1. If blob has provided MIME (contentType), match by contentType first - * 2. Match by identified (magic bytes) extension - * 3. Fall back to provided extension from blob path - */ - findLoaderForBlob(blob: Blob): AnyLoaderInstance | undefined { - const provided = blob.provided; - if (provided.mime) { - for (const loader of this.#loaders.values()) { - if (loader.contentTypes.includes(provided.mime)) { - return loader; - } - } - } - - const identified = blob.identified; - if (identified.extension) { - for (const loader of this.#loaders.values()) { - if (loader.extensions.includes(identified.extension)) { - return loader; - } - } - } - - if (provided.extension) { - for (const loader of this.#loaders.values()) { - if (loader.extensions.includes(provided.extension)) { - return loader; - } - } - } - - return undefined; - } - - #getOrThrow<T>(map: Map<string, T>, name: string, kind: string): T { - const entry = map.get(name); - if (!entry) { - logger.warn(`${kind} not found: ${name}`, { [kind]: name }); - throw ValidationError.notFound(name, kind, "registry"); - } - return entry; - } -} diff --git a/packages/nvisy-runtime/src/schema.ts b/packages/nvisy-runtime/src/schema.ts deleted file mode 100644 index 81f3625..0000000 --- a/packages/nvisy-runtime/src/schema.ts +++ /dev/null @@ -1,140 +0,0 @@ -import { z } from "zod"; - -/** Strategy for calculating delay between retry attempts. */ -export const BackoffStrategy = z.enum(["fixed", "exponential", "jitter"]); - -/** Controls how failed operations are retried. */ -export const RetryPolicy = z.object({ - /** Maximum number of retry attempts after the initial failure. */ - maxRetries: z.number().default(3), - /** Strategy for calculating delay between attempts. */ - backoff: BackoffStrategy.default("exponential"), - /** Delay before the first retry in milliseconds. */ - initialDelayMs: z.number().default(1000), - /** Maximum delay between retries in milliseconds. */ - maxDelayMs: z.number().default(30_000), -}); - -/** Controls execution time limits for nodes and graphs. */ -export const TimeoutPolicy = z.object({ - /** Maximum execution time for a single node in milliseconds. */ - nodeTimeoutMs: z.number().optional(), - /** Maximum execution time for the entire graph in milliseconds. */ - graphTimeoutMs: z.number().optional(), -}); - -/** Controls parallel execution limits. */ -export const ConcurrencyPolicy = z.object({ - /** Maximum number of nodes executing concurrently across the graph. */ - maxGlobal: z.number().default(10), - /** Maximum concurrent operations within a single node. */ - maxPerNode: z.number().optional(), -}); - -export type BackoffStrategy = z.infer<typeof BackoffStrategy>; -export type RetryPolicy = z.infer<typeof RetryPolicy>; -export type TimeoutPolicy = z.infer<typeof TimeoutPolicy>; -export type ConcurrencyPolicy = z.infer<typeof ConcurrencyPolicy>; - -/** Common properties shared by all node types. */ -const NodeBase = z.object({ - /** Unique identifier for the node. */ - id: z.uuid(), - /** Retry policy for this node. Overrides graph-level policy. */ - retry: RetryPolicy.optional(), - /** Timeout policy for this node. Overrides graph-level policy. */ - timeout: TimeoutPolicy.optional(), - /** Concurrency policy for this node. Overrides graph-level policy. */ - concurrency: ConcurrencyPolicy.optional(), -}); - -/** A source node reads data from an external system. */ -export const SourceNode = NodeBase.extend({ - /** Discriminator for source nodes. */ - type: z.literal("source"), - /** Provider identifier in "module/name" format. */ - provider: z.string(), - /** Stream identifier in "module/name" format. */ - stream: z.string(), - /** UUID reference to a connection in the connections map. */ - connection: z.uuid(), - /** Stream-specific configuration parameters. */ - params: z.record(z.string(), z.unknown()), -}); - -/** An action node transforms data flowing through the graph. */ -export const ActionNode = NodeBase.extend({ - /** Discriminator for action nodes. */ - type: z.literal("action"), - /** Action identifier in "module/name" format. */ - action: z.string(), - /** Provider identifier for client-bound actions (optional). */ - provider: z.string().optional(), - /** UUID reference to a connection for client-bound actions (optional). */ - connection: z.uuid().optional(), - /** Action-specific configuration parameters. */ - params: z.record(z.string(), z.unknown()).default({}), -}); - -/** A target node writes data to an external system. */ -export const TargetNode = NodeBase.extend({ - /** Discriminator for target nodes. */ - type: z.literal("target"), - /** Provider identifier in "module/name" format. */ - provider: z.string(), - /** Stream identifier in "module/name" format. */ - stream: z.string(), - /** UUID reference to a connection in the connections map. */ - connection: z.uuid(), - /** Stream-specific configuration parameters. */ - params: z.record(z.string(), z.unknown()), -}); - -/** A node in the execution graph. Can be a source, action, or target. */ -export const GraphNode = z.discriminatedUnion("type", [ - SourceNode, - ActionNode, - TargetNode, -]); - -/** A directed edge connecting two nodes in the graph. */ -export const GraphEdge = z.object({ - /** UUID of the source node. */ - from: z.uuid(), - /** UUID of the target node. */ - to: z.uuid(), -}); - -export type SourceNode = z.infer<typeof SourceNode>; -export type ActionNode = z.infer<typeof ActionNode>; -export type TargetNode = z.infer<typeof TargetNode>; -export type GraphNode = z.infer<typeof GraphNode>; -export type GraphEdge = z.infer<typeof GraphEdge>; - -/** - * A complete graph definition describing a data pipeline. - * - * The graph is a directed acyclic graph (DAG) where source nodes produce data, - * action nodes transform data, target nodes consume data, and edges define - * data flow between nodes. - */ -export const Graph = z.object({ - /** Unique identifier for the graph. */ - id: z.uuid(), - /** Human-readable name for the graph. */ - name: z.string().optional(), - /** Description of what the graph does. */ - description: z.string().optional(), - /** Nodes in the graph. */ - nodes: z.array(GraphNode), - /** Edges connecting nodes. Defines data flow direction. */ - edges: z.array(GraphEdge).default([]), - /** Graph-level concurrency policy. Can be overridden per-node. */ - concurrency: ConcurrencyPolicy.optional(), - /** Graph-level timeout policy. Can be overridden per-node. */ - timeout: TimeoutPolicy.optional(), - /** Arbitrary metadata attached to the graph. */ - metadata: z.record(z.string(), z.unknown()).default({}), -}); - -export type Graph = z.infer<typeof Graph>; diff --git a/packages/nvisy-runtime/test/compile.test.ts b/packages/nvisy-runtime/test/compile.test.ts deleted file mode 100644 index 414fe5d..0000000 --- a/packages/nvisy-runtime/test/compile.test.ts +++ /dev/null @@ -1,126 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { compile } from "../src/compiler/index.js"; -import { - ACTION_ID, - diamondGraph, - GRAPH_ID, - linearGraph, - makeTestRegistry, - SOURCE_ID, - TARGET_ID, -} from "./fixtures.js"; - -describe("compile", () => { - it("compiles a valid linear graph end-to-end", () => { - const registry = makeTestRegistry(); - const plan = compile(linearGraph(), registry); - - expect(plan.definition.id).toBe(GRAPH_ID); - expect(plan.order).toEqual([SOURCE_ID, ACTION_ID, TARGET_ID]); - expect(plan.graph.order).toBe(3); - expect(plan.graph.size).toBe(2); - }); - - it("compiles a diamond graph end-to-end", () => { - const registry = makeTestRegistry(); - const plan = compile(diamondGraph(), registry); - - expect(plan.order[0]).toBe(SOURCE_ID); - expect(plan.order[plan.order.length - 1]).toBe(TARGET_ID); - expect(plan.order).toHaveLength(4); - }); - - it("resolves all nodes during compilation", () => { - const registry = makeTestRegistry(); - const plan = compile(linearGraph(), registry); - - for (const id of plan.order) { - expect(plan.resolved.get(id)).toBeDefined(); - } - }); - - it("rejects invalid input", () => { - const registry = makeTestRegistry(); - - expect(() => compile("not a graph", registry)).toThrow("Graph parse error"); - }); - - it("rejects graphs with cycles through full pipeline", () => { - const registry = makeTestRegistry(); - const cyclic = { - id: GRAPH_ID, - nodes: [ - { id: SOURCE_ID, type: "action", action: "test/noop", config: {} }, - { id: ACTION_ID, type: "action", action: "test/noop", config: {} }, - ], - edges: [ - { from: SOURCE_ID, to: ACTION_ID }, - { from: ACTION_ID, to: SOURCE_ID }, - ], - }; - - expect(() => compile(cyclic, registry)).toThrow("Graph contains a cycle"); - }); - - it("rejects unresolved names through full pipeline", () => { - const registry = makeTestRegistry(); - const unresolved = { - id: GRAPH_ID, - nodes: [ - { id: SOURCE_ID, type: "action", action: "missing/action", config: {} }, - ], - }; - - expect(() => compile(unresolved, registry)).toThrow("Unresolved names"); - }); - - it("preserves concurrency policy in definition", () => { - const registry = makeTestRegistry(); - const input = { - ...linearGraph(), - concurrency: { maxGlobal: 5 }, - }; - - const plan = compile(input, registry); - - expect(plan.definition.concurrency?.maxGlobal).toBe(5); - }); - - it("rejects source node without connection field", () => { - const registry = makeTestRegistry(); - const input = { - id: GRAPH_ID, - nodes: [ - { - id: SOURCE_ID, - type: "source", - provider: "test/testdb", - stream: "test/read", - // missing connection - params: { table: "t" }, - }, - ], - }; - - expect(() => compile(input, registry)).toThrow("Graph parse error"); - }); - - it("rejects non-UUID connection field", () => { - const registry = makeTestRegistry(); - const input = { - id: GRAPH_ID, - nodes: [ - { - id: SOURCE_ID, - type: "source", - provider: "test/testdb", - stream: "test/read", - connection: "not-a-uuid", - params: { table: "t" }, - }, - ], - }; - - expect(() => compile(input, registry)).toThrow("Graph parse error"); - }); -}); diff --git a/packages/nvisy-runtime/test/engine.test.ts b/packages/nvisy-runtime/test/engine.test.ts deleted file mode 100644 index 6ea0060..0000000 --- a/packages/nvisy-runtime/test/engine.test.ts +++ /dev/null @@ -1,493 +0,0 @@ -import { - Action, - CancellationError, - Document, - Plugin, - Provider, - RuntimeError, - ValidationError, -} from "@nvisy/core"; -import { beforeEach, describe, expect, it } from "vitest"; -import { z } from "zod"; -import type { Connections } from "../src/engine/connections.js"; -import { - CRED_ID, - diamondGraph, - linearGraph, - makeTestEngine, - SOURCE_ID, - sourceEntries, - testConnections, - writtenItems, -} from "./fixtures.js"; - -beforeEach(() => { - writtenItems.length = 0; -}); - -describe("validate", () => { - it("valid graph returns { valid: true, errors: [] }", () => { - const engine = makeTestEngine(); - const result = engine.validate(linearGraph(), testConnections()); - - expect(result.valid).toBe(true); - expect(result.errors).toEqual([]); - }); - - it("invalid graph returns errors", () => { - const engine = makeTestEngine(); - const result = engine.validate("not a graph", testConnections()); - - expect(result.valid).toBe(false); - expect(result.errors.length).toBeGreaterThan(0); - expect(result.errors[0]).toContain("Graph parse error"); - }); - - it("missing connections returns errors", () => { - const engine = makeTestEngine(); - const result = engine.validate(linearGraph(), {}); - - expect(result.valid).toBe(false); - expect(result.errors.some((e) => e.includes("Missing connection"))).toBe( - true, - ); - }); - - it("invalid credentials returns errors", () => { - const engine = makeTestEngine(); - const connections: Connections = { - [CRED_ID]: { - type: "testdb", - credentials: { wrong: "field" }, - context: {}, - }, - }; - const result = engine.validate(linearGraph(), connections); - - expect(result.valid).toBe(false); - expect(result.errors.some((e) => e.includes("Invalid credentials"))).toBe( - true, - ); - }); - - it("incompatible provider client rejected at execution time", async () => { - // Action requires this client class - abstract class EmbeddingClient { - abstract embed(input: string[]): Promise<number[][]>; - } - - // Provider produces a plain object — not an EmbeddingClient - class CompletionOnlyClient {} - - const incompatProvider = Provider.withAuthentication("incompatdb", { - credentials: z.object({ key: z.string() }), - connect: async () => ({ client: new CompletionOnlyClient() }), - }); - - const embeddingAction = Action.withClient("needs_embed", EmbeddingClient, { - types: [Document], - params: z.object({}), - transform: (stream) => stream, - }); - - const capPlugin = Plugin.define("cap") - .withProviders(incompatProvider) - .withActions(embeddingAction); - - const engine = makeTestEngine(); - engine.register(capPlugin); - - const credId = "00000000-0000-4000-8000-0000000000d0"; - const graph = { - id: "00000000-0000-4000-8000-000000000030", - nodes: [ - { - id: "00000000-0000-4000-8000-000000000031", - type: "action" as const, - action: "cap/needs_embed", - provider: "cap/incompatdb", - connection: credId, - params: {}, - }, - ], - edges: [], - }; - - const connections: Connections = { - [credId]: { - type: "incompatdb", - credentials: { key: "test" }, - context: {}, - }, - }; - - const result = await engine.executeSync(graph, connections); - expect(result.status).toBe("failure"); - expect( - result.nodes.some( - (n) => - n.status === "failure" && n.error?.message.includes("not compatible"), - ), - ).toBe(true); - }); -}); - -describe("execute", () => { - it("linear pipeline: source -> action -> target", async () => { - const engine = makeTestEngine(); - const result = await engine.executeSync(linearGraph(), testConnections()); - - expect(result.status).toBe("success"); - expect(result.nodes).toHaveLength(3); - for (const node of result.nodes) { - expect(node.status).toBe("success"); - } - expect(writtenItems).toHaveLength(sourceEntries.length); - }); - - it("diamond graph: source -> 2 actions -> target", async () => { - const engine = makeTestEngine(); - const result = await engine.executeSync(diamondGraph(), testConnections()); - - expect(result.status).toBe("success"); - expect(result.nodes).toHaveLength(4); - for (const node of result.nodes) { - expect(node.status).toBe("success"); - } - // Source fans out to 2 actions, each forwards all items to target - // Target sees items from both action branches - expect(writtenItems).toHaveLength(sourceEntries.length * 2); - }); - - it("empty source: 0 items, all nodes succeed", async () => { - const engine = makeTestEngine(); - // Override source entries to empty for this test - const original = [...sourceEntries]; - sourceEntries.length = 0; - try { - const result = await engine.executeSync(linearGraph(), testConnections()); - - expect(result.status).toBe("success"); - for (const node of result.nodes) { - expect(node.status).toBe("success"); - } - expect(writtenItems).toHaveLength(0); - } finally { - sourceEntries.push(...original); - } - }); - - it("cancellation via AbortSignal (pre-aborted)", async () => { - const engine = makeTestEngine(); - const controller = new AbortController(); - controller.abort(); - - await expect( - engine.executeSync(linearGraph(), testConnections(), { - signal: controller.signal, - }), - ).rejects.toThrow(CancellationError); - }); - - it("cancellation via AbortSignal (abort during execution)", async () => { - const engine = makeTestEngine(); - const controller = new AbortController(); - - // Abort after a short delay - setTimeout(() => controller.abort(), 5); - - // The execution should either complete normally (if fast enough) - // or be halted. Either outcome is acceptable — we just verify - // it doesn't hang. - const result = await engine.executeSync(linearGraph(), testConnections(), { - signal: controller.signal, - }); - // If we get here, execution completed before abort — that's fine - expect(result).toBeDefined(); - }); - - it("non-retryable error stops immediately", async () => { - const { Action, Document, Plugin, Provider, Stream } = await import( - "@nvisy/core" - ); - const { z } = await import("zod"); - - class FailClient {} - - const failProvider = Provider.withAuthentication("faildb", { - credentials: z.object({ host: z.string() }), - connect: async () => ({ - client: new FailClient(), - }), - }); - - const failSource = Stream.createSource("read", FailClient, { - type: Document, - context: z.object({}).default({}), - params: z.record(z.string(), z.unknown()), - // biome-ignore lint/correctness/useYield: intentionally throws before yielding to test error handling - reader: async function* () { - throw new RuntimeError("Non-retryable failure", { - retryable: false, - }); - }, - }); - - const failTarget = Stream.createTarget("write", FailClient, { - type: Document, - params: z.record(z.string(), z.unknown()), - writer: () => async () => {}, - }); - - const failPlugin = Plugin.define("fail") - .withActions( - Action.withoutClient("noop", { - types: [Document], - params: z.object({}), - transform: (stream) => stream, - }), - ) - .withProviders(failProvider) - .withStreams(failSource, failTarget); - - const engine = makeTestEngine(); - engine.register(failPlugin); - - const failCredId = "00000000-0000-4000-8000-0000000000f1"; - const graph = { - id: "00000000-0000-4000-8000-000000000010", - nodes: [ - { - id: "00000000-0000-4000-8000-000000000011", - type: "source" as const, - provider: "fail/faildb", - stream: "fail/read", - connection: failCredId, - params: {}, - retry: { - maxRetries: 3, - backoff: "fixed" as const, - initialDelayMs: 1, - maxDelayMs: 1, - }, - }, - { - id: "00000000-0000-4000-8000-000000000012", - type: "target" as const, - provider: "fail/faildb", - stream: "fail/write", - connection: failCredId, - params: {}, - }, - ], - edges: [ - { - from: "00000000-0000-4000-8000-000000000011", - to: "00000000-0000-4000-8000-000000000012", - }, - ], - }; - - const connections: Connections = { - ...testConnections(), - [failCredId]: { - type: "faildb", - credentials: { host: "localhost" }, - context: {}, - }, - }; - - const result = await engine.executeSync(graph, connections); - - // The source node should fail (non-retryable skips retries) - const sourceNode = result.nodes.find( - (n) => n.nodeId === "00000000-0000-4000-8000-000000000011", - ); - expect(sourceNode?.status).toBe("failure"); - expect(sourceNode?.error?.message).toContain("Non-retryable failure"); - }); - - it("retryable error triggers retry", async () => { - const { Action, Document, Plugin, Provider, Stream } = await import( - "@nvisy/core" - ); - const { z } = await import("zod"); - - let attempts = 0; - - class RetryClient {} - - const retryProvider = Provider.withAuthentication("retrydb", { - credentials: z.object({ host: z.string() }), - connect: async () => ({ - client: new RetryClient(), - }), - }); - - const retrySource = Stream.createSource("read", RetryClient, { - type: Document, - context: z.object({}).default({}), - params: z.record(z.string(), z.unknown()), - reader: async function* () { - attempts++; - if (attempts < 3) { - throw new RuntimeError("Transient failure", { retryable: true }); - } - yield { - data: new Document("recovered"), - context: {}, - }; - }, - }); - - const retryTarget = Stream.createTarget("write", RetryClient, { - type: Document, - params: z.record(z.string(), z.unknown()), - writer: () => async () => {}, - }); - - const retryPlugin = Plugin.define("retry") - .withActions( - Action.withoutClient("noop", { - types: [Document], - params: z.object({}), - transform: (stream) => stream, - }), - ) - .withProviders(retryProvider) - .withStreams(retrySource, retryTarget); - - const engine = makeTestEngine(); - engine.register(retryPlugin); - - const retryCredId = "00000000-0000-4000-8000-0000000000f2"; - const graph = { - id: "00000000-0000-4000-8000-000000000020", - nodes: [ - { - id: "00000000-0000-4000-8000-000000000021", - type: "source" as const, - provider: "retry/retrydb", - stream: "retry/read", - connection: retryCredId, - params: {}, - retry: { - maxRetries: 5, - backoff: "fixed" as const, - initialDelayMs: 1, - maxDelayMs: 1, - }, - }, - { - id: "00000000-0000-4000-8000-000000000022", - type: "target" as const, - provider: "retry/retrydb", - stream: "retry/write", - connection: retryCredId, - params: {}, - }, - ], - edges: [ - { - from: "00000000-0000-4000-8000-000000000021", - to: "00000000-0000-4000-8000-000000000022", - }, - ], - }; - - const connections: Connections = { - ...testConnections(), - [retryCredId]: { - type: "retrydb", - credentials: { host: "localhost" }, - context: {}, - }, - }; - - const result = await engine.executeSync(graph, connections); - - expect(result.status).toBe("success"); - expect(attempts).toBe(3); // Failed twice, succeeded on third - }); - - it("calls onContextUpdate for each yielded resumable", async () => { - const engine = makeTestEngine(); - const updates: Array<{ - nodeId: string; - connectionId: string; - context: unknown; - }> = []; - - const result = await engine.executeSync(linearGraph(), testConnections(), { - onContextUpdate: (nodeId, connectionId, context) => { - updates.push({ nodeId, connectionId, context }); - }, - }); - - expect(result.status).toBe("success"); - expect(updates).toHaveLength(sourceEntries.length); - for (const update of updates) { - expect(update.nodeId).toBe(SOURCE_ID); - expect(update.connectionId).toBe(CRED_ID); - expect(update.context).toHaveProperty("cursor"); - } - }); -}); - -describe("credential validation", () => { - it("rejects malformed connections map (non-UUID keys)", async () => { - const engine = makeTestEngine(); - const connections = { - "not-a-uuid": { - type: "testdb", - credentials: { host: "localhost" }, - context: {}, - }, - }; - - await expect( - engine.executeSync(linearGraph(), connections), - ).rejects.toThrow(ValidationError); - }); - - it("rejects missing connection entry at execution time", async () => { - const engine = makeTestEngine(); - - // Provide empty connections — the node references CRED_ID which won't be found - await expect(engine.executeSync(linearGraph(), {})).rejects.toThrow( - ValidationError, - ); - }); - - it("rejects credentials that don't match provider schema", async () => { - const engine = makeTestEngine(); - const connections: Connections = { - [CRED_ID]: { - type: "testdb", - credentials: { wrong: "field" }, - context: {}, - }, - }; - - await expect( - engine.executeSync(linearGraph(), connections), - ).rejects.toThrow(ValidationError); - }); - - it("accepts valid connections map with extra entries", async () => { - const engine = makeTestEngine(); - const extraCredId = "00000000-0000-4000-8000-0000000000e0"; - const connections: Connections = { - ...testConnections(), - [extraCredId]: { - type: "other", - credentials: { unused: true }, - context: {}, - }, - }; - - const result = await engine.executeSync(linearGraph(), connections); - - expect(result.status).toBe("success"); - }); -}); diff --git a/packages/nvisy-runtime/test/fixtures.ts b/packages/nvisy-runtime/test/fixtures.ts deleted file mode 100644 index d42b087..0000000 --- a/packages/nvisy-runtime/test/fixtures.ts +++ /dev/null @@ -1,236 +0,0 @@ -import type { JsonValue, Resumable } from "@nvisy/core"; -import { Action, Data, Plugin, Provider, Stream } from "@nvisy/core"; -import { z } from "zod"; -import type { Connections } from "../src/engine/connections.js"; -import { Engine } from "../src/engine/engine.js"; -import { Registry } from "../src/registry.js"; - -/** Minimal row-like data type for testing. */ -export class TestRow extends Data { - readonly #columns: Readonly<Record<string, JsonValue>>; - - constructor(columns: Record<string, JsonValue>) { - super(); - this.#columns = columns; - } - - get columns(): Readonly<Record<string, JsonValue>> { - return this.#columns; - } - - get(column: string): JsonValue | undefined { - return this.#columns[column]; - } -} - -export const GRAPH_ID = "00000000-0000-4000-8000-000000000000"; -export const SOURCE_ID = "00000000-0000-4000-8000-000000000001"; -export const ACTION_ID = "00000000-0000-4000-8000-000000000002"; -export const TARGET_ID = "00000000-0000-4000-8000-000000000003"; -export const EXTRA_ID = "00000000-0000-4000-8000-000000000004"; -export const CRED_ID = "00000000-0000-4000-8000-0000000000c0"; - -const NoopParams = z.object({}); - -export const noopAction = Action.withoutClient("noop", { - types: [TestRow], - params: NoopParams, - transform: (stream, _params) => stream, -}); - -const TestCredentials = z.object({ host: z.string() }); - -class TestClient {} - -export const testProvider = Provider.withAuthentication("testdb", { - credentials: TestCredentials, - connect: async (_creds) => ({ - client: new TestClient(), - }), -}); - -const TestContext = z.object({ - cursor: z.string().nullable().default(null), -}); - -const TestParams = z.record(z.string(), z.unknown()); - -/** - * Items produced by the mock source stream. - * Exposed so tests can assert on them. - */ -export const sourceEntries: TestRow[] = [ - new TestRow({ name: "Alice", age: 30 }), - new TestRow({ name: "Bob", age: 25 }), - new TestRow({ name: "Carol", age: 35 }), -]; - -export const testSourceStream = Stream.createSource("read", TestClient, { - type: TestRow, - context: TestContext, - params: TestParams, - reader: async function* (_client, _ctx, _params) { - for (const row of sourceEntries) { - yield { data: row, context: { cursor: row.id } } as Resumable< - TestRow, - z.infer<typeof TestContext> - >; - } - }, -}); - -/** - * Items written to the mock target stream. - * Tests can inspect this array after execution. - */ -export const writtenItems: Data[] = []; - -export const testTargetStream = Stream.createTarget("write", TestClient, { - type: TestRow, - params: TestParams, - writer: (_client, _params) => { - return async (item: TestRow) => { - writtenItems.push(item); - }; - }, -}); - -export const testPlugin = Plugin.define("test") - .withActions(noopAction) - .withProviders(testProvider) - .withStreams(testSourceStream, testTargetStream); - -/** - * Create a Registry pre-loaded with the test plugin. - */ -export function makeTestRegistry(): Registry { - const registry = new Registry(); - registry.load(testPlugin); - return registry; -} - -/** - * Create an Engine pre-loaded with the test plugin. - */ -export function makeTestEngine(): Engine { - return new Engine().register(testPlugin); -} - -/** - * Default credential map matching the test provider's schema. - */ -export function testCredentials() { - return { [CRED_ID]: { host: "localhost" } }; -} - -/** - * Default connections map matching the test provider's schema. - */ -export function testConnections(): Connections { - return { - [CRED_ID]: { - type: "testdb", - credentials: { host: "localhost" }, - context: {}, - }, - }; -} - -export function linearGraph() { - return { - id: GRAPH_ID, - nodes: [ - { - id: SOURCE_ID, - type: "source" as const, - provider: "test/testdb", - stream: "test/read", - connection: CRED_ID, - params: { table: "users" }, - }, - { - id: ACTION_ID, - type: "action" as const, - action: "test/noop", - params: {}, - }, - { - id: TARGET_ID, - type: "target" as const, - provider: "test/testdb", - stream: "test/write", - connection: CRED_ID, - params: { table: "output" }, - }, - ], - edges: [ - { from: SOURCE_ID, to: ACTION_ID }, - { from: ACTION_ID, to: TARGET_ID }, - ], - }; -} - -export function isolatedNodesGraph() { - return { - id: GRAPH_ID, - nodes: [ - { - id: SOURCE_ID, - type: "source" as const, - provider: "test/testdb", - stream: "test/read", - connection: CRED_ID, - params: { table: "users" }, - }, - { - id: ACTION_ID, - type: "action" as const, - action: "test/noop", - params: {}, - }, - ], - edges: [], - }; -} - -export function diamondGraph() { - return { - id: GRAPH_ID, - nodes: [ - { - id: SOURCE_ID, - type: "source" as const, - provider: "test/testdb", - stream: "test/read", - connection: CRED_ID, - params: { table: "users" }, - }, - { - id: ACTION_ID, - type: "action" as const, - action: "test/noop", - params: {}, - }, - { - id: EXTRA_ID, - type: "action" as const, - action: "test/noop", - params: {}, - }, - { - id: TARGET_ID, - type: "target" as const, - provider: "test/testdb", - stream: "test/write", - connection: CRED_ID, - params: { table: "output" }, - }, - ], - edges: [ - { from: SOURCE_ID, to: ACTION_ID }, - { from: SOURCE_ID, to: EXTRA_ID }, - { from: ACTION_ID, to: TARGET_ID }, - { from: EXTRA_ID, to: TARGET_ID }, - ], - }; -} diff --git a/packages/nvisy-runtime/test/parse.test.ts b/packages/nvisy-runtime/test/parse.test.ts deleted file mode 100644 index 79f0184..0000000 --- a/packages/nvisy-runtime/test/parse.test.ts +++ /dev/null @@ -1,175 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { parseGraph } from "../src/compiler/parse.js"; -import { - ACTION_ID, - CRED_ID, - GRAPH_ID, - linearGraph, - SOURCE_ID, - TARGET_ID, -} from "./fixtures.js"; - -describe("parseGraph", () => { - it("parses a valid linear graph", () => { - const result = parseGraph(linearGraph()); - - expect(result.definition.id).toBe(GRAPH_ID); - expect(result.definition.nodes).toHaveLength(3); - expect(result.definition.edges).toHaveLength(2); - expect(result.graph.order).toBe(3); - expect(result.graph.size).toBe(2); - }); - - it("returns a RuntimeGraph with correct node attributes", () => { - const result = parseGraph(linearGraph()); - - const attrs = result.graph.getNodeAttributes(SOURCE_ID); - expect(attrs.schema.type).toBe("source"); - }); - - it("creates edge keys in from->to format", () => { - const result = parseGraph(linearGraph()); - - expect(result.graph.hasEdge(`${SOURCE_ID}->${ACTION_ID}`)).toBe(true); - expect(result.graph.hasEdge(`${ACTION_ID}->${TARGET_ID}`)).toBe(true); - }); - - it("rejects input missing required fields", () => { - expect(() => parseGraph({})).toThrow("Graph parse error"); - }); - - it("rejects non-UUID node IDs", () => { - const bad = { - id: GRAPH_ID, - nodes: [ - { id: "not-a-uuid", type: "action", action: "test/noop", params: {} }, - ], - }; - - expect(() => parseGraph(bad)).toThrow("Graph parse error"); - }); - - it("rejects non-UUID graph ID", () => { - const bad = { - id: "bad-graph-id", - nodes: [], - }; - - expect(() => parseGraph(bad)).toThrow("Graph parse error"); - }); - - it("defaults edges to empty array", () => { - const input = { - id: GRAPH_ID, - nodes: [ - { - id: SOURCE_ID, - type: "source", - provider: "x", - stream: "x/read", - connection: CRED_ID, - params: { key: "val" }, - }, - ], - }; - - const result = parseGraph(input); - expect(result.definition.edges).toEqual([]); - expect(result.graph.size).toBe(0); - }); - - it("defaults metadata to empty object", () => { - const input = { - id: GRAPH_ID, - nodes: [], - }; - - const result = parseGraph(input); - expect(result.definition.metadata).toEqual({}); - }); - - it("rejects duplicate node IDs (caught by graphology during parse)", () => { - const input = { - id: GRAPH_ID, - nodes: [ - { - id: SOURCE_ID, - type: "source", - provider: "test/testdb", - stream: "test/read", - connection: CRED_ID, - params: { table: "t" }, - }, - { id: SOURCE_ID, type: "action", action: "test/noop", params: {} }, - ], - }; - - expect(() => parseGraph(input)).toThrow("already exist"); - }); - - it("rejects dangling edge.from references (caught by graphology during parse)", () => { - const input = { - id: GRAPH_ID, - nodes: [ - { - id: SOURCE_ID, - type: "source", - provider: "test/testdb", - stream: "test/read", - connection: CRED_ID, - params: { table: "t" }, - }, - ], - edges: [{ from: ACTION_ID, to: SOURCE_ID }], - }; - - expect(() => parseGraph(input)).toThrow("not found"); - }); - - it("rejects dangling edge.to references (caught by graphology during parse)", () => { - const input = { - id: GRAPH_ID, - nodes: [ - { - id: SOURCE_ID, - type: "source", - provider: "test/testdb", - stream: "test/read", - connection: CRED_ID, - params: { table: "t" }, - }, - ], - edges: [{ from: SOURCE_ID, to: ACTION_ID }], - }; - - expect(() => parseGraph(input)).toThrow("not found"); - }); -}); - -describe("buildRuntimeGraph", () => { - it("builds a graph matching the definition", () => { - const { graph } = parseGraph({ - id: GRAPH_ID, - nodes: [ - { - id: SOURCE_ID, - type: "source" as const, - provider: "x", - stream: "x/read", - connection: CRED_ID, - params: { k: "v" }, - }, - { id: ACTION_ID, type: "action" as const, action: "y", params: {} }, - ], - edges: [{ from: SOURCE_ID, to: ACTION_ID }], - }); - - expect(graph.order).toBe(2); - expect(graph.size).toBe(1); - expect(graph.hasNode(SOURCE_ID)).toBe(true); - expect(graph.hasNode(ACTION_ID)).toBe(true); - expect(graph.hasEdge(`${SOURCE_ID}->${ACTION_ID}`)).toBe(true); - expect(graph.source(`${SOURCE_ID}->${ACTION_ID}`)).toBe(SOURCE_ID); - expect(graph.target(`${SOURCE_ID}->${ACTION_ID}`)).toBe(ACTION_ID); - }); -}); diff --git a/packages/nvisy-runtime/test/plan.test.ts b/packages/nvisy-runtime/test/plan.test.ts deleted file mode 100644 index 86b0b79..0000000 --- a/packages/nvisy-runtime/test/plan.test.ts +++ /dev/null @@ -1,179 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { parseGraph } from "../src/compiler/parse.js"; -import { buildPlan } from "../src/compiler/plan.js"; -import { - ACTION_ID, - CRED_ID, - diamondGraph, - EXTRA_ID, - GRAPH_ID, - isolatedNodesGraph, - linearGraph, - makeTestRegistry, - SOURCE_ID, - TARGET_ID, -} from "./fixtures.js"; - -describe("buildPlan", () => { - it("produces a topological order for a linear graph", () => { - const registry = makeTestRegistry(); - const parsed = parseGraph(linearGraph()); - const plan = buildPlan(parsed, registry); - - expect(plan.order).toEqual([SOURCE_ID, ACTION_ID, TARGET_ID]); - }); - - it("produces a valid topological order for a diamond graph", () => { - const registry = makeTestRegistry(); - const parsed = parseGraph(diamondGraph()); - const plan = buildPlan(parsed, registry); - - // Source must come first, sink must come last - expect(plan.order[0]).toBe(SOURCE_ID); - expect(plan.order[plan.order.length - 1]).toBe(TARGET_ID); - // Both middle nodes must appear between source and sink - expect(plan.order).toContain(ACTION_ID); - expect(plan.order).toContain(EXTRA_ID); - expect(plan.order.indexOf(ACTION_ID)).toBeGreaterThan(0); - expect(plan.order.indexOf(EXTRA_ID)).toBeGreaterThan(0); - expect(plan.order.indexOf(ACTION_ID)).toBeLessThan(plan.order.length - 1); - expect(plan.order.indexOf(EXTRA_ID)).toBeLessThan(plan.order.length - 1); - }); - - it("handles isolated nodes (no edges)", () => { - const registry = makeTestRegistry(); - const parsed = parseGraph(isolatedNodesGraph()); - const plan = buildPlan(parsed, registry); - - expect(plan.order).toHaveLength(2); - expect(plan.order).toContain(SOURCE_ID); - expect(plan.order).toContain(ACTION_ID); - }); - - it("stores resolved entries in the resolved map", () => { - const registry = makeTestRegistry(); - const parsed = parseGraph(linearGraph()); - const plan = buildPlan(parsed, registry); - - const sourceResolved = plan.resolved.get(SOURCE_ID); - expect(sourceResolved).toBeDefined(); - expect(sourceResolved!.type).toBe("source"); - - const actionResolved = plan.resolved.get(ACTION_ID); - expect(actionResolved).toBeDefined(); - expect(actionResolved!.type).toBe("action"); - - const targetResolved = plan.resolved.get(TARGET_ID); - expect(targetResolved).toBeDefined(); - expect(targetResolved!.type).toBe("target"); - }); - - it("exposes the graph definition on the plan", () => { - const registry = makeTestRegistry(); - const parsed = parseGraph(linearGraph()); - const plan = buildPlan(parsed, registry); - - expect(plan.definition.id).toBe(GRAPH_ID); - expect(plan.definition.nodes).toHaveLength(3); - }); - - it("plan.graph is the same RuntimeGraph instance", () => { - const registry = makeTestRegistry(); - const parsed = parseGraph(linearGraph()); - const plan = buildPlan(parsed, registry); - - // Verify graph structure matches definition - expect(plan.graph.order).toBe(plan.definition.nodes.length); - expect(plan.graph.size).toBe(plan.definition.edges.length); - }); - - it("rejects graphs with cycles", () => { - const registry = makeTestRegistry(); - const input = { - id: GRAPH_ID, - nodes: [ - { id: SOURCE_ID, type: "action", action: "test/noop", params: {} }, - { id: ACTION_ID, type: "action", action: "test/noop", params: {} }, - { id: TARGET_ID, type: "action", action: "test/noop", params: {} }, - ], - edges: [ - { from: SOURCE_ID, to: ACTION_ID }, - { from: ACTION_ID, to: TARGET_ID }, - { from: TARGET_ID, to: SOURCE_ID }, - ], - }; - - const parsed = parseGraph(input); - expect(() => buildPlan(parsed, registry)).toThrow("Graph contains a cycle"); - }); - - it("rejects unresolved action names", () => { - const registry = makeTestRegistry(); - const input = { - id: GRAPH_ID, - nodes: [ - { - id: ACTION_ID, - type: "action", - action: "nonexistent/action", - params: {}, - }, - ], - }; - - const parsed = parseGraph(input); - expect(() => buildPlan(parsed, registry)).toThrow("Unresolved names"); - }); - - it("rejects unresolved provider names", () => { - const registry = makeTestRegistry(); - const input = { - id: GRAPH_ID, - nodes: [ - { - id: SOURCE_ID, - type: "source", - provider: "nonexistent/provider", - stream: "test/read", - connection: CRED_ID, - params: { key: "val" }, - }, - ], - }; - - const parsed = parseGraph(input); - expect(() => buildPlan(parsed, registry)).toThrow("Unresolved names"); - }); - - it("rejects unresolved stream names", () => { - const registry = makeTestRegistry(); - const input = { - id: GRAPH_ID, - nodes: [ - { - id: SOURCE_ID, - type: "source", - provider: "test/testdb", - stream: "test/nonexistent", - connection: CRED_ID, - params: { key: "val" }, - }, - ], - }; - - const parsed = parseGraph(input); - expect(() => buildPlan(parsed, registry)).toThrow("Unresolved names"); - }); - - it("passes for an empty graph", () => { - const registry = makeTestRegistry(); - const input = { id: GRAPH_ID, nodes: [] }; - - const parsed = parseGraph(input); - const plan = buildPlan(parsed, registry); - - expect(plan.definition.nodes).toHaveLength(0); - expect(plan.graph.order).toBe(0); - expect(plan.resolved.size).toBe(0); - }); -}); diff --git a/packages/nvisy-runtime/test/registry.test.ts b/packages/nvisy-runtime/test/registry.test.ts deleted file mode 100644 index 5252715..0000000 --- a/packages/nvisy-runtime/test/registry.test.ts +++ /dev/null @@ -1,84 +0,0 @@ -import { Datatype, Document, Plugin, ValidationError } from "@nvisy/core"; -import { describe, expect, it } from "vitest"; -import { Registry } from "../src/registry.js"; -import { - makeTestRegistry, - noopAction, - testPlugin, - testProvider, - testSourceStream, - testTargetStream, -} from "./fixtures.js"; - -describe("Registry", () => { - it("loads a plugin and resolves its entries by qualified name", () => { - const registry = makeTestRegistry(); - - expect(registry.getAction("test/noop")).toBe(noopAction); - expect(registry.getProvider("test/testdb")).toBe(testProvider); - expect(registry.getStream("test/read")).toBe(testSourceStream); - expect(registry.getStream("test/write")).toBe(testTargetStream); - }); - - it("find* returns undefined for missing entries", () => { - const registry = makeTestRegistry(); - - expect(registry.findAction("missing/action")).toBeUndefined(); - expect(registry.findProvider("missing/provider")).toBeUndefined(); - expect(registry.findStream("missing/stream")).toBeUndefined(); - expect(registry.findDataType("missing/type")).toBeUndefined(); - }); - - it("get* throws ValidationError for missing entries", () => { - const registry = makeTestRegistry(); - - expect(() => registry.getAction("missing/action")).toThrow(ValidationError); - expect(() => registry.getProvider("missing/provider")).toThrow( - ValidationError, - ); - expect(() => registry.getStream("missing/stream")).toThrow(ValidationError); - expect(() => registry.getDataType("missing/type")).toThrow(ValidationError); - }); - - it("rejects loading the same plugin twice", () => { - const registry = makeTestRegistry(); - - expect(() => registry.load(testPlugin)).toThrow("Plugin already loaded"); - }); - - it("loads datatypes and resolves them", () => { - const registry = new Registry(); - const plugin = Plugin.define("dt").withDatatypes( - Datatype.define("document", Document), - ); - registry.load(plugin); - - const entry = registry.getDataType("dt/document"); - expect(entry.id).toBe("document"); - expect(entry.dataClass).toBe(Document); - }); - - it("schema snapshot includes actions and providers", () => { - const registry = makeTestRegistry(); - const schema = registry.schema; - - expect(schema.actions).toHaveLength(1); - expect(schema.actions[0]!.name).toBe("test/noop"); - expect(schema.providers).toHaveLength(1); - expect(schema.providers[0]!.name).toBe("test/testdb"); - expect(schema.streams).toBe(2); - expect(schema.loaders).toBe(0); - expect(schema.datatypes).toBe(0); - }); - - it("schema snapshot is empty for a fresh registry", () => { - const registry = new Registry(); - const schema = registry.schema; - - expect(schema.actions).toHaveLength(0); - expect(schema.providers).toHaveLength(0); - expect(schema.streams).toBe(0); - expect(schema.loaders).toBe(0); - expect(schema.datatypes).toBe(0); - }); -}); diff --git a/packages/nvisy-runtime/tsconfig.json b/packages/nvisy-runtime/tsconfig.json deleted file mode 100644 index 8b06e27..0000000 --- a/packages/nvisy-runtime/tsconfig.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - /* Emit */ - "outDir": "./dist", - "rootDir": "./src", - "composite": true - }, - /* Scope */ - "include": ["src/**/*"], - "exclude": ["node_modules", "dist"], - "references": [ - { "path": "../nvisy-core" }, - { "path": "../nvisy-plugin-core" } - ] -} diff --git a/packages/nvisy-runtime/tsup.config.ts b/packages/nvisy-runtime/tsup.config.ts deleted file mode 100644 index d68a5db..0000000 --- a/packages/nvisy-runtime/tsup.config.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - /* Entry */ - entry: ["src/index.ts"], - format: ["esm"], - - /* Output */ - outDir: "dist", - dts: { compilerOptions: { composite: false } }, - sourcemap: true, - clean: true, - - /* Optimization */ - splitting: false, - treeshake: true, - skipNodeModulesBundle: true, - - /* Environment */ - platform: "node", - target: "es2024", -}); diff --git a/packages/nvisy-server/README.md b/packages/nvisy-server/README.md deleted file mode 100644 index e993cc6..0000000 --- a/packages/nvisy-server/README.md +++ /dev/null @@ -1,43 +0,0 @@ -# @nvisy/server - -[![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) - -HTTP server for the Nvisy Runtime platform, built on Hono. - -## Features - -- **REST API**: graph lifecycle management, run execution, and monitoring -- **Connector health checks**: verify provider connections before execution -- **Cron scheduling**: time-based pipeline triggers -- **Webhook events**: HTTP-based pipeline triggers - -## Overview - -Exposes a REST API for graph lifecycle management, run execution and monitoring, connector health checks, and lineage queries. Includes cron-based scheduling and webhook event triggers. - -## Usage - -```ts -import { createServer } from "@nvisy/server"; - -const server = createServer({ - engine, - port: 3000, -}); - -await server.start(); -``` - -## Changelog - -See [CHANGELOG.md](../../CHANGELOG.md) for release notes and version history. - -## License - -Apache 2.0 License - see [LICENSE.txt](../../LICENSE.txt) - -## Support - -- **Documentation**: [docs.nvisy.com](https://docs.nvisy.com) -- **Issues**: [GitHub Issues](https://github.com/nvisycom/runtime/issues) -- **Email**: [support@nvisy.com](mailto:support@nvisy.com) diff --git a/packages/nvisy-server/package.json b/packages/nvisy-server/package.json deleted file mode 100644 index 945067c..0000000 --- a/packages/nvisy-server/package.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "name": "@nvisy/server", - "version": "0.1.0", - "private": true, - "description": "HTTP execution worker for the Nvisy runtime", - "type": "module", - "scripts": { - "build": "tsup", - "build:watch": "tsup --watch", - "clean": "rimraf dist", - "dev": "node --watch dist/main.js", - "start": "node dist/main.js", - "typecheck": "tsc -b" - }, - "dependencies": { - "@hono/event-emitter": "^2.0.0", - "@hono/node-server": "^1.19.9", - "@hono/node-ws": "^1.3.0", - "@hono/otel": "^1.1.0", - "@hono/zod-openapi": "^1.2.1", - "@hono/zod-validator": "^0.7.6", - "@logtape/hono": "^2.0.2", - "@logtape/logtape": "^2.0.2", - "@logtape/pretty": "^2.0.2", - "@logtape/redaction": "^2.0.2", - "@nvisy/core": "*", - "@nvisy/plugin-ai": "*", - "@nvisy/plugin-nosql": "*", - "@nvisy/plugin-object": "*", - "@nvisy/plugin-tesseract": "*", - "@nvisy/plugin-pandoc": "*", - "@nvisy/plugin-queue": "*", - "@nvisy/plugin-sql": "*", - "@nvisy/plugin-vector": "*", - "@nvisy/runtime": "*", - "@scalar/hono-api-reference": "^0.9.40", - "hono": "^4.11.7", - "zod": "^4.3.6" - }, - "engines": { - "node": ">=22.0.0" - } -} diff --git a/packages/nvisy-server/src/app.ts b/packages/nvisy-server/src/app.ts deleted file mode 100644 index 95b267e..0000000 --- a/packages/nvisy-server/src/app.ts +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Application building blocks for the Nvisy HTTP server. - * - * Exports the Hono app factory ({@link createApp}) and a plain - * server lifecycle ({@link startServer}). These are composed - * into a running service by `main.ts`. - * - * @module - */ - -import { serve } from "@hono/node-server"; -import { createNodeWebSocket } from "@hono/node-ws"; -import { OpenAPIHono } from "@hono/zod-openapi"; -import { getLogger } from "@logtape/logtape"; -import type { ServerConfig } from "./config.js"; -import { registerHandlers } from "./handler/index.js"; -import { engineMiddleware, registerMiddleware } from "./middleware/index.js"; -import { createEngine } from "./service/index.js"; - -const logger = getLogger(["nvisy", "server"]); - -/** Build a fully configured OpenAPIHono application with middleware and routes. */ -export function createApp(config: ServerConfig) { - const app = new OpenAPIHono(); - const { injectWebSocket, upgradeWebSocket } = createNodeWebSocket({ app }); - - const engine = createEngine(); - app.use("*", engineMiddleware(engine)); - - logger.debug("Registering middleware"); - registerMiddleware(app, config); - logger.debug("Registering route handlers"); - registerHandlers(app, config); - logger.debug("App initialised (mode={mode}, cors={cors})", { - mode: config.isDevelopment ? "development" : "production", - cors: config.corsOrigin, - }); - - return { app, injectWebSocket, upgradeWebSocket }; -} - -export interface StartServerOptions { - app: OpenAPIHono; - host: string; - port: number; - injectWebSocket: (server: ReturnType<typeof serve>) => void; -} - -/** - * Start the Node.js HTTP server. - * - * Returns a cleanup function that closes the server gracefully. - */ -export function startServer(opts: StartServerOptions): { - server: ReturnType<typeof serve>; - close: () => void; -} { - const server = serve({ - fetch: opts.app.fetch, - hostname: opts.host, - port: opts.port, - }); - opts.injectWebSocket(server); - - logger.info("Server started on {host}:{port}", { - host: opts.host, - port: opts.port, - }); - - return { - server, - close: () => server.close(), - }; -} diff --git a/packages/nvisy-server/src/config.ts b/packages/nvisy-server/src/config.ts deleted file mode 100644 index e2490f0..0000000 --- a/packages/nvisy-server/src/config.ts +++ /dev/null @@ -1,89 +0,0 @@ -/** - * Typed server configuration loaded from environment variables. - * - * | Variable | Type | Default | - * |----------------------|--------|-----------------| - * | `PORT` | number | `8080` | - * | `HOST` | string | `"0.0.0.0"` | - * | `CORS_ORIGIN` | string | `"*"` | - * | `BODY_LIMIT_BYTES` | number | `1048576` (1MB) | - * | `REQUEST_TIMEOUT_MS` | number | `30000` (30s) | - * | `NODE_ENV` | string | `"development"` | - * - * `isDevelopment` is derived from `NODE_ENV` — `true` unless - * `NODE_ENV` is explicitly set to `"production"`. - */ - -import { AsyncLocalStorage } from "node:async_hooks"; -import { - configure, - getConsoleSink, - jsonLinesFormatter, -} from "@logtape/logtape"; -import { prettyFormatter } from "@logtape/pretty"; -import { redactByField } from "@logtape/redaction"; -import { z } from "zod"; - -const EnvSchema = z.object({ - PORT: z.coerce.number().default(8080), - HOST: z.string().default("0.0.0.0"), - CORS_ORIGIN: z.string().default("*"), - BODY_LIMIT_BYTES: z.coerce.number().default(1024 * 1024), - REQUEST_TIMEOUT_MS: z.coerce.number().default(30_000), - NODE_ENV: z.string().default("development"), -}); - -export interface ServerConfig { - readonly port: number; - readonly host: string; - readonly corsOrigin: string; - readonly bodyLimitBytes: number; - readonly requestTimeoutMs: number; - readonly isDevelopment: boolean; -} - -export function loadConfig(): ServerConfig { - const env = EnvSchema.parse(process.env); - - return { - port: env.PORT, - host: env.HOST, - corsOrigin: env.CORS_ORIGIN, - bodyLimitBytes: env.BODY_LIMIT_BYTES, - requestTimeoutMs: env.REQUEST_TIMEOUT_MS, - isDevelopment: env.NODE_ENV !== "production", - }; -} - -/** - * Configure LogTape logging. - * - * - **development** — human-readable, coloured console output via `@logtape/pretty`. - * - **production** — JSON Lines (machine-parseable) via the built-in `jsonLinesFormatter`. - * - * Sensitive fields are automatically redacted by `@logtape/redaction`. - * Per-request `requestId` is propagated to every log call via - * `AsyncLocalStorage` — see `middleware/index.ts`. - */ -export async function configureLogging(config: ServerConfig): Promise<void> { - const consoleSink = config.isDevelopment - ? getConsoleSink({ formatter: prettyFormatter }) - : getConsoleSink({ formatter: jsonLinesFormatter }); - - await configure({ - contextLocalStorage: new AsyncLocalStorage(), - sinks: { console: redactByField(consoleSink) }, - loggers: [ - { - category: ["logtape", "meta"], - lowestLevel: "warning", - sinks: ["console"], - }, - { - category: ["nvisy"], - lowestLevel: config.isDevelopment ? "debug" : "info", - sinks: ["console"], - }, - ], - }); -} diff --git a/packages/nvisy-server/src/handler/graphs-routes.ts b/packages/nvisy-server/src/handler/graphs-routes.ts deleted file mode 100644 index 4588e4f..0000000 --- a/packages/nvisy-server/src/handler/graphs-routes.ts +++ /dev/null @@ -1,106 +0,0 @@ -import { createRoute, z } from "@hono/zod-openapi"; -import { - CancelResponseSchema, - ErrorResponseSchema, - ExecuteRequestSchema, - ExecuteResponseSchema, - RunDetailSchema, - RunIdParamSchema, - RunSummarySchema, - ValidateRequestSchema, - ValidateResponseSchema, -} from "./graphs-schema.js"; - -export const executeRoute = createRoute({ - method: "post", - path: "/api/v1/graphs/execute", - tags: ["Graphs"], - summary: "Execute a graph", - description: - "Submit a graph for execution. Returns immediately with a run ID.", - request: { - body: { - content: { "application/json": { schema: ExecuteRequestSchema } }, - }, - }, - responses: { - 202: { - description: "Graph execution started", - content: { "application/json": { schema: ExecuteResponseSchema } }, - }, - }, -}); - -export const validateRoute = createRoute({ - method: "post", - path: "/api/v1/graphs/validate", - tags: ["Graphs"], - summary: "Validate a graph", - description: "Compile and validate a graph definition without executing it.", - request: { - body: { - content: { "application/json": { schema: ValidateRequestSchema } }, - }, - }, - responses: { - 200: { - description: "Validation result", - content: { "application/json": { schema: ValidateResponseSchema } }, - }, - }, -}); - -export const listRunsRoute = createRoute({ - method: "get", - path: "/api/v1/graphs", - tags: ["Graphs"], - summary: "List in-flight runs", - responses: { - 200: { - description: "List of currently executing runs", - content: { "application/json": { schema: z.array(RunSummarySchema) } }, - }, - }, -}); - -export const getRunRoute = createRoute({ - method: "get", - path: "/api/v1/graphs/{runId}", - tags: ["Graphs"], - summary: "Get run status", - description: "Get detailed status of a single in-flight run.", - request: { - params: RunIdParamSchema, - }, - responses: { - 200: { - description: "Run details", - content: { "application/json": { schema: RunDetailSchema } }, - }, - 404: { - description: "Run not found", - content: { "application/json": { schema: ErrorResponseSchema } }, - }, - }, -}); - -export const cancelRunRoute = createRoute({ - method: "delete", - path: "/api/v1/graphs/{runId}", - tags: ["Graphs"], - summary: "Cancel a run", - description: "Cancel a running graph execution.", - request: { - params: RunIdParamSchema, - }, - responses: { - 200: { - description: "Run cancelled", - content: { "application/json": { schema: CancelResponseSchema } }, - }, - 404: { - description: "Run not found or already completed", - content: { "application/json": { schema: ErrorResponseSchema } }, - }, - }, -}); diff --git a/packages/nvisy-server/src/handler/graphs-schema.ts b/packages/nvisy-server/src/handler/graphs-schema.ts deleted file mode 100644 index 6b18b01..0000000 --- a/packages/nvisy-server/src/handler/graphs-schema.ts +++ /dev/null @@ -1,90 +0,0 @@ -import { z } from "@hono/zod-openapi"; - -export const ErrorResponseSchema = z.object({ - status: z.number(), - error: z.string(), - requestId: z.string().optional(), -}); - -export const ConnectionSchema = z.object({ - type: z.string(), - credentials: z.unknown(), - context: z.unknown(), -}); - -export const ConnectionsSchema = z.record(z.uuid(), ConnectionSchema); - -export const GraphSchema = z.record(z.string(), z.unknown()); - -export const ExecuteRequestSchema = z.object({ - graph: GraphSchema, - connections: ConnectionsSchema, -}); - -export const ValidateRequestSchema = z.object({ - graph: GraphSchema, - connections: ConnectionsSchema, -}); - -export const ExecuteResponseSchema = z.object({ - runId: z.string(), -}); - -export const ValidateResponseSchema = z.object({ - valid: z.boolean(), - errors: z.array(z.string()), -}); - -export const RunStatusSchema = z.enum([ - "pending", - "running", - "completed", - "failed", - "cancelled", -]); - -export const RunSummarySchema = z.object({ - runId: z.string(), - status: RunStatusSchema, - startedAt: z.string(), - completedAt: z.string().optional(), -}); - -export const NodeProgressSchema = z.object({ - nodeId: z.string(), - status: z.enum(["pending", "running", "completed", "failed"]), - itemsProcessed: z.number(), - error: z.string().optional(), -}); - -export const NodeResultSchema = z.object({ - nodeId: z.string(), - status: z.enum(["success", "failure", "skipped"]), - itemsProcessed: z.number(), - error: z.string().optional(), -}); - -export const RunResultSchema = z.object({ - runId: z.string(), - status: z.enum(["success", "partial_failure", "failure"]), - nodes: z.array(NodeResultSchema), -}); - -export const RunDetailSchema = z.object({ - runId: z.string(), - status: RunStatusSchema, - startedAt: z.string(), - completedAt: z.string().optional(), - nodeProgress: z.record(z.string(), NodeProgressSchema), - result: RunResultSchema.optional(), - error: z.string().optional(), -}); - -export const CancelResponseSchema = z.object({ - runId: z.string(), - cancelled: z.boolean(), -}); - -export const RunIdParamSchema = z.object({ - runId: z.uuid().openapi({ param: { name: "runId", in: "path" } }), -}); diff --git a/packages/nvisy-server/src/handler/graphs.ts b/packages/nvisy-server/src/handler/graphs.ts deleted file mode 100644 index c261df8..0000000 --- a/packages/nvisy-server/src/handler/graphs.ts +++ /dev/null @@ -1,146 +0,0 @@ -import type { OpenAPIHono } from "@hono/zod-openapi"; -import { getLogger } from "@logtape/logtape"; -import { getEngine } from "../middleware/index.js"; -import { - cancelRunRoute, - executeRoute, - getRunRoute, - listRunsRoute, - validateRoute, -} from "./graphs-routes.js"; - -const logger = getLogger(["nvisy", "server"]); - -/** - * Graph execution endpoints. - * - * POST /api/v1/graphs/execute — Submit a graph for execution, returns { runId } - * POST /api/v1/graphs/validate — Compile and validate a graph without executing - * GET /api/v1/graphs — List in-flight runs - * GET /api/v1/graphs/:runId — Get detailed status of a single in-flight run - * DELETE /api/v1/graphs/:runId — Cancel a running execution - */ -export function registerGraphHandler(app: OpenAPIHono): void { - app.openapi(executeRoute, async (c) => { - const { graph, connections } = c.req.valid("json"); - const engine = getEngine(c); - - const runId = engine.execute(graph, connections); - logger.info("Graph execution submitted: {runId}", { runId }); - - return c.json({ runId }, 202); - }); - - app.openapi(validateRoute, async (c) => { - const { graph, connections } = c.req.valid("json"); - const engine = getEngine(c); - - logger.debug("Graph validation requested"); - const result = engine.validate(graph, connections); - - return c.json({ valid: result.valid, errors: [...result.errors] }, 200); - }); - - app.openapi(listRunsRoute, async (c) => { - const engine = getEngine(c); - - logger.debug("Listing runs"); - const runs = engine.listRuns(); - - return c.json( - runs.map((run) => ({ - runId: run.runId, - status: run.status, - startedAt: run.startedAt.toISOString(), - completedAt: run.completedAt?.toISOString(), - })), - 200, - ); - }); - - app.openapi(getRunRoute, async (c) => { - const { runId } = c.req.valid("param"); - const engine = getEngine(c); - - logger.debug("Run status requested: {runId}", { runId }); - const run = engine.getRun(runId); - - if (!run) { - const requestId = c.get("requestId") as string | undefined; - return c.json( - { status: 404, error: `Run not found: ${runId}`, requestId }, - 404, - ); - } - - const nodeProgress: Record< - string, - { - nodeId: string; - status: "pending" | "running" | "completed" | "failed"; - itemsProcessed: number; - error?: string; - } - > = {}; - for (const [nodeId, progress] of run.nodeProgress) { - nodeProgress[nodeId] = { - nodeId: progress.nodeId, - status: progress.status, - itemsProcessed: progress.itemsProcessed, - ...(progress.error && { error: progress.error.message }), - }; - } - - return c.json( - { - runId: run.runId, - status: run.status, - startedAt: run.startedAt.toISOString(), - completedAt: run.completedAt?.toISOString(), - nodeProgress, - result: run.result - ? { - runId: run.result.runId, - status: run.result.status, - nodes: run.result.nodes.map((n) => ({ - nodeId: n.nodeId, - status: n.status, - itemsProcessed: n.itemsProcessed, - ...(n.error && { error: n.error.message }), - })), - } - : undefined, - error: run.error?.message, - }, - 200, - ); - }); - - app.openapi(cancelRunRoute, async (c) => { - const { runId } = c.req.valid("param"); - const engine = getEngine(c); - - logger.info("Run cancellation requested: {runId}", { runId }); - const cancelled = engine.cancelRun(runId); - - if (!cancelled) { - const requestId = c.get("requestId") as string | undefined; - return c.json( - { - status: 404, - error: `Run not found or already completed: ${runId}`, - requestId, - }, - 404, - ); - } - - return c.json({ runId, cancelled: true }, 200); - }); - - logger.debug(" POST {route}", { route: "/api/v1/graphs/execute" }); - logger.debug(" POST {route}", { route: "/api/v1/graphs/validate" }); - logger.debug(" GET {route}", { route: "/api/v1/graphs" }); - logger.debug(" GET {route}", { route: "/api/v1/graphs/:runId" }); - logger.debug(" DEL {route}", { route: "/api/v1/graphs/:runId" }); -} diff --git a/packages/nvisy-server/src/handler/health-routes.ts b/packages/nvisy-server/src/handler/health-routes.ts deleted file mode 100644 index 2c8234c..0000000 --- a/packages/nvisy-server/src/handler/health-routes.ts +++ /dev/null @@ -1,32 +0,0 @@ -import { createRoute } from "@hono/zod-openapi"; -import { HealthResponseSchema, ReadyResponseSchema } from "./health-schema.js"; - -export const healthRoute = createRoute({ - method: "get", - path: "/health", - tags: ["Health"], - summary: "Liveness probe", - responses: { - 200: { - description: "Server is alive", - content: { "application/json": { schema: HealthResponseSchema } }, - }, - }, -}); - -export const readyRoute = createRoute({ - method: "get", - path: "/ready", - tags: ["Health"], - summary: "Readiness probe", - responses: { - 200: { - description: "Server can accept work", - content: { "application/json": { schema: ReadyResponseSchema } }, - }, - 503: { - description: "Server is not ready", - content: { "application/json": { schema: ReadyResponseSchema } }, - }, - }, -}); diff --git a/packages/nvisy-server/src/handler/health-schema.ts b/packages/nvisy-server/src/handler/health-schema.ts deleted file mode 100644 index a2c1083..0000000 --- a/packages/nvisy-server/src/handler/health-schema.ts +++ /dev/null @@ -1,9 +0,0 @@ -import { z } from "@hono/zod-openapi"; - -export const HealthResponseSchema = z.object({ - status: z.literal("ok"), -}); - -export const ReadyResponseSchema = z.object({ - status: z.enum(["ready", "unavailable"]), -}); diff --git a/packages/nvisy-server/src/handler/health.ts b/packages/nvisy-server/src/handler/health.ts deleted file mode 100644 index 4cbf8b2..0000000 --- a/packages/nvisy-server/src/handler/health.ts +++ /dev/null @@ -1,25 +0,0 @@ -import type { OpenAPIHono } from "@hono/zod-openapi"; -import { getLogger } from "@logtape/logtape"; -import { healthRoute, readyRoute } from "./health-routes.js"; - -const logger = getLogger(["nvisy", "server"]); - -/** - * Health and readiness endpoints. - * - * GET /health — Liveness probe. Always returns 200. - * GET /ready — Readiness probe. Returns 200 when the runtime can accept work. - */ -export function registerHealthHandler(app: OpenAPIHono): void { - app.openapi(healthRoute, (c) => { - return c.json({ status: "ok" as const }, 200); - }); - - app.openapi(readyRoute, (c) => { - // TODO: check whether the runtime can accept new graph executions - return c.json({ status: "ready" as const }, 200); - }); - - logger.debug(" GET {route}", { route: "/health" }); - logger.debug(" GET {route}", { route: "/ready" }); -} diff --git a/packages/nvisy-server/src/handler/index.ts b/packages/nvisy-server/src/handler/index.ts deleted file mode 100644 index 24488e9..0000000 --- a/packages/nvisy-server/src/handler/index.ts +++ /dev/null @@ -1,23 +0,0 @@ -import type { OpenAPIHono } from "@hono/zod-openapi"; -import { getLogger } from "@logtape/logtape"; -import type { ServerConfig } from "../config.js"; -import { registerGraphHandler } from "./graphs.js"; -import { registerHealthHandler } from "./health.js"; -import { registerOpenApiHandler } from "./openapi.js"; - -const logger = getLogger(["nvisy", "server"]); - -/** - * Register all route handlers on the given OpenAPIHono app. - * - * Registration order matters: health and graph handlers are registered - * first so that the OpenAPI spec includes all routes. - */ -export function registerHandlers(app: OpenAPIHono, config: ServerConfig): void { - logger.debug("Registering health handlers"); - registerHealthHandler(app); - logger.debug("Registering graph handlers"); - registerGraphHandler(app); - logger.debug("Registering OpenAPI handlers"); - registerOpenApiHandler(app, config); -} diff --git a/packages/nvisy-server/src/handler/openapi.ts b/packages/nvisy-server/src/handler/openapi.ts deleted file mode 100644 index 0ad912e..0000000 --- a/packages/nvisy-server/src/handler/openapi.ts +++ /dev/null @@ -1,45 +0,0 @@ -import type { OpenAPIHono } from "@hono/zod-openapi"; -import { getLogger } from "@logtape/logtape"; -import { Scalar } from "@scalar/hono-api-reference"; -import type { ServerConfig } from "../config.js"; - -const logger = getLogger(["nvisy", "server"]); - -/** Path where the OpenAPI 3.1 JSON specification is served. */ -const SPEC_PATH = "/openapi.json"; - -/** Path where the Scalar API reference UI is served. */ -const DOCS_PATH = "/docs"; - -/** - * OpenAPI spec and Scalar API reference endpoints. - * - * GET /openapi.json — OpenAPI 3.1 JSON spec - * GET /docs — Scalar API reference UI - */ -export function registerOpenApiHandler( - app: OpenAPIHono, - config: ServerConfig, -): void { - app.doc(SPEC_PATH, { - openapi: "3.1.0", - info: { - title: "Nvisy Runtime", - version: "0.1.0", - description: "Stateless execution worker for Nvisy graph pipelines.", - license: { name: "Apache-2.0" }, - }, - servers: [{ url: `http://${config.host}:${config.port}` }], - }); - - app.get( - DOCS_PATH, - Scalar({ - url: SPEC_PATH, - theme: "default", - }), - ); - - logger.debug(" GET {spec}", { spec: SPEC_PATH }); - logger.debug(" GET {docs}", { docs: DOCS_PATH }); -} diff --git a/packages/nvisy-server/src/main.ts b/packages/nvisy-server/src/main.ts deleted file mode 100644 index 326d634..0000000 --- a/packages/nvisy-server/src/main.ts +++ /dev/null @@ -1,36 +0,0 @@ -/** - * Server entry point. - * - * Reads configuration from environment variables, configures - * logging via LogTape, builds the Hono app, and starts the - * HTTP server. - * - * @module - */ - -import { getLogger } from "@logtape/logtape"; -import { createApp, startServer } from "./app.js"; -import { configureLogging, loadConfig } from "./config.js"; - -const config = loadConfig(); -await configureLogging(config); - -const { app, injectWebSocket } = createApp(config); - -const { close } = startServer({ - app, - host: config.host, - port: config.port, - injectWebSocket, -}); - -const logger = getLogger(["nvisy", "server"]); - -function shutdown() { - logger.info("Shutting down"); - close(); - process.exit(0); -} - -process.on("SIGINT", shutdown); -process.on("SIGTERM", shutdown); diff --git a/packages/nvisy-server/src/middleware/error-handler.ts b/packages/nvisy-server/src/middleware/error-handler.ts deleted file mode 100644 index 7775fd2..0000000 --- a/packages/nvisy-server/src/middleware/error-handler.ts +++ /dev/null @@ -1,96 +0,0 @@ -/** - * Global error and not-found handlers. - * - * - {@link createErrorHandler} — registered via `app.onError` - * - {@link createNotFoundHandler} — registered via `app.notFound` - * - * Both return the unified {@link ErrorResponse} JSON envelope so every - * error response — 4xx, 5xx, thrown `HTTPException`, unmatched route — - * has the same shape. - * - * `requestId` appears in log output automatically via LogTape implicit - * context (set by the `withContext` middleware in `index.ts`). It is - * still read from the Hono context for inclusion in the JSON body. - */ - -import { getLogger } from "@logtape/logtape"; -import type { Context, ErrorHandler, NotFoundHandler } from "hono"; -import { HTTPException } from "hono/http-exception"; - -/** Unified JSON error envelope returned by all error responses. */ -interface ErrorResponse { - status: number; - error: string; - requestId?: string; - stack?: string; -} - -const logger = getLogger(["nvisy", "server"]); - -/** - * Create the global `app.onError` handler. - * - * - `HTTPException` → logs at warn, returns the exception's status + message. - * - Anything else → logs at error, returns 500 with a generic message - * (or the real message + stack trace in development). - */ -export function createErrorHandler(opts: { - isDevelopment: boolean; -}): ErrorHandler { - return (error: Error, c: Context): Response => { - const requestId = c.get("requestId") as string | undefined; - - if (error instanceof HTTPException) { - const status = error.status; - logger.warn("HTTP {status} on {method} {path}: {message}", { - status, - method: c.req.method, - path: c.req.path, - message: error.message, - }); - const body: ErrorResponse = { - status, - error: error.message, - ...(requestId && { requestId }), - }; - return c.json(body, status); - } - - logger.error("Unhandled error on {method} {path}: {message}", { - method: c.req.method, - path: c.req.path, - message: error.message, - stack: error.stack, - }); - - const body: ErrorResponse = { - status: 500, - error: opts.isDevelopment ? error.message : "Internal server error", - ...(requestId && { requestId }), - ...(opts.isDevelopment && error.stack && { stack: error.stack }), - }; - return c.json(body, 500); - }; -} - -/** - * Create the global `app.notFound` handler. - * - * - **development** — includes the method and path in the error message. - * - **production** — returns a generic "Not found" message. - */ -export function createNotFoundHandler(opts: { - isDevelopment: boolean; -}): NotFoundHandler { - return (c) => { - const requestId = c.get("requestId") as string | undefined; - const body: ErrorResponse = { - status: 404, - error: opts.isDevelopment - ? `Not found: ${c.req.method} ${c.req.path}` - : "Not found", - ...(requestId && { requestId }), - }; - return c.json(body, 404); - }; -} diff --git a/packages/nvisy-server/src/middleware/hono-context.ts b/packages/nvisy-server/src/middleware/hono-context.ts deleted file mode 100644 index f9fad1e..0000000 --- a/packages/nvisy-server/src/middleware/hono-context.ts +++ /dev/null @@ -1,23 +0,0 @@ -import type { Engine } from "@nvisy/runtime"; -import type { Context, MiddlewareHandler } from "hono"; - -const ENGINE_KEY = "engine" as const; - -declare module "hono" { - interface ContextVariableMap { - [ENGINE_KEY]: Engine; - } -} - -/** Middleware that injects the Engine into Hono context. */ -export function engineMiddleware(engine: Engine): MiddlewareHandler { - return async (c, next) => { - c.set(ENGINE_KEY, engine); - await next(); - }; -} - -/** Retrieve the Engine from Hono context. */ -export function getEngine(c: Context): Engine { - return c.get(ENGINE_KEY); -} diff --git a/packages/nvisy-server/src/middleware/index.ts b/packages/nvisy-server/src/middleware/index.ts deleted file mode 100644 index 57fce7a..0000000 --- a/packages/nvisy-server/src/middleware/index.ts +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Global middleware registration. - * - * Middleware is applied in declaration order. The first three entries - * establish the per-request foundations that everything else relies on: - * - * 1. **OTel instrumentation** — wraps the request in an OpenTelemetry span. - * 2. **Request ID** — generates (or reads from `X-Request-Id`) a UUID. - * 3. **Implicit log context** — propagates `requestId` into every LogTape - * log call for the remainder of the request via `withContext`. - * - * After that, the request logger and standard security / transport - * middleware are registered. - */ - -import { httpInstrumentationMiddleware } from "@hono/otel"; -import type { OpenAPIHono } from "@hono/zod-openapi"; -import { withContext } from "@logtape/logtape"; -import { bodyLimit } from "hono/body-limit"; -import { compress } from "hono/compress"; -import { cors } from "hono/cors"; -import { csrf } from "hono/csrf"; -import { etag } from "hono/etag"; -import { requestId } from "hono/request-id"; -import { secureHeaders } from "hono/secure-headers"; -import { timeout } from "hono/timeout"; -import { timing } from "hono/timing"; -import type { ServerConfig } from "../config.js"; -import { createErrorHandler, createNotFoundHandler } from "./error-handler.js"; -import { createRequestLogger } from "./request-logger.js"; - -export { engineMiddleware, getEngine } from "./hono-context.js"; - -/** Register all global middleware on the given OpenAPIHono app. */ -export function registerMiddleware(app: OpenAPIHono, config: ServerConfig) { - app.onError(createErrorHandler({ isDevelopment: config.isDevelopment })); - app.notFound(createNotFoundHandler({ isDevelopment: config.isDevelopment })); - - app.use("*", httpInstrumentationMiddleware()); - app.use("*", requestId()); - app.use("*", async (c, next) => { - await withContext({ requestId: c.get("requestId") }, next); - }); - app.use("*", createRequestLogger({ isDevelopment: config.isDevelopment })); - - app.use("*", secureHeaders()); - app.use("*", csrf()); - app.use("*", compress()); - app.use("*", etag()); - app.use("*", bodyLimit({ maxSize: config.bodyLimitBytes })); - app.use("*", timeout(config.requestTimeoutMs)); - app.use("*", timing()); - app.use("*", cors({ origin: config.corsOrigin })); -} diff --git a/packages/nvisy-server/src/middleware/request-logger.ts b/packages/nvisy-server/src/middleware/request-logger.ts deleted file mode 100644 index 93326c5..0000000 --- a/packages/nvisy-server/src/middleware/request-logger.ts +++ /dev/null @@ -1,36 +0,0 @@ -/** - * HTTP request logger backed by {@link https://jsr.io/@logtape/hono | @logtape/hono}. - * - * - **development** — human-readable one-liner: - * `GET /api/v1/graphs → 200 (1.2ms)` - * - **production** — structured object (consumed by `jsonLinesFormatter`): - * `{ method, path, status, responseTime }` - * - * The per-request `requestId` is attached automatically via LogTape - * implicit context — see the `withContext` middleware in `index.ts`. - */ - -import { type HonoContext, honoLogger } from "@logtape/hono"; -import type { MiddlewareHandler } from "hono"; - -/** Human-readable request summary for development console output. */ -const devFormat = (c: HonoContext, ms: number): string => - `${c.req.method} ${c.req.path} → ${c.res.status} (${ms.toFixed(1)}ms)`; - -/** Structured request properties for JSON Lines production output. */ -const prodFormat = (c: HonoContext, ms: number) => ({ - method: c.req.method, - path: c.req.path, - status: c.res.status, - responseTime: ms.toFixed(1), -}); - -/** Create request-logging middleware appropriate for the environment. */ -export function createRequestLogger(opts: { - isDevelopment: boolean; -}): MiddlewareHandler { - return honoLogger({ - category: ["nvisy", "server"], - format: opts.isDevelopment ? devFormat : prodFormat, - }); -} diff --git a/packages/nvisy-server/src/service/engine-factory.ts b/packages/nvisy-server/src/service/engine-factory.ts deleted file mode 100644 index cc9ea3c..0000000 --- a/packages/nvisy-server/src/service/engine-factory.ts +++ /dev/null @@ -1,43 +0,0 @@ -import { getLogger } from "@logtape/logtape"; -import { aiPlugin } from "@nvisy/plugin-ai"; -import { nosqlPlugin } from "@nvisy/plugin-nosql"; -import { objectPlugin } from "@nvisy/plugin-object"; -import { pandocPlugin } from "@nvisy/plugin-pandoc"; -import { queuePlugin } from "@nvisy/plugin-queue"; -import { sqlPlugin } from "@nvisy/plugin-sql"; -import { tesseractPlugin } from "@nvisy/plugin-tesseract"; -import { vectorPlugin } from "@nvisy/plugin-vector"; -import { Engine } from "@nvisy/runtime"; - -const logger = getLogger(["nvisy", "engine"]); - -/** Create and initialize the Engine with all standard plugins. */ -export function createEngine(): Engine { - logger.info("Initializing engine"); - - try { - const engine = new Engine() - .register(aiPlugin) - .register(nosqlPlugin) - .register(objectPlugin) - .register(tesseractPlugin) - .register(pandocPlugin) - .register(queuePlugin) - .register(sqlPlugin) - .register(vectorPlugin); - - const { actions, providers, streams, loaders, datatypes } = engine.schema; - logger.info("Engine initialized", { - providers: providers.length, - streams, - actions: actions.length, - loaders, - datatypes, - }); - - return engine; - } catch (error) { - logger.fatal("Failed to initialize engine: {error}", { error }); - throw error; - } -} diff --git a/packages/nvisy-server/src/service/index.ts b/packages/nvisy-server/src/service/index.ts deleted file mode 100644 index e013301..0000000 --- a/packages/nvisy-server/src/service/index.ts +++ /dev/null @@ -1 +0,0 @@ -export { createEngine } from "./engine-factory.js"; diff --git a/packages/nvisy-server/tsconfig.json b/packages/nvisy-server/tsconfig.json deleted file mode 100644 index be2bcc4..0000000 --- a/packages/nvisy-server/tsconfig.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "extends": "../../tsconfig.json", - "compilerOptions": { - /* Emit */ - "outDir": "./dist", - "rootDir": "./src", - "composite": true - }, - /* Scope */ - "include": ["src/**/*"], - "exclude": ["node_modules", "dist"], - "references": [ - { "path": "../nvisy-core" }, - { "path": "../nvisy-runtime" }, - { "path": "../nvisy-plugin-ai" }, - { "path": "../nvisy-plugin-sql" }, - { "path": "../nvisy-plugin-vector" } - ] -} diff --git a/packages/nvisy-server/tsup.config.ts b/packages/nvisy-server/tsup.config.ts deleted file mode 100644 index d95ed71..0000000 --- a/packages/nvisy-server/tsup.config.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { defineConfig } from "tsup"; - -export default defineConfig({ - /* Entry */ - entry: ["src/main.ts"], - format: ["esm"], - - /* Output */ - outDir: "dist", - dts: false, - sourcemap: true, - clean: true, - - /* Optimization */ - splitting: false, - treeshake: true, - skipNodeModulesBundle: true, - - /* Environment */ - platform: "node", - target: "es2024", -}); diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c64a2f2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[project] +name = "nvisy-workspace" +version = "0.1.0" +requires-python = ">=3.11" + +[tool.uv.workspace] +members = ["packages/nvisy-ai"] diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..73cb934 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,3 @@ +[toolchain] +channel = "stable" +components = ["rustfmt", "clippy"] diff --git a/tsconfig.json b/tsconfig.json deleted file mode 100644 index 0779260..0000000 --- a/tsconfig.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "compilerOptions": { - /* Language & Environment */ - "target": "ES2024", - "lib": ["ES2024"], - - /* Modules */ - "module": "NodeNext", - "moduleResolution": "nodenext", - "resolveJsonModule": true, - "isolatedModules": true, - "verbatimModuleSyntax": true, - - /* Type Checking */ - "strict": true, - "noUnusedLocals": true, - "noUnusedParameters": true, - "noImplicitReturns": true, - "noFallthroughCasesInSwitch": true, - "noUncheckedIndexedAccess": true, - "noImplicitOverride": true, - "noPropertyAccessFromIndexSignature": true, - "exactOptionalPropertyTypes": true, - - /* Emit */ - "declaration": true, - "declarationMap": true, - "sourceMap": true, - - /* Interop */ - "esModuleInterop": true, - "forceConsistentCasingInFileNames": true, - "skipLibCheck": true - }, - "exclude": ["node_modules", "dist"] -} diff --git a/vitest.config.ts b/vitest.config.ts deleted file mode 100644 index 495bd0b..0000000 --- a/vitest.config.ts +++ /dev/null @@ -1,48 +0,0 @@ -import { defineConfig } from "vitest/config"; - -/* Resolve the "source" export condition so workspace packages point to - ./src/index.ts instead of ./dist/index.js. Vitest runs in Vite's SSR - environment, so ssr.resolve.conditions is required here — the top-level - resolve.conditions only applies to the client environment. */ - -export default defineConfig({ - /* Module Resolution */ - ssr: { - resolve: { - conditions: ["source", "import", "default"], - }, - }, - - test: { - /* Environment */ - globals: true, - environment: "node", - - /* Discovery */ - include: [ - "packages/*/src/**/*.{test,spec}.ts", - "packages/*/test/**/*.{test,spec}.ts", - ], - exclude: ["node_modules", "dist", "**/*.d.ts"], - - /* Coverage */ - coverage: { - provider: "v8", - reporter: ["text", "json", "html", "lcov"], - exclude: [ - "node_modules/", - "dist/", - "coverage/", - "**/*.d.ts", - "**/*.config.*", - "**/index.ts", - ], - }, - - /* Timeouts */ - testTimeout: 15000, - }, - - /* Transform */ - esbuild: { target: "es2024" }, -}); From 1c19a5041fd6556f827807695a977d5ce7a9ff30 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Tue, 10 Feb 2026 10:20:16 +0100 Subject: [PATCH 07/17] feat: consolidate server, add RedactionContext, schema feature, utoipa, JSON patterns, nvisy-exif, update CI - Move state.rs and config.rs into service/, update all imports - Add RedactionContext type to nvisy-core for per-request redaction config - Add schemars schema feature with JsonSchema derives on all server-facing types - Add utoipa OpenAPI annotations to all 15 handler endpoints with SwaggerUI at /swagger-ui - Refactor NvisyError enum into Error struct with ErrorKind, source field, Result alias - Clean crate roots: move standalone .rs files into module directories - Load detection patterns from JSON (assets/patterns.json) instead of hardcoded statics - Create nvisy-exif Python package for EXIF metadata reading/stripping - Bump deps: petgraph 0.8, infer 0.19, utoipa-swagger-ui 9, add schemars 1 - Replace TypeScript CI with Rust workflows, update Dependabot for cargo/pip - Update Dockerfile for multi-stage Rust build - Update Makefile with cargo targets - Rewrite README and docs for Rust codebase Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- .cargo/config.toml | 5 - .github/dependabot.yml | 35 +- .github/workflows/build.yml | 143 ++++--- .github/workflows/security.yml | 32 +- Cargo.lock | 188 +++++---- Cargo.toml | 11 +- Makefile | 69 ++-- README.md | 40 +- crates/nvisy-core/Cargo.toml | 6 + crates/nvisy-core/README.md | 3 + .../nvisy-core/src/{data.rs => data/mod.rs} | 2 + crates/nvisy-core/src/datatypes/audit.rs | 1 + crates/nvisy-core/src/datatypes/blob.rs | 3 + crates/nvisy-core/src/datatypes/document.rs | 1 + crates/nvisy-core/src/datatypes/entity.rs | 3 + crates/nvisy-core/src/datatypes/image.rs | 2 + crates/nvisy-core/src/datatypes/mod.rs | 1 + crates/nvisy-core/src/datatypes/policy.rs | 2 + crates/nvisy-core/src/datatypes/redaction.rs | 1 + .../src/datatypes/redaction_context.rs | 133 +++++++ crates/nvisy-core/src/documents/elements.rs | 6 + crates/nvisy-core/src/documents/ontology.rs | 2 + crates/nvisy-core/src/errors/mod.rs | 166 ++++---- crates/nvisy-core/src/lib.rs | 4 + .../src/{plugin.rs => plugin/mod.rs} | 0 .../src/{registry.rs => registry/mod.rs} | 0 .../nvisy-core/src/{types.rs => types/mod.rs} | 4 + crates/nvisy-detect/README.md | 3 + crates/nvisy-detect/assets/patterns.json | 74 ++++ .../src/actions/detect_checksum.rs | 2 +- .../nvisy-detect/src/actions/detect_regex.rs | 4 +- crates/nvisy-detect/src/lib.rs | 4 + crates/nvisy-detect/src/patterns/api_key.rs | 39 -- .../nvisy-detect/src/patterns/credit_card.rs | 38 -- crates/nvisy-detect/src/patterns/email.rs | 12 - .../nvisy-detect/src/patterns/ip_address.rs | 21 - crates/nvisy-detect/src/patterns/mod.rs | 90 +++-- crates/nvisy-detect/src/patterns/phone.rs | 12 - crates/nvisy-detect/src/patterns/ssn.rs | 32 -- .../nvisy-detect/src/patterns/validators.rs | 42 ++ crates/nvisy-engine/Cargo.toml | 6 + crates/nvisy-engine/README.md | 3 + .../{connections.rs => connections/mod.rs} | 1 + crates/nvisy-engine/src/executor/runner.rs | 2 + crates/nvisy-engine/src/lib.rs | 4 + .../src/{policies.rs => policies/mod.rs} | 0 .../nvisy-engine/src/{runs.rs => runs/mod.rs} | 4 + .../src/{schema.rs => schema/mod.rs} | 5 + crates/nvisy-object/README.md | 3 + .../src/{client.rs => client/mod.rs} | 0 crates/nvisy-object/src/lib.rs | 4 + crates/nvisy-python/README.md | 3 + .../src/{actions.rs => actions/mod.rs} | 0 .../src/{bridge.rs => bridge/mod.rs} | 0 .../src/{error.rs => error/mod.rs} | 6 +- crates/nvisy-python/src/lib.rs | 4 + .../nvisy-python/src/{ner.rs => ner/mod.rs} | 16 +- .../src/{provider.rs => provider/mod.rs} | 0 crates/nvisy-server/Cargo.toml | 7 +- crates/nvisy-server/README.md | 3 + .../nvisy-server/src/{app.rs => app/mod.rs} | 20 +- .../src/{routes => handler}/audit.rs | 46 ++- crates/nvisy-server/src/handler/graphs.rs | 121 ++++++ .../src/{routes => handler}/health.rs | 30 +- crates/nvisy-server/src/handler/mod.rs | 34 ++ .../src/{routes => handler}/policies.rs | 128 +++++-- .../src/{routes => handler}/redact.rs | 37 +- crates/nvisy-server/src/main.rs | 11 +- crates/nvisy-server/src/routes/graphs.rs | 71 ---- crates/nvisy-server/src/routes/mod.rs | 5 - crates/nvisy-server/src/schemas/mod.rs | 1 - .../nvisy-server/src/service/audit_store.rs | 2 +- .../nvisy-server/src/{ => service}/config.rs | 0 crates/nvisy-server/src/service/mod.rs | 28 ++ .../nvisy-server/src/service/policy_store.rs | 2 +- .../nvisy-server/src/{ => service}/state.rs | 4 +- docker/Dockerfile | 76 ++-- docs/ARCHITECTURE.md | 362 +++++------------- docs/DEVELOPMENT.md | 153 ++------ docs/README.md | 85 ++-- packages/nvisy-exif/pyproject.toml | 14 + .../nvisy-exif/src/nvisy_exif/__init__.py | 1 + packages/nvisy-exif/src/nvisy_exif/exif.py | 58 +++ pyproject.toml | 2 +- rust-toolchain.toml | 3 - rustfmt.toml | 6 + 86 files changed, 1509 insertions(+), 1098 deletions(-) delete mode 100644 .cargo/config.toml create mode 100644 crates/nvisy-core/README.md rename crates/nvisy-core/src/{data.rs => data/mod.rs} (94%) create mode 100644 crates/nvisy-core/src/datatypes/redaction_context.rs rename crates/nvisy-core/src/{plugin.rs => plugin/mod.rs} (100%) rename crates/nvisy-core/src/{registry.rs => registry/mod.rs} (100%) rename crates/nvisy-core/src/{types.rs => types/mod.rs} (81%) create mode 100644 crates/nvisy-detect/README.md create mode 100644 crates/nvisy-detect/assets/patterns.json delete mode 100644 crates/nvisy-detect/src/patterns/api_key.rs delete mode 100644 crates/nvisy-detect/src/patterns/credit_card.rs delete mode 100644 crates/nvisy-detect/src/patterns/email.rs delete mode 100644 crates/nvisy-detect/src/patterns/ip_address.rs delete mode 100644 crates/nvisy-detect/src/patterns/phone.rs delete mode 100644 crates/nvisy-detect/src/patterns/ssn.rs create mode 100644 crates/nvisy-detect/src/patterns/validators.rs create mode 100644 crates/nvisy-engine/README.md rename crates/nvisy-engine/src/{connections.rs => connections/mod.rs} (87%) rename crates/nvisy-engine/src/{policies.rs => policies/mod.rs} (100%) rename crates/nvisy-engine/src/{runs.rs => runs/mod.rs} (95%) rename crates/nvisy-engine/src/{schema.rs => schema/mod.rs} (91%) create mode 100644 crates/nvisy-object/README.md rename crates/nvisy-object/src/{client.rs => client/mod.rs} (100%) create mode 100644 crates/nvisy-python/README.md rename crates/nvisy-python/src/{actions.rs => actions/mod.rs} (100%) rename crates/nvisy-python/src/{bridge.rs => bridge/mod.rs} (100%) rename crates/nvisy-python/src/{error.rs => error/mod.rs} (66%) rename crates/nvisy-python/src/{ner.rs => ner/mod.rs} (97%) rename crates/nvisy-python/src/{provider.rs => provider/mod.rs} (100%) create mode 100644 crates/nvisy-server/README.md rename crates/nvisy-server/src/{app.rs => app/mod.rs} (64%) rename crates/nvisy-server/src/{routes => handler}/audit.rs (54%) create mode 100644 crates/nvisy-server/src/handler/graphs.rs rename crates/nvisy-server/src/{routes => handler}/health.rs (52%) create mode 100644 crates/nvisy-server/src/handler/mod.rs rename crates/nvisy-server/src/{routes => handler}/policies.rs (54%) rename crates/nvisy-server/src/{routes => handler}/redact.rs (51%) delete mode 100644 crates/nvisy-server/src/routes/graphs.rs delete mode 100644 crates/nvisy-server/src/routes/mod.rs delete mode 100644 crates/nvisy-server/src/schemas/mod.rs rename crates/nvisy-server/src/{ => service}/config.rs (100%) rename crates/nvisy-server/src/{ => service}/state.rs (77%) create mode 100644 packages/nvisy-exif/pyproject.toml create mode 100644 packages/nvisy-exif/src/nvisy_exif/__init__.py create mode 100644 packages/nvisy-exif/src/nvisy_exif/exif.py delete mode 100644 rust-toolchain.toml create mode 100644 rustfmt.toml diff --git a/.cargo/config.toml b/.cargo/config.toml deleted file mode 100644 index af95132..0000000 --- a/.cargo/config.toml +++ /dev/null @@ -1,5 +0,0 @@ -[target.x86_64-apple-darwin] -rustflags = ["-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup"] - -[target.aarch64-apple-darwin] -rustflags = ["-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup"] diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 576670a..ea615b7 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,7 +1,6 @@ version: 2 updates: - # Version updates for npm dependencies - - package-ecosystem: "npm" + - package-ecosystem: "cargo" directory: "/" schedule: interval: "weekly" @@ -13,18 +12,40 @@ updates: - "chore" commit-message: prefix: "chore(deps)" - prefix-development: "chore(deps-dev)" - rebase-strategy: "auto" - versioning-strategy: "auto" groups: - npm-dependencies: + rust-dependencies: patterns: - "*" update-types: - "minor" - "patch" - # Version updates for GitHub Actions + - package-ecosystem: "pip" + directory: "/packages/nvisy-ai" + schedule: + interval: "weekly" + timezone: "Europe/Berlin" + day: "monday" + time: "04:00" + open-pull-requests-limit: 3 + labels: + - "chore" + commit-message: + prefix: "chore(deps-py)" + + - package-ecosystem: "pip" + directory: "/packages/nvisy-exif" + schedule: + interval: "weekly" + timezone: "Europe/Berlin" + day: "monday" + time: "04:00" + open-pull-requests-limit: 3 + labels: + - "chore" + commit-message: + prefix: "chore(deps-py)" + - package-ecosystem: "github-actions" directory: "/" schedule: diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 769e7e8..c55ade4 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -4,72 +4,87 @@ on: push: branches: [main] paths: - - "packages/**" - - "package.json" - - "package-lock.json" - - "tsconfig.json" - - "vitest.config.ts" - - "biome.json" + - "crates/**" + - "Cargo.toml" + - "Cargo.lock" - ".github/workflows/build.yml" pull_request: branches: [main] paths: - - "packages/**" - - "package.json" - - "package-lock.json" - - "tsconfig.json" - - "vitest.config.ts" - - "biome.json" + - "crates/**" + - "Cargo.toml" + - "Cargo.lock" - ".github/workflows/build.yml" workflow_dispatch: env: - NODE_VERSION: 22 + CARGO_TERM_COLOR: always + PYO3_USE_ABI3_FORWARD_COMPATIBILITY: 1 concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: - lint: - name: Lint + check: + name: Check runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v6 - - - name: Install Node.js - uses: actions/setup-node@v6 - with: - node-version: ${{ env.NODE_VERSION }} - cache: npm + uses: actions/checkout@v4 - - name: Install dependencies - run: npm ci + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable - - name: Check formatting & linting - run: npx biome check . + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" - check: - name: Typecheck + - name: Cache cargo registry and build + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-check-${{ hashFiles('**/Cargo.lock') }} + restore-keys: ${{ runner.os }}-cargo-check- + + - name: Cargo check + run: cargo check --workspace + + clippy: + name: Clippy runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v6 + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + components: clippy - - name: Install Node.js - uses: actions/setup-node@v6 + - name: Install Python + uses: actions/setup-python@v5 with: - node-version: ${{ env.NODE_VERSION }} - cache: npm + python-version: "3.11" - - name: Install dependencies - run: npm ci + - name: Cache cargo registry and build + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-clippy-${{ hashFiles('**/Cargo.lock') }} + restore-keys: ${{ runner.os }}-cargo-clippy- - - name: Typecheck - run: npx tsc -b packages/*/tsconfig.json + - name: Clippy + run: cargo clippy --workspace -- -D warnings test: name: Test @@ -78,19 +93,28 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v6 + uses: actions/checkout@v4 - - name: Install Node.js - uses: actions/setup-node@v6 + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Install Python + uses: actions/setup-python@v5 with: - node-version: ${{ env.NODE_VERSION }} - cache: npm + python-version: "3.11" - - name: Install dependencies - run: npm ci + - name: Cache cargo registry and build + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-test-${{ hashFiles('**/Cargo.lock') }} + restore-keys: ${{ runner.os }}-cargo-test- - name: Run tests - run: npx vitest run --coverage + run: cargo test --workspace build: name: Build @@ -99,16 +123,25 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v6 + uses: actions/checkout@v4 - - name: Install Node.js - uses: actions/setup-node@v6 - with: - node-version: ${{ env.NODE_VERSION }} - cache: npm + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable - - name: Install dependencies - run: npm ci + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" - - name: Build - run: npm run build --workspaces --if-present + - name: Cache cargo registry and build + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-build-${{ hashFiles('**/Cargo.lock') }} + restore-keys: ${{ runner.os }}-cargo-build- + + - name: Build release + run: cargo build --release diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 4069d7c..660fc76 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -4,16 +4,16 @@ on: push: branches: [main] paths: - - "packages/**" - - "package.json" - - "package-lock.json" + - "crates/**" + - "Cargo.toml" + - "Cargo.lock" - ".github/workflows/security.yml" pull_request: branches: [main] paths: - - "packages/**" - - "package.json" - - "package-lock.json" + - "crates/**" + - "Cargo.toml" + - "Cargo.lock" - ".github/workflows/security.yml" schedule: # Monday at 06:00 UTC @@ -27,9 +27,6 @@ concurrency: permissions: contents: read -env: - NODE_VERSION: 22 - jobs: audit: name: Audit @@ -37,16 +34,13 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v6 + uses: actions/checkout@v4 - - name: Install Node.js - uses: actions/setup-node@v6 - with: - node-version: ${{ env.NODE_VERSION }} - cache: npm + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable - - name: Install dependencies - run: npm ci + - name: Install cargo-audit + run: cargo install cargo-audit - - name: Audit dependencies - run: npm audit --omit=dev + - name: Run cargo audit + run: cargo audit diff --git a/Cargo.lock b/Cargo.lock index b2342d5..b226491 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -523,42 +523,13 @@ dependencies = [ "tracing", ] -[[package]] -name = "axum" -version = "0.7.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" -dependencies = [ - "async-trait", - "axum-core 0.4.5", - "bytes", - "futures-util", - "http 1.4.0", - "http-body 1.0.1", - "http-body-util", - "itoa", - "matchit 0.7.3", - "memchr", - "mime", - "percent-encoding", - "pin-project-lite", - "rustversion", - "serde", - "serde_json", - "serde_path_to_error", - "sync_wrapper", - "tower", - "tower-layer", - "tower-service", -] - [[package]] name = "axum" version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" dependencies = [ - "axum-core 0.5.6", + "axum-core", "axum-macros", "bytes", "form_urlencoded", @@ -569,7 +540,7 @@ dependencies = [ "hyper 1.8.1", "hyper-util", "itoa", - "matchit 0.8.4", + "matchit", "memchr", "mime", "percent-encoding", @@ -586,26 +557,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "axum-core" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" -dependencies = [ - "async-trait", - "bytes", - "futures-util", - "http 1.4.0", - "http-body 1.0.1", - "http-body-util", - "mime", - "pin-project-lite", - "rustversion", - "sync_wrapper", - "tower-layer", - "tower-service", -] - [[package]] name = "axum-core" version = "0.5.6" @@ -844,12 +795,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "crossbeam-utils" -version = "0.8.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" - [[package]] name = "crypto-bigint" version = "0.4.9" @@ -940,6 +885,12 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + [[package]] name = "ecdsa" version = "0.14.8" @@ -1028,8 +979,8 @@ version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ - "crc32fast", "miniz_oxide", + "zlib-rs", ] [[package]] @@ -1038,6 +989,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "foldhash" version = "0.2.0" @@ -1186,6 +1143,15 @@ dependencies = [ "tracing", ] +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash 0.1.5", +] + [[package]] name = "hashbrown" version = "0.16.1" @@ -1194,7 +1160,7 @@ checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.2.0", ] [[package]] @@ -1530,7 +1496,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.16.1", "serde", "serde_core", ] @@ -1546,9 +1512,9 @@ dependencies = [ [[package]] name = "infer" -version = "0.16.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc150e5ce2330295b8616ce0e3f53250e53af31759a9dbedad1621ba29151847" +checksum = "a588916bfdfd92e71cacef98a63d9b1f0d74d6599980d11894290e7ddefffcf7" dependencies = [ "cfb", ] @@ -1615,7 +1581,7 @@ version = "0.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" dependencies = [ - "hashbrown", + "hashbrown 0.16.1", ] [[package]] @@ -1627,12 +1593,6 @@ dependencies = [ "regex-automata", ] -[[package]] -name = "matchit" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" - [[package]] name = "matchit" version = "0.8.4" @@ -1743,6 +1703,7 @@ dependencies = [ "bytes", "chrono", "infer", + "schemars", "serde", "serde_json", "thiserror", @@ -1774,6 +1735,7 @@ dependencies = [ "nvisy-core", "petgraph", "rand", + "schemars", "serde", "serde_json", "thiserror", @@ -1820,13 +1782,14 @@ name = "nvisy-server" version = "0.1.0" dependencies = [ "anyhow", - "axum 0.8.8", + "axum", "chrono", "nvisy-core", "nvisy-detect", "nvisy-engine", "nvisy-object", "nvisy-python", + "schemars", "serde", "serde_json", "thiserror", @@ -1886,12 +1849,14 @@ checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "petgraph" -version = "0.7.1" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", + "hashbrown 0.15.5", "indexmap", + "serde", ] [[package]] @@ -2072,6 +2037,26 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "ref-cast" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "regex" version = "1.12.3" @@ -2280,6 +2265,34 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "schemars" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" +dependencies = [ + "bytes", + "chrono", + "dyn-clone", + "ref-cast", + "schemars_derive", + "serde", + "serde_json", + "uuid", +] + +[[package]] +name = "schemars_derive" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d115b50f4aaeea07e79c1912f645c7513d81715d0420f8bc77a18c6260b307f" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn", +] + [[package]] name = "sct" version = "0.7.1" @@ -2363,6 +2376,17 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_derive_internals" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "serde_json" version = "1.0.149" @@ -2892,15 +2916,16 @@ dependencies = [ "quote", "regex", "syn", + "uuid", ] [[package]] name = "utoipa-swagger-ui" -version = "8.1.0" +version = "9.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db4b5ac679cc6dfc5ea3f2823b0291c777750ffd5e13b21137e0f7ac0e8f9617" +checksum = "d047458f1b5b65237c2f6dc6db136945667f40a7668627b3490b9513a3d43a55" dependencies = [ - "axum 0.7.9", + "axum", "base64", "mime_guess", "regex", @@ -3368,21 +3393,24 @@ dependencies = [ [[package]] name = "zip" -version = "2.4.2" +version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" +checksum = "12598812502ed0105f607f941c386f43d441e00148fce9dec3ca5ffb0bde9308" dependencies = [ "arbitrary", "crc32fast", - "crossbeam-utils", - "displaydoc", "flate2", "indexmap", "memchr", - "thiserror", "zopfli", ] +[[package]] +name = "zlib-rs" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7948af682ccbc3342b6e9420e8c51c1fe5d7bf7756002b4a3c6cabfe96a7e3c" + [[package]] name = "zmij" version = "1.0.20" diff --git a/Cargo.toml b/Cargo.toml index c5191cb..c883094 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,8 +49,8 @@ tower = { version = "0.5", features = [] } tower-http = { version = "0.6", features = [] } # OpenAPI / Documentation -utoipa = { version = "5", features = [] } -utoipa-swagger-ui = { version = "8", features = [] } +utoipa = { version = "5", features = ["uuid"] } +utoipa-swagger-ui = { version = "9", features = [] } # Observability tracing = { version = "0.1", features = [] } @@ -73,10 +73,13 @@ bytes = { version = "1", features = ["serde"] } regex = { version = "1.0", features = [] } # Graph data structures -petgraph = { version = "0.7", features = [] } +petgraph = { version = "0.8", features = [] } # File type detection -infer = { version = "0.16", features = [] } +infer = { version = "0.19", features = [] } + +# JSON Schema generation +schemars = { version = "1", features = ["uuid1", "chrono04", "bytes1"] } # Python interop pyo3 = { version = "0.23", features = [] } diff --git a/Makefile b/Makefile index f740844..9ab97d1 100644 --- a/Makefile +++ b/Makefile @@ -5,48 +5,50 @@ ifneq (,$(wildcard ./.env)) export endif -# Shell-level logger (expands to a printf that runs in the shell). +export PYO3_USE_ABI3_FORWARD_COMPATIBILITY := 1 + define log printf "[%s] [MAKE] [$(MAKECMDGOALS)] $(1)\n" "$$(date '+%Y-%m-%d %H:%M:%S')" endef -WATCH_PATHS := $(foreach p,$(wildcard packages/*/dist),--watch-path=$(p)) - .PHONY: dev -dev: ## Starts build watchers and dev server concurrently. - @for pkg in packages/*/; do \ - npm run build:watch --workspace=$$pkg & \ - done; \ - node $(WATCH_PATHS) packages/nvisy-server/dist/main.js & \ - wait - -.PHONY: dev\:prod -dev\:prod: ## Starts dev server with production log level (info). - @for pkg in packages/*/; do \ - npm run build:watch --workspace=$$pkg & \ - done; \ - NODE_ENV=production node $(WATCH_PATHS) packages/nvisy-server/dist/main.js & \ - wait +dev: ## Starts cargo-watch for the server binary. + @cargo watch -x 'run -p nvisy-server' + +.PHONY: build +build: ## Builds all crates in release mode. + @$(call log,Building workspace...) + @cargo build --workspace --release + @$(call log,Build complete.) + +.PHONY: check +check: ## Runs cargo check on all crates. + @cargo check --workspace + +.PHONY: test +test: ## Runs all tests. + @cargo test --workspace + +.PHONY: lint +lint: ## Runs clippy and format check. + @$(call log,Running format check...) + @cargo fmt --all -- --check + @$(call log,Running clippy...) + @cargo clippy --workspace -- -D warnings + @$(call log,Lint passed.) + +.PHONY: fmt +fmt: ## Formats all Rust code. + @cargo fmt --all .PHONY: ci -ci: ## Runs all CI checks locally (lint, typecheck, test, build). - @$(call log,Running lint...) - @npx biome check . - @$(call log,Running typecheck...) - @npx tsc -b packages/*/tsconfig.json - @$(call log,Running tests...) - @npx vitest run --coverage - @$(call log,Running build...) - @npm run build --workspaces --if-present +ci: lint check test build ## Runs all CI checks locally. @$(call log,All CI checks passed!) .PHONY: clean -clean: ## Removes all build artifacts and node_modules. +clean: ## Removes build artifacts. @$(call log,Cleaning build artifacts...) - @npx tsc -b --clean packages/*/tsconfig.json - @rm -rf packages/*/dist - @$(call log,Removing node_modules...) - @rm -rf node_modules packages/*/node_modules package-lock.json + @cargo clean @$(call log,Clean complete.) .PHONY: docker @@ -54,3 +56,8 @@ docker: ## Builds the Docker image. @$(call log,Building Docker image...) @docker build -f docker/Dockerfile -t nvisy-runtime . @$(call log,Docker image built.) + +.PHONY: help +help: ## Shows this help message. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \ + awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-12s\033[0m %s\n", $$1, $$2}' diff --git a/README.md b/README.md index c4cfb55..7201a3e 100644 --- a/README.md +++ b/README.md @@ -2,31 +2,45 @@ [![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) -An open-source ETL platform purpose-built for LLM and AI data pipelines. +A data protection runtime for AI pipelines — detect, redact, and audit sensitive data across documents, images, and streams. -Nvisy Runtime treats AI data as a first-class citizen: embeddings, completions, -structured outputs, tool-call traces, images, audio, and fine-tuning datasets -all flow through typed, validated primitives with full lineage tracking. +Built in Rust with Python extensions for AI-powered detection. -## Packages +## Workspace -See [packages/](packages/README.md) for the full package listing and detailed descriptions. +``` +crates/ + nvisy-core/ Types, traits, plugin registry, error handling + nvisy-detect/ Regex patterns, policy evaluation, redaction actions + nvisy-engine/ DAG graph compiler and execution engine + nvisy-object/ Object storage connectors (S3) + nvisy-python/ Python interop for AI-powered NER via PyO3 + nvisy-server/ Axum HTTP server with REST API + +packages/ + nvisy-ai/ Python: LLM-based entity detection + nvisy-exif/ Python: EXIF metadata reading and stripping +``` ## Quick Start ```bash -npm install -npm run build +cargo build --workspace +cargo test --workspace +cargo run -p nvisy-server ``` -## Documentation +## Development -See [`docs/`](docs/) for architecture, intelligence capabilities, provider -design, and security documentation. +```bash +make dev # cargo-watch dev server +make ci # lint + check + test + build +make help # list all targets +``` -## Changelog +## Documentation -See [CHANGELOG.md](CHANGELOG.md) for release notes and version history. +See [`docs/`](docs/) for architecture and development documentation. ## License diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index 4c0d3c7..8c51dbe 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -21,7 +21,13 @@ documentation = { workspace = true } all-features = true rustdoc-args = ["--cfg", "docsrs"] +[features] +schema = ["dep:schemars"] + [dependencies] +# JSON Schema generation +schemars = { workspace = true, optional = true } + # (De)serialization serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = [] } diff --git a/crates/nvisy-core/README.md b/crates/nvisy-core/README.md new file mode 100644 index 0000000..63e5fd2 --- /dev/null +++ b/crates/nvisy-core/README.md @@ -0,0 +1,3 @@ +# nvisy-core + +Foundational crate for the Nvisy runtime. Defines domain types, error types, the plugin trait system, and the action/provider registry that all other crates build on. diff --git a/crates/nvisy-core/src/data.rs b/crates/nvisy-core/src/data/mod.rs similarity index 94% rename from crates/nvisy-core/src/data.rs rename to crates/nvisy-core/src/data/mod.rs index 3a88ae9..440cb32 100644 --- a/crates/nvisy-core/src/data.rs +++ b/crates/nvisy-core/src/data/mod.rs @@ -8,6 +8,7 @@ use crate::datatypes::{ /// Common fields shared by all domain data items. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct DataItem { pub id: Uuid, #[serde(skip_serializing_if = "Option::is_none")] @@ -44,6 +45,7 @@ impl Default for DataItem { /// Discriminated union of all data types that flow through DAG channels. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(tag = "_type", rename_all = "snake_case")] pub enum DataValue { Document(Document), diff --git a/crates/nvisy-core/src/datatypes/audit.rs b/crates/nvisy-core/src/datatypes/audit.rs index c293603..e4ae60f 100644 --- a/crates/nvisy-core/src/datatypes/audit.rs +++ b/crates/nvisy-core/src/datatypes/audit.rs @@ -6,6 +6,7 @@ use crate::types::{AuditAction, Metadata}; /// An immutable audit record tracking a data protection event. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Audit { #[serde(flatten)] pub data: DataItem, diff --git a/crates/nvisy-core/src/datatypes/blob.rs b/crates/nvisy-core/src/datatypes/blob.rs index 7d62bca..7dec1a9 100644 --- a/crates/nvisy-core/src/datatypes/blob.rs +++ b/crates/nvisy-core/src/datatypes/blob.rs @@ -4,6 +4,7 @@ use crate::data::DataItem; /// Content type information for a blob. #[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct BlobContentInfo { /// MIME type provided by the caller (e.g. from HTTP Content-Type header). #[serde(skip_serializing_if = "Option::is_none")] @@ -15,11 +16,13 @@ pub struct BlobContentInfo { /// A binary object from storage (file content + path + content type). #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Blob { #[serde(flatten)] pub data: DataItem, pub path: String, #[serde(with = "bytes_serde")] + #[cfg_attr(feature = "schema", schemars(with = "Vec<u8>"))] pub content: Bytes, pub provided: BlobContentInfo, } diff --git a/crates/nvisy-core/src/datatypes/document.rs b/crates/nvisy-core/src/datatypes/document.rs index 39dac9c..8906456 100644 --- a/crates/nvisy-core/src/datatypes/document.rs +++ b/crates/nvisy-core/src/datatypes/document.rs @@ -4,6 +4,7 @@ use crate::documents::elements::Element; /// A parsed human-readable text representation of a document. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Document { #[serde(flatten)] pub data: DataItem, diff --git a/crates/nvisy-core/src/datatypes/entity.rs b/crates/nvisy-core/src/datatypes/entity.rs index 3b9fec3..15fd76d 100644 --- a/crates/nvisy-core/src/datatypes/entity.rs +++ b/crates/nvisy-core/src/datatypes/entity.rs @@ -5,6 +5,7 @@ use crate::types::{DetectionMethod, EntityCategory}; /// Bounding box for image-based entity locations. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct BoundingBox { pub x: f64, pub y: f64, @@ -14,6 +15,7 @@ pub struct BoundingBox { /// Location of an entity within its source document. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct EntityLocation { pub start_offset: usize, pub end_offset: usize, @@ -27,6 +29,7 @@ pub struct EntityLocation { /// A detected sensitive data occurrence within a document. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Entity { #[serde(flatten)] pub data: DataItem, diff --git a/crates/nvisy-core/src/datatypes/image.rs b/crates/nvisy-core/src/datatypes/image.rs index d8dc412..6da4815 100644 --- a/crates/nvisy-core/src/datatypes/image.rs +++ b/crates/nvisy-core/src/datatypes/image.rs @@ -4,10 +4,12 @@ use crate::data::DataItem; /// An image extracted from a document or provided directly. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct ImageData { #[serde(flatten)] pub data: DataItem, #[serde(with = "crate::datatypes::blob::bytes_serde")] + #[cfg_attr(feature = "schema", schemars(with = "Vec<u8>"))] pub image_data: Bytes, pub mime_type: String, #[serde(skip_serializing_if = "Option::is_none")] diff --git a/crates/nvisy-core/src/datatypes/mod.rs b/crates/nvisy-core/src/datatypes/mod.rs index 151a49e..eaac1ef 100644 --- a/crates/nvisy-core/src/datatypes/mod.rs +++ b/crates/nvisy-core/src/datatypes/mod.rs @@ -5,3 +5,4 @@ pub mod entity; pub mod image; pub mod policy; pub mod redaction; +pub mod redaction_context; diff --git a/crates/nvisy-core/src/datatypes/policy.rs b/crates/nvisy-core/src/datatypes/policy.rs index 96d2f3a..c36e44c 100644 --- a/crates/nvisy-core/src/datatypes/policy.rs +++ b/crates/nvisy-core/src/datatypes/policy.rs @@ -4,6 +4,7 @@ use crate::types::{EntityCategory, RedactionMethod}; /// A single rule within a redaction policy. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct PolicyRule { pub id: String, pub name: String, @@ -18,6 +19,7 @@ pub struct PolicyRule { /// A redaction policy containing rules. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Policy { #[serde(flatten)] pub data: DataItem, diff --git a/crates/nvisy-core/src/datatypes/redaction.rs b/crates/nvisy-core/src/datatypes/redaction.rs index 1a6582e..6bde50e 100644 --- a/crates/nvisy-core/src/datatypes/redaction.rs +++ b/crates/nvisy-core/src/datatypes/redaction.rs @@ -5,6 +5,7 @@ use crate::types::RedactionMethod; /// A redaction decision for a detected entity. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Redaction { #[serde(flatten)] pub data: DataItem, diff --git a/crates/nvisy-core/src/datatypes/redaction_context.rs b/crates/nvisy-core/src/datatypes/redaction_context.rs new file mode 100644 index 0000000..80906a3 --- /dev/null +++ b/crates/nvisy-core/src/datatypes/redaction_context.rs @@ -0,0 +1,133 @@ +use serde::{Deserialize, Serialize}; +use crate::types::{EntityCategory, RedactionMethod}; + +/// Per-entity-type override for the redaction method. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct EntityRedactionRule { + pub entity_type: String, + pub method: RedactionMethod, + #[serde(skip_serializing_if = "Option::is_none")] + pub replacement: Option<String>, +} + +/// Request-scoped description of what to redact. +/// +/// Acts as the per-request equivalent of a stored [`Policy`](super::policy::Policy), +/// specifying categories, entity types, confidence thresholds, and +/// redaction methods for a single redaction invocation. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct RedactionContext { + /// Entity categories to scan for. Empty = all. + #[serde(default)] + pub categories: Vec<EntityCategory>, + /// Specific entity type names (e.g. "ssn", "face", "address"). Empty = all within categories. + #[serde(default)] + pub entity_types: Vec<String>, + /// Per-entity-type overrides for redaction method. + #[serde(default)] + pub rules: Vec<EntityRedactionRule>, + /// Default method when no per-type rule matches. + #[serde(default = "default_method")] + pub default_method: RedactionMethod, + /// Minimum confidence (0.0-1.0). Below = ignored. + #[serde(default = "default_min_confidence")] + pub min_confidence: f64, + /// Enable image-based detection (faces, license plates). + #[serde(default)] + pub detect_images: bool, + /// Free-form labels (e.g. "gdpr-request"). + #[serde(default)] + pub labels: Vec<String>, +} + +fn default_method() -> RedactionMethod { + RedactionMethod::Mask +} + +fn default_min_confidence() -> f64 { + 0.5 +} + +impl Default for RedactionContext { + fn default() -> Self { + Self { + categories: Vec::new(), + entity_types: Vec::new(), + rules: Vec::new(), + default_method: RedactionMethod::Mask, + min_confidence: 0.5, + detect_images: false, + labels: Vec::new(), + } + } +} + +impl RedactionContext { + pub fn new() -> Self { + Self::default() + } + + pub fn with_categories(mut self, categories: Vec<EntityCategory>) -> Self { + self.categories = categories; + self + } + + pub fn with_entity_types(mut self, entity_types: Vec<String>) -> Self { + self.entity_types = entity_types; + self + } + + pub fn with_rule(mut self, rule: EntityRedactionRule) -> Self { + self.rules.push(rule); + self + } + + pub fn with_default_method(mut self, method: RedactionMethod) -> Self { + self.default_method = method; + self + } + + pub fn with_min_confidence(mut self, confidence: f64) -> Self { + self.min_confidence = confidence; + self + } + + pub fn with_detect_images(mut self, detect: bool) -> Self { + self.detect_images = detect; + self + } + + /// Return the redaction method for a given entity type. + /// + /// Checks per-type rules first, falls back to `default_method`. + pub fn method_for(&self, entity_type: &str) -> RedactionMethod { + self.rules + .iter() + .find(|r| r.entity_type == entity_type) + .map(|r| r.method) + .unwrap_or(self.default_method) + } + + /// Whether a detected entity should be processed given the context filters. + pub fn should_process( + &self, + category: EntityCategory, + entity_type: &str, + confidence: f64, + ) -> bool { + if confidence < self.min_confidence { + return false; + } + if !self.categories.is_empty() && !self.categories.contains(&category) { + return false; + } + if !self.entity_types.is_empty() + && !self.entity_types.iter().any(|t| t == entity_type) + { + return false; + } + true + } +} diff --git a/crates/nvisy-core/src/documents/elements.rs b/crates/nvisy-core/src/documents/elements.rs index 1e4f4a1..7218ea5 100644 --- a/crates/nvisy-core/src/documents/elements.rs +++ b/crates/nvisy-core/src/documents/elements.rs @@ -6,6 +6,7 @@ use crate::types::Metadata; /// An inline hyperlink within element text. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Link { pub text: String, pub url: String, @@ -14,6 +15,7 @@ pub struct Link { /// An inline formatting span within element text. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct EmphasizedText { pub text: String, pub tag: String, @@ -21,6 +23,7 @@ pub struct EmphasizedText { /// A single cell within a table structure. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct TableCellData { pub row: usize, pub column: usize, @@ -31,6 +34,7 @@ pub struct TableCellData { /// Extraction / OCR provenance data. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct ElementProvenance { #[serde(skip_serializing_if = "Option::is_none")] pub confidence: Option<f64>, @@ -44,6 +48,7 @@ pub struct ElementProvenance { /// Structured key-value pair from a form. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct FormKeyValuePair { pub key: String, #[serde(skip_serializing_if = "Option::is_none")] @@ -57,6 +62,7 @@ pub struct FormKeyValuePair { /// Combines base element fields with optional type-specific fields /// (image, table, form, email) in a flat struct rather than inheritance. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Element { pub id: Uuid, #[serde(rename = "type")] diff --git a/crates/nvisy-core/src/documents/ontology.rs b/crates/nvisy-core/src/documents/ontology.rs index 75e6cfb..dae5dda 100644 --- a/crates/nvisy-core/src/documents/ontology.rs +++ b/crates/nvisy-core/src/documents/ontology.rs @@ -2,6 +2,7 @@ use serde::{Deserialize, Serialize}; /// Element category — broad grouping of element types. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] pub enum ElementCategory { Text, @@ -16,6 +17,7 @@ pub enum ElementCategory { /// All element types across all categories. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(rename_all = "kebab-case")] pub enum ElementType { // Text diff --git a/crates/nvisy-core/src/errors/mod.rs b/crates/nvisy-core/src/errors/mod.rs index 35d9148..500dc26 100644 --- a/crates/nvisy-core/src/errors/mod.rs +++ b/crates/nvisy-core/src/errors/mod.rs @@ -1,81 +1,105 @@ -/// Unified error type for the Nvisy platform. -#[derive(Debug, thiserror::Error)] -pub enum NvisyError { - #[error("Validation: {message}")] - Validation { - message: String, - source_component: String, - }, - - #[error("Connection: {message}")] - Connection { - message: String, - source_component: String, - retryable: bool, - }, - - #[error("Timeout: {message}")] - Timeout { message: String }, +use std::fmt; - #[error("Cancelled: {message}")] - Cancellation { message: String }, +/// Classification of error kinds. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ErrorKind { + Validation, + Connection, + Timeout, + Cancellation, + Policy, + Runtime, + Python, + Other, +} - #[error("Policy: {message}")] - Policy { message: String }, +impl fmt::Display for ErrorKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Validation => write!(f, "Validation"), + Self::Connection => write!(f, "Connection"), + Self::Timeout => write!(f, "Timeout"), + Self::Cancellation => write!(f, "Cancelled"), + Self::Policy => write!(f, "Policy"), + Self::Runtime => write!(f, "Runtime"), + Self::Python => write!(f, "Python"), + Self::Other => write!(f, "Other"), + } + } +} - #[error("Runtime: {message}")] - Runtime { - message: String, - source_component: String, - retryable: bool, - }, +/// Unified error type for the Nvisy platform. +#[derive(Debug)] +pub struct Error { + pub kind: ErrorKind, + pub message: String, + pub source_component: Option<String>, + pub retryable: bool, + pub source: Option<Box<dyn std::error::Error + Send + Sync>>, +} - #[error("Python: {message}")] - Python { - message: String, - traceback: Option<String>, - }, +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}: {}", self.kind, self.message) + } +} - #[error(transparent)] - Other(#[from] anyhow::Error), +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + self.source.as_ref().map(|e| e.as_ref() as &(dyn std::error::Error + 'static)) + } } -impl NvisyError { - pub fn validation(message: impl Into<String>, source: impl Into<String>) -> Self { - Self::Validation { +impl Error { + pub fn new(kind: ErrorKind, message: impl Into<String>) -> Self { + Self { + kind, message: message.into(), - source_component: source.into(), + source_component: None, + retryable: false, + source: None, } } + pub fn with_source(mut self, source: impl std::error::Error + Send + Sync + 'static) -> Self { + self.source = Some(Box::new(source)); + self + } + + pub fn with_component(mut self, component: impl Into<String>) -> Self { + self.source_component = Some(component.into()); + self + } + + pub fn with_retryable(mut self, retryable: bool) -> Self { + self.retryable = retryable; + self + } + + pub fn validation(message: impl Into<String>, source: impl Into<String>) -> Self { + Self::new(ErrorKind::Validation, message).with_component(source) + } + pub fn connection( message: impl Into<String>, source: impl Into<String>, retryable: bool, ) -> Self { - Self::Connection { - message: message.into(), - source_component: source.into(), - retryable, - } + Self::new(ErrorKind::Connection, message) + .with_component(source) + .with_retryable(retryable) } pub fn timeout(message: impl Into<String>) -> Self { - Self::Timeout { - message: message.into(), - } + Self::new(ErrorKind::Timeout, message).with_retryable(true) } pub fn cancellation(message: impl Into<String>) -> Self { - Self::Cancellation { - message: message.into(), - } + Self::new(ErrorKind::Cancellation, message) } pub fn policy(message: impl Into<String>) -> Self { - Self::Policy { - message: message.into(), - } + Self::new(ErrorKind::Policy, message) } pub fn runtime( @@ -83,27 +107,31 @@ impl NvisyError { source: impl Into<String>, retryable: bool, ) -> Self { - Self::Runtime { - message: message.into(), - source_component: source.into(), - retryable, - } + Self::new(ErrorKind::Runtime, message) + .with_component(source) + .with_retryable(retryable) } - pub fn python(message: impl Into<String>, traceback: Option<String>) -> Self { - Self::Python { - message: message.into(), - traceback, - } + pub fn python(message: impl Into<String>) -> Self { + Self::new(ErrorKind::Python, message) } /// Whether this error is retryable. pub fn is_retryable(&self) -> bool { - match self { - Self::Connection { retryable, .. } => *retryable, - Self::Runtime { retryable, .. } => *retryable, - Self::Timeout { .. } => true, - _ => false, - } + self.retryable } } + +impl From<anyhow::Error> for Error { + fn from(err: anyhow::Error) -> Self { + // anyhow::Error doesn't implement std::error::Error, so we capture the + // full chain as text instead of storing it as a boxed source. + Self::new(ErrorKind::Other, format!("{err:#}")) + } +} + +/// Convenience type alias for results using the Nvisy error type. +pub type Result<T> = std::result::Result<T, Error>; + +// Keep backward compatibility: NvisyError is an alias for Error. +pub type NvisyError = Error; diff --git a/crates/nvisy-core/src/lib.rs b/crates/nvisy-core/src/lib.rs index d8b1994..8561f7e 100644 --- a/crates/nvisy-core/src/lib.rs +++ b/crates/nvisy-core/src/lib.rs @@ -1,3 +1,7 @@ +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + pub mod data; pub mod datatypes; pub mod documents; diff --git a/crates/nvisy-core/src/plugin.rs b/crates/nvisy-core/src/plugin/mod.rs similarity index 100% rename from crates/nvisy-core/src/plugin.rs rename to crates/nvisy-core/src/plugin/mod.rs diff --git a/crates/nvisy-core/src/registry.rs b/crates/nvisy-core/src/registry/mod.rs similarity index 100% rename from crates/nvisy-core/src/registry.rs rename to crates/nvisy-core/src/registry/mod.rs diff --git a/crates/nvisy-core/src/types.rs b/crates/nvisy-core/src/types/mod.rs similarity index 81% rename from crates/nvisy-core/src/types.rs rename to crates/nvisy-core/src/types/mod.rs index e5b6f15..7c2100c 100644 --- a/crates/nvisy-core/src/types.rs +++ b/crates/nvisy-core/src/types/mod.rs @@ -2,6 +2,7 @@ use serde::{Deserialize, Serialize}; /// Category of sensitive data. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] pub enum EntityCategory { Pii, @@ -13,6 +14,7 @@ pub enum EntityCategory { /// How the entity was detected. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] pub enum DetectionMethod { Regex, @@ -24,6 +26,7 @@ pub enum DetectionMethod { /// Method used to redact sensitive data. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] pub enum RedactionMethod { Mask, @@ -38,6 +41,7 @@ pub enum RedactionMethod { /// Type of auditable action. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] pub enum AuditAction { Detection, diff --git a/crates/nvisy-detect/README.md b/crates/nvisy-detect/README.md new file mode 100644 index 0000000..4b8bf54 --- /dev/null +++ b/crates/nvisy-detect/README.md @@ -0,0 +1,3 @@ +# nvisy-detect + +Detection and redaction plugin for the Nvisy runtime. Provides regex-based entity detection, checksum validation, policy evaluation, classification, audit emission, and file loaders for plaintext, CSV, and JSON formats. diff --git a/crates/nvisy-detect/assets/patterns.json b/crates/nvisy-detect/assets/patterns.json new file mode 100644 index 0000000..a5ae239 --- /dev/null +++ b/crates/nvisy-detect/assets/patterns.json @@ -0,0 +1,74 @@ +[ + { + "name": "ssn", + "category": "pii", + "entity_type": "ssn", + "pattern": "\\b(\\d{3})-(\\d{2})-(\\d{4})\\b", + "confidence": 0.9, + "validator": "ssn" + }, + { + "name": "email", + "category": "pii", + "entity_type": "email", + "pattern": "\\b[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}\\b", + "confidence": 0.95 + }, + { + "name": "phone", + "category": "pii", + "entity_type": "phone", + "pattern": "(?:\\+\\d{1,3}[\\s.\\-]?)?\\(?\\d{2,4}\\)?[\\s.\\-]?\\d{3,4}[\\s.\\-]?\\d{4}\\b", + "confidence": 0.8 + }, + { + "name": "credit-card", + "category": "financial", + "entity_type": "credit_card", + "pattern": "\\b(?:\\d[ \\-]*?){13,19}\\b", + "confidence": 0.85, + "validator": "luhn" + }, + { + "name": "aws-key", + "category": "credentials", + "entity_type": "aws_access_key", + "pattern": "\\bAKIA[0-9A-Z]{16}\\b", + "confidence": 0.95 + }, + { + "name": "github-token", + "category": "credentials", + "entity_type": "github_token", + "pattern": "\\bgh[pousr]_[a-zA-Z0-9]{36}\\b", + "confidence": 0.95 + }, + { + "name": "stripe-key", + "category": "credentials", + "entity_type": "stripe_key", + "pattern": "\\bsk_(live|test)_[a-zA-Z0-9]{24,}\\b", + "confidence": 0.95 + }, + { + "name": "generic-api-key", + "category": "credentials", + "entity_type": "api_key", + "pattern": "(?i)(?:api[_\\-]?key|api[_\\-]?secret|access[_\\-]?token|secret[_\\-]?key|bearer)\\s*[:=]\\s*[\"']?([a-zA-Z0-9_\\-]{20,})[\"']?", + "confidence": 0.7 + }, + { + "name": "ipv4", + "category": "pii", + "entity_type": "ip_address", + "pattern": "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b", + "confidence": 0.75 + }, + { + "name": "ipv6", + "category": "pii", + "entity_type": "ip_address", + "pattern": "\\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b|(?:[0-9a-fA-F]{1,4}:){1,7}:|::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\\b", + "confidence": 0.75 + } +] diff --git a/crates/nvisy-detect/src/actions/detect_checksum.rs b/crates/nvisy-detect/src/actions/detect_checksum.rs index 7855034..ae4641a 100644 --- a/crates/nvisy-detect/src/actions/detect_checksum.rs +++ b/crates/nvisy-detect/src/actions/detect_checksum.rs @@ -8,7 +8,7 @@ use nvisy_core::errors::NvisyError; use nvisy_core::traits::action::Action; use nvisy_core::types::DetectionMethod; -use crate::patterns::credit_card::luhn_check; +use crate::patterns::validators::luhn_check; pub struct DetectChecksumAction; diff --git a/crates/nvisy-detect/src/actions/detect_regex.rs b/crates/nvisy-detect/src/actions/detect_regex.rs index 602b1e5..1b91bd9 100644 --- a/crates/nvisy-detect/src/actions/detect_regex.rs +++ b/crates/nvisy-detect/src/actions/detect_regex.rs @@ -53,7 +53,7 @@ impl Action for DetectRegexAction { // Compile regexes let compiled: Vec<(&PatternDefinition, Regex)> = active_patterns .iter() - .filter_map(|p| Regex::new(p.pattern_str).ok().map(|r| (*p, r))) + .filter_map(|p| Regex::new(&p.pattern_str).ok().map(|r| (*p, r))) .collect(); let mut count = 0u64; @@ -76,7 +76,7 @@ impl Action for DetectRegexAction { let mut entity = Entity::new( pattern.category, - pattern.entity_type, + &pattern.entity_type, value, DetectionMethod::Regex, pattern.confidence, diff --git a/crates/nvisy-detect/src/lib.rs b/crates/nvisy-detect/src/lib.rs index b6a5e70..ebe1863 100644 --- a/crates/nvisy-detect/src/lib.rs +++ b/crates/nvisy-detect/src/lib.rs @@ -1,3 +1,7 @@ +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + pub mod actions; pub mod loaders; pub mod patterns; diff --git a/crates/nvisy-detect/src/patterns/api_key.rs b/crates/nvisy-detect/src/patterns/api_key.rs deleted file mode 100644 index 1b7bdb8..0000000 --- a/crates/nvisy-detect/src/patterns/api_key.rs +++ /dev/null @@ -1,39 +0,0 @@ -use nvisy_core::types::EntityCategory; - -use super::PatternDefinition; - -pub static AWS_KEY_PATTERN: PatternDefinition = PatternDefinition { - name: "aws-key", - category: EntityCategory::Credentials, - entity_type: "aws_access_key", - pattern_str: r"\bAKIA[0-9A-Z]{16}\b", - confidence: 0.95, - validate: None, -}; - -pub static GITHUB_TOKEN_PATTERN: PatternDefinition = PatternDefinition { - name: "github-token", - category: EntityCategory::Credentials, - entity_type: "github_token", - pattern_str: r"\bgh[pousr]_[a-zA-Z0-9]{36}\b", - confidence: 0.95, - validate: None, -}; - -pub static STRIPE_KEY_PATTERN: PatternDefinition = PatternDefinition { - name: "stripe-key", - category: EntityCategory::Credentials, - entity_type: "stripe_key", - pattern_str: r"\bsk_(live|test)_[a-zA-Z0-9]{24,}\b", - confidence: 0.95, - validate: None, -}; - -pub static GENERIC_KEY_PATTERN: PatternDefinition = PatternDefinition { - name: "generic-api-key", - category: EntityCategory::Credentials, - entity_type: "api_key", - pattern_str: r#"(?i)(?:api[_\-]?key|api[_\-]?secret|access[_\-]?token|secret[_\-]?key|bearer)\s*[:=]\s*["']?([a-zA-Z0-9_\-]{20,})["']?"#, - confidence: 0.7, - validate: None, -}; diff --git a/crates/nvisy-detect/src/patterns/credit_card.rs b/crates/nvisy-detect/src/patterns/credit_card.rs deleted file mode 100644 index bb0b9c5..0000000 --- a/crates/nvisy-detect/src/patterns/credit_card.rs +++ /dev/null @@ -1,38 +0,0 @@ -use nvisy_core::types::EntityCategory; - -use super::PatternDefinition; - -/// Luhn check algorithm for credit card validation. -pub fn luhn_check(num: &str) -> bool { - let digits: String = num.chars().filter(|c| c.is_ascii_digit()).collect(); - if digits.is_empty() { - return false; - } - let mut sum = 0u32; - let mut alternate = false; - for ch in digits.chars().rev() { - let mut n = ch.to_digit(10).unwrap_or(0); - if alternate { - n *= 2; - if n > 9 { - n -= 9; - } - } - sum += n; - alternate = !alternate; - } - sum % 10 == 0 -} - -fn validate_credit_card(value: &str) -> bool { - luhn_check(value) -} - -pub static CREDIT_CARD_PATTERN: PatternDefinition = PatternDefinition { - name: "credit-card", - category: EntityCategory::Financial, - entity_type: "credit_card", - pattern_str: r"\b(?:\d[ \-]*?){13,19}\b", - confidence: 0.85, - validate: Some(validate_credit_card), -}; diff --git a/crates/nvisy-detect/src/patterns/email.rs b/crates/nvisy-detect/src/patterns/email.rs deleted file mode 100644 index 66b6b33..0000000 --- a/crates/nvisy-detect/src/patterns/email.rs +++ /dev/null @@ -1,12 +0,0 @@ -use nvisy_core::types::EntityCategory; - -use super::PatternDefinition; - -pub static EMAIL_PATTERN: PatternDefinition = PatternDefinition { - name: "email", - category: EntityCategory::Pii, - entity_type: "email", - pattern_str: r"\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b", - confidence: 0.95, - validate: None, -}; diff --git a/crates/nvisy-detect/src/patterns/ip_address.rs b/crates/nvisy-detect/src/patterns/ip_address.rs deleted file mode 100644 index 8be5ab0..0000000 --- a/crates/nvisy-detect/src/patterns/ip_address.rs +++ /dev/null @@ -1,21 +0,0 @@ -use nvisy_core::types::EntityCategory; - -use super::PatternDefinition; - -pub static IPV4_PATTERN: PatternDefinition = PatternDefinition { - name: "ipv4", - category: EntityCategory::Pii, - entity_type: "ip_address", - pattern_str: r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b", - confidence: 0.75, - validate: None, -}; - -pub static IPV6_PATTERN: PatternDefinition = PatternDefinition { - name: "ipv6", - category: EntityCategory::Pii, - entity_type: "ip_address", - pattern_str: r"\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b|(?:[0-9a-fA-F]{1,4}:){1,7}:|::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\b", - confidence: 0.75, - validate: None, -}; diff --git a/crates/nvisy-detect/src/patterns/mod.rs b/crates/nvisy-detect/src/patterns/mod.rs index 8b2fd50..d3bd8e0 100644 --- a/crates/nvisy-detect/src/patterns/mod.rs +++ b/crates/nvisy-detect/src/patterns/mod.rs @@ -1,43 +1,77 @@ -pub mod api_key; -pub mod credit_card; -pub mod email; -pub mod ip_address; -pub mod phone; -pub mod ssn; +pub mod validators; -use nvisy_core::types::EntityCategory; use std::collections::HashMap; use std::sync::LazyLock; +use nvisy_core::types::EntityCategory; + +/// JSON representation of a pattern loaded from disk. +#[derive(Debug, Clone, serde::Deserialize)] +struct PatternJson { + name: String, + category: String, + entity_type: String, + pattern: String, + confidence: f64, + #[serde(default)] + validator: Option<String>, +} + /// Definition of a regex-based detection pattern. pub struct PatternDefinition { - pub name: &'static str, + pub name: String, pub category: EntityCategory, - pub entity_type: &'static str, - pub pattern_str: &'static str, + pub entity_type: String, + pub pattern_str: String, pub confidence: f64, pub validate: Option<fn(&str) -> bool>, } -static REGISTRY: LazyLock<HashMap<&'static str, &'static PatternDefinition>> = LazyLock::new(|| { - let patterns: &[&'static PatternDefinition] = &[ - &ssn::SSN_PATTERN, - &email::EMAIL_PATTERN, - &phone::PHONE_PATTERN, - &credit_card::CREDIT_CARD_PATTERN, - &api_key::AWS_KEY_PATTERN, - &api_key::GITHUB_TOKEN_PATTERN, - &api_key::STRIPE_KEY_PATTERN, - &api_key::GENERIC_KEY_PATTERN, - &ip_address::IPV4_PATTERN, - &ip_address::IPV6_PATTERN, - ]; - let mut map = HashMap::new(); - for p in patterns { - map.insert(p.name, *p); +fn parse_category(s: &str) -> EntityCategory { + match s { + "pii" => EntityCategory::Pii, + "phi" => EntityCategory::Phi, + "financial" => EntityCategory::Financial, + "credentials" => EntityCategory::Credentials, + _ => EntityCategory::Custom, + } +} + +fn resolve_validator(name: &str) -> Option<fn(&str) -> bool> { + match name { + "ssn" => Some(validators::validate_ssn), + "luhn" => Some(validators::luhn_check), + _ => None, } - map -}); +} + +fn load_patterns() -> Vec<PatternDefinition> { + let json_bytes = include_bytes!("../../assets/patterns.json"); + let raw: Vec<PatternJson> = + serde_json::from_slice(json_bytes).expect("Failed to parse patterns.json"); + + raw.into_iter() + .map(|p| PatternDefinition { + category: parse_category(&p.category), + validate: p.validator.as_deref().and_then(resolve_validator), + name: p.name, + entity_type: p.entity_type, + pattern_str: p.pattern, + confidence: p.confidence, + }) + .collect() +} + +static PATTERNS: LazyLock<Vec<PatternDefinition>> = LazyLock::new(load_patterns); + +static REGISTRY: LazyLock<HashMap<&'static str, &'static PatternDefinition>> = + LazyLock::new(|| { + let mut map = HashMap::new(); + for p in PATTERNS.iter() { + map.insert(p.name.as_str(), p); + } + map + }); /// Look up a built-in pattern by name. pub fn get_pattern(name: &str) -> Option<&'static PatternDefinition> { diff --git a/crates/nvisy-detect/src/patterns/phone.rs b/crates/nvisy-detect/src/patterns/phone.rs deleted file mode 100644 index fcc2858..0000000 --- a/crates/nvisy-detect/src/patterns/phone.rs +++ /dev/null @@ -1,12 +0,0 @@ -use nvisy_core::types::EntityCategory; - -use super::PatternDefinition; - -pub static PHONE_PATTERN: PatternDefinition = PatternDefinition { - name: "phone", - category: EntityCategory::Pii, - entity_type: "phone", - pattern_str: r"(?:\+\d{1,3}[\s.\-]?)?\(?\d{2,4}\)?[\s.\-]?\d{3,4}[\s.\-]?\d{4}\b", - confidence: 0.8, - validate: None, -}; diff --git a/crates/nvisy-detect/src/patterns/ssn.rs b/crates/nvisy-detect/src/patterns/ssn.rs deleted file mode 100644 index a5bc876..0000000 --- a/crates/nvisy-detect/src/patterns/ssn.rs +++ /dev/null @@ -1,32 +0,0 @@ -use nvisy_core::types::EntityCategory; - -use super::PatternDefinition; - -fn validate_ssn(value: &str) -> bool { - let parts: Vec<&str> = value.split('-').collect(); - if parts.len() != 3 { - return false; - } - let area: u32 = match parts[0].parse() { - Ok(v) => v, - Err(_) => return false, - }; - let group: u32 = match parts[1].parse() { - Ok(v) => v, - Err(_) => return false, - }; - let serial: u32 = match parts[2].parse() { - Ok(v) => v, - Err(_) => return false, - }; - area > 0 && area < 900 && area != 666 && group > 0 && serial > 0 -} - -pub static SSN_PATTERN: PatternDefinition = PatternDefinition { - name: "ssn", - category: EntityCategory::Pii, - entity_type: "ssn", - pattern_str: r"\b(\d{3})-(\d{2})-(\d{4})\b", - confidence: 0.9, - validate: Some(validate_ssn), -}; diff --git a/crates/nvisy-detect/src/patterns/validators.rs b/crates/nvisy-detect/src/patterns/validators.rs new file mode 100644 index 0000000..8903f64 --- /dev/null +++ b/crates/nvisy-detect/src/patterns/validators.rs @@ -0,0 +1,42 @@ +/// Validate a US Social Security Number. +pub fn validate_ssn(value: &str) -> bool { + let parts: Vec<&str> = value.split('-').collect(); + if parts.len() != 3 { + return false; + } + let area: u32 = match parts[0].parse() { + Ok(v) => v, + Err(_) => return false, + }; + let group: u32 = match parts[1].parse() { + Ok(v) => v, + Err(_) => return false, + }; + let serial: u32 = match parts[2].parse() { + Ok(v) => v, + Err(_) => return false, + }; + area > 0 && area < 900 && area != 666 && group > 0 && serial > 0 +} + +/// Luhn check algorithm for credit card validation. +pub fn luhn_check(num: &str) -> bool { + let digits: String = num.chars().filter(|c| c.is_ascii_digit()).collect(); + if digits.is_empty() { + return false; + } + let mut sum = 0u32; + let mut alternate = false; + for ch in digits.chars().rev() { + let mut n = ch.to_digit(10).unwrap_or(0); + if alternate { + n *= 2; + if n > 9 { + n -= 9; + } + } + sum += n; + alternate = !alternate; + } + sum % 10 == 0 +} diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml index ced9a33..dad052c 100644 --- a/crates/nvisy-engine/Cargo.toml +++ b/crates/nvisy-engine/Cargo.toml @@ -21,10 +21,16 @@ documentation = { workspace = true } all-features = true rustdoc-args = ["--cfg", "docsrs"] +[features] +schema = ["dep:schemars", "nvisy-core/schema"] + [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } +# JSON Schema generation +schemars = { workspace = true, optional = true } + # (De)serialization serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = [] } diff --git a/crates/nvisy-engine/README.md b/crates/nvisy-engine/README.md new file mode 100644 index 0000000..0ae582f --- /dev/null +++ b/crates/nvisy-engine/README.md @@ -0,0 +1,3 @@ +# nvisy-engine + +DAG compiler and executor for the Nvisy runtime. Compiles graph definitions into executable pipelines, manages run lifecycles, and coordinates policy resolution and connection routing between nodes. diff --git a/crates/nvisy-engine/src/connections.rs b/crates/nvisy-engine/src/connections/mod.rs similarity index 87% rename from crates/nvisy-engine/src/connections.rs rename to crates/nvisy-engine/src/connections/mod.rs index 43f986c..98384bf 100644 --- a/crates/nvisy-engine/src/connections.rs +++ b/crates/nvisy-engine/src/connections/mod.rs @@ -3,6 +3,7 @@ use serde::{Deserialize, Serialize}; /// A validated connection to an external service. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Connection { #[serde(rename = "type")] pub provider_type: String, diff --git a/crates/nvisy-engine/src/executor/runner.rs b/crates/nvisy-engine/src/executor/runner.rs index 4c344ad..838b886 100644 --- a/crates/nvisy-engine/src/executor/runner.rs +++ b/crates/nvisy-engine/src/executor/runner.rs @@ -12,6 +12,7 @@ use crate::schema::GraphNode; /// Result of a single node execution. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct NodeResult { pub node_id: String, pub items_processed: u64, @@ -20,6 +21,7 @@ pub struct NodeResult { /// Result of an entire graph execution. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct RunResult { pub run_id: Uuid, pub node_results: Vec<NodeResult>, diff --git a/crates/nvisy-engine/src/lib.rs b/crates/nvisy-engine/src/lib.rs index f051503..465f82e 100644 --- a/crates/nvisy-engine/src/lib.rs +++ b/crates/nvisy-engine/src/lib.rs @@ -1,3 +1,7 @@ +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + pub mod compiler; pub mod connections; pub mod executor; diff --git a/crates/nvisy-engine/src/policies.rs b/crates/nvisy-engine/src/policies/mod.rs similarity index 100% rename from crates/nvisy-engine/src/policies.rs rename to crates/nvisy-engine/src/policies/mod.rs diff --git a/crates/nvisy-engine/src/runs.rs b/crates/nvisy-engine/src/runs/mod.rs similarity index 95% rename from crates/nvisy-engine/src/runs.rs rename to crates/nvisy-engine/src/runs/mod.rs index 22975a7..d45d7d0 100644 --- a/crates/nvisy-engine/src/runs.rs +++ b/crates/nvisy-engine/src/runs/mod.rs @@ -8,6 +8,7 @@ use crate::executor::runner::RunResult; /// Status of a pipeline run. #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] pub enum RunStatus { Pending, @@ -20,6 +21,7 @@ pub enum RunStatus { /// Progress of a single node within a run. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct NodeProgress { pub node_id: String, pub status: RunStatus, @@ -30,6 +32,7 @@ pub struct NodeProgress { /// Full state of a run. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct RunState { pub id: Uuid, pub status: RunStatus, @@ -43,6 +46,7 @@ pub struct RunState { /// Summary of a run for listing. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct RunSummary { pub id: Uuid, pub status: RunStatus, diff --git a/crates/nvisy-engine/src/schema.rs b/crates/nvisy-engine/src/schema/mod.rs similarity index 91% rename from crates/nvisy-engine/src/schema.rs rename to crates/nvisy-engine/src/schema/mod.rs index 39b1479..5b14cc7 100644 --- a/crates/nvisy-engine/src/schema.rs +++ b/crates/nvisy-engine/src/schema/mod.rs @@ -2,6 +2,7 @@ use serde::{Deserialize, Serialize}; /// Retry policy for a node. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct RetryPolicy { #[serde(default = "default_max_retries")] pub max_retries: u32, @@ -25,6 +26,7 @@ impl Default for RetryPolicy { } #[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] pub enum BackoffStrategy { #[default] @@ -35,6 +37,7 @@ pub enum BackoffStrategy { /// A node in the graph definition. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(tag = "type", rename_all = "snake_case")] pub enum GraphNode { Source { @@ -107,6 +110,7 @@ impl GraphNode { /// An edge connecting two nodes. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct GraphEdge { pub from: String, pub to: String, @@ -114,6 +118,7 @@ pub struct GraphEdge { /// A complete graph definition. #[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Graph { pub nodes: Vec<GraphNode>, pub edges: Vec<GraphEdge>, diff --git a/crates/nvisy-object/README.md b/crates/nvisy-object/README.md new file mode 100644 index 0000000..cb82b98 --- /dev/null +++ b/crates/nvisy-object/README.md @@ -0,0 +1,3 @@ +# nvisy-object + +Object store plugin for the Nvisy runtime. Provides cloud storage providers (S3) and streaming read/write interfaces for ingesting and outputting data through the processing pipeline. diff --git a/crates/nvisy-object/src/client.rs b/crates/nvisy-object/src/client/mod.rs similarity index 100% rename from crates/nvisy-object/src/client.rs rename to crates/nvisy-object/src/client/mod.rs diff --git a/crates/nvisy-object/src/lib.rs b/crates/nvisy-object/src/lib.rs index ec6a1ba..4d091cd 100644 --- a/crates/nvisy-object/src/lib.rs +++ b/crates/nvisy-object/src/lib.rs @@ -1,3 +1,7 @@ +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + pub mod client; pub mod providers; pub mod streams; diff --git a/crates/nvisy-python/README.md b/crates/nvisy-python/README.md new file mode 100644 index 0000000..fece235 --- /dev/null +++ b/crates/nvisy-python/README.md @@ -0,0 +1,3 @@ +# nvisy-python + +PyO3 bridge plugin for the Nvisy runtime. Embeds a Python interpreter to run AI-powered named entity recognition (NER) models for text and image detection, exposing them as native Nvisy actions. diff --git a/crates/nvisy-python/src/actions.rs b/crates/nvisy-python/src/actions/mod.rs similarity index 100% rename from crates/nvisy-python/src/actions.rs rename to crates/nvisy-python/src/actions/mod.rs diff --git a/crates/nvisy-python/src/bridge.rs b/crates/nvisy-python/src/bridge/mod.rs similarity index 100% rename from crates/nvisy-python/src/bridge.rs rename to crates/nvisy-python/src/bridge/mod.rs diff --git a/crates/nvisy-python/src/error.rs b/crates/nvisy-python/src/error/mod.rs similarity index 66% rename from crates/nvisy-python/src/error.rs rename to crates/nvisy-python/src/error/mod.rs index a5e8fa7..f173ec1 100644 --- a/crates/nvisy-python/src/error.rs +++ b/crates/nvisy-python/src/error/mod.rs @@ -8,6 +8,10 @@ pub fn from_pyerr(err: PyErr) -> NvisyError { let traceback = err .traceback(py) .map(|tb| tb.format().unwrap_or_default()); - NvisyError::python(err.to_string(), traceback) + let msg = match traceback { + Some(tb) => format!("{}\n{}", err, tb), + None => err.to_string(), + }; + NvisyError::python(msg) }) } diff --git a/crates/nvisy-python/src/lib.rs b/crates/nvisy-python/src/lib.rs index b4e7fcf..9c4c41c 100644 --- a/crates/nvisy-python/src/lib.rs +++ b/crates/nvisy-python/src/lib.rs @@ -1,3 +1,7 @@ +#![deny(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + pub mod actions; pub mod bridge; pub mod error; diff --git a/crates/nvisy-python/src/ner.rs b/crates/nvisy-python/src/ner/mod.rs similarity index 97% rename from crates/nvisy-python/src/ner.rs rename to crates/nvisy-python/src/ner/mod.rs index b5e4a94..eeb26b4 100644 --- a/crates/nvisy-python/src/ner.rs +++ b/crates/nvisy-python/src/ner/mod.rs @@ -49,7 +49,7 @@ pub async fn detect_ner( }) }) .await - .map_err(|e| NvisyError::python(format!("Task join error: {}", e), None))? + .map_err(|e| NvisyError::python(format!("Task join error: {}", e)))? } /// Call Python detect_ner_image function via GIL + spawn_blocking. @@ -85,26 +85,26 @@ pub async fn detect_ner_image( }) }) .await - .map_err(|e| NvisyError::python(format!("Task join error: {}", e), None))? + .map_err(|e| NvisyError::python(format!("Task join error: {}", e)))? } /// Parse Python list[dict] response into Vec<Entity>. fn parse_python_entities(_py: Python<'_>, result: Bound<'_, PyAny>) -> Result<Vec<Entity>, NvisyError> { let list: &Bound<'_, PyList> = result.downcast().map_err(|e| { - NvisyError::python(format!("Expected list from Python: {}", e), None) + NvisyError::python(format!("Expected list from Python: {}", e)) })?; let mut entities = Vec::new(); for item in list.iter() { let dict: &Bound<'_, PyDict> = item.downcast().map_err(|e| { - NvisyError::python(format!("Expected dict in list: {}", e), None) + NvisyError::python(format!("Expected dict in list: {}", e)) })?; let category_str: String = dict .get_item("category") .map_err(from_pyerr)? - .ok_or_else(|| NvisyError::python("Missing 'category'", None))? + .ok_or_else(|| NvisyError::python("Missing 'category'"))? .extract() .map_err(from_pyerr)?; @@ -119,21 +119,21 @@ fn parse_python_entities(_py: Python<'_>, result: Bound<'_, PyAny>) -> Result<Ve let entity_type: String = dict .get_item("entity_type") .map_err(from_pyerr)? - .ok_or_else(|| NvisyError::python("Missing 'entity_type'", None))? + .ok_or_else(|| NvisyError::python("Missing 'entity_type'"))? .extract() .map_err(from_pyerr)?; let value: String = dict .get_item("value") .map_err(from_pyerr)? - .ok_or_else(|| NvisyError::python("Missing 'value'", None))? + .ok_or_else(|| NvisyError::python("Missing 'value'"))? .extract() .map_err(from_pyerr)?; let confidence: f64 = dict .get_item("confidence") .map_err(from_pyerr)? - .ok_or_else(|| NvisyError::python("Missing 'confidence'", None))? + .ok_or_else(|| NvisyError::python("Missing 'confidence'"))? .extract() .map_err(from_pyerr)?; diff --git a/crates/nvisy-python/src/provider.rs b/crates/nvisy-python/src/provider/mod.rs similarity index 100% rename from crates/nvisy-python/src/provider.rs rename to crates/nvisy-python/src/provider/mod.rs diff --git a/crates/nvisy-server/Cargo.toml b/crates/nvisy-server/Cargo.toml index c651ecd..7bbfd39 100644 --- a/crates/nvisy-server/Cargo.toml +++ b/crates/nvisy-server/Cargo.toml @@ -27,12 +27,15 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates -nvisy-core = { workspace = true, features = [] } -nvisy-engine = { workspace = true, features = [] } +nvisy-core = { workspace = true, features = ["schema"] } +nvisy-engine = { workspace = true, features = ["schema"] } nvisy-detect = { workspace = true, features = [] } nvisy-object = { workspace = true, features = [] } nvisy-python = { workspace = true, features = [] } +# JSON Schema generation +schemars = { workspace = true } + # (De)serialization serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = [] } diff --git a/crates/nvisy-server/README.md b/crates/nvisy-server/README.md new file mode 100644 index 0000000..a774adb --- /dev/null +++ b/crates/nvisy-server/README.md @@ -0,0 +1,3 @@ +# nvisy-server + +Axum-based HTTP server for the Nvisy runtime. Exposes REST endpoints for graph execution, data redaction, policy management, and audit log queries, with dependency-injected service layer. diff --git a/crates/nvisy-server/src/app.rs b/crates/nvisy-server/src/app/mod.rs similarity index 64% rename from crates/nvisy-server/src/app.rs rename to crates/nvisy-server/src/app/mod.rs index 9783f13..8b93b67 100644 --- a/crates/nvisy-server/src/app.rs +++ b/crates/nvisy-server/src/app/mod.rs @@ -2,13 +2,12 @@ use axum::Router; use std::sync::Arc; use tower_http::cors::{Any, CorsLayer}; use tower_http::trace::TraceLayer; +use utoipa::OpenApi; +use utoipa_swagger_ui::SwaggerUi; -use crate::config::ServerConfig; -use crate::routes; +use crate::handler; use crate::service::engine_factory; -use crate::service::audit_store::AuditStore; -use crate::service::policy_store::PolicyStore; -use crate::state::AppState; +use crate::service::{AuditStore, AppState, PolicyStore, ServerConfig}; use nvisy_engine::runs::RunManager; /// Build a fully configured Axum application. @@ -28,11 +27,12 @@ pub async fn build_app(_config: &ServerConfig) -> anyhow::Result<Router> { .allow_headers(Any); let app = Router::new() - .merge(routes::health::router()) - .merge(routes::graphs::router()) - .merge(routes::redact::router()) - .merge(routes::policies::router()) - .merge(routes::audit::router()) + .merge(handler::health::router()) + .merge(handler::graphs::router()) + .merge(handler::redact::router()) + .merge(handler::policies::router()) + .merge(handler::audit::router()) + .merge(SwaggerUi::new("/swagger-ui").url("/api-docs/openapi.json", handler::ApiDoc::openapi())) .layer(TraceLayer::new_for_http()) .layer(cors) .with_state(state); diff --git a/crates/nvisy-server/src/routes/audit.rs b/crates/nvisy-server/src/handler/audit.rs similarity index 54% rename from crates/nvisy-server/src/routes/audit.rs rename to crates/nvisy-server/src/handler/audit.rs index 844c34c..eb85032 100644 --- a/crates/nvisy-server/src/routes/audit.rs +++ b/crates/nvisy-server/src/handler/audit.rs @@ -4,16 +4,12 @@ use axum::{ routing::get, Json, }; +use std::sync::Arc; use uuid::Uuid; -use crate::state::AppState; +use crate::service::AuditStore; +use crate::service::AppState; -pub fn router() -> Router<AppState> { - Router::new() - .route("/api/v1/audit", get(list_audit)) - .route("/api/v1/audit/{run_id}", get(get_audit_by_run)) -} - -#[derive(serde::Deserialize)] +#[derive(serde::Deserialize, schemars::JsonSchema, utoipa::IntoParams)] struct AuditQuery { #[serde(rename = "runId")] run_id: Option<String>, @@ -24,11 +20,20 @@ struct AuditQuery { offset: Option<usize>, } +/// List audit records with optional filters. +#[utoipa::path( + get, + path = "/api/v1/audit", + params(AuditQuery), + responses( + (status = 200, description = "List of audit records") + ) +)] async fn list_audit( - State(state): State<AppState>, + State(audit_store): State<Arc<AuditStore>>, Query(query): Query<AuditQuery>, ) -> Json<serde_json::Value> { - let records = state.audit_store.query( + let records = audit_store.query( query.run_id.as_deref(), query.action.as_deref(), query.source_id.as_deref(), @@ -38,10 +43,27 @@ async fn list_audit( Json(serde_json::to_value(&records).unwrap_or_default()) } +/// Get audit records for a specific run. +#[utoipa::path( + get, + path = "/api/v1/audit/{run_id}", + params( + ("run_id" = Uuid, Path, description = "Run ID") + ), + responses( + (status = 200, description = "Audit records for the run") + ) +)] async fn get_audit_by_run( - State(state): State<AppState>, + State(audit_store): State<Arc<AuditStore>>, Path(run_id): Path<Uuid>, ) -> Json<serde_json::Value> { - let records = state.audit_store.get_by_run_id(run_id); + let records = audit_store.get_by_run_id(run_id); Json(serde_json::to_value(&records).unwrap_or_default()) } + +pub fn router() -> Router<AppState> { + Router::new() + .route("/api/v1/audit", get(list_audit)) + .route("/api/v1/audit/{run_id}", get(get_audit_by_run)) +} diff --git a/crates/nvisy-server/src/handler/graphs.rs b/crates/nvisy-server/src/handler/graphs.rs new file mode 100644 index 0000000..482f35f --- /dev/null +++ b/crates/nvisy-server/src/handler/graphs.rs @@ -0,0 +1,121 @@ +use axum::{ + Router, + extract::{Path, State}, + routing::{delete, get, post}, + Json, +}; +use std::sync::Arc; +use uuid::Uuid; +use nvisy_engine::runs::RunManager; +use crate::service::AppState; + +/// Submit a graph for execution. +#[utoipa::path( + post, + path = "/api/v1/graphs/execute", + request_body = serde_json::Value, + responses( + (status = 202, description = "Graph execution accepted") + ) +)] +async fn execute_graph( + State(run_manager): State<Arc<RunManager>>, + Json(_body): Json<serde_json::Value>, +) -> (axum::http::StatusCode, Json<serde_json::Value>) { + let (run_id, _cancel_token) = run_manager.create_run().await; + run_manager.set_running(run_id).await; + + // TODO: spawn actual graph execution + ( + axum::http::StatusCode::ACCEPTED, + Json(serde_json::json!({ + "runId": run_id.to_string(), + "status": "accepted" + })), + ) +} + +/// Validate a graph definition without executing. +#[utoipa::path( + post, + path = "/api/v1/graphs/validate", + request_body = serde_json::Value, + responses( + (status = 200, description = "Validation result") + ) +)] +async fn validate_graph( + Json(_body): Json<serde_json::Value>, +) -> Json<serde_json::Value> { + // TODO: validate graph against registry + Json(serde_json::json!({ "valid": true, "errors": [] })) +} + +/// List all runs. +#[utoipa::path( + get, + path = "/api/v1/graphs", + responses( + (status = 200, description = "List of runs") + ) +)] +async fn list_runs( + State(run_manager): State<Arc<RunManager>>, +) -> Json<serde_json::Value> { + let runs = run_manager.list(None).await; + Json(serde_json::to_value(&runs).unwrap_or_default()) +} + +/// Get status of a single run. +#[utoipa::path( + get, + path = "/api/v1/graphs/{run_id}", + params( + ("run_id" = Uuid, Path, description = "Run ID") + ), + responses( + (status = 200, description = "Run details"), + (status = 404, description = "Run not found") + ) +)] +async fn get_run( + State(run_manager): State<Arc<RunManager>>, + Path(run_id): Path<Uuid>, +) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { + match run_manager.get(run_id).await { + Some(run) => Ok(Json(serde_json::to_value(&run).unwrap_or_default())), + None => Err(axum::http::StatusCode::NOT_FOUND), + } +} + +/// Cancel a running execution. +#[utoipa::path( + delete, + path = "/api/v1/graphs/{run_id}", + params( + ("run_id" = Uuid, Path, description = "Run ID") + ), + responses( + (status = 200, description = "Run cancelled"), + (status = 404, description = "Run not found") + ) +)] +async fn cancel_run( + State(run_manager): State<Arc<RunManager>>, + Path(run_id): Path<Uuid>, +) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { + if run_manager.cancel(run_id).await { + Ok(Json(serde_json::json!({ "cancelled": true }))) + } else { + Err(axum::http::StatusCode::NOT_FOUND) + } +} + +pub fn router() -> Router<AppState> { + Router::new() + .route("/api/v1/graphs/execute", post(execute_graph)) + .route("/api/v1/graphs/validate", post(validate_graph)) + .route("/api/v1/graphs", get(list_runs)) + .route("/api/v1/graphs/{run_id}", get(get_run)) + .route("/api/v1/graphs/{run_id}", delete(cancel_run)) +} diff --git a/crates/nvisy-server/src/routes/health.rs b/crates/nvisy-server/src/handler/health.rs similarity index 52% rename from crates/nvisy-server/src/routes/health.rs rename to crates/nvisy-server/src/handler/health.rs index 709e091..2dca70c 100644 --- a/crates/nvisy-server/src/routes/health.rs +++ b/crates/nvisy-server/src/handler/health.rs @@ -1,16 +1,32 @@ use axum::{Router, routing::get, Json}; -use crate::state::AppState; - -pub fn router() -> Router<AppState> { - Router::new() - .route("/health", get(health)) - .route("/ready", get(ready)) -} +use crate::service::AppState; +/// Health check response. +#[utoipa::path( + get, + path = "/health", + responses( + (status = 200, description = "Service is healthy") + ) +)] async fn health() -> Json<serde_json::Value> { Json(serde_json::json!({ "status": "ok" })) } +/// Readiness check response. +#[utoipa::path( + get, + path = "/ready", + responses( + (status = 200, description = "Service is ready") + ) +)] async fn ready() -> Json<serde_json::Value> { Json(serde_json::json!({ "status": "ready" })) } + +pub fn router() -> Router<AppState> { + Router::new() + .route("/health", get(health)) + .route("/ready", get(ready)) +} diff --git a/crates/nvisy-server/src/handler/mod.rs b/crates/nvisy-server/src/handler/mod.rs new file mode 100644 index 0000000..584c2a5 --- /dev/null +++ b/crates/nvisy-server/src/handler/mod.rs @@ -0,0 +1,34 @@ +pub mod audit; +pub mod graphs; +pub mod health; +pub mod policies; +pub mod redact; + +use utoipa::OpenApi; + +#[derive(OpenApi)] +#[openapi( + paths( + health::health, + health::ready, + graphs::execute_graph, + graphs::validate_graph, + graphs::list_runs, + graphs::get_run, + graphs::cancel_run, + redact::redact, + policies::create_policy, + policies::list_policies, + policies::get_policy, + policies::update_policy, + policies::delete_policy, + audit::list_audit, + audit::get_audit_by_run, + ), + components(schemas( + redact::RedactRequest, + policies::CreatePolicyRequest, + policies::UpdatePolicyRequest, + )) +)] +pub struct ApiDoc; diff --git a/crates/nvisy-server/src/routes/policies.rs b/crates/nvisy-server/src/handler/policies.rs similarity index 54% rename from crates/nvisy-server/src/routes/policies.rs rename to crates/nvisy-server/src/handler/policies.rs index a49a65e..9f0155c 100644 --- a/crates/nvisy-server/src/routes/policies.rs +++ b/crates/nvisy-server/src/handler/policies.rs @@ -4,20 +4,13 @@ use axum::{ routing::{delete, get, post, put}, Json, }; +use std::sync::Arc; use uuid::Uuid; -use crate::state::AppState; +use crate::service::PolicyStore; +use crate::service::AppState; -pub fn router() -> Router<AppState> { - Router::new() - .route("/api/v1/policies", post(create_policy)) - .route("/api/v1/policies", get(list_policies)) - .route("/api/v1/policies/{id}", get(get_policy)) - .route("/api/v1/policies/{id}", put(update_policy)) - .route("/api/v1/policies/{id}", delete(delete_policy)) -} - -#[derive(serde::Deserialize)] -struct CreatePolicyRequest { +#[derive(serde::Deserialize, schemars::JsonSchema, utoipa::ToSchema)] +pub(crate) struct CreatePolicyRequest { name: String, #[serde(default)] rules: Vec<serde_json::Value>, @@ -30,11 +23,34 @@ struct CreatePolicyRequest { fn default_method() -> String { "mask".to_string() } fn default_threshold() -> f64 { 0.5 } +#[derive(serde::Deserialize, schemars::JsonSchema, utoipa::ToSchema)] +pub(crate) struct UpdatePolicyRequest { + #[serde(default)] + name: Option<String>, + #[serde(default)] + rules: Option<Vec<serde_json::Value>>, + #[serde(rename = "defaultMethod")] + #[serde(default)] + default_method: Option<String>, + #[serde(rename = "defaultConfidenceThreshold")] + #[serde(default)] + default_confidence_threshold: Option<f64>, +} + +/// Create a new policy. +#[utoipa::path( + post, + path = "/api/v1/policies", + request_body = CreatePolicyRequest, + responses( + (status = 201, description = "Policy created") + ) +)] async fn create_policy( - State(state): State<AppState>, + State(policy_store): State<Arc<PolicyStore>>, Json(body): Json<CreatePolicyRequest>, ) -> (axum::http::StatusCode, Json<serde_json::Value>) { - let policy = state.policy_store.create( + let policy = policy_store.create( body.name, body.rules, body.default_method, @@ -46,55 +62,95 @@ async fn create_policy( ) } +/// List all policies. +#[utoipa::path( + get, + path = "/api/v1/policies", + responses( + (status = 200, description = "List of policies") + ) +)] async fn list_policies( - State(state): State<AppState>, + State(policy_store): State<Arc<PolicyStore>>, ) -> Json<serde_json::Value> { - let policies = state.policy_store.list(); + let policies = policy_store.list(); Json(serde_json::to_value(&policies).unwrap_or_default()) } +/// Get a policy by ID. +#[utoipa::path( + get, + path = "/api/v1/policies/{id}", + params( + ("id" = Uuid, Path, description = "Policy ID") + ), + responses( + (status = 200, description = "Policy details"), + (status = 404, description = "Policy not found") + ) +)] async fn get_policy( - State(state): State<AppState>, + State(policy_store): State<Arc<PolicyStore>>, Path(id): Path<Uuid>, ) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { - match state.policy_store.get(id) { + match policy_store.get(id) { Some(policy) => Ok(Json(serde_json::to_value(&policy).unwrap_or_default())), None => Err(axum::http::StatusCode::NOT_FOUND), } } -#[derive(serde::Deserialize)] -struct UpdatePolicyRequest { - #[serde(default)] - name: Option<String>, - #[serde(default)] - rules: Option<Vec<serde_json::Value>>, - #[serde(rename = "defaultMethod")] - #[serde(default)] - default_method: Option<String>, - #[serde(rename = "defaultConfidenceThreshold")] - #[serde(default)] - default_confidence_threshold: Option<f64>, -} - +/// Update an existing policy. +#[utoipa::path( + put, + path = "/api/v1/policies/{id}", + params( + ("id" = Uuid, Path, description = "Policy ID") + ), + request_body = UpdatePolicyRequest, + responses( + (status = 200, description = "Policy updated"), + (status = 404, description = "Policy not found") + ) +)] async fn update_policy( - State(state): State<AppState>, + State(policy_store): State<Arc<PolicyStore>>, Path(id): Path<Uuid>, Json(body): Json<UpdatePolicyRequest>, ) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { - match state.policy_store.update(id, body.name, body.rules, body.default_method, body.default_confidence_threshold) { + match policy_store.update(id, body.name, body.rules, body.default_method, body.default_confidence_threshold) { Some(policy) => Ok(Json(serde_json::to_value(&policy).unwrap_or_default())), None => Err(axum::http::StatusCode::NOT_FOUND), } } +/// Delete a policy. +#[utoipa::path( + delete, + path = "/api/v1/policies/{id}", + params( + ("id" = Uuid, Path, description = "Policy ID") + ), + responses( + (status = 200, description = "Policy deleted"), + (status = 404, description = "Policy not found") + ) +)] async fn delete_policy( - State(state): State<AppState>, + State(policy_store): State<Arc<PolicyStore>>, Path(id): Path<Uuid>, ) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { - if state.policy_store.delete(id) { + if policy_store.delete(id) { Ok(Json(serde_json::json!({ "deleted": true }))) } else { Err(axum::http::StatusCode::NOT_FOUND) } } + +pub fn router() -> Router<AppState> { + Router::new() + .route("/api/v1/policies", post(create_policy)) + .route("/api/v1/policies", get(list_policies)) + .route("/api/v1/policies/{id}", get(get_policy)) + .route("/api/v1/policies/{id}", put(update_policy)) + .route("/api/v1/policies/{id}", delete(delete_policy)) +} diff --git a/crates/nvisy-server/src/routes/redact.rs b/crates/nvisy-server/src/handler/redact.rs similarity index 51% rename from crates/nvisy-server/src/routes/redact.rs rename to crates/nvisy-server/src/handler/redact.rs index a94c8bc..d5d57a7 100644 --- a/crates/nvisy-server/src/routes/redact.rs +++ b/crates/nvisy-server/src/handler/redact.rs @@ -4,18 +4,17 @@ use axum::{ routing::post, Json, }; -use crate::state::AppState; +use std::sync::Arc; +use nvisy_core::datatypes::redaction_context::RedactionContext; +use nvisy_engine::runs::RunManager; +use crate::service::AppState; -pub fn router() -> Router<AppState> { - Router::new() - .route("/api/v1/redact", post(redact)) -} - -#[derive(serde::Deserialize)] -struct RedactRequest { +#[derive(serde::Deserialize, schemars::JsonSchema, utoipa::ToSchema)] +pub(crate) struct RedactRequest { source: serde_json::Value, #[serde(default)] - detection: Option<serde_json::Value>, + #[schema(value_type = Option<Object>)] + context: Option<RedactionContext>, #[serde(default)] output: Option<serde_json::Value>, #[serde(rename = "policyId")] @@ -23,12 +22,21 @@ struct RedactRequest { policy_id: Option<String>, } +/// Submit a redaction request. +#[utoipa::path( + post, + path = "/api/v1/redact", + request_body = RedactRequest, + responses( + (status = 202, description = "Redaction accepted") + ) +)] async fn redact( - State(state): State<AppState>, + State(run_manager): State<Arc<RunManager>>, Json(_body): Json<RedactRequest>, ) -> (axum::http::StatusCode, Json<serde_json::Value>) { - let (run_id, _cancel_token) = state.run_manager.create_run().await; - state.run_manager.set_running(run_id).await; + let (run_id, _cancel_token) = run_manager.create_run().await; + run_manager.set_running(run_id).await; // TODO: build redaction graph from body and execute @@ -40,3 +48,8 @@ async fn redact( })), ) } + +pub fn router() -> Router<AppState> { + Router::new() + .route("/api/v1/redact", post(redact)) +} diff --git a/crates/nvisy-server/src/main.rs b/crates/nvisy-server/src/main.rs index 32a9c56..273c3fe 100644 --- a/crates/nvisy-server/src/main.rs +++ b/crates/nvisy-server/src/main.rs @@ -1,10 +1,11 @@ +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + mod app; -mod config; mod middleware; -mod routes; -mod schemas; +mod handler; mod service; -mod state; use tracing_subscriber::EnvFilter; @@ -16,7 +17,7 @@ async fn main() -> anyhow::Result<()> { .json() .init(); - let config = config::ServerConfig::from_env(); + let config = service::ServerConfig::from_env(); tracing::info!(host = %config.host, port = config.port, "Starting nvisy-server"); let app = app::build_app(&config).await?; diff --git a/crates/nvisy-server/src/routes/graphs.rs b/crates/nvisy-server/src/routes/graphs.rs deleted file mode 100644 index e8cf874..0000000 --- a/crates/nvisy-server/src/routes/graphs.rs +++ /dev/null @@ -1,71 +0,0 @@ -use axum::{ - Router, - extract::{Path, State}, - routing::{delete, get, post}, - Json, -}; -use uuid::Uuid; -use crate::state::AppState; - -pub fn router() -> Router<AppState> { - Router::new() - .route("/api/v1/graphs/execute", post(execute_graph)) - .route("/api/v1/graphs/validate", post(validate_graph)) - .route("/api/v1/graphs", get(list_runs)) - .route("/api/v1/graphs/{run_id}", get(get_run)) - .route("/api/v1/graphs/{run_id}", delete(cancel_run)) -} - -async fn execute_graph( - State(state): State<AppState>, - Json(_body): Json<serde_json::Value>, -) -> (axum::http::StatusCode, Json<serde_json::Value>) { - let (run_id, _cancel_token) = state.run_manager.create_run().await; - state.run_manager.set_running(run_id).await; - - // TODO: spawn actual graph execution - // For now, return the run ID - ( - axum::http::StatusCode::ACCEPTED, - Json(serde_json::json!({ - "runId": run_id.to_string(), - "status": "accepted" - })), - ) -} - -async fn validate_graph( - State(_state): State<AppState>, - Json(_body): Json<serde_json::Value>, -) -> Json<serde_json::Value> { - // TODO: validate graph against registry - Json(serde_json::json!({ "valid": true, "errors": [] })) -} - -async fn list_runs( - State(state): State<AppState>, -) -> Json<serde_json::Value> { - let runs = state.run_manager.list(None).await; - Json(serde_json::to_value(&runs).unwrap_or_default()) -} - -async fn get_run( - State(state): State<AppState>, - Path(run_id): Path<Uuid>, -) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { - match state.run_manager.get(run_id).await { - Some(run) => Ok(Json(serde_json::to_value(&run).unwrap_or_default())), - None => Err(axum::http::StatusCode::NOT_FOUND), - } -} - -async fn cancel_run( - State(state): State<AppState>, - Path(run_id): Path<Uuid>, -) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { - if state.run_manager.cancel(run_id).await { - Ok(Json(serde_json::json!({ "cancelled": true }))) - } else { - Err(axum::http::StatusCode::NOT_FOUND) - } -} diff --git a/crates/nvisy-server/src/routes/mod.rs b/crates/nvisy-server/src/routes/mod.rs deleted file mode 100644 index e839dc8..0000000 --- a/crates/nvisy-server/src/routes/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -pub mod audit; -pub mod graphs; -pub mod health; -pub mod policies; -pub mod redact; diff --git a/crates/nvisy-server/src/schemas/mod.rs b/crates/nvisy-server/src/schemas/mod.rs deleted file mode 100644 index 9c59885..0000000 --- a/crates/nvisy-server/src/schemas/mod.rs +++ /dev/null @@ -1 +0,0 @@ -// OpenAPI schema types can be defined here when utoipa integration is added. diff --git a/crates/nvisy-server/src/service/audit_store.rs b/crates/nvisy-server/src/service/audit_store.rs index 1972758..9945612 100644 --- a/crates/nvisy-server/src/service/audit_store.rs +++ b/crates/nvisy-server/src/service/audit_store.rs @@ -1,7 +1,7 @@ use std::sync::RwLock; use uuid::Uuid; -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, schemars::JsonSchema)] pub struct StoredAudit { pub id: Uuid, pub action: String, diff --git a/crates/nvisy-server/src/config.rs b/crates/nvisy-server/src/service/config.rs similarity index 100% rename from crates/nvisy-server/src/config.rs rename to crates/nvisy-server/src/service/config.rs diff --git a/crates/nvisy-server/src/service/mod.rs b/crates/nvisy-server/src/service/mod.rs index cdd56c7..0145003 100644 --- a/crates/nvisy-server/src/service/mod.rs +++ b/crates/nvisy-server/src/service/mod.rs @@ -1,3 +1,31 @@ pub mod audit_store; +pub mod config; pub mod engine_factory; pub mod policy_store; +pub mod state; + +use std::sync::Arc; + +// Re-exports for convenience +pub use audit_store::AuditStore; +pub use config::ServerConfig; +pub use engine_factory::create_registry; +pub use policy_store::PolicyStore; +pub use state::AppState; + +macro_rules! impl_di { + ($($f:ident: $t:ty),+) => {$( + impl axum::extract::FromRef<AppState> for $t { + fn from_ref(state: &AppState) -> Self { + state.$f.clone() + } + } + )+}; +} + +impl_di! { + registry: Arc<nvisy_core::registry::Registry>, + run_manager: Arc<nvisy_engine::runs::RunManager>, + policy_store: Arc<PolicyStore>, + audit_store: Arc<AuditStore> +} diff --git a/crates/nvisy-server/src/service/policy_store.rs b/crates/nvisy-server/src/service/policy_store.rs index 13fe6a5..cf8468e 100644 --- a/crates/nvisy-server/src/service/policy_store.rs +++ b/crates/nvisy-server/src/service/policy_store.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::sync::RwLock; use uuid::Uuid; -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, schemars::JsonSchema)] pub struct StoredPolicy { pub id: Uuid, pub name: String, diff --git a/crates/nvisy-server/src/state.rs b/crates/nvisy-server/src/service/state.rs similarity index 77% rename from crates/nvisy-server/src/state.rs rename to crates/nvisy-server/src/service/state.rs index f5a348e..fc66c6c 100644 --- a/crates/nvisy-server/src/state.rs +++ b/crates/nvisy-server/src/service/state.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use nvisy_engine::runs::RunManager; -use crate::service::audit_store::AuditStore; -use crate::service::policy_store::PolicyStore; +use super::audit_store::AuditStore; +use super::policy_store::PolicyStore; use nvisy_core::registry::Registry; /// Shared application state. diff --git a/docker/Dockerfile b/docker/Dockerfile index 96fcfbf..12e14c6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,34 +1,52 @@ -FROM node:22-alpine AS base +FROM rust:1.85-bookworm AS builder + +RUN apt-get update && apt-get install -y python3-dev python3-pip && rm -rf /var/lib/apt/lists/* + WORKDIR /app -FROM base AS deps -COPY package.json package-lock.json ./ -COPY packages/nvisy-core/package.json packages/nvisy-core/package.json -COPY packages/nvisy-runtime/package.json packages/nvisy-runtime/package.json -COPY packages/nvisy-server/package.json packages/nvisy-server/package.json -RUN npm ci - -FROM base AS build -COPY --from=deps /app/node_modules ./node_modules -COPY --from=deps /app/packages/nvisy-core/node_modules ./packages/nvisy-core/node_modules -COPY --from=deps /app/packages/nvisy-runtime/node_modules ./packages/nvisy-runtime/node_modules -COPY --from=deps /app/packages/nvisy-server/node_modules ./packages/nvisy-server/node_modules +# Copy manifests first to cache dependency builds +COPY Cargo.toml Cargo.lock ./ +COPY crates/nvisy-core/Cargo.toml crates/nvisy-core/Cargo.toml +COPY crates/nvisy-detect/Cargo.toml crates/nvisy-detect/Cargo.toml +COPY crates/nvisy-engine/Cargo.toml crates/nvisy-engine/Cargo.toml +COPY crates/nvisy-object/Cargo.toml crates/nvisy-object/Cargo.toml +COPY crates/nvisy-python/Cargo.toml crates/nvisy-python/Cargo.toml +COPY crates/nvisy-server/Cargo.toml crates/nvisy-server/Cargo.toml + +# Create empty src files to satisfy cargo's manifest checks +RUN for crate in nvisy-core nvisy-detect nvisy-engine nvisy-object nvisy-python; do \ + mkdir -p crates/$crate/src && echo "" > crates/$crate/src/lib.rs; \ + done && \ + mkdir -p crates/nvisy-server/src && echo "fn main() {}" > crates/nvisy-server/src/main.rs + +# Create stub READMEs for crates that use doc = include_str!("../README.md") +RUN for crate in nvisy-core nvisy-detect nvisy-engine nvisy-object nvisy-python nvisy-server; do \ + touch crates/$crate/README.md; \ + done + +ENV PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1 + +# Cache dependency build +RUN cargo build --release 2>/dev/null || true + +# Copy full source and build COPY . . -RUN npm run build - -FROM base AS runtime -ENV NODE_ENV=production -COPY --from=deps /app/node_modules ./node_modules -COPY --from=deps /app/packages/nvisy-core/node_modules ./packages/nvisy-core/node_modules -COPY --from=deps /app/packages/nvisy-runtime/node_modules ./packages/nvisy-runtime/node_modules -COPY --from=deps /app/packages/nvisy-server/node_modules ./packages/nvisy-server/node_modules -COPY --from=build /app/packages/nvisy-core/dist ./packages/nvisy-core/dist -COPY --from=build /app/packages/nvisy-core/package.json ./packages/nvisy-core/package.json -COPY --from=build /app/packages/nvisy-runtime/dist ./packages/nvisy-runtime/dist -COPY --from=build /app/packages/nvisy-runtime/package.json ./packages/nvisy-runtime/package.json -COPY --from=build /app/packages/nvisy-server/dist ./packages/nvisy-server/dist -COPY --from=build /app/packages/nvisy-server/package.json ./packages/nvisy-server/package.json -COPY package.json package-lock.json ./ +RUN cargo build --release + +# Runtime stage +FROM debian:bookworm-slim AS runtime + +RUN apt-get update && apt-get install -y \ + python3 python3-pip python3-venv ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install Python packages +COPY packages/ /opt/nvisy/packages/ +RUN python3 -m pip install --break-system-packages \ + /opt/nvisy/packages/nvisy-ai \ + /opt/nvisy/packages/nvisy-exif + +COPY --from=builder /app/target/release/nvisy-server /usr/local/bin/nvisy-server EXPOSE 8080 -CMD ["node", "packages/nvisy-server/dist/main.js"] +CMD ["nvisy-server"] diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index e1da7d2..c17324f 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -1,339 +1,187 @@ # Nvisy Runtime — Architecture -**Technical architecture specification for the Nvisy Runtime ETL platform.** +**Technical architecture for the Nvisy Runtime data protection platform.** --- ## 1. Overview -Nvisy Runtime is a TypeScript-native, DAG-based ETL platform for AI data workloads. It is structured as a set of composable packages that can be consumed as a library or deployed as a long-lived server. - -This document defines the system architecture: package boundaries, data flow, execution model, connector interface, graph compilation, scheduling, error handling, and observability. It is intended as the authoritative reference for implementation. +Nvisy Runtime is a Rust-native, DAG-based data protection platform. It detects, classifies, and redacts sensitive data across documents, images, and streams. The system is structured as a Cargo workspace of composable crates, with Python extensions for AI-powered detection. --- -## 2. Package Structure - -The system is organized as a monorepo of npm packages under the `@nvisy` scope. +## 2. Crate Structure ``` +crates/ + nvisy-core/ Core types, traits, plugin registry, errors + nvisy-detect/ Regex patterns, checksum validation, policy evaluation, redaction + nvisy-engine/ Graph schema, DAG compiler, executor, run management + nvisy-object/ Object storage client and connectors (S3) + nvisy-python/ PyO3 bridge for Python AI modules + nvisy-server/ Axum HTTP server, handlers, middleware + packages/ - nvisy-core/ Primitives, type system, validation, errors, - base interfaces for sources, sinks, and actions, - and core observability (structured logging, metrics, tracing) - nvisy-plugin-sql/ SQL connectors (PostgreSQL, MySQL) - nvisy-plugin-object/ Object store and file format connectors (S3, GCS, Parquet, JSONL, CSV) - nvisy-plugin-vector/ Vector database connectors (Pinecone, Qdrant, Milvus, Weaviate, pgvector) - nvisy-runtime/ Graph definition, JSON parser, DAG compiler, execution engine, - task runner, retry logic, concurrency control, - runtime-level observability (run metrics, node tracing) - nvisy-server/ HTTP server (Hono), REST API, cron scheduler, dashboard backend, - server-level observability (request logging, health endpoints) + nvisy-ai/ Python: LLM-based NER detection + nvisy-exif/ Python: EXIF metadata reading/stripping ``` ### Dependency graph ``` - nvisy-server - / \ - ▼ ▼ - nvisy-runtime nvisy-plugin-{sql,ai,object,vector} - \ / - ▼ ▼ - nvisy-core + nvisy-server + / | \ + ▼ ▼ ▼ + nvisy-engine nvisy-detect nvisy-python + \ | / + ▼ ▼ ▼ + nvisy-core + ▲ + | + nvisy-object ``` -Every package depends on `nvisy-core`. The plugin packages (`nvisy-plugin-sql`, `nvisy-plugin-ai`, `nvisy-plugin-object`, `nvisy-plugin-vector`) are siblings — they depend on `nvisy-core` for the base source, sink, and action interfaces, but are independent of each other and independent of `nvisy-runtime`. The server (or any application) imports both the runtime and the desired plugins, then registers each plugin with the engine at startup. `nvisy-runtime` is the central package that owns graph definition, compilation, and execution — but has no compile-time dependency on any plugin. No circular dependencies are permitted. Packages communicate through typed interfaces, never through implementation details. - -### Observability distribution - -There is no dedicated observability package. Instead, observability is distributed across three layers: - -- **`nvisy-core`** defines the observability primitives: structured log format, metric types, trace span interface, and lineage record structure. It also exports the logging, metrics, and tracing utilities that all other packages use. -- **`nvisy-runtime`** emits runtime observability: graph run duration, node execution times, primitives processed/failed, connector call counts, rate limit wait times. Each graph run produces an OpenTelemetry-compatible trace with nodes as spans. -- **`nvisy-server`** emits server-level observability: HTTP request logging, health check endpoints, and metric export endpoints (Prometheus, OpenTelemetry). +Every crate depends on `nvisy-core`. Plugin crates (`nvisy-detect`, `nvisy-object`, `nvisy-python`) are independent of each other. The server imports everything and wires plugins into the engine at startup. --- -## 3. Idiomatic Modern JavaScript with Effection - -The platform is built on idiomatic modern JavaScript — `async`/`await`, `AsyncIterable`, native `Promise`, and generator functions — with **Effection** providing structured concurrency for the runtime's DAG executor. - -### 3.1 Design philosophy - -Traditional TypeScript ETL code suffers from scattered try/catch blocks, manual resource cleanup, ad-hoc retry logic, and opaque concurrency. Nvisy addresses these problems with standard language features and a minimal set of libraries: - -- **Typed errors.** A structured error hierarchy with machine-readable tags enables programmatic error handling. TypeScript discriminated unions and Zod schemas enforce correctness at both compile time and runtime. -- **Resource safety.** Connector lifecycle (connect, use, disconnect) is managed with explicit `try`/`finally` blocks in the node executor. Cleanup runs regardless of success, failure, or cancellation. -- **Structured concurrency.** The runtime's DAG executor uses Effection — a structured concurrency library built on generators. Spawned tasks are scoped to their parent, so halting a graph run automatically cancels all in-flight nodes without manual bookkeeping. -- **Streaming.** Data flows between nodes via the native `AsyncIterable` protocol. Plugin interfaces (sources, sinks, actions) accept and return `AsyncIterable` — no special streaming library required. -- **Validation.** Zod schemas provide runtime validation, TypeScript type derivation, and structured parse errors in a single definition. - -### 3.2 Effection in the runtime - -Effection is used exclusively in `nvisy-runtime` for DAG execution. It provides `spawn` (launch concurrent tasks), `race` (timeout handling), `sleep` (backoff delays), and `call` (bridge async functions into generator-based operations). The runtime wraps graph execution in an Effection task; each node runs as a spawned child operation. Plugin code never touches Effection — sources, sinks, and actions are plain `async` functions and `AsyncIterable` generators. - ---- - -## 4. Core (`nvisy-core`) - -The core package serves three purposes: it defines the **primitive type system** (the data model), it provides the **base interfaces** for building sources, sinks, and actions, and it houses the **observability primitives** used by all other packages. - -### 4.1 Primitive type hierarchy - -All data flowing through a graph is represented as a **Primitive**. Primitives are immutable, serializable, and carry both payload and metadata. - -The primitive type discriminant covers the full AI data surface: `embedding`, `completion`, `structured_output`, `tool_call_trace`, `image`, `audio`, `fine_tune_sample`, and `raw` (an escape hatch for untyped data). Each type maps to a specific payload shape — for example, an embedding payload contains the vector, its dimensionality, the producing model, source text, and a content hash. - -Primitives are validated at construction time. The factory enforces payload conformance and rejects malformed data with structured errors before it enters the graph. - -### 4.2 Metadata envelope +## 3. Core (`nvisy-core`) -Every primitive carries a standard metadata envelope: creation timestamp, producing source or node identifier, graph ID, run ID, user-defined tags, and an extensible custom fields map. This envelope enables correlation, filtering, and auditing across the entire data lifecycle. +### 3.1 Type system -### 4.3 Lineage +All data flowing through a graph is represented as a `DataValue` — a discriminated union of typed primitives: `Document`, `Blob`, `Entity`, `Redaction`, `Policy`, `Audit`, `Image`. Each carries a `DataItem` with UUID, parent lineage, and metadata. -Each transformation appends a lineage record to the primitive: the node ID that performed the operation, the operation name, a timestamp, the IDs of input primitives, and the parameters used. This enables full forward and backward tracing — given a vector in a database, trace back to the source document and every transformation it passed through. +### 3.2 Traits -### 4.4 Source, Sink, and Action interfaces +Extension points are defined as async traits: +- **Action** — transforms data (detect, redact, classify, emit audit) +- **Loader** — parses blobs into documents (plaintext, CSV, JSON) +- **ProviderFactory** — creates authenticated client connections +- **StreamSource / StreamTarget** — reads from / writes to external systems -`nvisy-core` exports the abstract interfaces that all connectors and actions must implement. This is the extension contract of the platform: +### 3.3 Plugin registry -- **Source** — reads primitives from an external system. Declares supported primitive types. Returns an `AsyncIterable` stream of primitives with resumption context. -- **Sink** — writes primitives to an external system. Declares capabilities (batch size, upsert support, rate limits). Returns a write function that accepts individual primitives. -- **Action** — transforms primitives via the `pipe` method: receives an `AsyncIterable` of input primitives and returns an `AsyncIterable` of output primitives. Actions may be stateless (map, filter) or stateful (deduplicate, aggregate). +`PluginDescriptor` bundles actions, providers, sources, targets, and loaders under a namespace. `Registry` stores them keyed by `"plugin_id/item_id"` and resolves references at graph compilation time. -All three interfaces include lifecycle methods (connect, disconnect) and capability declarations. By housing these interfaces in `nvisy-core`, the connector packages and any community-contributed connectors share a single, versioned contract. All methods use standard `async`/`await` and `AsyncIterable` — no special runtime library is required. +### 3.4 Errors -### 4.5 Error taxonomy - -The core package defines a structured error hierarchy with machine-readable tags. The hierarchy distinguishes connector errors, validation errors, rate limit errors, timeout errors, graph compilation errors, and node execution errors. Each error class carries a `retryable` flag so the runtime can distinguish transient failures from terminal ones without string matching. - -### 4.6 Observability primitives - -Core defines the foundational observability types: structured log schema (JSON, with correlation IDs for graph, run, and node), metric types (counters, histograms, gauges), trace span interface (OpenTelemetry-compatible), and lineage record structure. It also exports utility functions for logging, metric emission, and span creation that all packages use uniformly. - -### 4.7 Utilities - -Common utilities shared across packages live in core: ULID generation, content hashing, primitive serialization/deserialization, and type-safe builder helpers for constructing primitives. +`Error` struct with `ErrorKind` enum (Validation, Connection, Timeout, Cancellation, Policy, Runtime, Python, Other). Carries optional source component, retryable flag, and boxed source error. `Result<T>` type alias for convenience. --- -## 5. Connector Packages - -### 5.1 Package separation rationale - -Connectors are split into three domain-specific packages rather than a single monolithic package. This serves two goals: - -1. **Install footprint.** Each connector package carries peer dependencies on the relevant client libraries (e.g., `@qdrant/js-client-rest`, `@aws-sdk/client-s3`, `pg`). Users who only need vector database connectors should not be forced to install SQL drivers or S3 SDKs. - -2. **Release independence.** A breaking change in a vector database client library should not force a release of the SQL connector package. Domain-specific packages can be versioned and released independently. - -### 5.2 `nvisy-plugin-sql` - -Implements the Source and Sink interfaces for relational databases. Initial targets: PostgreSQL and MySQL. Connectors handle connection pooling, query generation, type mapping between SQL types and primitive payloads, and batch insert/upsert operations. +## 4. Detection (`nvisy-detect`) -### 5.3 `nvisy-plugin-object` +### 4.1 Pattern detection -Implements the Source and Sink interfaces for object stores and file formats. Initial targets: S3, GCS, Parquet, JSONL, and CSV. Object store connectors handle multipart uploads, streaming reads, and prefix-based listing. File format connectors handle serialization, deserialization, schema inference, and chunked reading for large files. +Regex patterns are loaded from `assets/patterns.json` at startup. Each pattern defines: name, category, entity type, regex, confidence score, and optional validator reference. Validators (SSN format check, Luhn checksum) are registered in Rust code and resolved by name. -### 5.4 `nvisy-plugin-vector` +### 4.2 Actions -Implements the Source and Sink interfaces for vector databases. Initial targets: Pinecone, Qdrant, Milvus, Weaviate, and pgvector. Vector connectors handle collection/index management, upsert with metadata, batch operations, and dimensionality validation. +- **detect-regex** — scans documents against all or selected patterns, emits entities +- **detect-checksum** — validates entities with checksum algorithms (Luhn), boosts confidence +- **evaluate-policy** — filters entities against policy rules +- **apply-redaction** — applies redaction methods (mask, replace, hash, etc.) +- **classify** — categorizes documents based on detected entities +- **emit-audit** — produces audit records for compliance -### 5.5 Plugin registration +### 4.3 Loaders -Plugins are registered with the `Engine` at startup via `engine.register(plugin)`. Each plugin bundles providers, streams, and actions under a namespace (e.g. `"sql"`, `"ai"`). The graph compiler resolves references like `"sql/postgres"` or `"ai/embed"` against the registry at compilation time. Community plugins install as npm packages and export a `PluginInstance` implementing the standard interface from `nvisy-core`. +- **plaintext** — loads text files +- **csv** — loads CSV with header detection +- **json** — loads JSON documents --- -## 6. Runtime (`nvisy-runtime`) - -The runtime package owns the full lifecycle of a graph: definition, compilation, and execution. It is the central package — plugin packages register their providers, streams, and actions into the runtime's `Engine`, and the server orchestrates this wiring at startup. - -### 6.1 Graph definition and JSON serializability - -The central design constraint is that every graph must be representable as a plain JSON document. This means graphs can be stored in a database, versioned in source control, transmitted over an API, diffed, and reconstructed without loss. The programmatic TypeScript API is a convenience layer that produces the same JSON structure. - -A graph is a directed acyclic graph of **nodes**. Each node declares its type (source, action, sink, branch, fanout, fanin), its configuration, its upstream dependencies, and optional execution policies (retry, timeout, concurrency). Nodes are connected by edges derived from the dependency declarations. - -The graph JSON schema is defined using **Zod**. This provides runtime validation, TypeScript type derivation, and structured parse errors in a single definition. - -### 6.2 Node types - -**Source** — References a registered source connector by name and provides connection and extraction configuration. - -**Action** — Applies a transformation to primitives. Actions are resolved by name from the registry. The runtime provides built-in generic actions (filter, map, batch, deduplicate, validate, convert). Domain-specific actions (embed, chunk, complete) are provided by plugin packages (`nvisy-plugin-ai`, etc.) and registered into the engine by the application at startup. - -**Sink** — References a registered sink connector by name and provides connection and load configuration. - -**Branch** — Routes each primitive to one of several downstream nodes based on a predicate. A default route handles unmatched primitives. +## 5. Engine (`nvisy-engine`) -**FanOut** — Duplicates each primitive to multiple downstream nodes for parallel processing (e.g., embedding the same text with multiple models simultaneously). +### 5.1 Graph schema -**FanIn** — Collects primitives from multiple upstream nodes and merges them into a single stream. +Graphs are JSON structures with typed nodes (Source, Action, Target) and edges. Each node declares its provider/action reference, parameters, and optional retry/timeout policies. -### 6.3 Registries +### 5.2 Compilation -The runtime uses three registries for resolving node references to implementations: +The compiler validates graph structure: parses JSON against the schema, checks for cycles via topological sort, verifies all node references resolve against the registry, and validates type compatibility between connected nodes. -- **SourceRegistry** — Maps source names to `DataSource` factories. Populated by plugin packages (`nvisy-plugin-sql`, `nvisy-plugin-vector`, etc.) via `engine.register()`. -- **SinkRegistry** — Maps sink names to `DataSink` factories. Same population model. -- **ActionRegistry** — Maps action names to `Action` factories. Pre-loaded with the runtime's built-in generic actions. Extended by plugin packages (`nvisy-plugin-ai`, etc.). +### 5.3 Execution -Separation ensures type safety (each registry returns the correct interface without narrowing), independent testability (mock only what you need), and clear error messages ("unknown source: xyz" vs "unknown action: xyz"). Plugin packages export a `PluginInstance` that the application registers with the engine at startup — the runtime itself has no compile-time dependency on any plugin. +The executor runs nodes in topological order. Data flows between nodes via `tokio::sync::mpsc` channels. Each node runs as a spawned task. The executor tracks per-node progress and aggregates results into a `RunResult`. -### 6.4 Built-in actions +### 5.4 Run management -The runtime owns a set of generic, data-plane actions that operate on primitives without external dependencies: +`RunManager` tracks all in-flight runs with status (pending, running, success, partial failure, failure, cancelled), progress per node, and cancellation tokens. -- **filter** — Drop primitives that don't match a predicate. -- **map** — Transform primitive fields (rename, reshape, project). -- **batch** — Group N primitives into batches before forwarding downstream. -- **deduplicate** — Drop duplicates by content hash or user-defined key. -- **validate** — Assert primitives match a schema; route failures to DLQ. -- **convert** — Cast between primitive types (Row → Document, etc.). +### 5.5 Policies -These are pre-registered in the ActionRegistry. Domain-specific actions (embed, chunk, complete, extract) live in their respective packages. +Retry policies (fixed, exponential, jitter backoff) and timeout policies are configurable per node. -### 6.5 Graph compilation - -The compiler pipeline transforms raw JSON into an executable plan in three stages: - -1. **Parse** (`compiler/parse.ts`) — Decode the raw JSON against the Zod graph schema. This produces a typed, validated graph structure or structured errors with JSON paths. - -2. **Validate** (`compiler/validate.ts`) — Structural DAG validation: cycle detection via topological sort, dangling node references, type compatibility between connected nodes (source output types match downstream action input types), and connector/action name resolution against the registries. - -3. **Plan** (`compiler/plan.ts`) — Build the `ExecutionPlan`: resolve all names to concrete implementations via the registries, compute the topological execution order, aggregate concurrency constraints, and wire rate limit policies. The `ExecutionPlan` is an immutable data structure. Compilation throws structured errors on failure. - -### 6.6 Execution model - -The engine executes an `ExecutionPlan` as an Effection task. It spawns each node as a child operation, with dependency tracking ensuring nodes wait for their upstream inputs before executing. Effection's structured concurrency guarantees that halting the top-level task automatically cancels all in-flight nodes. The executor maintains a **done store** of completed results (both successes and terminal failures) and coordinates data flow via inter-node queues. - -### 6.7 Data flow between nodes - -Each edge in the DAG is backed by an Effection queue. Producer nodes push primitives into the queue; consumer nodes pull from it. For action nodes that expect an `AsyncIterable` input, the runtime bridges Effection queues into a `ReadableStream` via a `TransformStream`, allowing actions to consume data with standard `for await...of` iteration. - -Data flows through the system using the native `AsyncIterable` protocol. Sources yield items, actions pipe one `AsyncIterable` to another, and sinks consume items via a write function. No special streaming library is required — the platform relies entirely on JavaScript's built-in async iteration. - -For nodes that require all upstream data before starting (e.g., deduplication across the full dataset), a **materialization barrier** can be configured. The barrier drains the upstream queue into an array before forwarding to the node. +--- -Fan-out nodes push each primitive to multiple downstream queues. Fan-in nodes pull from multiple upstream queues and merge into a single stream. +## 6. Server (`nvisy-server`) -### 6.8 Retry policy +### 6.1 Role -Each node can define a retry policy specifying maximum retries, backoff strategy (fixed, exponential, or jitter), initial and maximum delay, and an optional allowlist of retryable error codes. The runtime implements retries as a generator-based loop using Effection's `sleep` for backoff delays. It distinguishes between retryable errors (network timeouts, rate limits, transient API failures) and terminal errors (authentication failures, schema violations, invalid configuration). Terminal errors fail the node immediately. +Short-lived Axum HTTP server. Accepts graph definitions, compiles and executes them, reports status. Designed for containerized deployment. -### 6.9 Rate limiting +### 6.2 REST API -Rate limits are enforced per-connector. When a node issues a request that would exceed the rate limit, the operation is suspended until tokens are available. Rate limits are declared in connector capabilities and can be overridden in graph configuration. +| Method | Path | Description | +|----------|-----------------------------|--------------------------------------| +| `GET` | `/health` | Liveness probe | +| `GET` | `/ready` | Readiness probe | +| `POST` | `/api/v1/graphs/execute` | Submit graph for execution | +| `POST` | `/api/v1/graphs/validate` | Validate graph without executing | +| `GET` | `/api/v1/graphs` | List runs | +| `GET` | `/api/v1/graphs/{runId}` | Get run status | +| `DELETE` | `/api/v1/graphs/{runId}` | Cancel run | +| `POST` | `/api/v1/redact` | Submit redaction request | +| `POST` | `/api/v1/policies` | Create policy | +| `GET` | `/api/v1/policies` | List policies | +| `GET` | `/api/v1/policies/{id}` | Get policy | +| `PUT` | `/api/v1/policies/{id}` | Update policy | +| `DELETE` | `/api/v1/policies/{id}` | Delete policy | +| `GET` | `/api/v1/audit` | Query audit records | +| `GET` | `/api/v1/audit/{runId}` | Get audit records for a run | -### 6.10 Concurrency control +### 6.3 Middleware -Global concurrency is bounded by a configurable limit (default: 10 permits). Per-node concurrency can be set individually. The runtime respects both limits simultaneously. The concurrency pool (`engine/pool.ts`) manages this. +- Request ID injection (`X-Request-Id`) +- Request/response tracing via `tower-http` +- CORS -### 6.11 Runtime observability +### 6.4 Service layer -The runtime emits structured metrics and trace spans for every graph run. Each run is an OpenTelemetry trace; each node execution is a span within that trace. Metrics include run duration, run status (success, partial failure, failure), per-node execution time, primitives processed and failed, connector calls issued, and rate limit wait time. +- `PolicyStore` — in-memory policy CRUD +- `AuditStore` — in-memory audit record storage +- `AppState` — shared state (registry, run manager, stores) +- `ServerConfig` — configuration from environment variables --- -## 7. Server (`nvisy-server`) +## 7. Python Extensions -### 7.1 Role +### 7.1 PyO3 bridge -The Node.js server is a **stateless execution worker**. It accepts graph JSON, compiles and executes it via the runtime, and reports status of in-flight runs. It does not persist graph definitions, run history, or lineage — that responsibility belongs to a separate persistent server (written in Rust). This package is a thin HTTP interface over the runtime engine. +`PythonBridge` manages Python interpreter access via `pyo3`. Functions run on `spawn_blocking` threads to avoid blocking the async runtime. The GIL is acquired per-call. -### 7.2 HTTP layer +### 7.2 AI detection -The HTTP layer is built on **Hono**, a lightweight, edge-compatible web framework. Hono provides routing, middleware composition, and request validation with minimal overhead. The server entry point is `main.ts`, which starts a Node.js HTTP server via `@hono/node-server`. +The `nvisy-ai` Python package provides LLM-based NER for text and images. Called from Rust via the bridge, it returns entity dicts that are parsed into `Entity` structs. -### 7.3 Middleware +### 7.3 EXIF handling -All requests pass through two middleware layers: - -- **Request ID** — Assigns a unique `X-Request-Id` header to every request for correlation. -- **Request logger** — Emits structured JSON logs (method, path, status, latency, request ID) for every request. - -### 7.4 REST API - -The API surface covers health checks, graph execution, validation, and in-flight run management. All endpoints accept and return JSON. Connectors are defined within the graph JSON schema, not managed separately. - -| Method | Path | Description | -|--------|------|-------------| -| `GET` | `/health` | Liveness probe | -| `GET` | `/ready` | Readiness probe | -| `POST` | `/api/v1/graphs/execute` | Submit a graph for execution; returns `{ runId }` immediately | -| `POST` | `/api/v1/graphs/validate` | Compile and validate a graph without executing | -| `GET` | `/api/v1/graphs` | List in-flight runs | -| `GET` | `/api/v1/graphs/:runId` | Get detailed status of a single in-flight run | -| `DELETE` | `/api/v1/graphs/:runId` | Cancel a running execution | - -### 7.5 Server observability - -The server layer emits HTTP request logs (structured JSON with method, path, status, latency, request ID) and exposes health check and readiness endpoints. +The `nvisy-exif` Python package reads and strips EXIF metadata from images using Pillow. --- ## 8. Error Handling -### 8.1 Error propagation - -When a node encounters an error, the runtime first checks retryability via the error's `retryable` flag. If the error is retryable and retries remain, the node is re-attempted with backoff. If the error is terminal or retries are exhausted, the node is marked as failed. Downstream nodes that depend on the failed node are marked as skipped. Independent branches of the DAG continue executing. The graph run is marked as `partial_failure` or `failure` depending on whether any terminal sink node succeeded. - -Errors are caught at the node executor boundary and recorded in the run result. The structured error hierarchy ensures consistent, machine-readable failure information at every layer. - -### 8.2 Dead letter queue - -Primitives that fail processing can be routed to a dead letter queue (DLQ) instead of failing the entire node. This allows the graph to continue processing valid data while capturing failures for later inspection and replay. +Errors carry an `ErrorKind`, message, optional source component, retryable flag, and optional boxed source error. The runtime distinguishes transient failures (retry with backoff) from terminal failures (fail immediately). Downstream nodes dependent on a failed node are skipped. --- ## 9. Security -### 9.1 Secret management - -Connector credentials are never stored in graph definitions. They are resolved at runtime from environment variables, a pluggable secret provider interface (supporting AWS Secrets Manager, HashiCorp Vault, and similar systems), or `.env` files in development. - -### 9.2 Network and access control - -In server mode, the REST API supports TLS termination, bearer token authentication, IP allowlisting, and CORS configuration via Hono middleware. - -### 9.3 Data handling - -Primitives may contain sensitive data (PII in completions, proprietary embeddings). The platform provides configurable data retention policies per graph, primitive redaction hooks for logging, and encryption at rest for server-mode persistence. - ---- - -## 10. Performance Considerations - -### 10.1 Memory management - -Primitives are processed in streaming fashion wherever possible using `AsyncIterable`. Nodes that must materialize full datasets (deduplication, sorting) use configurable memory limits and spill to disk when exceeded. - -### 10.2 Embedding vectors - -Embedding vectors use `Float32Array` for memory efficiency — a 1536-dimensional embedding occupies 6 KB vs. approximately 24 KB as a JSON number array. For large-scale embedding workloads, this 4x reduction is significant. - -### 10.3 Batching - -Connectors declare their optimal batch size. The runtime automatically batches primitives to match, reducing round trips to external systems. Batching respects rate limits — a batch that would exceed the rate limit is delayed, not split. - -### 10.4 Backpressure - -Effection queues between nodes provide natural backpressure. When a downstream node processes slower than its upstream, the connecting queue suspends the upstream operation until the downstream is ready. This prevents memory exhaustion in unbalanced graphs without manual flow control. - ---- - -## 11. Extension Points - -The platform is designed for extension at multiple levels: - -| Extension Point | Mechanism | Example | -|----------------|-----------|---------| -| Custom primitive types | Extend the primitive type union and implement a payload interface | Graph embedding type | -| Custom connectors | Implement Source or Sink from `nvisy-core` | Elasticsearch connector | -| Custom actions | Implement Action from `nvisy-core`, or provide an inline function | Custom chunking strategy | -| Custom secret providers | Implement the SecretProvider interface | Azure Key Vault integration | -| Custom metric exporters | Implement the MetricExporter interface | StatsD exporter | -| Custom storage backends | Implement the StorageBackend interface (server mode) | MongoDB storage | +- Credentials resolved from environment variables, never stored in graph definitions +- TLS termination and CORS via middleware +- Detection patterns configurable per deployment +- Audit trail for all detection and redaction operations diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md index 17517d0..470dc62 100644 --- a/docs/DEVELOPMENT.md +++ b/docs/DEVELOPMENT.md @@ -1,6 +1,6 @@ # Nvisy Runtime — Development -**Technology choices and development roadmap for the Nvisy Runtime platform.** +**Technology choices and development roadmap.** --- @@ -8,126 +8,57 @@ | Concern | Choice | Rationale | |---------|--------|-----------| -| Language | TypeScript | Type safety for the primitive system, broad ecosystem | -| Module system | ESM only | Modern standard, native in Node.js, tree-shakeable | -| Runtime | Node.js | Async I/O suited for connector-heavy workloads, npm ecosystem | -| Structured concurrency | Effection | Generator-based structured concurrency for DAG execution | -| Validation | Zod | Runtime validation, TypeScript type derivation, structured parse errors | -| Graph library | Graphology | DAG construction, cycle detection, topological sort | -| Package manager | npm workspaces | Monorepo management without additional tooling | -| Build | tsup | Fast TypeScript compilation, ESM output, declaration generation | -| Testing | Vitest | Fast, TypeScript-native, ESM-compatible | -| Linting | Biome | Unified formatter and linter, high performance | -| HTTP framework | Hono | Lightweight, edge-compatible, fast routing, middleware composition | -| Cron | croner | Lightweight, timezone-aware scheduling | +| Language | Rust | Performance, memory safety, zero-cost abstractions | +| Python extensions | PyO3 | AI/ML model inference where Python ecosystem dominates | +| Async runtime | Tokio | Industry-standard async I/O for Rust | +| HTTP framework | Axum | Tower-based, ergonomic, high performance | +| Serialization | Serde | De facto standard for Rust serialization | +| Graph library | Petgraph | DAG construction, cycle detection, topological sort | +| OpenAPI | utoipa | Compile-time OpenAPI spec generation | +| JSON Schema | schemars | Derive-based JSON Schema for all types | +| Testing | cargo test | Built-in test framework | +| Linting | clippy | Standard Rust linter | +| Formatting | rustfmt | Standard Rust formatter | +| Build | Cargo workspaces | Monorepo management | +| CI | GitHub Actions | Rust toolchain with cargo check, clippy, test, build | +| Python packaging | uv | Fast Python package management | +| Container | Docker | Multi-stage Rust build with Python runtime | --- ## Development Roadmap -### Phase 1 — Foundation +### Phase 1 — Foundation (complete) -Core infrastructure and proof-of-concept connectors. - -- **`nvisy-core`** - - Primitive type system (embedding, completion, structured_output, tool_call_trace, image, audio, fine_tune_sample, raw) - - Zod-based validation and type derivation - - Error taxonomy with machine-readable tags and retryable flags - - Base Source, Sink, and Action interfaces (AsyncIterable-based) - - Observability primitives (structured logging, metrics, tracing) - - Utility library (ULID generation, content hashing, serialization) - -- **`nvisy-runtime`** - - Graph JSON schema definition (Zod) - - JSON parser and graph validator - - DAG compiler (cycle detection, dependency resolution, execution planning) - - Execution engine with Effection-based structured concurrency - - Retry policies (fixed, exponential, jitter backoff) - - Timeout policies (per-node execution limits) - - Concurrency control (global and per-node limits) - - Built-in generic actions (filter, map, batch, deduplicate, validate, convert) - - Runtime metrics and OpenTelemetry tracing - -- **`nvisy-plugin-object`** - - S3 source and sink (multipart upload, streaming read, prefix listing) - - JSONL source and sink (line-delimited JSON, schema inference) - -- **`nvisy-plugin-vector`** - - Qdrant source and sink (collection management, upsert with metadata, dimensionality validation) +- **`nvisy-core`** — Type system, traits, plugin registry, error handling +- **`nvisy-detect`** — Regex detection, checksum validation, policy evaluation, redaction +- **`nvisy-engine`** — Graph schema, DAG compiler, executor, run management +- **`nvisy-object`** — S3 object storage connector +- **`nvisy-python`** — PyO3 bridge, AI NER actions +- **`nvisy-server`** — Axum server, REST API, middleware +- **`nvisy-ai`** — Python LLM-based NER +- **`nvisy-exif`** — Python EXIF metadata handling ### Phase 2 — Breadth -Expand connector coverage, add domain-specific actions. - -- **`nvisy-plugin-vector`** - - Pinecone connector - - Milvus connector - - Weaviate connector - - pgvector connector - -- **`nvisy-plugin-sql`** - - PostgreSQL source and sink (connection pooling, query generation, batch upsert) - - MySQL source and sink - - MSSQL source and sink - -- **`nvisy-plugin-object`** - - GCS source and sink - - Parquet source and sink (columnar read/write, schema mapping) - - CSV source and sink (header detection, type inference, chunked reading) - -- **`nvisy-plugin-ai`** - - Embedding action (multi-provider: OpenAI, Anthropic, Cohere, Gemini) - - Chunking actions (fixed-size, contextual, similarity-based) - - Completion action (structured output extraction) - - Enrichment action (metadata augmentation via LLM) - -- **Runtime additions** - - Dead letter queue support (per-node failure routing) - - Dry-run mode (compile and validate without executing) - - Resumable execution (checkpoint and resume from last successful context) - -### Phase 3 — Server - -HTTP server, scheduling, and operational tooling. - -- **`nvisy-server`** - - REST API (Hono) for graph execution, validation, and run management - - Cron scheduler (croner) for time-based pipeline triggers - - Webhook-based event triggers - - Request logging and structured observability - - Health and readiness endpoints - -- **Storage backends** - - SQLite for development and single-node deployments - - PostgreSQL for production deployments - -- **Web dashboard** - - Run monitoring and status visualization - - Lineage exploration (trace primitives through transformations) - - Failure inspection and replay - -### Phase 4 — Production Hardening - -Performance, security, and operational maturity. +- Additional detection patterns (IBAN, passport, driver's license) +- Image-based detection (face detection, license plates, document OCR) +- Additional storage connectors (GCS, Azure Blob) +- SQL connectors (PostgreSQL, MySQL) for audit persistence +- Webhook-based event triggers -- **Performance** - - Backpressure tuning and memory management - - Disk spill for materialization nodes (deduplication, sorting over large datasets) - - Batching optimization (adaptive batch sizing based on connector feedback) - - Performance benchmarks and profiling +### Phase 3 — Production Hardening -- **Security** - - Secret provider integrations (AWS Secrets Manager, HashiCorp Vault, Azure Key Vault) - - TLS termination and certificate management - - Bearer token authentication and API key management - - IP allowlisting and CORS configuration +- Performance benchmarks and optimization +- Backpressure and memory management +- Graceful shutdown and in-flight run draining +- Secret provider integrations (AWS Secrets Manager, HashiCorp Vault) +- Rate limiting per connector +- Resumable execution with checkpoints -- **Operational** - - Graceful shutdown and in-flight run draining - - Configuration hot-reload - - Structured alerting on pipeline failures +### Phase 4 — Ecosystem -- **Community** - - Plugin SDK documentation and examples - - Connector contribution guide - - Published npm packages with semantic versioning +- Plugin SDK documentation +- Community connector contribution guide +- Published crates on crates.io +- Dashboard UI for run monitoring and audit inspection diff --git a/docs/README.md b/docs/README.md index a7924e8..4bbfcc8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,118 +1,93 @@ # Nvisy Runtime -**An open-source ETL platform purpose-built for LLM and AI data pipelines.** +**A data protection runtime for AI pipelines.** --- ## Abstract -The proliferation of large language models and embedding-based retrieval systems has created a new category of data engineering problem. Teams building AI-powered products must continuously move, transform, and validate data across a fragmented ecosystem of vector databases, model APIs, object stores, relational databases, and file formats — each with its own schema conventions, rate limits, and failure modes. +AI-powered products handle sensitive data at every stage — ingestion, transformation, enrichment, and storage. PII in documents, faces in images, credentials in logs, and financial data in spreadsheets all require detection, classification, and redaction before downstream consumption. -Existing ETL platforms were designed for tabular, row-oriented data. They lack first-class support for the primitives that define AI workloads: high-dimensional embeddings, completion traces, structured outputs, tool-call logs, audio and image payloads, and fine-tuning datasets. Engineers are left stitching together ad-hoc scripts, battling impedance mismatches between systems that were never designed to interoperate. - -**Nvisy Runtime** addresses this gap. It is an open-source, TypeScript-native ETL platform that treats AI data as a first-class citizen. It provides a DAG-based execution engine, a typed primitive system for AI workloads, a broad connector ecosystem, and a declarative graph definition language — all designed to make AI data engineering reliable, observable, and composable. +**Nvisy Runtime** is a Rust-native data protection platform that treats sensitive data detection as a first-class pipeline primitive. It provides a DAG-based execution engine, typed data primitives with lineage tracking, regex and AI-powered entity detection, configurable redaction policies, and a pluggable connector system — all designed for throughput, correctness, and auditability. --- ## Problem Statement -### 1. AI data is structurally different from traditional data +### 1. Sensitive data is everywhere in AI pipelines -An embedding is not a row. A completion trace carries metadata (model, temperature, token counts, latency, cost) that has no analog in a traditional ETL schema. Fine-tuning datasets impose strict structural contracts. Tool-call sequences are trees, not tables. Current ETL platforms force these structures into tabular representations, losing semantic information and making transformations error-prone. +Documents, images, API responses, and model outputs all carry PII, PHI, financial data, and credentials. Manual redaction doesn't scale. Teams need automated, configurable detection and redaction that runs inline with their data pipelines. -### 2. The connector ecosystem is fragmented and immature +### 2. Detection requires multiple methods -Vector databases (Pinecone, Qdrant, Milvus, Weaviate, pgvector) expose incompatible APIs for upsert, query, and metadata filtering. Model provider APIs (OpenAI, Anthropic, Cohere, local inference servers) differ in authentication, rate limiting, batching, and response structure. Object stores, relational databases, and file formats each add their own integration surface. Teams rewrite the same connector logic project after project. +Regex patterns catch structured data (SSNs, emails, credit cards). AI-powered NER catches unstructured entities (names, addresses, medical terms). Checksum validation reduces false positives. A production system needs all three, composable in a single pipeline. -### 3. Pipeline orchestration for AI workloads has unique requirements +### 3. Redaction must be auditable -AI pipelines are not simple source-to-sink flows. They involve conditional branching (route data based on classification), fan-out (embed the same text with multiple models), rate-limited external calls (respect API quotas), idempotent retries (avoid duplicate embeddings), and cost tracking (monitor spend per pipeline run). General-purpose orchestrators like Airflow or Prefect can model these patterns, but they provide no native abstractions for them. +Compliance (GDPR, HIPAA, PCI-DSS) requires proof of what was detected, what was redacted, and how. Every detection and redaction action must produce an audit trail with full lineage. -### 4. Observability is an afterthought +### 4. Performance matters -When an embedding pipeline fails at 3 AM, engineers need to know: which records failed, at which stage, with what error, and whether retrying is safe. They need lineage — the ability to trace a vector in a database back to the source document, through every transformation it passed through. Current tooling provides none of this out of the box. +Data protection runs on every record. The runtime must handle high throughput without becoming a bottleneck. Rust provides the performance foundation; Python extensions handle AI workloads where model quality matters more than latency. --- ## Design Principles -### AI-native type system +### Typed data primitives -Every data object flowing through a Nvisy graph is a typed AI primitive: `Embedding`, `Completion`, `StructuredOutput`, `ToolCallTrace`, `ImagePayload`, `AudioPayload`, `FineTuneSample`, or a user-defined extension. Primitives carry domain-specific metadata and enforce structural contracts at compile time (TypeScript) and runtime (validation). +Every data object flowing through a graph is typed: `Document`, `Blob`, `Entity`, `Redaction`, `Policy`, `Audit`, `Image`. Primitives carry metadata and enforce structural contracts at compile time (Rust) and runtime (serde validation). ### DAG-based execution -Graphs are directed acyclic graphs of nodes. The runtime resolves dependencies, manages parallelism, handles retries, and tracks execution state. This model supports conditional branching, fan-out/fan-in patterns, and partial re-execution of failed subgraphs — all essential for production AI workloads. - -### Declarative-first, code-escape-hatch - -Common operations (extract, map, filter, chunk, embed, deduplicate, load) are expressed declaratively in JSON graph definitions. For operations that require custom logic, users drop into TypeScript functions that receive and return typed primitives. The declarative layer compiles down to the same execution graph as hand-written code. Because graphs are plain JSON, they are trivially serializable, storable, versionable, and transmittable over the wire. +Graphs are directed acyclic graphs of nodes (sources, actions, targets). The engine resolves dependencies, manages concurrency, handles retries, and tracks execution state. -### Idiomatic modern JavaScript with structured concurrency +### Regex + AI detection -The platform is built on idiomatic modern JavaScript — `async`/`await`, `AsyncIterable`, and generator functions — with Effection providing structured concurrency for the runtime's DAG executor. This keeps the plugin interface simple (standard async code) while giving the execution engine automatic task cancellation, timeout handling, and resource cleanup. +Built-in regex patterns detect structured sensitive data. Python-based NER (via PyO3) detects unstructured entities. Both produce the same `Entity` type, composable in a single pipeline. -### Broad, pluggable connectors +### Plugin architecture -Connectors are organized into domain-specific packages — SQL, object storage, and vector databases — each installable independently. All connectors implement a standard source/sink interface defined in `nvisy-core`, making community contributions straightforward. Users install only the connector packages they need. +Connectors, actions, and loaders register through a plugin system. Each plugin bundles its capabilities under a namespace. The engine resolves references at compilation time. -### Library and server modes +### Audit-first -Nvisy Runtime can be embedded as an npm package for programmatic use or deployed as a long-lived server with a REST API, scheduler, and dashboard. The same graph definition works in both modes. +Every detection and redaction produces an `Audit` record. Policies define what to detect and how to redact. The audit trail provides full lineage from source document to redacted output. --- ## Core Concepts -### Primitives +### Entities -A **primitive** is the unit of data in a Nvisy graph. Unlike raw JSON blobs, primitives are typed, validated, and carry metadata relevant to their domain. For example, an `Embedding` primitive contains the vector, its dimensionality, the model that produced it, the source text, and a content hash for deduplication. +An **Entity** is a detected piece of sensitive data: its category (PII, PHI, financial, credentials), type (SSN, email, face), value, confidence score, detection method, and location within the source document or image. -### Graphs +### Policies -A **graph** is a DAG of **nodes**. Each node is one of: +A **Policy** defines detection and redaction rules: which entity categories to scan, minimum confidence thresholds, and per-type redaction methods (mask, replace, hash, encrypt, remove, blur, block, synthesize). -- **Source** — reads data from an external system via a connector -- **Action** — applies a transformation or declarative operation to primitives -- **Sink** — writes data to an external system via a connector -- **Branch** — routes primitives to different downstream nodes based on a condition -- **FanOut / FanIn** — duplicates primitives across parallel subgraphs and merges results +### Graphs -Graphs are defined as JSON structures. This makes them inherently serializable — they can be stored in a database, versioned in source control, transmitted over an API, and reconstructed without loss of fidelity. The programmatic TypeScript API produces the same JSON representation. +A **Graph** is a DAG of nodes. Source nodes read data, action nodes detect/redact/classify, and target nodes write results. Graphs are defined as JSON and compiled into execution plans. ### Connectors -A **connector** is an adapter that knows how to read from or write to an external system. All connectors implement the source and sink interfaces defined in `nvisy-core`, and declare their capabilities (batch size, rate limits, supported primitive types). Connectors are organized into domain-specific packages: `nvisy-plugin-sql` for relational databases, `nvisy-plugin-object` for object stores and file formats, and `nvisy-plugin-vector` for vector databases. - -### Runtime - -The **runtime** is responsible for compiling and executing graphs. It parses JSON graph definitions, compiles them into execution plans, resolves node dependencies, manages concurrency limits, enforces rate limits on external calls, retries failed nodes with configurable backoff, and emits execution events for observability. +Connectors implement the source and target interfaces. The object storage connector (S3) handles file ingestion and output. Additional connectors register through the plugin system. --- -## Deployment Modes - -| Mode | Use Case | Entry Point | -|------|----------|-------------| -| **Library** | Embedded in application code | `import { Engine } from "@nvisy/runtime"` | -| **Server** | Production scheduling, monitoring | `@nvisy/server` | +## Deployment -The server mode exposes a REST API for graph management, a scheduler for cron and event-triggered execution, and a web dashboard for monitoring runs, inspecting lineage, and debugging failures. +The server (`nvisy-server`) is a short-lived Axum HTTP server. It accepts graph definitions, executes them, and reports status. Designed for containerized deployment — the main server spins it up, feeds work, waits for completion. --- ## Project Status -Nvisy Runtime is in the specification and design phase. This document serves as the product specification. Implementation will proceed according to the architecture defined in [ARCHITECTURE.md](./ARCHITECTURE.md). +Active development. The Rust runtime, detection engine, and server are implemented. AI-powered detection runs via Python extensions. --- ## License Apache License 2.0. See [LICENSE.txt](../LICENSE.txt). - ---- - -## Contributing - -Contribution guidelines will be published once the core architecture stabilizes. The project is designed for community-contributed connectors and actions from the outset. diff --git a/packages/nvisy-exif/pyproject.toml b/packages/nvisy-exif/pyproject.toml new file mode 100644 index 0000000..0cbec69 --- /dev/null +++ b/packages/nvisy-exif/pyproject.toml @@ -0,0 +1,14 @@ +[project] +name = "nvisy-exif" +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "Pillow>=10.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.backends" + +[tool.hatch.build.targets.wheel] +packages = ["src/nvisy_exif"] diff --git a/packages/nvisy-exif/src/nvisy_exif/__init__.py b/packages/nvisy-exif/src/nvisy_exif/__init__.py new file mode 100644 index 0000000..dd6cbd8 --- /dev/null +++ b/packages/nvisy-exif/src/nvisy_exif/__init__.py @@ -0,0 +1 @@ +from .exif import read_exif, strip_exif diff --git a/packages/nvisy-exif/src/nvisy_exif/exif.py b/packages/nvisy-exif/src/nvisy_exif/exif.py new file mode 100644 index 0000000..b2bd16c --- /dev/null +++ b/packages/nvisy-exif/src/nvisy_exif/exif.py @@ -0,0 +1,58 @@ +"""EXIF metadata reading and stripping for images. + +Uses Pillow for EXIF handling. Supports JPEG, PNG, TIFF, and WebP formats. +These functions are designed to be callable from Rust via PyO3. +""" + +from __future__ import annotations + +import io + +from PIL import Image +from PIL.ExifTags import TAGS + + +def read_exif(image_bytes: bytes) -> dict: + """Read EXIF metadata from image bytes. + + Args: + image_bytes: Raw image bytes (JPEG, PNG, TIFF, or WebP). + + Returns: + Dictionary mapping human-readable tag names to their values. + Binary or complex values are converted to strings. + """ + img = Image.open(io.BytesIO(image_bytes)) + exif_data = img.getexif() + + result: dict[str, object] = {} + for tag_id, value in exif_data.items(): + tag_name = TAGS.get(tag_id, str(tag_id)) + # Convert bytes to hex string for JSON compatibility + if isinstance(value, bytes): + value = value.hex() + result[tag_name] = value + + return result + + +def strip_exif(image_bytes: bytes) -> bytes: + """Remove all EXIF metadata from image bytes. + + Args: + image_bytes: Raw image bytes (JPEG, PNG, TIFF, or WebP). + + Returns: + Image bytes with all EXIF metadata removed, preserving the + original format. + """ + img = Image.open(io.BytesIO(image_bytes)) + fmt = img.format or "JPEG" + + # Create a clean copy without EXIF data + clean = Image.new(img.mode, img.size) + clean.putdata(list(img.getdata())) + + buf = io.BytesIO() + clean.save(buf, format=fmt) + return buf.getvalue() diff --git a/pyproject.toml b/pyproject.toml index c64a2f2..772f2b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,4 +4,4 @@ version = "0.1.0" requires-python = ">=3.11" [tool.uv.workspace] -members = ["packages/nvisy-ai"] +members = ["packages/nvisy-ai", "packages/nvisy-exif"] diff --git a/rust-toolchain.toml b/rust-toolchain.toml deleted file mode 100644 index 73cb934..0000000 --- a/rust-toolchain.toml +++ /dev/null @@ -1,3 +0,0 @@ -[toolchain] -channel = "stable" -components = ["rustfmt", "clippy"] diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..12a6950 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,6 @@ +# https://rust-lang.github.io/rustfmt + +group_imports = "StdExternalCrate" +imports_granularity = "Module" +reorder_impl_items = true +merge_derives = false From 021e95c878515e7ac0e119de69a7e5beaec423c1 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Tue, 10 Feb 2026 13:42:34 +0100 Subject: [PATCH 08/17] refactor: dissolve aggregated modules, remove DataValue, blob-centric pipeline, add docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fully-qualify async_trait attributes across all crates. Dissolve schema/ into compiler/graph.rs and policies/retry.rs. Dissolve types/ into datatypes/ submodules. Merge data/ into datatypes/. Remove DataValue enum — the pipeline is now blob-centric with an artifact registry (add_artifact/get_artifacts) for derived data. Add comprehensive documentation across all 6 crates. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- Cargo.lock | 176 +++--------------- Cargo.toml | 3 +- crates/nvisy-core/Cargo.toml | 3 +- crates/nvisy-core/src/data/mod.rs | 86 --------- crates/nvisy-core/src/datatypes/audit.rs | 42 ++++- crates/nvisy-core/src/datatypes/blob.rs | 54 +++++- crates/nvisy-core/src/datatypes/document.rs | 19 +- crates/nvisy-core/src/datatypes/entity.rs | 65 ++++++- crates/nvisy-core/src/datatypes/image.rs | 18 +- crates/nvisy-core/src/datatypes/mod.rs | 60 ++++++ crates/nvisy-core/src/datatypes/policy.rs | 36 +++- crates/nvisy-core/src/datatypes/redaction.rs | 43 ++++- .../src/datatypes/redaction_context.rs | 18 +- crates/nvisy-core/src/documents/elements.rs | 72 ++++++- crates/nvisy-core/src/documents/mod.rs | 6 + crates/nvisy-core/src/documents/ontology.rs | 73 ++++++-- .../src/{errors/mod.rs => error.rs} | 77 ++++---- crates/nvisy-core/src/lib.rs | 9 +- crates/nvisy-core/src/plugin/mod.rs | 52 ------ crates/nvisy-core/src/prelude.rs | 13 ++ crates/nvisy-core/src/registry/mod.rs | 149 --------------- crates/nvisy-core/src/traits/action.rs | 32 ++-- crates/nvisy-core/src/traits/loader.rs | 22 ++- crates/nvisy-core/src/traits/mod.rs | 5 + crates/nvisy-core/src/traits/provider.rs | 28 ++- crates/nvisy-core/src/traits/stream.rs | 47 +++-- crates/nvisy-core/src/types/mod.rs | 55 ------ .../src/actions/apply_redaction.rs | 151 ++++++++------- crates/nvisy-detect/src/actions/classify.rs | 82 ++++---- .../src/actions/detect_checksum.rs | 71 ++++--- .../nvisy-detect/src/actions/detect_regex.rs | 71 ++++--- crates/nvisy-detect/src/actions/emit_audit.rs | 59 +++--- .../src/actions/evaluate_policy.rs | 74 +++++--- crates/nvisy-detect/src/actions/mod.rs | 11 ++ crates/nvisy-detect/src/lib.rs | 37 ++-- crates/nvisy-detect/src/loaders/csv_loader.rs | 15 +- .../nvisy-detect/src/loaders/json_loader.rs | 17 +- crates/nvisy-detect/src/loaders/mod.rs | 9 + crates/nvisy-detect/src/loaders/plaintext.rs | 15 +- crates/nvisy-detect/src/patterns/mod.rs | 25 ++- .../nvisy-detect/src/patterns/validators.rs | 5 + crates/nvisy-detect/src/prelude.rs | 10 + .../src/{schema/mod.rs => compiler/graph.rs} | 79 ++++---- crates/nvisy-engine/src/compiler/mod.rs | 6 + crates/nvisy-engine/src/compiler/parse.rs | 28 ++- crates/nvisy-engine/src/compiler/plan.rs | 62 +++--- crates/nvisy-engine/src/connections/mod.rs | 13 +- crates/nvisy-engine/src/executor/context.rs | 27 ++- crates/nvisy-engine/src/executor/mod.rs | 6 +- crates/nvisy-engine/src/executor/nodes.rs | 63 ------- crates/nvisy-engine/src/executor/runner.rs | 41 ++-- crates/nvisy-engine/src/lib.rs | 10 +- crates/nvisy-engine/src/policies/mod.rs | 36 ++-- crates/nvisy-engine/src/policies/retry.rs | 52 ++++++ crates/nvisy-engine/src/prelude.rs | 5 + crates/nvisy-engine/src/runs/mod.rs | 42 ++++- crates/nvisy-object/src/client/mod.rs | 31 ++- crates/nvisy-object/src/lib.rs | 19 +- crates/nvisy-object/src/prelude.rs | 4 + crates/nvisy-object/src/providers/mod.rs | 2 + crates/nvisy-object/src/providers/s3.rs | 35 +++- crates/nvisy-object/src/streams/mod.rs | 2 + crates/nvisy-object/src/streams/read.rs | 30 +-- crates/nvisy-object/src/streams/write.rs | 52 +++--- crates/nvisy-python/src/actions/mod.rs | 123 ++++++++---- crates/nvisy-python/src/bridge/mod.rs | 13 +- crates/nvisy-python/src/error/mod.rs | 10 +- crates/nvisy-python/src/lib.rs | 21 +-- crates/nvisy-python/src/ner/mod.rs | 39 ++-- crates/nvisy-python/src/prelude.rs | 4 + crates/nvisy-python/src/provider/mod.rs | 26 ++- crates/nvisy-server/Cargo.toml | 5 +- crates/nvisy-server/src/app/mod.rs | 19 +- .../src/service/engine_factory.rs | 16 -- crates/nvisy-server/src/service/mod.rs | 9 +- crates/nvisy-server/src/service/state.rs | 11 +- 76 files changed, 1621 insertions(+), 1235 deletions(-) delete mode 100644 crates/nvisy-core/src/data/mod.rs rename crates/nvisy-core/src/{errors/mod.rs => error.rs} (57%) delete mode 100644 crates/nvisy-core/src/plugin/mod.rs create mode 100644 crates/nvisy-core/src/prelude.rs delete mode 100644 crates/nvisy-core/src/registry/mod.rs delete mode 100644 crates/nvisy-core/src/types/mod.rs create mode 100644 crates/nvisy-detect/src/prelude.rs rename crates/nvisy-engine/src/{schema/mod.rs => compiler/graph.rs} (52%) delete mode 100644 crates/nvisy-engine/src/executor/nodes.rs create mode 100644 crates/nvisy-engine/src/policies/retry.rs create mode 100644 crates/nvisy-engine/src/prelude.rs create mode 100644 crates/nvisy-object/src/prelude.rs create mode 100644 crates/nvisy-python/src/prelude.rs delete mode 100644 crates/nvisy-server/src/service/engine_factory.rs diff --git a/Cargo.lock b/Cargo.lock index b226491..129c0b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -50,15 +50,6 @@ dependencies = [ "backtrace", ] -[[package]] -name = "arbitrary" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" -dependencies = [ - "derive_arbitrary", -] - [[package]] name = "async-trait" version = "0.1.89" @@ -847,14 +838,24 @@ dependencies = [ ] [[package]] -name = "derive_arbitrary" -version = "1.4.2" +name = "derive_more" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a9b99b9cbbe49445b21764dc0625032a89b145a2642e67603e1c936f5458d05" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" dependencies = [ "proc-macro2", "quote", "syn", + "unicode-xid", ] [[package]] @@ -973,16 +974,6 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" -[[package]] -name = "flate2" -version = "1.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" -dependencies = [ - "miniz_oxide", - "zlib-rs", -] - [[package]] name = "fnv" version = "1.0.7" @@ -1630,16 +1621,6 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" -[[package]] -name = "mime_guess" -version = "2.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" -dependencies = [ - "mime", - "unicase", -] - [[package]] name = "miniz_oxide" version = "0.8.9" @@ -1647,7 +1628,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", - "simd-adler32", ] [[package]] @@ -1702,6 +1682,7 @@ dependencies = [ "async-trait", "bytes", "chrono", + "derive_more", "infer", "schemars", "serde", @@ -1785,10 +1766,7 @@ dependencies = [ "axum", "chrono", "nvisy-core", - "nvisy-detect", "nvisy-engine", - "nvisy-object", - "nvisy-python", "schemars", "serde", "serde_json", @@ -1799,7 +1777,7 @@ dependencies = [ "tracing", "tracing-subscriber", "utoipa", - "utoipa-swagger-ui", + "utoipa-scalar", "uuid", ] @@ -2117,40 +2095,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "rust-embed" -version = "8.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04113cb9355a377d83f06ef1f0a45b8ab8cd7d8b1288160717d66df5c7988d27" -dependencies = [ - "rust-embed-impl", - "rust-embed-utils", - "walkdir", -] - -[[package]] -name = "rust-embed-impl" -version = "8.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0902e4c7c8e997159ab384e6d0fc91c221375f6894346ae107f47dd0f3ccaa" -dependencies = [ - "proc-macro2", - "quote", - "rust-embed-utils", - "syn", - "walkdir", -] - -[[package]] -name = "rust-embed-utils" -version = "8.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bcdef0be6fe7f6fa333b1073c949729274b05f123a0ad7efcb8efd878e5c3b1" -dependencies = [ - "sha2", - "walkdir", -] - [[package]] name = "rustc-demangle" version = "0.1.27" @@ -2247,15 +2191,6 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - [[package]] name = "schannel" version = "0.1.28" @@ -2480,12 +2415,6 @@ dependencies = [ "rand_core 0.6.4", ] -[[package]] -name = "simd-adler32" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" - [[package]] name = "slab" version = "0.4.12" @@ -2846,18 +2775,18 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" -[[package]] -name = "unicase" -version = "2.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" - [[package]] name = "unicode-ident" version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "537dd038a89878be9b64dd4bd1b260315c1bb94f4d784956b81e27a088d9a09e" +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + [[package]] name = "unindent" version = "0.2.4" @@ -2920,21 +2849,15 @@ dependencies = [ ] [[package]] -name = "utoipa-swagger-ui" -version = "9.0.2" +name = "utoipa-scalar" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d047458f1b5b65237c2f6dc6db136945667f40a7668627b3490b9513a3d43a55" +checksum = "59559e1509172f6b26c1cdbc7247c4ddd1ac6560fe94b584f81ee489b141f719" dependencies = [ "axum", - "base64", - "mime_guess", - "regex", - "rust-embed", "serde", "serde_json", - "url", "utoipa", - "zip", ] [[package]] @@ -2967,16 +2890,6 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" -[[package]] -name = "walkdir" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" -dependencies = [ - "same-file", - "winapi-util", -] - [[package]] name = "want" version = "0.3.1" @@ -3046,15 +2959,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "winapi-util" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" -dependencies = [ - "windows-sys 0.61.2", -] - [[package]] name = "windows-core" version = "0.62.2" @@ -3391,40 +3295,8 @@ dependencies = [ "syn", ] -[[package]] -name = "zip" -version = "3.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12598812502ed0105f607f941c386f43d441e00148fce9dec3ca5ffb0bde9308" -dependencies = [ - "arbitrary", - "crc32fast", - "flate2", - "indexmap", - "memchr", - "zopfli", -] - -[[package]] -name = "zlib-rs" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7948af682ccbc3342b6e9420e8c51c1fe5d7bf7756002b4a3c6cabfe96a7e3c" - [[package]] name = "zmij" version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4de98dfa5d5b7fef4ee834d0073d560c9ca7b6c46a71d058c48db7960f8cfaf7" - -[[package]] -name = "zopfli" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249" -dependencies = [ - "bumpalo", - "crc32fast", - "log", - "simd-adler32", -] diff --git a/Cargo.toml b/Cargo.toml index c883094..9ae4d49 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,7 +50,7 @@ tower-http = { version = "0.6", features = [] } # OpenAPI / Documentation utoipa = { version = "5", features = ["uuid"] } -utoipa-swagger-ui = { version = "9", features = [] } +utoipa-scalar = { version = "0.3", features = [] } # Observability tracing = { version = "0.1", features = [] } @@ -63,6 +63,7 @@ serde_json = { version = "1.0", features = [] } # Error handling thiserror = { version = "2.0", features = [] } anyhow = { version = "1.0", features = [] } +derive_more = { version = "1", features = ["display"] } # Primitive datatypes uuid = { version = "1", features = ["serde", "v4"] } diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index 8c51dbe..617f75b 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -2,7 +2,7 @@ [package] name = "nvisy-core" -description = "Domain types, traits, errors, and plugin registry for the Nvisy platform" +description = "Domain types, traits, and errors for the Nvisy platform" keywords = ["nvisy", "core", "domain", "types"] categories = ["data-structures"] @@ -47,6 +47,7 @@ infer = { workspace = true, features = [] } # Error handling thiserror = { workspace = true, features = [] } anyhow = { workspace = true, features = [] } +derive_more = { workspace = true, features = ["display"] } # Observability tracing = { workspace = true, features = [] } diff --git a/crates/nvisy-core/src/data/mod.rs b/crates/nvisy-core/src/data/mod.rs deleted file mode 100644 index 440cb32..0000000 --- a/crates/nvisy-core/src/data/mod.rs +++ /dev/null @@ -1,86 +0,0 @@ -use serde::{Deserialize, Serialize}; -use uuid::Uuid; -use crate::types::Metadata; -use crate::datatypes::{ - entity::Entity, redaction::Redaction, policy::Policy, audit::Audit, - document::Document, blob::Blob, image::ImageData, -}; - -/// Common fields shared by all domain data items. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct DataItem { - pub id: Uuid, - #[serde(skip_serializing_if = "Option::is_none")] - pub parent_id: Option<Uuid>, - #[serde(skip_serializing_if = "Option::is_none")] - pub metadata: Option<Metadata>, -} - -impl DataItem { - pub fn new() -> Self { - Self { - id: Uuid::new_v4(), - parent_id: None, - metadata: None, - } - } - - pub fn with_metadata(mut self, metadata: Metadata) -> Self { - self.metadata = Some(metadata); - self - } - - pub fn derive_from(mut self, parent: &DataItem) -> Self { - self.parent_id = Some(parent.id); - self - } -} - -impl Default for DataItem { - fn default() -> Self { - Self::new() - } -} - -/// Discriminated union of all data types that flow through DAG channels. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -#[serde(tag = "_type", rename_all = "snake_case")] -pub enum DataValue { - Document(Document), - Blob(Blob), - Entity(Entity), - Redaction(Redaction), - Policy(Policy), - Audit(Audit), - Image(ImageData), -} - -impl DataValue { - /// Get the type name of this data value. - pub fn type_name(&self) -> &'static str { - match self { - DataValue::Document(_) => "document", - DataValue::Blob(_) => "blob", - DataValue::Entity(_) => "entity", - DataValue::Redaction(_) => "redaction", - DataValue::Policy(_) => "policy", - DataValue::Audit(_) => "audit", - DataValue::Image(_) => "image", - } - } - - /// Get the underlying DataItem common fields. - pub fn data_item(&self) -> &DataItem { - match self { - DataValue::Document(d) => &d.data, - DataValue::Blob(b) => &b.data, - DataValue::Entity(e) => &e.data, - DataValue::Redaction(r) => &r.data, - DataValue::Policy(p) => &p.data, - DataValue::Audit(a) => &a.data, - DataValue::Image(i) => &i.data, - } - } -} diff --git a/crates/nvisy-core/src/datatypes/audit.rs b/crates/nvisy-core/src/datatypes/audit.rs index e4ae60f..5ef13c1 100644 --- a/crates/nvisy-core/src/datatypes/audit.rs +++ b/crates/nvisy-core/src/datatypes/audit.rs @@ -1,34 +1,67 @@ +//! Audit trail records for data protection events. + use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use crate::data::DataItem; -use crate::types::{AuditAction, Metadata}; +use super::DataItem; +use crate::datatypes::Metadata; + +/// Kind of auditable action recorded in an [`Audit`] entry. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum AuditAction { + /// A sensitive entity was detected. + Detection, + /// A redaction was applied to an entity. + Redaction, + /// A policy was evaluated against detected entities. + PolicyEval, + /// A blob or document was accessed. + Access, + /// Processed content was exported to an external system. + Export, +} /// An immutable audit record tracking a data protection event. +/// +/// Audit entries are emitted by pipeline actions and form a tamper-evident +/// log of all detection, redaction, and policy decisions. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Audit { + /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] pub data: DataItem, + /// The kind of event this audit entry records. pub action: AuditAction, + /// UTC timestamp when the event occurred. pub timestamp: DateTime<Utc>, + /// Identifier of the related entity, if applicable. #[serde(skip_serializing_if = "Option::is_none")] pub entity_id: Option<Uuid>, + /// Identifier of the related redaction, if applicable. #[serde(skip_serializing_if = "Option::is_none")] pub redaction_id: Option<Uuid>, + /// Identifier of the policy that was evaluated, if applicable. #[serde(skip_serializing_if = "Option::is_none")] pub policy_id: Option<Uuid>, + /// Identifier of the source blob or document. #[serde(skip_serializing_if = "Option::is_none")] pub source_id: Option<Uuid>, + /// Identifier of the pipeline run that produced this entry. #[serde(skip_serializing_if = "Option::is_none")] pub run_id: Option<Uuid>, + /// Human or service account that triggered the event. #[serde(skip_serializing_if = "Option::is_none")] pub actor: Option<String>, + /// Additional unstructured details about the event. #[serde(skip_serializing_if = "Option::is_none")] pub details: Option<Metadata>, } impl Audit { + /// Create a new audit record for the given action, timestamped to now. pub fn new(action: AuditAction) -> Self { Self { data: DataItem::new(), @@ -44,26 +77,31 @@ impl Audit { } } + /// Associate this audit entry with a detected entity. pub fn with_entity_id(mut self, id: Uuid) -> Self { self.entity_id = Some(id); self } + /// Associate this audit entry with a redaction. pub fn with_redaction_id(mut self, id: Uuid) -> Self { self.redaction_id = Some(id); self } + /// Associate this audit entry with a pipeline run. pub fn with_run_id(mut self, id: Uuid) -> Self { self.run_id = Some(id); self } + /// Record the human or service account that triggered the event. pub fn with_actor(mut self, actor: impl Into<String>) -> Self { self.actor = Some(actor.into()); self } + /// Attach additional unstructured details to this audit entry. pub fn with_details(mut self, details: Metadata) -> Self { self.details = Some(details); self diff --git a/crates/nvisy-core/src/datatypes/blob.rs b/crates/nvisy-core/src/datatypes/blob.rs index 7dec1a9..dfdf54d 100644 --- a/crates/nvisy-core/src/datatypes/blob.rs +++ b/crates/nvisy-core/src/datatypes/blob.rs @@ -1,8 +1,17 @@ +//! Binary large object type and helpers. + +use std::collections::HashMap; + use bytes::Bytes; +use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; -use crate::data::DataItem; +use super::DataItem; /// Content type information for a blob. +/// +/// Tracks both the caller-supplied MIME type and the type detected +/// from the file's magic bytes so consumers can choose the most +/// reliable value. #[derive(Debug, Clone, Default, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct BlobContentInfo { @@ -14,20 +23,39 @@ pub struct BlobContentInfo { pub detected_mime: Option<String>, } -/// A binary object from storage (file content + path + content type). +/// A binary large object flowing through the pipeline. +/// +/// Blobs carry raw byte content along with an artifact registry +/// for derived data produced during pipeline processing. Each +/// pipeline action may attach artifacts (entities, documents, +/// redactions, etc.) to the blob as it passes through. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Blob { + /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] pub data: DataItem, + /// Storage path or key identifying this blob's origin. pub path: String, + /// Raw byte content of the blob. #[serde(with = "bytes_serde")] #[cfg_attr(feature = "schema", schemars(with = "Vec<u8>"))] pub content: Bytes, + /// Caller-supplied and auto-detected MIME type information. pub provided: BlobContentInfo, + /// Artifacts derived from this blob during pipeline processing. + /// + /// Keys are artifact type names (e.g. `"documents"`, `"entities"`, `"redactions"`). + /// Values are lists of JSON-serialized artifacts. Use [`add_artifact`] and + /// [`get_artifacts`] for type-safe access. + #[serde(default, skip_serializing_if = "HashMap::is_empty")] + pub artifacts: HashMap<String, Vec<serde_json::Value>>, } impl Blob { + /// Create a new blob from a storage path and raw content bytes. + /// + /// The MIME type is auto-detected from magic bytes when possible. pub fn new(path: impl Into<String>, content: impl Into<Bytes>) -> Self { let content = content.into(); let detected_mime = infer::get(&content).map(|t| t.mime_type().to_string()); @@ -39,9 +67,11 @@ impl Blob { mime: None, detected_mime, }, + artifacts: HashMap::new(), } } + /// Set the caller-provided MIME type (builder pattern). pub fn with_content_type(mut self, mime: impl Into<String>) -> Self { self.provided.mime = Some(mime.into()); self @@ -59,6 +89,26 @@ impl Blob { pub fn extension(&self) -> Option<&str> { self.path.rsplit('.').next() } + + /// Store a serializable artifact under the given key. + pub fn add_artifact<T: Serialize>(&mut self, key: &str, value: &T) -> Result<(), serde_json::Error> { + let json = serde_json::to_value(value)?; + self.artifacts.entry(key.to_string()).or_default().push(json); + Ok(()) + } + + /// Retrieve all artifacts under the given key, deserializing into `T`. + pub fn get_artifacts<T: DeserializeOwned>(&self, key: &str) -> Result<Vec<T>, serde_json::Error> { + match self.artifacts.get(key) { + Some(values) => values.iter().map(|v| serde_json::from_value(v.clone())).collect(), + None => Ok(Vec::new()), + } + } + + /// Check if any artifacts exist under the given key. + pub fn has_artifacts(&self, key: &str) -> bool { + self.artifacts.get(key).is_some_and(|v| !v.is_empty()) + } } pub(crate) mod bytes_serde { diff --git a/crates/nvisy-core/src/datatypes/document.rs b/crates/nvisy-core/src/datatypes/document.rs index 8906456..c91b613 100644 --- a/crates/nvisy-core/src/datatypes/document.rs +++ b/crates/nvisy-core/src/datatypes/document.rs @@ -1,25 +1,38 @@ +//! Parsed document representation. + use serde::{Deserialize, Serialize}; -use crate::data::DataItem; +use super::DataItem; use crate::documents::elements::Element; /// A parsed human-readable text representation of a document. +/// +/// Documents are produced by loaders from raw blobs and contain the +/// extracted text along with optional structural elements, title, and +/// source format metadata. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Document { + /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] pub data: DataItem, + /// Full text content of the document. pub content: String, + /// Document title, if one was extracted. #[serde(skip_serializing_if = "Option::is_none")] pub title: Option<String>, + /// Structural elements (paragraphs, tables, images, etc.) parsed from the document. #[serde(skip_serializing_if = "Option::is_none")] pub elements: Option<Vec<Element>>, + /// Original file format (e.g. `"pdf"`, `"docx"`, `"html"`). #[serde(skip_serializing_if = "Option::is_none")] pub source_format: Option<String>, + /// Total number of pages, if the source format is paginated. #[serde(skip_serializing_if = "Option::is_none")] pub page_count: Option<u32>, } impl Document { + /// Create a new document from raw text content. pub fn new(content: impl Into<String>) -> Self { Self { data: DataItem::new(), @@ -31,21 +44,25 @@ impl Document { } } + /// Set the document title (builder pattern). pub fn with_title(mut self, title: impl Into<String>) -> Self { self.title = Some(title.into()); self } + /// Attach parsed structural elements to this document. pub fn with_elements(mut self, elements: Vec<Element>) -> Self { self.elements = Some(elements); self } + /// Record the original file format (e.g. `"pdf"`, `"docx"`). pub fn with_source_format(mut self, format: impl Into<String>) -> Self { self.source_format = Some(format.into()); self } + /// Set the total page count for paginated source formats. pub fn with_page_count(mut self, count: u32) -> Self { self.page_count = Some(count); self diff --git a/crates/nvisy-core/src/datatypes/entity.rs b/crates/nvisy-core/src/datatypes/entity.rs index 15fd76d..c10a3db 100644 --- a/crates/nvisy-core/src/datatypes/entity.rs +++ b/crates/nvisy-core/src/datatypes/entity.rs @@ -1,49 +1,105 @@ +//! Sensitive-data entity types and detection metadata. + use serde::{Deserialize, Serialize}; use uuid::Uuid; -use crate::data::DataItem; -use crate::types::{DetectionMethod, EntityCategory}; +use super::DataItem; + +/// Category of sensitive data an entity belongs to. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum EntityCategory { + /// Personally Identifiable Information (names, SSNs, addresses, etc.). + Pii, + /// Protected Health Information (HIPAA-regulated data). + Phi, + /// Financial data (credit card numbers, bank accounts, etc.). + Financial, + /// Secrets and credentials (API keys, passwords, tokens). + Credentials, + /// User-defined or plugin-specific category. + Custom, +} + +/// Method used to detect a sensitive entity. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum DetectionMethod { + /// Regular expression pattern matching. + Regex, + /// Named-entity recognition via AI model. + AiNer, + /// Lookup in a known-value dictionary. + Dictionary, + /// Checksum or Luhn-algorithm validation. + Checksum, + /// Multiple methods combined to produce a single detection. + Composite, +} -/// Bounding box for image-based entity locations. +/// Axis-aligned bounding box for image-based entity locations. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct BoundingBox { + /// Horizontal offset of the top-left corner (pixels or normalized). pub x: f64, + /// Vertical offset of the top-left corner (pixels or normalized). pub y: f64, + /// Width of the bounding box. pub width: f64, + /// Height of the bounding box. pub height: f64, } -/// Location of an entity within its source document. +/// Location of an entity within its source document or image. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct EntityLocation { + /// Byte or character offset where the entity starts in the text. pub start_offset: usize, + /// Byte or character offset where the entity ends in the text. pub end_offset: usize, + /// Identifier of the document element containing this entity. #[serde(skip_serializing_if = "Option::is_none")] pub element_id: Option<String>, + /// 1-based page number where the entity was found. #[serde(skip_serializing_if = "Option::is_none")] pub page_number: Option<u32>, + /// Bounding box for image-based detections. #[serde(skip_serializing_if = "Option::is_none")] pub bounding_box: Option<BoundingBox>, } /// A detected sensitive data occurrence within a document. +/// +/// Entities are produced by detection actions (regex, NER, checksum, etc.) +/// and later consumed by redaction and audit actions. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Entity { + /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] pub data: DataItem, + /// Broad classification of the sensitive data. pub category: EntityCategory, + /// Specific type label (e.g. `"ssn"`, `"email"`, `"credit_card"`). pub entity_type: String, + /// The matched text or value. pub value: String, + /// How this entity was detected. pub detection_method: DetectionMethod, + /// Detection confidence score in the range `[0.0, 1.0]`. pub confidence: f64, + /// Where this entity was found in the source document. pub location: EntityLocation, + /// Identifier of the source blob or document this entity came from. #[serde(skip_serializing_if = "Option::is_none")] pub source_id: Option<Uuid>, } impl Entity { + /// Create a new entity with the given detection details. pub fn new( category: EntityCategory, entity_type: impl Into<String>, @@ -64,6 +120,7 @@ impl Entity { } } + /// Link this entity to the blob or document it was extracted from. pub fn with_source_id(mut self, source_id: Uuid) -> Self { self.source_id = Some(source_id); self diff --git a/crates/nvisy-core/src/datatypes/image.rs b/crates/nvisy-core/src/datatypes/image.rs index 6da4815..02067e3 100644 --- a/crates/nvisy-core/src/datatypes/image.rs +++ b/crates/nvisy-core/src/datatypes/image.rs @@ -1,28 +1,41 @@ +//! Image data extracted from documents or provided directly. + use bytes::Bytes; use serde::{Deserialize, Serialize}; -use crate::data::DataItem; +use super::DataItem; /// An image extracted from a document or provided directly. +/// +/// Carries the raw pixel data, MIME type, optional dimensions, and +/// provenance information linking back to its source. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct ImageData { + /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] pub data: DataItem, + /// Raw image bytes (PNG, JPEG, etc.). #[serde(with = "crate::datatypes::blob::bytes_serde")] #[cfg_attr(feature = "schema", schemars(with = "Vec<u8>"))] pub image_data: Bytes, + /// MIME type of the image (e.g. `"image/png"`). pub mime_type: String, + /// Width of the image in pixels, if known. #[serde(skip_serializing_if = "Option::is_none")] pub width: Option<u32>, + /// Height of the image in pixels, if known. #[serde(skip_serializing_if = "Option::is_none")] pub height: Option<u32>, + /// File path or URL the image was loaded from, if applicable. #[serde(skip_serializing_if = "Option::is_none")] pub source_path: Option<String>, + /// 1-based page number the image was extracted from, if applicable. #[serde(skip_serializing_if = "Option::is_none")] pub page_number: Option<u32>, } impl ImageData { + /// Create a new image from raw bytes and a MIME type. pub fn new(image_data: impl Into<Bytes>, mime_type: impl Into<String>) -> Self { Self { data: DataItem::new(), @@ -35,17 +48,20 @@ impl ImageData { } } + /// Set the pixel dimensions of the image. pub fn with_dimensions(mut self, width: u32, height: u32) -> Self { self.width = Some(width); self.height = Some(height); self } + /// Record the file path or URL the image originated from. pub fn with_source_path(mut self, path: impl Into<String>) -> Self { self.source_path = Some(path.into()); self } + /// Set the page number this image was extracted from. pub fn with_page_number(mut self, page: u32) -> Self { self.page_number = Some(page); self diff --git a/crates/nvisy-core/src/datatypes/mod.rs b/crates/nvisy-core/src/datatypes/mod.rs index eaac1ef..078c1a8 100644 --- a/crates/nvisy-core/src/datatypes/mod.rs +++ b/crates/nvisy-core/src/datatypes/mod.rs @@ -1,3 +1,12 @@ +//! Domain data types for the nvisy pipeline. +//! +//! This module defines the core data structures that flow through the nvisy +//! processing pipeline: blobs, documents, entities, redactions, audits, +//! policies, and images. + +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + pub mod audit; pub mod blob; pub mod document; @@ -6,3 +15,54 @@ pub mod image; pub mod policy; pub mod redaction; pub mod redaction_context; + +/// General-purpose metadata map. +pub type Metadata = serde_json::Map<String, serde_json::Value>; + +/// Common fields shared by all domain data items. +/// +/// Every first-class object in the pipeline (blobs, documents, entities, etc.) +/// embeds a `DataItem` to carry a unique identifier, an optional parent +/// lineage link, and arbitrary metadata. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct DataItem { + /// Unique identifier for this item, generated as a v4 UUID on creation. + pub id: Uuid, + /// Identifier of the item this was derived from, if any. + #[serde(skip_serializing_if = "Option::is_none")] + pub parent_id: Option<Uuid>, + /// Arbitrary key-value metadata associated with this item. + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option<Metadata>, +} + +impl DataItem { + /// Create a new `DataItem` with a freshly generated UUID and no parent or metadata. + pub fn new() -> Self { + Self { + id: Uuid::new_v4(), + parent_id: None, + metadata: None, + } + } + + /// Attach metadata to this item (builder pattern). + pub fn with_metadata(mut self, metadata: Metadata) -> Self { + self.metadata = Some(metadata); + self + } + + /// Set `parent_id` to the id of `parent`, establishing lineage. + pub fn derive_from(mut self, parent: &DataItem) -> Self { + self.parent_id = Some(parent.id); + self + } +} + +impl Default for DataItem { + fn default() -> Self { + Self::new() + } +} + diff --git a/crates/nvisy-core/src/datatypes/policy.rs b/crates/nvisy-core/src/datatypes/policy.rs index c36e44c..fb1b474 100644 --- a/crates/nvisy-core/src/datatypes/policy.rs +++ b/crates/nvisy-core/src/datatypes/policy.rs @@ -1,35 +1,61 @@ +//! Redaction policies and rules. + use serde::{Deserialize, Serialize}; -use crate::data::DataItem; -use crate::types::{EntityCategory, RedactionMethod}; +use super::DataItem; +use crate::datatypes::entity::EntityCategory; +use crate::datatypes::redaction::RedactionMethod; -/// A single rule within a redaction policy. +/// A single rule within a redaction [`Policy`]. +/// +/// Rules specify which entity categories and types they match, the minimum +/// confidence threshold, and the redaction method to apply. Rules are +/// evaluated in ascending priority order. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct PolicyRule { + /// Unique identifier for this rule within its policy. pub id: String, + /// Human-readable name for display purposes. pub name: String, + /// Entity categories this rule applies to. Empty means all categories. pub categories: Vec<EntityCategory>, + /// Specific entity type names this rule applies to. Empty means all types. pub entity_types: Vec<String>, + /// Minimum detection confidence required for this rule to trigger. pub confidence_threshold: f64, + /// Redaction strategy to apply when this rule matches. pub method: RedactionMethod, + /// Template string for the replacement value (e.g. `"[REDACTED]"`). pub replacement_template: String, + /// Whether this rule is active. Disabled rules are skipped during evaluation. pub enabled: bool, + /// Evaluation priority (lower numbers are evaluated first). pub priority: i32, } -/// A redaction policy containing rules. +/// A named redaction policy containing an ordered set of rules. +/// +/// Policies are evaluated by [`find_matching_rule`](Policy::find_matching_rule) +/// which returns the first matching enabled rule sorted by priority. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Policy { + /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] pub data: DataItem, + /// Human-readable policy name. pub name: String, + /// Ordered list of redaction rules. pub rules: Vec<PolicyRule>, + /// Fallback redaction method when no rule matches. pub default_method: RedactionMethod, + /// Fallback confidence threshold when no rule matches. pub default_confidence_threshold: f64, } impl Policy { + /// Create a new policy with the given name and rules, using default + /// fallback method ([`Mask`](RedactionMethod::Mask)) and threshold (0.5). pub fn new(name: impl Into<String>, rules: Vec<PolicyRule>) -> Self { Self { data: DataItem::new(), @@ -40,11 +66,13 @@ impl Policy { } } + /// Override the fallback redaction method. pub fn with_default_method(mut self, method: RedactionMethod) -> Self { self.default_method = method; self } + /// Override the fallback confidence threshold. pub fn with_default_confidence_threshold(mut self, threshold: f64) -> Self { self.default_confidence_threshold = threshold; self diff --git a/crates/nvisy-core/src/datatypes/redaction.rs b/crates/nvisy-core/src/datatypes/redaction.rs index 6bde50e..faaa649 100644 --- a/crates/nvisy-core/src/datatypes/redaction.rs +++ b/crates/nvisy-core/src/datatypes/redaction.rs @@ -1,25 +1,60 @@ +//! Redaction methods and records. + use serde::{Deserialize, Serialize}; use uuid::Uuid; -use crate::data::DataItem; -use crate::types::RedactionMethod; +use super::DataItem; + +/// Strategy used to redact or obfuscate a detected entity. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum RedactionMethod { + /// Replace characters with a mask character (e.g. `***-**-1234`). + Mask, + /// Substitute with a fixed placeholder string. + Replace, + /// Replace with a one-way hash of the original value. + Hash, + /// Encrypt the value so it can be recovered later with a key. + Encrypt, + /// Remove the value entirely from the output. + Remove, + /// Blur a region in an image. + Blur, + /// Overlay an opaque block over a region in an image. + Block, + /// Replace with a synthetically generated realistic value. + Synthesize, +} -/// A redaction decision for a detected entity. +/// A redaction decision recording how a specific entity was (or will be) redacted. +/// +/// Each `Redaction` is linked to exactly one [`Entity`](super::entity::Entity) +/// via `entity_id`. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Redaction { + /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] pub data: DataItem, + /// Identifier of the entity being redacted. pub entity_id: Uuid, + /// Redaction strategy applied to the entity. pub method: RedactionMethod, + /// The string that replaces the original value in the output. pub replacement_value: String, + /// The original sensitive value, retained for audit purposes. #[serde(skip_serializing_if = "Option::is_none")] pub original_value: Option<String>, + /// Identifier of the policy rule that triggered this redaction. #[serde(skip_serializing_if = "Option::is_none")] pub policy_rule_id: Option<String>, + /// Whether the redaction has been applied to the output content. pub applied: bool, } impl Redaction { + /// Create a new pending redaction for the given entity. pub fn new( entity_id: Uuid, method: RedactionMethod, @@ -36,11 +71,13 @@ impl Redaction { } } + /// Record the original sensitive value for audit trail purposes. pub fn with_original_value(mut self, value: impl Into<String>) -> Self { self.original_value = Some(value.into()); self } + /// Associate this redaction with the policy rule that triggered it. pub fn with_policy_rule_id(mut self, id: impl Into<String>) -> Self { self.policy_rule_id = Some(id.into()); self diff --git a/crates/nvisy-core/src/datatypes/redaction_context.rs b/crates/nvisy-core/src/datatypes/redaction_context.rs index 80906a3..78f70df 100644 --- a/crates/nvisy-core/src/datatypes/redaction_context.rs +++ b/crates/nvisy-core/src/datatypes/redaction_context.rs @@ -1,12 +1,21 @@ +//! Request-scoped redaction context for per-invocation control. + use serde::{Deserialize, Serialize}; -use crate::types::{EntityCategory, RedactionMethod}; +use crate::datatypes::entity::EntityCategory; +use crate::datatypes::redaction::RedactionMethod; /// Per-entity-type override for the redaction method. +/// +/// When included in a [`RedactionContext`], this rule overrides the +/// default redaction method for a specific entity type. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct EntityRedactionRule { + /// The entity type this override applies to (e.g. `"ssn"`, `"email"`). pub entity_type: String, + /// Redaction strategy to use for this entity type. pub method: RedactionMethod, + /// Optional custom replacement string for this entity type. #[serde(skip_serializing_if = "Option::is_none")] pub replacement: Option<String>, } @@ -65,35 +74,42 @@ impl Default for RedactionContext { } impl RedactionContext { + /// Create a new context with default settings (mask method, 0.5 min confidence). pub fn new() -> Self { Self::default() } + /// Restrict processing to the given entity categories. pub fn with_categories(mut self, categories: Vec<EntityCategory>) -> Self { self.categories = categories; self } + /// Restrict processing to the given entity type names. pub fn with_entity_types(mut self, entity_types: Vec<String>) -> Self { self.entity_types = entity_types; self } + /// Add a per-entity-type redaction method override. pub fn with_rule(mut self, rule: EntityRedactionRule) -> Self { self.rules.push(rule); self } + /// Set the fallback redaction method when no per-type rule matches. pub fn with_default_method(mut self, method: RedactionMethod) -> Self { self.default_method = method; self } + /// Set the minimum confidence threshold. Entities below this are ignored. pub fn with_min_confidence(mut self, confidence: f64) -> Self { self.min_confidence = confidence; self } + /// Enable or disable image-based detection (faces, license plates, etc.). pub fn with_detect_images(mut self, detect: bool) -> Self { self.detect_images = detect; self diff --git a/crates/nvisy-core/src/documents/elements.rs b/crates/nvisy-core/src/documents/elements.rs index 7218ea5..58f1fdf 100644 --- a/crates/nvisy-core/src/documents/elements.rs +++ b/crates/nvisy-core/src/documents/elements.rs @@ -1,15 +1,20 @@ +//! Structural elements extracted from parsed documents. + use serde::{Deserialize, Serialize}; use uuid::Uuid; use crate::documents::ontology::ElementType; -use crate::types::Metadata; +use crate::datatypes::Metadata; /// An inline hyperlink within element text. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Link { + /// Display text of the hyperlink. pub text: String, + /// Target URL of the hyperlink. pub url: String, + /// Character offset where the link text begins in the parent element. pub start_index: usize, } @@ -17,7 +22,9 @@ pub struct Link { #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct EmphasizedText { + /// The emphasized text content. pub text: String, + /// HTML tag name describing the emphasis (e.g. `"b"`, `"i"`, `"em"`). pub tag: String, } @@ -25,34 +32,48 @@ pub struct EmphasizedText { #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct TableCellData { + /// Zero-based row index. pub row: usize, + /// Zero-based column index. pub column: usize, + /// Text content of the cell. pub text: String, + /// Whether this cell is a header cell. #[serde(skip_serializing_if = "Option::is_none")] pub is_header: Option<bool>, } -/// Extraction / OCR provenance data. +/// Extraction or OCR provenance data for an element. +/// +/// Records how an element was detected and any extraction +/// confidence metadata. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct ElementProvenance { + /// Confidence score of the extraction (0.0 to 1.0). #[serde(skip_serializing_if = "Option::is_none")] pub confidence: Option<f64>, + /// Name of the extraction engine or model that produced this element. #[serde(skip_serializing_if = "Option::is_none")] pub detection_origin: Option<String>, + /// Whether this element continues from a previous element split across pages. #[serde(skip_serializing_if = "Option::is_none")] pub is_continuation: Option<bool>, + /// Type of header or footer (e.g. `"primary"`, `"footnote"`), if applicable. #[serde(skip_serializing_if = "Option::is_none")] pub header_footer_type: Option<String>, } -/// Structured key-value pair from a form. +/// Structured key-value pair extracted from a form. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct FormKeyValuePair { + /// Form field label or key. pub key: String, + /// Form field value, if one was extracted. #[serde(skip_serializing_if = "Option::is_none")] pub value: Option<String>, + /// Extraction confidence for this key-value pair. #[serde(skip_serializing_if = "Option::is_none")] pub confidence: Option<f64>, } @@ -64,74 +85,108 @@ pub struct FormKeyValuePair { #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Element { + /// Unique identifier for this element. pub id: Uuid, + /// The structural type of this element. #[serde(rename = "type")] pub element_type: ElementType, + /// Plain-text content of the element. pub text: String, + /// Identifier of the parent element (for nested structures). #[serde(skip_serializing_if = "Option::is_none")] pub parent_id: Option<Uuid>, + /// 1-based page number where this element appears. #[serde(skip_serializing_if = "Option::is_none")] pub page_number: Option<u32>, + /// Named page or sheet label (e.g. worksheet name in a spreadsheet). #[serde(skip_serializing_if = "Option::is_none")] pub page_name: Option<String>, + /// Heading level (1-6) for title or header elements. #[serde(skip_serializing_if = "Option::is_none")] pub level: Option<u32>, + /// BCP-47 language tags detected in this element. #[serde(skip_serializing_if = "Option::is_none")] pub languages: Option<Vec<String>>, + /// Arbitrary metadata associated with this element. #[serde(skip_serializing_if = "Option::is_none")] pub metadata: Option<Metadata>, + /// Tag identifying the extraction source or pipeline stage. #[serde(skip_serializing_if = "Option::is_none")] pub source_tag: Option<String>, + /// HTML representation of the element's text with inline formatting. #[serde(skip_serializing_if = "Option::is_none")] pub text_as_html: Option<String>, + /// Inline hyperlinks found within this element's text. #[serde(skip_serializing_if = "Option::is_none")] pub links: Option<Vec<Link>>, + /// Inline formatting spans (bold, italic, etc.) within this element. #[serde(skip_serializing_if = "Option::is_none")] pub emphasized_texts: Option<Vec<EmphasizedText>>, + /// Extraction or OCR provenance information. #[serde(skip_serializing_if = "Option::is_none")] pub provenance: Option<ElementProvenance>, - // Image-specific fields (when element_type is Image) + // -- Image-specific fields (when element_type is Image) -- + + /// Base64-encoded image data. #[serde(skip_serializing_if = "Option::is_none")] pub image_base64: Option<String>, + /// MIME type of the embedded image. #[serde(skip_serializing_if = "Option::is_none")] pub image_mime_type: Option<String>, + /// Remote URL of the image. #[serde(skip_serializing_if = "Option::is_none")] pub image_url: Option<String>, + /// Local file path of the image. #[serde(skip_serializing_if = "Option::is_none")] pub image_path: Option<String>, - // Table-specific fields (when element_type is Table) + // -- Table-specific fields (when element_type is Table) -- + + /// Individual table cells with row/column coordinates. #[serde(skip_serializing_if = "Option::is_none")] pub cells: Option<Vec<TableCellData>>, - // Form-specific fields (when element_type is Checkbox/FormKeysValues) + // -- Form-specific fields (when element_type is Checkbox/FormKeysValues) -- + + /// Whether a checkbox is checked. #[serde(skip_serializing_if = "Option::is_none")] pub checked: Option<bool>, + /// Value of a form field. #[serde(skip_serializing_if = "Option::is_none")] pub value: Option<String>, + /// Structured key-value pairs extracted from a form. #[serde(skip_serializing_if = "Option::is_none")] pub key_value_pairs: Option<Vec<FormKeyValuePair>>, - // Email-specific fields (when element_type is EmailMessage) + // -- Email-specific fields (when element_type is EmailMessage) -- + + /// Sender addresses. #[serde(skip_serializing_if = "Option::is_none")] pub sent_from: Option<Vec<String>>, + /// Primary recipient addresses. #[serde(skip_serializing_if = "Option::is_none")] pub sent_to: Option<Vec<String>>, + /// CC recipient addresses. #[serde(skip_serializing_if = "Option::is_none")] pub cc_recipient: Option<Vec<String>>, + /// BCC recipient addresses. #[serde(skip_serializing_if = "Option::is_none")] pub bcc_recipient: Option<Vec<String>>, + /// Email subject line. #[serde(skip_serializing_if = "Option::is_none")] pub subject: Option<String>, + /// Email signature block. #[serde(skip_serializing_if = "Option::is_none")] pub signature: Option<String>, + /// RFC 2822 Message-ID of the email. #[serde(skip_serializing_if = "Option::is_none")] pub email_message_id: Option<String>, } impl Element { + /// Create a new element with the given type and text content. pub fn new(element_type: ElementType, text: impl Into<String>) -> Self { Self { id: Uuid::new_v4(), @@ -166,16 +221,19 @@ impl Element { } } + /// Set the 1-based page number for this element. pub fn with_page_number(mut self, page: u32) -> Self { self.page_number = Some(page); self } + /// Set the heading level (1-6) for title or header elements. pub fn with_level(mut self, level: u32) -> Self { self.level = Some(level); self } + /// Set BCP-47 language tags detected in this element. pub fn with_languages(mut self, langs: Vec<String>) -> Self { self.languages = Some(langs); self diff --git a/crates/nvisy-core/src/documents/mod.rs b/crates/nvisy-core/src/documents/mod.rs index 45c93da..2db6a42 100644 --- a/crates/nvisy-core/src/documents/mod.rs +++ b/crates/nvisy-core/src/documents/mod.rs @@ -1,2 +1,8 @@ +//! Document structure and element ontology. +//! +//! This module provides the structural representation of parsed documents, +//! including individual elements (paragraphs, tables, images, etc.) and +//! the ontology that classifies them. + pub mod elements; pub mod ontology; diff --git a/crates/nvisy-core/src/documents/ontology.rs b/crates/nvisy-core/src/documents/ontology.rs index dae5dda..cd2ed78 100644 --- a/crates/nvisy-core/src/documents/ontology.rs +++ b/crates/nvisy-core/src/documents/ontology.rs @@ -1,49 +1,98 @@ +//! Element type ontology and category classification. + use serde::{Deserialize, Serialize}; -/// Element category — broad grouping of element types. +/// Broad grouping of element types. +/// +/// Every [`ElementType`] belongs to exactly one category, providing +/// a coarse filter for pipeline actions that only operate on certain +/// kinds of content. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] pub enum ElementCategory { + /// Narrative text, headings, list items, captions, and addresses. Text, + /// Tabular data. Table, + /// Images and other media content. Media, + /// Source code fragments. Code, + /// Mathematical formulae. Math, + /// Form elements such as checkboxes and key-value fields. Form, + /// Layout markers like page breaks and page numbers. Layout, + /// Email message content. Email, } -/// All element types across all categories. +/// Specific structural element type extracted from a document. +/// +/// Each variant maps to a single [`ElementCategory`] via +/// [`ElementType::category`]. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(rename_all = "kebab-case")] pub enum ElementType { - // Text + // -- Text -- + + /// A document title or section heading. Title, + /// A block of narrative prose. NarrativeText, + /// An item within a bulleted or numbered list. ListItem, + /// A page or section header. Header, + /// A page or section footer. Footer, + /// Caption text associated with a figure. FigureCaption, + /// A physical or mailing address. Address, + /// Text that does not fit any other text category. UncategorizedText, - // Table + + // -- Table -- + + /// A data table with rows and columns. Table, - // Media + + // -- Media -- + + /// An embedded image. Image, - // Code + + // -- Code -- + + /// A source code snippet or block. CodeSnippet, - // Math + + // -- Math -- + + /// A mathematical formula or equation. Formula, - // Form + + // -- Form -- + + /// A checkbox form control. Checkbox, + /// A set of key-value pairs extracted from a form. FormKeysValues, - // Layout + + // -- Layout -- + + /// A page break marker. PageBreak, + /// A page number indicator. PageNumber, - // Email + + // -- Email -- + + /// An email message body and headers. EmailMessage, } @@ -70,7 +119,9 @@ impl ElementType { } } -/// Return the category for a given element type string. +/// Parse an element type string and return its category. +/// +/// Returns `None` if the string does not match any known [`ElementType`]. pub fn category_of(type_str: &str) -> Option<ElementCategory> { let et: ElementType = serde_json::from_value(serde_json::Value::String(type_str.to_string())).ok()?; diff --git a/crates/nvisy-core/src/errors/mod.rs b/crates/nvisy-core/src/error.rs similarity index 57% rename from crates/nvisy-core/src/errors/mod.rs rename to crates/nvisy-core/src/error.rs index 500dc26..14195e5 100644 --- a/crates/nvisy-core/src/errors/mod.rs +++ b/crates/nvisy-core/src/error.rs @@ -1,56 +1,58 @@ -use std::fmt; +//! Unified error types for the nvisy platform. +//! +//! All crates in the nvisy workspace use [`Error`] as their primary error +//! type and [`ErrorKind`] to classify failures. + +use derive_more::Display; /// Classification of error kinds. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +/// +/// Used to tag every [`Error`] so callers can programmatically decide +/// how to handle a failure (e.g. retry on `Timeout`, surface to user +/// on `Validation`). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Display)] pub enum ErrorKind { + /// Input or configuration failed validation checks. Validation, + /// Could not connect to an external service. Connection, + /// An operation exceeded its time limit. Timeout, + /// The operation was explicitly cancelled. + #[display("Cancelled")] Cancellation, + /// A policy rule was violated. Policy, + /// An internal runtime error occurred. Runtime, + /// An error originating from the embedded Python bridge. Python, + /// An error that does not fit any other category. Other, } -impl fmt::Display for ErrorKind { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Validation => write!(f, "Validation"), - Self::Connection => write!(f, "Connection"), - Self::Timeout => write!(f, "Timeout"), - Self::Cancellation => write!(f, "Cancelled"), - Self::Policy => write!(f, "Policy"), - Self::Runtime => write!(f, "Runtime"), - Self::Python => write!(f, "Python"), - Self::Other => write!(f, "Other"), - } - } -} - -/// Unified error type for the Nvisy platform. -#[derive(Debug)] +/// Unified error type for the nvisy platform. +/// +/// Carries a [`kind`](ErrorKind), a human-readable message, an optional +/// source component name, a retryable flag, and an optional wrapped cause. +#[derive(Debug, thiserror::Error)] +#[error("{kind}: {message}")] pub struct Error { + /// Classification of the error. pub kind: ErrorKind, + /// Human-readable description of what went wrong. pub message: String, + /// Name of the component that produced this error (e.g. `"s3-read"`, `"detect-regex"`). pub source_component: Option<String>, + /// Whether the operation that failed can be safely retried. pub retryable: bool, + /// The underlying cause, if any. + #[source] pub source: Option<Box<dyn std::error::Error + Send + Sync>>, } -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}: {}", self.kind, self.message) - } -} - -impl std::error::Error for Error { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - self.source.as_ref().map(|e| e.as_ref() as &(dyn std::error::Error + 'static)) - } -} - impl Error { + /// Create a new error with the given kind and message. pub fn new(kind: ErrorKind, message: impl Into<String>) -> Self { Self { kind, @@ -61,25 +63,30 @@ impl Error { } } + /// Attach an underlying cause to this error. pub fn with_source(mut self, source: impl std::error::Error + Send + Sync + 'static) -> Self { self.source = Some(Box::new(source)); self } + /// Tag this error with the name of the component that produced it. pub fn with_component(mut self, component: impl Into<String>) -> Self { self.source_component = Some(component.into()); self } + /// Mark whether this error is safe to retry. pub fn with_retryable(mut self, retryable: bool) -> Self { self.retryable = retryable; self } + /// Shorthand for a validation error with a source component. pub fn validation(message: impl Into<String>, source: impl Into<String>) -> Self { Self::new(ErrorKind::Validation, message).with_component(source) } + /// Shorthand for a connection error with a source component and retryable flag. pub fn connection( message: impl Into<String>, source: impl Into<String>, @@ -90,18 +97,22 @@ impl Error { .with_retryable(retryable) } + /// Shorthand for a timeout error (always retryable). pub fn timeout(message: impl Into<String>) -> Self { Self::new(ErrorKind::Timeout, message).with_retryable(true) } + /// Shorthand for a cancellation error. pub fn cancellation(message: impl Into<String>) -> Self { Self::new(ErrorKind::Cancellation, message) } + /// Shorthand for a policy violation error. pub fn policy(message: impl Into<String>) -> Self { Self::new(ErrorKind::Policy, message) } + /// Shorthand for a runtime error with a source component and retryable flag. pub fn runtime( message: impl Into<String>, source: impl Into<String>, @@ -112,6 +123,7 @@ impl Error { .with_retryable(retryable) } + /// Shorthand for a Python bridge error. pub fn python(message: impl Into<String>) -> Self { Self::new(ErrorKind::Python, message) } @@ -132,6 +144,3 @@ impl From<anyhow::Error> for Error { /// Convenience type alias for results using the Nvisy error type. pub type Result<T> = std::result::Result<T, Error>; - -// Keep backward compatibility: NvisyError is an alias for Error. -pub type NvisyError = Error; diff --git a/crates/nvisy-core/src/lib.rs b/crates/nvisy-core/src/lib.rs index 8561f7e..7fdb276 100644 --- a/crates/nvisy-core/src/lib.rs +++ b/crates/nvisy-core/src/lib.rs @@ -2,11 +2,10 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -pub mod data; pub mod datatypes; pub mod documents; -pub mod errors; -pub mod plugin; -pub mod registry; +pub mod error; pub mod traits; -pub mod types; + +#[doc(hidden)] +pub mod prelude; diff --git a/crates/nvisy-core/src/plugin/mod.rs b/crates/nvisy-core/src/plugin/mod.rs deleted file mode 100644 index a66434d..0000000 --- a/crates/nvisy-core/src/plugin/mod.rs +++ /dev/null @@ -1,52 +0,0 @@ -use crate::traits::action::Action; -use crate::traits::loader::Loader; -use crate::traits::provider::ProviderFactory; -use crate::traits::stream::{StreamSource, StreamTarget}; - -/// Describes a plugin that bundles actions, providers, streams, and loaders. -pub struct PluginDescriptor { - pub id: String, - pub actions: Vec<Box<dyn Action>>, - pub providers: Vec<Box<dyn ProviderFactory>>, - pub sources: Vec<Box<dyn StreamSource>>, - pub targets: Vec<Box<dyn StreamTarget>>, - pub loaders: Vec<Box<dyn Loader>>, -} - -impl PluginDescriptor { - pub fn new(id: impl Into<String>) -> Self { - Self { - id: id.into(), - actions: Vec::new(), - providers: Vec::new(), - sources: Vec::new(), - targets: Vec::new(), - loaders: Vec::new(), - } - } - - pub fn with_action(mut self, action: impl Action) -> Self { - self.actions.push(Box::new(action)); - self - } - - pub fn with_provider(mut self, provider: impl ProviderFactory) -> Self { - self.providers.push(Box::new(provider)); - self - } - - pub fn with_source(mut self, source: impl StreamSource) -> Self { - self.sources.push(Box::new(source)); - self - } - - pub fn with_target(mut self, target: impl StreamTarget) -> Self { - self.targets.push(Box::new(target)); - self - } - - pub fn with_loader(mut self, loader: impl Loader) -> Self { - self.loaders.push(Box::new(loader)); - self - } -} diff --git a/crates/nvisy-core/src/prelude.rs b/crates/nvisy-core/src/prelude.rs new file mode 100644 index 0000000..5e157a1 --- /dev/null +++ b/crates/nvisy-core/src/prelude.rs @@ -0,0 +1,13 @@ +//! Convenience re-exports for common nvisy-core types. +//! +//! Import everything from this module to get the most commonly used +//! types without individual `use` statements. +pub use crate::datatypes::blob::Blob; +pub use crate::datatypes::DataItem; +pub use crate::error::{Error, ErrorKind, Result}; +pub use crate::traits::action::Action; +pub use crate::traits::loader::Loader; +pub use crate::traits::provider::{ConnectedInstance, ProviderFactory}; +pub use crate::traits::stream::{StreamSource, StreamTarget}; +pub use crate::datatypes::entity::{DetectionMethod, EntityCategory}; +pub use crate::datatypes::redaction::RedactionMethod; diff --git a/crates/nvisy-core/src/registry/mod.rs b/crates/nvisy-core/src/registry/mod.rs deleted file mode 100644 index 52233a9..0000000 --- a/crates/nvisy-core/src/registry/mod.rs +++ /dev/null @@ -1,149 +0,0 @@ -use std::collections::HashMap; - -use crate::datatypes::blob::Blob; -use crate::errors::NvisyError; -use crate::plugin::PluginDescriptor; -use crate::traits::action::Action; -use crate::traits::loader::Loader; -use crate::traits::provider::ProviderFactory; -use crate::traits::stream::{StreamSource, StreamTarget}; - -/// Registry of all actions, providers, streams, and loaders. -/// -/// Items are keyed by "plugin_id/item_id" (e.g. "detect/detect-regex"). -pub struct Registry { - actions: HashMap<String, Box<dyn Action>>, - providers: HashMap<String, Box<dyn ProviderFactory>>, - sources: HashMap<String, Box<dyn StreamSource>>, - targets: HashMap<String, Box<dyn StreamTarget>>, - loaders: Vec<Box<dyn Loader>>, -} - -impl Registry { - pub fn new() -> Self { - Self { - actions: HashMap::new(), - providers: HashMap::new(), - sources: HashMap::new(), - targets: HashMap::new(), - loaders: Vec::new(), - } - } - - /// Load a plugin, registering all its items under "plugin_id/item_id" keys. - pub fn load(&mut self, plugin: PluginDescriptor) -> Result<(), NvisyError> { - let prefix = &plugin.id; - - for action in plugin.actions { - let key = format!("{}/{}", prefix, action.id()); - if self.actions.contains_key(&key) { - return Err(NvisyError::validation( - format!("Duplicate action: {}", key), - "registry", - )); - } - self.actions.insert(key, action); - } - - for provider in plugin.providers { - let key = format!("{}/{}", prefix, provider.id()); - if self.providers.contains_key(&key) { - return Err(NvisyError::validation( - format!("Duplicate provider: {}", key), - "registry", - )); - } - self.providers.insert(key, provider); - } - - for source in plugin.sources { - let key = format!("{}/{}", prefix, source.id()); - if self.sources.contains_key(&key) { - return Err(NvisyError::validation( - format!("Duplicate source: {}", key), - "registry", - )); - } - self.sources.insert(key, source); - } - - for target in plugin.targets { - let key = format!("{}/{}", prefix, target.id()); - if self.targets.contains_key(&key) { - return Err(NvisyError::validation( - format!("Duplicate target: {}", key), - "registry", - )); - } - self.targets.insert(key, target); - } - - for loader in plugin.loaders { - self.loaders.push(loader); - } - - Ok(()) - } - - pub fn get_action(&self, key: &str) -> Option<&dyn Action> { - self.actions.get(key).map(|a| a.as_ref()) - } - - pub fn get_provider(&self, key: &str) -> Option<&dyn ProviderFactory> { - self.providers.get(key).map(|p| p.as_ref()) - } - - pub fn get_source(&self, key: &str) -> Option<&dyn StreamSource> { - self.sources.get(key).map(|s| s.as_ref()) - } - - pub fn get_target(&self, key: &str) -> Option<&dyn StreamTarget> { - self.targets.get(key).map(|t| t.as_ref()) - } - - /// Find a loader that matches a blob's extension or content type. - pub fn find_loader_for_blob(&self, blob: &Blob) -> Option<&dyn Loader> { - let ext = blob.extension(); - let ct = blob.content_type(); - - for loader in &self.loaders { - if let Some(ext) = ext { - if loader.extensions().contains(&ext) { - return Some(loader.as_ref()); - } - } - if let Some(ct) = ct { - if loader.content_types().contains(&ct) { - return Some(loader.as_ref()); - } - } - } - None - } - - pub fn action_keys(&self) -> Vec<&str> { - self.actions.keys().map(|s| s.as_str()).collect() - } - - pub fn provider_keys(&self) -> Vec<&str> { - self.providers.keys().map(|s| s.as_str()).collect() - } - - pub fn source_keys(&self) -> Vec<&str> { - self.sources.keys().map(|s| s.as_str()).collect() - } - - pub fn target_keys(&self) -> Vec<&str> { - self.targets.keys().map(|s| s.as_str()).collect() - } - - pub fn loader_ids(&self) -> Vec<&str> { - self.loaders.iter().map(|l| l.id()).collect() - } -} - -impl Default for Registry { - fn default() -> Self { - Self::new() - } -} diff --git a/crates/nvisy-core/src/traits/action.rs b/crates/nvisy-core/src/traits/action.rs index c1fe2a3..824a276 100644 --- a/crates/nvisy-core/src/traits/action.rs +++ b/crates/nvisy-core/src/traits/action.rs @@ -1,23 +1,23 @@ +//! The `Action` trait -- the fundamental processing unit in a pipeline. + use std::any::Any; -use async_trait::async_trait; use tokio::sync::mpsc; -use crate::data::DataValue; -use crate::errors::NvisyError; +use crate::datatypes::blob::Blob; +use crate::error::Error; -/// Type-erased action that consumes from an input channel and produces to an output channel. -#[async_trait] +/// A processing step that consumes blobs from an input channel and +/// produces blobs to an output channel. +/// +/// Actions are the primary unit of work in a pipeline. Each action +/// receives blobs via an async MPSC channel, transforms them (possibly +/// attaching artifacts), and forwards results to the next stage. +#[async_trait::async_trait] pub trait Action: Send + Sync + 'static { /// Unique identifier for this action (e.g. "detect-regex"). fn id(&self) -> &str; - /// Expected input data type name (e.g. "document"). - fn input_type(&self) -> &str; - - /// Output data type name (e.g. "entity"). - fn output_type(&self) -> &str; - /// Whether this action requires a provider client. fn requires_client(&self) -> bool { false @@ -29,15 +29,15 @@ pub trait Action: Send + Sync + 'static { } /// Validate action parameters. - fn validate_params(&self, params: &serde_json::Value) -> Result<(), NvisyError>; + fn validate_params(&self, params: &serde_json::Value) -> Result<(), Error>; - /// Execute the action, consuming items from input and sending results to output. + /// Execute the action, consuming blobs from input and sending results to output. /// Returns the number of items processed. async fn execute( &self, - input: mpsc::Receiver<DataValue>, - output: mpsc::Sender<DataValue>, + input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, params: serde_json::Value, client: Option<Box<dyn Any + Send>>, - ) -> Result<u64, NvisyError>; + ) -> Result<u64, Error>; } diff --git a/crates/nvisy-core/src/traits/loader.rs b/crates/nvisy-core/src/traits/loader.rs index ed85bd2..6cf660b 100644 --- a/crates/nvisy-core/src/traits/loader.rs +++ b/crates/nvisy-core/src/traits/loader.rs @@ -1,26 +1,36 @@ -use async_trait::async_trait; +//! The `Loader` trait for converting raw blobs into structured documents or images. use crate::datatypes::blob::Blob; use crate::datatypes::document::Document; use crate::datatypes::image::ImageData; -use crate::errors::NvisyError; +use crate::error::Error; -/// Output of a loader: either a Document or an ImageData. +/// Output of a loader -- either a parsed document or an extracted image. pub enum LoaderOutput { + /// A successfully parsed text document. Document(Document), + /// An extracted or decoded image. Image(ImageData), } -/// A loader transforms Blobs into Documents or Images. -#[async_trait] +/// Converts raw [`Blob`] content into structured [`Document`]s or [`ImageData`]. +/// +/// Loaders declare which file extensions and MIME types they support. +/// The engine selects the appropriate loader based on the blob's +/// content type and extension. +#[async_trait::async_trait] pub trait Loader: Send + Sync + 'static { + /// Unique identifier for this loader (e.g. `"csv"`, `"pdf"`). fn id(&self) -> &str; + /// File extensions this loader handles (e.g. `["csv", "tsv"]`). fn extensions(&self) -> &[&str]; + /// MIME types this loader handles (e.g. `["text/csv"]`). fn content_types(&self) -> &[&str]; + /// Parse the blob and return one or more documents or images. async fn load( &self, blob: &Blob, params: &serde_json::Value, - ) -> Result<Vec<LoaderOutput>, NvisyError>; + ) -> Result<Vec<LoaderOutput>, Error>; } diff --git a/crates/nvisy-core/src/traits/mod.rs b/crates/nvisy-core/src/traits/mod.rs index 125eb5a..816673d 100644 --- a/crates/nvisy-core/src/traits/mod.rs +++ b/crates/nvisy-core/src/traits/mod.rs @@ -1,3 +1,8 @@ +//! Core traits defining the pipeline extension points. +//! +//! Actions, loaders, stream sources/targets, and provider factories +//! are the primary interfaces that plugins implement. + pub mod action; pub mod loader; pub mod provider; diff --git a/crates/nvisy-core/src/traits/provider.rs b/crates/nvisy-core/src/traits/provider.rs index 216568a..5fde2ad 100644 --- a/crates/nvisy-core/src/traits/provider.rs +++ b/crates/nvisy-core/src/traits/provider.rs @@ -1,29 +1,39 @@ +//! Provider factory trait for creating authenticated client connections. + use std::any::Any; use std::future::Future; use std::pin::Pin; -use async_trait::async_trait; - -use crate::errors::NvisyError; +use crate::error::Error; -/// A connected provider instance with an opaque client and optional disconnect callback. +/// A connected provider instance holding an opaque client and an +/// optional async disconnect callback. +/// +/// The `client` is type-erased so that different providers (S3, OpenAI, +/// databases, etc.) can return their own client types without requiring +/// a common interface. pub struct ConnectedInstance { + /// Type-erased client handle, downcast by consumers to the concrete type. pub client: Box<dyn Any + Send>, + /// Optional cleanup function called when the connection is no longer needed. pub disconnect: Option<Box<dyn FnOnce() -> Pin<Box<dyn Future<Output = ()> + Send>> + Send>>, } -/// Factory for creating connected provider instances. -#[async_trait] +/// Factory for creating authenticated connections to an external service. +/// +/// Implementations handle credential validation, connectivity verification, +/// and client construction for a specific provider (e.g. S3, OpenAI). +#[async_trait::async_trait] pub trait ProviderFactory: Send + Sync + 'static { /// Unique identifier (e.g. "s3", "openai"). fn id(&self) -> &str; /// Validate credentials shape without connecting. - fn validate_credentials(&self, creds: &serde_json::Value) -> Result<(), NvisyError>; + fn validate_credentials(&self, creds: &serde_json::Value) -> Result<(), Error>; /// Verify credentials by attempting a lightweight connection. - async fn verify(&self, creds: &serde_json::Value) -> Result<(), NvisyError>; + async fn verify(&self, creds: &serde_json::Value) -> Result<(), Error>; /// Create a connected instance. - async fn connect(&self, creds: &serde_json::Value) -> Result<ConnectedInstance, NvisyError>; + async fn connect(&self, creds: &serde_json::Value) -> Result<ConnectedInstance, Error>; } diff --git a/crates/nvisy-core/src/traits/stream.rs b/crates/nvisy-core/src/traits/stream.rs index 9f841e9..435d99b 100644 --- a/crates/nvisy-core/src/traits/stream.rs +++ b/crates/nvisy-core/src/traits/stream.rs @@ -1,39 +1,56 @@ +//! Stream source and target traits for external I/O. + use std::any::Any; -use async_trait::async_trait; use tokio::sync::mpsc; -use crate::data::DataValue; -use crate::errors::NvisyError; +use crate::datatypes::blob::Blob; +use crate::error::Error; -/// A source stream that reads data from an external system into the pipeline. -#[async_trait] +/// A source stream that reads blobs from an external system into the pipeline. +/// +/// Implementations connect to a storage backend (e.g. S3, local filesystem) +/// and emit blobs into the pipeline's input channel. +#[async_trait::async_trait] pub trait StreamSource: Send + Sync + 'static { + /// Unique identifier for this stream source (e.g. `"s3-read"`). fn id(&self) -> &str; - fn output_type(&self) -> &str; + /// The provider this stream requires (e.g. `"s3"`). fn required_provider_id(&self) -> &str; - fn validate_params(&self, params: &serde_json::Value) -> Result<(), NvisyError>; + /// Validate source parameters before execution. + fn validate_params(&self, params: &serde_json::Value) -> Result<(), Error>; + /// Read blobs from the external system and send them to `output`. + /// + /// Returns the number of blobs read. async fn read( &self, - output: mpsc::Sender<DataValue>, + output: mpsc::Sender<Blob>, params: serde_json::Value, client: Box<dyn Any + Send>, - ) -> Result<u64, NvisyError>; + ) -> Result<u64, Error>; } -/// A target stream that writes pipeline data to an external system. -#[async_trait] +/// A target stream that writes blobs from the pipeline to an external system. +/// +/// Implementations receive processed blobs from the pipeline and persist +/// them to a storage backend. +#[async_trait::async_trait] pub trait StreamTarget: Send + Sync + 'static { + /// Unique identifier for this stream target (e.g. `"s3-write"`). fn id(&self) -> &str; - fn input_type(&self) -> &str; + /// The provider this stream requires (e.g. `"s3"`). fn required_provider_id(&self) -> &str; - fn validate_params(&self, params: &serde_json::Value) -> Result<(), NvisyError>; + /// Validate target parameters before execution. + fn validate_params(&self, params: &serde_json::Value) -> Result<(), Error>; + /// Receive blobs from `input` and write them to the external system. + /// + /// Returns the number of blobs written. async fn write( &self, - input: mpsc::Receiver<DataValue>, + input: mpsc::Receiver<Blob>, params: serde_json::Value, client: Box<dyn Any + Send>, - ) -> Result<u64, NvisyError>; + ) -> Result<u64, Error>; } diff --git a/crates/nvisy-core/src/types/mod.rs b/crates/nvisy-core/src/types/mod.rs deleted file mode 100644 index 7c2100c..0000000 --- a/crates/nvisy-core/src/types/mod.rs +++ /dev/null @@ -1,55 +0,0 @@ -use serde::{Deserialize, Serialize}; - -/// Category of sensitive data. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -#[serde(rename_all = "snake_case")] -pub enum EntityCategory { - Pii, - Phi, - Financial, - Credentials, - Custom, -} - -/// How the entity was detected. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -#[serde(rename_all = "snake_case")] -pub enum DetectionMethod { - Regex, - AiNer, - Dictionary, - Checksum, - Composite, -} - -/// Method used to redact sensitive data. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -#[serde(rename_all = "snake_case")] -pub enum RedactionMethod { - Mask, - Replace, - Hash, - Encrypt, - Remove, - Blur, - Block, - Synthesize, -} - -/// Type of auditable action. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -#[serde(rename_all = "snake_case")] -pub enum AuditAction { - Detection, - Redaction, - PolicyEval, - Access, - Export, -} - -/// General-purpose metadata map. -pub type Metadata = serde_json::Map<String, serde_json::Value>; diff --git a/crates/nvisy-detect/src/actions/apply_redaction.rs b/crates/nvisy-detect/src/actions/apply_redaction.rs index 299eb83..3a70b07 100644 --- a/crates/nvisy-detect/src/actions/apply_redaction.rs +++ b/crates/nvisy-detect/src/actions/apply_redaction.rs @@ -1,112 +1,119 @@ -use async_trait::async_trait; +//! Action that applies pending redactions to document text. + use std::any::Any; use std::collections::HashMap; use tokio::sync::mpsc; use uuid::Uuid; -use nvisy_core::data::DataValue; +use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; use nvisy_core::datatypes::entity::Entity; use nvisy_core::datatypes::redaction::Redaction; -use nvisy_core::errors::NvisyError; +use nvisy_core::error::{Error, ErrorKind}; use nvisy_core::traits::action::Action; +/// Applies pending [`Redaction`] artifacts to document content. +/// +/// The action correlates entities with their redactions, locates the +/// corresponding text spans inside each document, and replaces them with +/// the computed replacement values. The resulting redacted documents are +/// re-emitted as `"documents"` artifacts. pub struct ApplyRedactionAction; +/// A single text replacement that has been resolved but not yet applied. struct PendingRedaction { + /// Byte offset where the redaction starts in the original text. start_offset: usize, + /// Byte offset where the redaction ends (exclusive) in the original text. end_offset: usize, + /// The string that will replace the original span. replacement_value: String, } -#[async_trait] +#[async_trait::async_trait] impl Action for ApplyRedactionAction { fn id(&self) -> &str { "apply-redaction" } - fn input_type(&self) -> &str { - "document" - } - - fn output_type(&self) -> &str { - "document" - } - - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { Ok(()) } async fn execute( &self, - mut input: mpsc::Receiver<DataValue>, - output: mpsc::Sender<DataValue>, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, _params: serde_json::Value, _client: Option<Box<dyn Any + Send>>, - ) -> Result<u64, NvisyError> { - let mut entities: HashMap<Uuid, Entity> = HashMap::new(); - let mut redactions: HashMap<Uuid, Redaction> = HashMap::new(); - let mut documents: Vec<Document> = Vec::new(); - - // Collect all items first - while let Some(item) = input.recv().await { - match item { - DataValue::Entity(e) => { - entities.insert(e.data.id, e); - } - DataValue::Redaction(r) => { - redactions.insert(r.entity_id, r); - } - DataValue::Document(d) => { - documents.push(d); - } - _ => {} - } - } - + ) -> Result<u64, Error> { let mut count = 0u64; - for doc in documents { - let mut pending: Vec<PendingRedaction> = Vec::new(); - - for (entity_id, redaction) in &redactions { - let entity = match entities.get(entity_id) { - Some(e) => e, - None => continue, - }; + while let Some(mut blob) = input.recv().await { + let documents: Vec<Document> = blob.get_artifacts("documents").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read documents artifact: {e}")) + })?; + let entities: Vec<Entity> = blob.get_artifacts("entities").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read entities artifact: {e}")) + })?; + let redactions: Vec<Redaction> = blob.get_artifacts("redactions").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read redactions artifact: {e}")) + })?; + + let entity_map: HashMap<Uuid, &Entity> = + entities.iter().map(|e| (e.data.id, e)).collect(); + let redaction_map: HashMap<Uuid, &Redaction> = + redactions.iter().map(|r| (r.entity_id, r)).collect(); + + // Clear existing documents -- we will re-add the (possibly redacted) versions + blob.artifacts.remove("documents"); + + for doc in &documents { + let mut pending: Vec<PendingRedaction> = Vec::new(); + + for (entity_id, redaction) in &redaction_map { + let entity = match entity_map.get(entity_id) { + Some(e) => e, + None => continue, + }; + + // Check entity belongs to this document + let belongs = entity.data.parent_id == Some(doc.data.id) + || entity.source_id == Some(doc.data.id); + if !belongs { + continue; + } + + pending.push(PendingRedaction { + start_offset: entity.location.start_offset, + end_offset: entity.location.end_offset, + replacement_value: redaction.replacement_value.clone(), + }); + } - // Check entity belongs to this document - let belongs = entity.data.parent_id == Some(doc.data.id) - || entity.source_id == Some(doc.data.id); - if !belongs { + if pending.is_empty() { + blob.add_artifact("documents", doc).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add document artifact: {e}")) + })?; + count += 1; continue; } - pending.push(PendingRedaction { - start_offset: entity.location.start_offset, - end_offset: entity.location.end_offset, - replacement_value: redaction.replacement_value.clone(), - }); - } - - if pending.is_empty() { + let redacted_content = apply_redactions(&doc.content, &mut pending); + let mut result = Document::new(redacted_content); + result.title = doc.title.clone(); + result.elements = doc.elements.clone(); + result.source_format = doc.source_format.clone(); + result.page_count = doc.page_count; + result.data.parent_id = Some(doc.data.id); + + blob.add_artifact("documents", &result).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add document artifact: {e}")) + })?; count += 1; - if output.send(DataValue::Document(doc)).await.is_err() { - return Ok(count); - } - continue; } - let redacted_content = apply_redactions(&doc.content, &mut pending); - let mut result = Document::new(redacted_content); - result.title = doc.title.clone(); - result.elements = doc.elements.clone(); - result.source_format = doc.source_format.clone(); - result.page_count = doc.page_count; - result.data.parent_id = Some(doc.data.id); - - count += 1; - if output.send(DataValue::Document(result)).await.is_err() { + if output.send(blob).await.is_err() { return Ok(count); } } @@ -115,6 +122,10 @@ impl Action for ApplyRedactionAction { } } +/// Applies a set of pending redactions to `text`, returning the redacted result. +/// +/// Replacements are applied right-to-left (descending start offset) so that +/// earlier byte offsets remain valid after each substitution. fn apply_redactions(text: &str, pending: &mut [PendingRedaction]) -> String { // Sort by start offset descending (right-to-left) to preserve positions pending.sort_by(|a, b| b.start_offset.cmp(&a.start_offset)); diff --git a/crates/nvisy-detect/src/actions/classify.rs b/crates/nvisy-detect/src/actions/classify.rs index d72aa85..ed5d5e7 100644 --- a/crates/nvisy-detect/src/actions/classify.rs +++ b/crates/nvisy-detect/src/actions/classify.rs @@ -1,74 +1,48 @@ -use async_trait::async_trait; +//! Sensitivity classification action. + use std::any::Any; -use std::collections::HashMap; use tokio::sync::mpsc; -use nvisy_core::data::DataValue; -use nvisy_core::datatypes::document::Document; +use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::entity::Entity; -use nvisy_core::errors::NvisyError; +use nvisy_core::error::{Error, ErrorKind}; use nvisy_core::traits::action::Action; +/// Assigns a sensitivity level to each blob based on its detected entities. +/// +/// The action inspects the `"entities"` artifact, computes a sensitivity level +/// (`"none"`, `"low"`, `"medium"`, `"high"`, or `"critical"`), and writes it +/// into the blob metadata as `"sensitivityLevel"`. It also records the +/// `"totalEntities"` count. pub struct ClassifyAction; -#[async_trait] +#[async_trait::async_trait] impl Action for ClassifyAction { fn id(&self) -> &str { "classify" } - fn input_type(&self) -> &str { - "document" - } - - fn output_type(&self) -> &str { - "document" - } - - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { Ok(()) } async fn execute( &self, - mut input: mpsc::Receiver<DataValue>, - output: mpsc::Sender<DataValue>, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, _params: serde_json::Value, _client: Option<Box<dyn Any + Send>>, - ) -> Result<u64, NvisyError> { - let mut entities_by_source: HashMap<uuid::Uuid, Vec<Entity>> = HashMap::new(); - let mut documents: Vec<Document> = Vec::new(); - - while let Some(item) = input.recv().await { - match item { - DataValue::Entity(e) => { - let source_id = e.data.parent_id.unwrap_or(uuid::Uuid::nil()); - entities_by_source.entry(source_id).or_default().push(e); - } - DataValue::Document(d) => { - documents.push(d); - } - _ => {} - } - } - + ) -> Result<u64, Error> { let mut count = 0u64; - for doc in documents { - let entities = entities_by_source - .get(&doc.data.id) - .map(|v| v.as_slice()) - .unwrap_or(&[]); - let sensitivity_level = compute_sensitivity_level(entities); + while let Some(mut blob) = input.recv().await { + let entities: Vec<Entity> = blob.get_artifacts("entities").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read entities artifact: {e}")) + })?; - let mut result = Document::new(&doc.content); - result.title = doc.title.clone(); - result.elements = doc.elements.clone(); - result.source_format = doc.source_format.clone(); - result.page_count = doc.page_count; - result.data.parent_id = Some(doc.data.id); + let sensitivity_level = compute_sensitivity_level(&entities); - let mut meta = doc.data.metadata.clone().unwrap_or_default(); + let mut meta = blob.data.metadata.clone().unwrap_or_default(); meta.insert( "sensitivityLevel".to_string(), serde_json::Value::String(sensitivity_level), @@ -77,10 +51,10 @@ impl Action for ClassifyAction { "totalEntities".to_string(), serde_json::Value::Number(entities.len().into()), ); - result.data.metadata = Some(meta); + blob.data.metadata = Some(meta); count += 1; - if output.send(DataValue::Document(result)).await.is_err() { + if output.send(blob).await.is_err() { return Ok(count); } } @@ -89,6 +63,14 @@ impl Action for ClassifyAction { } } +/// Computes a sensitivity level string from a set of detected entities. +/// +/// The heuristic is: +/// - `"none"` -- no entities. +/// - `"critical"` -- at least one high-confidence (>= 0.9) credential, SSN, or credit card. +/// - `"high"` -- any critical type present, or more than 10 entities total. +/// - `"medium"` -- more than 3 entities. +/// - `"low"` -- 1-3 non-critical entities. fn compute_sensitivity_level(entities: &[Entity]) -> String { if entities.is_empty() { return "none".to_string(); @@ -96,7 +78,7 @@ fn compute_sensitivity_level(entities: &[Entity]) -> String { let has_high_confidence = entities.iter().any(|e| e.confidence >= 0.9); let has_critical_types = entities.iter().any(|e| { - matches!(e.category, nvisy_core::types::EntityCategory::Credentials) + matches!(e.category, nvisy_core::datatypes::entity::EntityCategory::Credentials) || e.entity_type == "ssn" || e.entity_type == "credit_card" }); diff --git a/crates/nvisy-detect/src/actions/detect_checksum.rs b/crates/nvisy-detect/src/actions/detect_checksum.rs index ae4641a..1c87711 100644 --- a/crates/nvisy-detect/src/actions/detect_checksum.rs +++ b/crates/nvisy-detect/src/actions/detect_checksum.rs @@ -1,42 +1,48 @@ -use async_trait::async_trait; +//! Checksum-based entity validation action. + use std::any::Any; use tokio::sync::mpsc; -use nvisy_core::data::DataValue; +use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::entity::Entity; -use nvisy_core::errors::NvisyError; +use nvisy_core::error::{Error, ErrorKind}; use nvisy_core::traits::action::Action; -use nvisy_core::types::DetectionMethod; +use nvisy_core::datatypes::entity::DetectionMethod; use crate::patterns::validators::luhn_check; +/// Validates previously detected entities using checksum algorithms. +/// +/// Entities whose type has a registered validator (e.g. Luhn for credit cards) +/// are verified. Valid matches receive a confidence boost and are re-emitted +/// with [`DetectionMethod::Checksum`]. Invalid matches can optionally be +/// dropped from the pipeline. +/// +/// # Parameters (JSON) +/// +/// | Key | Type | Default | Description | +/// |-------------------|--------|---------|------------------------------------------------------| +/// | `dropInvalid` | `bool` | `true` | Whether to discard entities that fail validation. | +/// | `confidenceBoost` | `f64` | `0.05` | Amount added to confidence on successful validation. | pub struct DetectChecksumAction; -#[async_trait] +#[async_trait::async_trait] impl Action for DetectChecksumAction { fn id(&self) -> &str { "detect-checksum" } - fn input_type(&self) -> &str { - "entity" - } - - fn output_type(&self) -> &str { - "entity" - } - - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { Ok(()) } async fn execute( &self, - mut input: mpsc::Receiver<DataValue>, - output: mpsc::Sender<DataValue>, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, params: serde_json::Value, _client: Option<Box<dyn Any + Send>>, - ) -> Result<u64, NvisyError> { + ) -> Result<u64, Error> { let drop_invalid = params .get("dropInvalid") .and_then(|v| v.as_bool()) @@ -48,8 +54,15 @@ impl Action for DetectChecksumAction { let mut count = 0u64; - while let Some(item) = input.recv().await { - if let DataValue::Entity(entity) = item { + while let Some(mut blob) = input.recv().await { + let entities: Vec<Entity> = blob.get_artifacts("entities").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read entities artifact: {e}")) + })?; + + // Clear existing entities -- we will re-add validated ones + blob.artifacts.remove("entities"); + + for entity in entities { let validator = get_validator(&entity.entity_type); if let Some(validate) = validator { @@ -71,19 +84,24 @@ impl Action for DetectChecksumAction { boosted.data.parent_id = entity.data.parent_id; boosted.source_id = entity.source_id; + blob.add_artifact("entities", &boosted).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add entity artifact: {e}")) + })?; + count += 1; - if output.send(DataValue::Entity(boosted)).await.is_err() { - return Ok(count); - } continue; } } - // No validator or not valid but not dropping — pass through + // No validator or not valid but not dropping -- pass through + blob.add_artifact("entities", &entity).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add entity artifact: {e}")) + })?; count += 1; - if output.send(DataValue::Entity(entity)).await.is_err() { - return Ok(count); - } + } + + if output.send(blob).await.is_err() { + return Ok(count); } } @@ -91,6 +109,7 @@ impl Action for DetectChecksumAction { } } +/// Returns the checksum validator function for a given entity type, if one exists. fn get_validator(entity_type: &str) -> Option<fn(&str) -> bool> { match entity_type { "credit_card" => Some(luhn_check), diff --git a/crates/nvisy-detect/src/actions/detect_regex.rs b/crates/nvisy-detect/src/actions/detect_regex.rs index 1b91bd9..72a0597 100644 --- a/crates/nvisy-detect/src/actions/detect_regex.rs +++ b/crates/nvisy-detect/src/actions/detect_regex.rs @@ -1,43 +1,48 @@ -use async_trait::async_trait; +//! Regex-based PII/PHI entity detection action. + use regex::Regex; use std::any::Any; use tokio::sync::mpsc; -use nvisy_core::data::DataValue; -use nvisy_core::datatypes::entity::{Entity, EntityLocation}; -use nvisy_core::errors::NvisyError; +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::Document; +use nvisy_core::datatypes::entity::{DetectionMethod, Entity, EntityLocation}; +use nvisy_core::error::{Error, ErrorKind}; use nvisy_core::traits::action::Action; -use nvisy_core::types::DetectionMethod; use crate::patterns::{self, PatternDefinition}; +/// Scans document text against compiled regex patterns to detect PII/PHI entities. +/// +/// For each blob the action reads the `"documents"` artifact (or falls back to +/// the raw blob content), runs every active pattern, optionally validates +/// matches, and appends resulting [`Entity`] artifacts. +/// +/// # Parameters (JSON) +/// +/// | Key | Type | Default | Description | +/// |----------------------|------------|---------|------------------------------------------| +/// | `confidenceThreshold`| `f64` | `0.0` | Minimum pattern confidence to emit. | +/// | `patterns` | `[String]` | all | Subset of built-in pattern names to use. | pub struct DetectRegexAction; -#[async_trait] +#[async_trait::async_trait] impl Action for DetectRegexAction { fn id(&self) -> &str { "detect-regex" } - fn input_type(&self) -> &str { - "document" - } - - fn output_type(&self) -> &str { - "entity" - } - - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { Ok(()) } async fn execute( &self, - mut input: mpsc::Receiver<DataValue>, - output: mpsc::Sender<DataValue>, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, params: serde_json::Value, _client: Option<Box<dyn Any + Send>>, - ) -> Result<u64, NvisyError> { + ) -> Result<u64, Error> { let confidence_threshold: f64 = params .get("confidenceThreshold") .and_then(|v| v.as_f64()) @@ -58,8 +63,20 @@ impl Action for DetectRegexAction { let mut count = 0u64; - while let Some(item) = input.recv().await { - if let DataValue::Document(doc) = &item { + while let Some(mut blob) = input.recv().await { + let documents: Vec<Document> = blob.get_artifacts("documents").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read documents artifact: {e}")) + })?; + + let docs = if documents.is_empty() { + // No documents artifact -- treat blob content as plain text + let text = String::from_utf8_lossy(&blob.content).into_owned(); + vec![Document::new(text)] + } else { + documents + }; + + for doc in &docs { for (pattern, regex) in &compiled { for mat in regex.find_iter(&doc.content) { let value = mat.as_str(); @@ -91,19 +108,27 @@ impl Action for DetectRegexAction { entity.source_id = Some(doc.data.id); entity.data.parent_id = Some(doc.data.id); + blob.add_artifact("entities", &entity).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add entity artifact: {e}")) + })?; + count += 1; - if output.send(DataValue::Entity(entity)).await.is_err() { - return Ok(count); - } } } } + + if output.send(blob).await.is_err() { + return Ok(count); + } } Ok(count) } } +/// Resolves the set of active patterns from an optional list of requested names. +/// +/// When `requested` is `None` or empty, all built-in patterns are returned. fn resolve_patterns(requested: &Option<Vec<String>>) -> Vec<&'static PatternDefinition> { match requested { Some(names) if !names.is_empty() => names diff --git a/crates/nvisy-detect/src/actions/emit_audit.rs b/crates/nvisy-detect/src/actions/emit_audit.rs index 0d2759e..9b64821 100644 --- a/crates/nvisy-detect/src/actions/emit_audit.rs +++ b/crates/nvisy-detect/src/actions/emit_audit.rs @@ -1,40 +1,46 @@ -use async_trait::async_trait; +//! Audit trail emission action. + use std::any::Any; use tokio::sync::mpsc; -use nvisy_core::data::DataValue; +use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::audit::Audit; -use nvisy_core::errors::NvisyError; +use nvisy_core::error::{Error, ErrorKind}; use nvisy_core::traits::action::Action; -use nvisy_core::types::AuditAction; +use nvisy_core::datatypes::audit::AuditAction; +use nvisy_core::datatypes::redaction::Redaction; +/// Emits an [`Audit`] record for every [`Redaction`] found in the blob. +/// +/// Each audit entry captures the redaction method, replacement value, and +/// (when available) the originating policy rule ID. Optional `runId` and +/// `actor` parameters are attached to every emitted audit. +/// +/// # Parameters (JSON) +/// +/// | Key | Type | Default | Description | +/// |---------|----------|---------|-------------------------------------| +/// | `runId` | `UUID` | `None` | Pipeline run identifier to attach. | +/// | `actor` | `String` | `None` | Human or service identity to record.| pub struct EmitAuditAction; -#[async_trait] +#[async_trait::async_trait] impl Action for EmitAuditAction { fn id(&self) -> &str { "emit-audit" } - fn input_type(&self) -> &str { - "redaction" - } - - fn output_type(&self) -> &str { - "audit" - } - - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { Ok(()) } async fn execute( &self, - mut input: mpsc::Receiver<DataValue>, - output: mpsc::Sender<DataValue>, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, params: serde_json::Value, _client: Option<Box<dyn Any + Send>>, - ) -> Result<u64, NvisyError> { + ) -> Result<u64, Error> { let run_id: Option<uuid::Uuid> = params .get("runId") .and_then(|v| v.as_str()) @@ -46,8 +52,12 @@ impl Action for EmitAuditAction { let mut count = 0u64; - while let Some(item) = input.recv().await { - if let DataValue::Redaction(redaction) = item { + while let Some(mut blob) = input.recv().await { + let redactions: Vec<Redaction> = blob.get_artifacts("redactions").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read redactions artifact: {e}")) + })?; + + for redaction in &redactions { let mut audit = Audit::new(AuditAction::Redaction) .with_entity_id(redaction.entity_id) .with_redaction_id(redaction.data.id); @@ -78,10 +88,15 @@ impl Action for EmitAuditAction { audit.data.parent_id = Some(redaction.data.id); + blob.add_artifact("audits", &audit).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add audit artifact: {e}")) + })?; + count += 1; - if output.send(DataValue::Audit(audit)).await.is_err() { - return Ok(count); - } + } + + if output.send(blob).await.is_err() { + return Ok(count); } } diff --git a/crates/nvisy-detect/src/actions/evaluate_policy.rs b/crates/nvisy-detect/src/actions/evaluate_policy.rs index 9de8f0a..7b2aea7 100644 --- a/crates/nvisy-detect/src/actions/evaluate_policy.rs +++ b/crates/nvisy-detect/src/actions/evaluate_policy.rs @@ -1,42 +1,49 @@ -use async_trait::async_trait; +//! Policy evaluation action that maps detected entities to redaction instructions. + use std::any::Any; use tokio::sync::mpsc; -use nvisy_core::data::DataValue; +use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::entity::Entity; use nvisy_core::datatypes::policy::PolicyRule; use nvisy_core::datatypes::redaction::Redaction; -use nvisy_core::errors::NvisyError; +use nvisy_core::error::{Error, ErrorKind}; use nvisy_core::traits::action::Action; -use nvisy_core::types::RedactionMethod; - +use nvisy_core::datatypes::redaction::RedactionMethod; + +/// Evaluates policy rules against detected entities and emits [`Redaction`] artifacts. +/// +/// For each entity the action finds the first matching rule (sorted by priority), +/// applies its redaction method and replacement template, and writes a +/// `"redactions"` artifact to the blob. Entities that fall below the confidence +/// threshold are skipped. +/// +/// # Parameters (JSON) +/// +/// | Key | Type | Default | Description | +/// |------------------------------|-----------------------|----------|----------------------------------------------| +/// | `rules` | `[PolicyRule]` | `[]` | Ordered policy rules to evaluate. | +/// | `defaultMethod` | `RedactionMethod` | `Mask` | Fallback redaction method when no rule matches.| +/// | `defaultConfidenceThreshold` | `f64` | `0.5` | Fallback confidence threshold. | pub struct EvaluatePolicyAction; -#[async_trait] +#[async_trait::async_trait] impl Action for EvaluatePolicyAction { fn id(&self) -> &str { "evaluate-policy" } - fn input_type(&self) -> &str { - "entity" - } - - fn output_type(&self) -> &str { - "redaction" - } - - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { Ok(()) } async fn execute( &self, - mut input: mpsc::Receiver<DataValue>, - output: mpsc::Sender<DataValue>, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, params: serde_json::Value, _client: Option<Box<dyn Any + Send>>, - ) -> Result<u64, NvisyError> { + ) -> Result<u64, Error> { let rules: Vec<PolicyRule> = params .get("rules") .and_then(|v| serde_json::from_value(v.clone()).ok()) @@ -55,9 +62,13 @@ impl Action for EvaluatePolicyAction { let mut count = 0u64; - while let Some(item) = input.recv().await { - if let DataValue::Entity(entity) = item { - let rule = find_matching_rule(&entity, &sorted_rules); + while let Some(mut blob) = input.recv().await { + let entities: Vec<Entity> = blob.get_artifacts("entities").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read entities artifact: {e}")) + })?; + + for entity in &entities { + let rule = find_matching_rule(entity, &sorted_rules); let method = rule.map(|r| r.method).unwrap_or(default_method); let threshold = rule .map(|r| r.confidence_threshold) @@ -68,9 +79,9 @@ impl Action for EvaluatePolicyAction { } let replacement_value = if let Some(r) = rule { - apply_template(&r.replacement_template, &entity) + apply_template(&r.replacement_template, entity) } else { - apply_default_mask(&entity, default_method) + apply_default_mask(entity, default_method) }; let mut redaction = @@ -81,10 +92,15 @@ impl Action for EvaluatePolicyAction { } redaction.data.parent_id = Some(entity.data.id); + blob.add_artifact("redactions", &redaction).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add redaction artifact: {e}")) + })?; + count += 1; - if output.send(DataValue::Redaction(redaction)).await.is_err() { - return Ok(count); - } + } + + if output.send(blob).await.is_err() { + return Ok(count); } } @@ -92,6 +108,8 @@ impl Action for EvaluatePolicyAction { } } +/// Returns the first enabled rule whose category/entity-type filters and confidence +/// threshold match the given entity, or `None` if no rule applies. fn find_matching_rule<'a>(entity: &Entity, rules: &'a [PolicyRule]) -> Option<&'a PolicyRule> { for rule in rules { if !rule.enabled { @@ -113,6 +131,9 @@ fn find_matching_rule<'a>(entity: &Entity, rules: &'a [PolicyRule]) -> Option<&' None } +/// Expands a replacement template using entity metadata. +/// +/// Supported placeholders: `{entityType}`, `{category}`, `{value}`. fn apply_template(template: &str, entity: &Entity) -> String { template .replace("{entityType}", &entity.entity_type) @@ -123,6 +144,7 @@ fn apply_template(template: &str, entity: &Entity) -> String { .replace("{value}", &entity.value) } +/// Generates a replacement string for an entity using the given default redaction method. fn apply_default_mask(entity: &Entity, method: RedactionMethod) -> String { match method { RedactionMethod::Mask => "*".repeat(entity.value.len()), diff --git a/crates/nvisy-detect/src/actions/mod.rs b/crates/nvisy-detect/src/actions/mod.rs index 3dfdc36..a9a3168 100644 --- a/crates/nvisy-detect/src/actions/mod.rs +++ b/crates/nvisy-detect/src/actions/mod.rs @@ -1,6 +1,17 @@ +//! Pipeline actions for the detection and redaction workflow. +//! +//! Each sub-module exposes a single [`Action`](nvisy_core::traits::action::Action) +//! implementation that can be wired into an nvisy execution plan. + +/// Applies pending redactions to document content. pub mod apply_redaction; +/// Computes a sensitivity classification for each blob based on detected entities. pub mod classify; +/// Validates detected entities using checksum algorithms (e.g. Luhn). pub mod detect_checksum; +/// Scans document text with compiled regex patterns to detect PII/PHI entities. pub mod detect_regex; +/// Emits audit trail records for every applied redaction. pub mod emit_audit; +/// Evaluates policy rules against detected entities and produces redaction instructions. pub mod evaluate_policy; diff --git a/crates/nvisy-detect/src/lib.rs b/crates/nvisy-detect/src/lib.rs index ebe1863..87ff6c9 100644 --- a/crates/nvisy-detect/src/lib.rs +++ b/crates/nvisy-detect/src/lib.rs @@ -1,33 +1,20 @@ +//! PII/PHI detection actions and loaders for the nvisy pipeline. +//! +//! This crate provides the detection, classification, policy evaluation, +//! redaction, and audit-trail stages used by the nvisy runtime. It also +//! ships format-specific loaders (CSV, JSON, plaintext) and a built-in +//! set of regex patterns compiled from `assets/patterns.json`. + #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] +/// Pipeline actions for detection, classification, policy, redaction, and audit. pub mod actions; +/// Format-specific blob loaders (CSV, JSON, plaintext). pub mod loaders; +/// Built-in regex pattern definitions and validation helpers. pub mod patterns; -use nvisy_core::plugin::PluginDescriptor; - -use crate::actions::apply_redaction::ApplyRedactionAction; -use crate::actions::classify::ClassifyAction; -use crate::actions::detect_checksum::DetectChecksumAction; -use crate::actions::detect_regex::DetectRegexAction; -use crate::actions::emit_audit::EmitAuditAction; -use crate::actions::evaluate_policy::EvaluatePolicyAction; -use crate::loaders::csv_loader::CsvLoader; -use crate::loaders::json_loader::JsonLoader; -use crate::loaders::plaintext::PlaintextLoader; - -/// Create the detect plugin descriptor. -pub fn detect_plugin() -> PluginDescriptor { - PluginDescriptor::new("detect") - .with_action(DetectRegexAction) - .with_action(DetectChecksumAction) - .with_action(EvaluatePolicyAction) - .with_action(ApplyRedactionAction) - .with_action(ClassifyAction) - .with_action(EmitAuditAction) - .with_loader(PlaintextLoader) - .with_loader(CsvLoader) - .with_loader(JsonLoader) -} +#[doc(hidden)] +pub mod prelude; diff --git a/crates/nvisy-detect/src/loaders/csv_loader.rs b/crates/nvisy-detect/src/loaders/csv_loader.rs index 74b8606..6148e6c 100644 --- a/crates/nvisy-detect/src/loaders/csv_loader.rs +++ b/crates/nvisy-detect/src/loaders/csv_loader.rs @@ -1,13 +1,18 @@ -use async_trait::async_trait; +//! CSV file loader. use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; -use nvisy_core::errors::NvisyError; +use nvisy_core::error::Error; use nvisy_core::traits::loader::{Loader, LoaderOutput}; +/// Loads CSV blobs into a single [`Document`] containing the raw CSV text. +/// +/// The loader validates that the blob content is valid UTF-8 and tags the +/// resulting document with `source_format = "csv"`. It handles the `text/csv` +/// content type and `.csv` file extension. pub struct CsvLoader; -#[async_trait] +#[async_trait::async_trait] impl Loader for CsvLoader { fn id(&self) -> &str { "csv" @@ -25,9 +30,9 @@ impl Loader for CsvLoader { &self, blob: &Blob, _params: &serde_json::Value, - ) -> Result<Vec<LoaderOutput>, NvisyError> { + ) -> Result<Vec<LoaderOutput>, Error> { let content = String::from_utf8(blob.content.to_vec()).map_err(|e| { - NvisyError::validation(format!("Invalid UTF-8 in CSV: {}", e), "csv-loader") + Error::validation(format!("Invalid UTF-8 in CSV: {}", e), "csv-loader") })?; let mut doc = Document::new(content); doc.source_format = Some("csv".to_string()); diff --git a/crates/nvisy-detect/src/loaders/json_loader.rs b/crates/nvisy-detect/src/loaders/json_loader.rs index 6f542d0..b93ce96 100644 --- a/crates/nvisy-detect/src/loaders/json_loader.rs +++ b/crates/nvisy-detect/src/loaders/json_loader.rs @@ -1,13 +1,18 @@ -use async_trait::async_trait; +//! JSON file loader. use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; -use nvisy_core::errors::NvisyError; +use nvisy_core::error::Error; use nvisy_core::traits::loader::{Loader, LoaderOutput}; +/// Loads JSON blobs into a single [`Document`] containing the raw JSON text. +/// +/// The loader validates that the blob content is valid UTF-8 **and** valid JSON +/// before producing the document. It handles the `application/json` content type +/// and `.json` file extension. pub struct JsonLoader; -#[async_trait] +#[async_trait::async_trait] impl Loader for JsonLoader { fn id(&self) -> &str { "json" @@ -25,13 +30,13 @@ impl Loader for JsonLoader { &self, blob: &Blob, _params: &serde_json::Value, - ) -> Result<Vec<LoaderOutput>, NvisyError> { + ) -> Result<Vec<LoaderOutput>, Error> { let content = String::from_utf8(blob.content.to_vec()).map_err(|e| { - NvisyError::validation(format!("Invalid UTF-8 in JSON: {}", e), "json-loader") + Error::validation(format!("Invalid UTF-8 in JSON: {}", e), "json-loader") })?; // Validate it's valid JSON let _: serde_json::Value = serde_json::from_str(&content).map_err(|e| { - NvisyError::validation(format!("Invalid JSON: {}", e), "json-loader") + Error::validation(format!("Invalid JSON: {}", e), "json-loader") })?; let mut doc = Document::new(content); doc.source_format = Some("json".to_string()); diff --git a/crates/nvisy-detect/src/loaders/mod.rs b/crates/nvisy-detect/src/loaders/mod.rs index b961145..aaa34a5 100644 --- a/crates/nvisy-detect/src/loaders/mod.rs +++ b/crates/nvisy-detect/src/loaders/mod.rs @@ -1,3 +1,12 @@ +//! Format-specific blob loaders. +//! +//! Each loader converts raw [`Blob`](nvisy_core::datatypes::blob::Blob) bytes +//! into one or more [`Document`](nvisy_core::datatypes::document::Document)s +//! that downstream actions can process. + +/// Loader for CSV files. pub mod csv_loader; +/// Loader for JSON files. pub mod json_loader; +/// Loader for plain-text files. pub mod plaintext; diff --git a/crates/nvisy-detect/src/loaders/plaintext.rs b/crates/nvisy-detect/src/loaders/plaintext.rs index 2056fb4..b4242de 100644 --- a/crates/nvisy-detect/src/loaders/plaintext.rs +++ b/crates/nvisy-detect/src/loaders/plaintext.rs @@ -1,13 +1,18 @@ -use async_trait::async_trait; +//! Plain-text file loader. use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; -use nvisy_core::errors::NvisyError; +use nvisy_core::error::Error; use nvisy_core::traits::loader::{Loader, LoaderOutput}; +/// Loads plain-text blobs into a single [`Document`]. +/// +/// The loader validates that the blob content is valid UTF-8 and tags the +/// resulting document with `source_format = "txt"`. It handles the `text/plain` +/// content type and `.txt` / `.text` file extensions. pub struct PlaintextLoader; -#[async_trait] +#[async_trait::async_trait] impl Loader for PlaintextLoader { fn id(&self) -> &str { "plaintext" @@ -25,9 +30,9 @@ impl Loader for PlaintextLoader { &self, blob: &Blob, _params: &serde_json::Value, - ) -> Result<Vec<LoaderOutput>, NvisyError> { + ) -> Result<Vec<LoaderOutput>, Error> { let content = String::from_utf8(blob.content.to_vec()).map_err(|e| { - NvisyError::validation( + Error::validation( format!("Invalid UTF-8 in plaintext: {}", e), "plaintext-loader", ) diff --git a/crates/nvisy-detect/src/patterns/mod.rs b/crates/nvisy-detect/src/patterns/mod.rs index d3bd8e0..226a71c 100644 --- a/crates/nvisy-detect/src/patterns/mod.rs +++ b/crates/nvisy-detect/src/patterns/mod.rs @@ -1,32 +1,51 @@ +//! Built-in regex pattern definitions and validation helpers. +//! +//! Patterns are loaded at startup from the embedded `assets/patterns.json` +//! file and compiled into a static registry keyed by pattern name. + +/// Checksum and format validators used by pattern definitions. pub mod validators; use std::collections::HashMap; use std::sync::LazyLock; -use nvisy_core::types::EntityCategory; +use nvisy_core::datatypes::entity::EntityCategory; /// JSON representation of a pattern loaded from disk. #[derive(Debug, Clone, serde::Deserialize)] struct PatternJson { + /// Human-readable pattern name (used as the registry key). name: String, + /// Category string (e.g. `"pii"`, `"phi"`, `"financial"`). category: String, + /// The entity type tag emitted when this pattern matches. entity_type: String, + /// The regex pattern string. pattern: String, + /// Base confidence score assigned to matches. confidence: f64, + /// Optional validator name resolved at load time (e.g. `"ssn"`, `"luhn"`). #[serde(default)] validator: Option<String>, } -/// Definition of a regex-based detection pattern. +/// A compiled regex-based detection pattern with optional post-match validation. pub struct PatternDefinition { + /// Unique name identifying this pattern in the registry. pub name: String, + /// The entity category (PII, PHI, Financial, etc.). pub category: EntityCategory, + /// The entity type tag emitted on match (e.g. `"ssn"`, `"credit_card"`). pub entity_type: String, + /// The raw regex pattern string. pub pattern_str: String, + /// Base confidence score assigned to matches of this pattern. pub confidence: f64, + /// Optional validation function applied after a regex match succeeds. pub validate: Option<fn(&str) -> bool>, } +/// Maps a category string from `patterns.json` to its [`EntityCategory`] variant. fn parse_category(s: &str) -> EntityCategory { match s { "pii" => EntityCategory::Pii, @@ -37,6 +56,7 @@ fn parse_category(s: &str) -> EntityCategory { } } +/// Resolves a validator name string to its corresponding validation function. fn resolve_validator(name: &str) -> Option<fn(&str) -> bool> { match name { "ssn" => Some(validators::validate_ssn), @@ -45,6 +65,7 @@ fn resolve_validator(name: &str) -> Option<fn(&str) -> bool> { } } +/// Deserializes and compiles all patterns from the embedded `patterns.json` asset. fn load_patterns() -> Vec<PatternDefinition> { let json_bytes = include_bytes!("../../assets/patterns.json"); let raw: Vec<PatternJson> = diff --git a/crates/nvisy-detect/src/patterns/validators.rs b/crates/nvisy-detect/src/patterns/validators.rs index 8903f64..842c3ca 100644 --- a/crates/nvisy-detect/src/patterns/validators.rs +++ b/crates/nvisy-detect/src/patterns/validators.rs @@ -1,3 +1,8 @@ +//! Checksum and format validators for detected entity values. +//! +//! These functions are referenced by pattern definitions in `patterns.json` +//! and are also used directly by [`DetectChecksumAction`](crate::actions::detect_checksum::DetectChecksumAction). + /// Validate a US Social Security Number. pub fn validate_ssn(value: &str) -> bool { let parts: Vec<&str> = value.split('-').collect(); diff --git a/crates/nvisy-detect/src/prelude.rs b/crates/nvisy-detect/src/prelude.rs new file mode 100644 index 0000000..87f870a --- /dev/null +++ b/crates/nvisy-detect/src/prelude.rs @@ -0,0 +1,10 @@ +//! Convenience re-exports. +pub use crate::actions::apply_redaction::ApplyRedactionAction; +pub use crate::actions::classify::ClassifyAction; +pub use crate::actions::detect_checksum::DetectChecksumAction; +pub use crate::actions::detect_regex::DetectRegexAction; +pub use crate::actions::emit_audit::EmitAuditAction; +pub use crate::actions::evaluate_policy::EvaluatePolicyAction; +pub use crate::loaders::csv_loader::CsvLoader; +pub use crate::loaders::json_loader::JsonLoader; +pub use crate::loaders::plaintext::PlaintextLoader; diff --git a/crates/nvisy-engine/src/schema/mod.rs b/crates/nvisy-engine/src/compiler/graph.rs similarity index 52% rename from crates/nvisy-engine/src/schema/mod.rs rename to crates/nvisy-engine/src/compiler/graph.rs index 5b14cc7..7cc9c6c 100644 --- a/crates/nvisy-engine/src/schema/mod.rs +++ b/crates/nvisy-engine/src/compiler/graph.rs @@ -1,80 +1,76 @@ -use serde::{Deserialize, Serialize}; - -/// Retry policy for a node. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct RetryPolicy { - #[serde(default = "default_max_retries")] - pub max_retries: u32, - #[serde(default = "default_delay_ms")] - pub delay_ms: u64, - #[serde(default)] - pub backoff: BackoffStrategy, -} - -fn default_max_retries() -> u32 { 3 } -fn default_delay_ms() -> u64 { 1000 } +//! Graph data model for pipeline definitions. +//! +//! A pipeline is represented as a set of [`GraphNode`]s connected by +//! [`GraphEdge`]s, collected into a [`Graph`]. -impl Default for RetryPolicy { - fn default() -> Self { - Self { - max_retries: 3, - delay_ms: 1000, - backoff: BackoffStrategy::default(), - } - } -} +use serde::{Deserialize, Serialize}; -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -#[serde(rename_all = "snake_case")] -pub enum BackoffStrategy { - #[default] - Fixed, - Exponential, - Jitter, -} +use crate::policies::retry::RetryPolicy; -/// A node in the graph definition. +/// A node in the pipeline graph, tagged by its role. +/// +/// Nodes are serialized with a `"type"` discriminator so JSON definitions +/// can specify `"source"`, `"action"`, or `"target"`. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(tag = "type", rename_all = "snake_case")] pub enum GraphNode { + /// A data source that reads from an external provider via a named stream. Source { + /// Unique identifier for this node within the graph. id: String, + /// Provider name used to resolve the connection (e.g. `"s3"`). provider: String, + /// Stream name on the provider (e.g. `"read"`). stream: String, + /// Arbitrary provider-specific parameters. #[serde(default)] params: serde_json::Value, + /// Optional retry policy applied to this node's execution. #[serde(skip_serializing_if = "Option::is_none")] retry: Option<RetryPolicy>, + /// Optional per-node timeout in milliseconds. #[serde(skip_serializing_if = "Option::is_none")] timeout_ms: Option<u64>, }, + /// A transformation or detection step applied to data flowing through the pipeline. Action { + /// Unique identifier for this node within the graph. id: String, + /// Registered action name (e.g. `"detect_regex"`, `"classify"`). action: String, + /// Arbitrary action-specific parameters. #[serde(default)] params: serde_json::Value, + /// Optional retry policy applied to this node's execution. #[serde(skip_serializing_if = "Option::is_none")] retry: Option<RetryPolicy>, + /// Optional per-node timeout in milliseconds. #[serde(skip_serializing_if = "Option::is_none")] timeout_ms: Option<u64>, }, + /// A data sink that writes to an external provider via a named stream. Target { + /// Unique identifier for this node within the graph. id: String, + /// Provider name used to resolve the connection (e.g. `"s3"`). provider: String, + /// Stream name on the provider (e.g. `"write"`). stream: String, + /// Arbitrary provider-specific parameters. #[serde(default)] params: serde_json::Value, + /// Optional retry policy applied to this node's execution. #[serde(skip_serializing_if = "Option::is_none")] retry: Option<RetryPolicy>, + /// Optional per-node timeout in milliseconds. #[serde(skip_serializing_if = "Option::is_none")] timeout_ms: Option<u64>, }, } impl GraphNode { + /// Returns the unique identifier shared by all node variants. pub fn id(&self) -> &str { match self { GraphNode::Source { id, .. } => id, @@ -83,6 +79,7 @@ impl GraphNode { } } + /// Returns the parameters value for this node. pub fn params(&self) -> &serde_json::Value { match self { GraphNode::Source { params, .. } => params, @@ -91,6 +88,7 @@ impl GraphNode { } } + /// Returns the retry policy, if one is configured. pub fn retry(&self) -> Option<&RetryPolicy> { match self { GraphNode::Source { retry, .. } => retry.as_ref(), @@ -99,6 +97,7 @@ impl GraphNode { } } + /// Returns the per-node timeout in milliseconds, if one is configured. pub fn timeout_ms(&self) -> Option<u64> { match self { GraphNode::Source { timeout_ms, .. } => *timeout_ms, @@ -108,18 +107,24 @@ impl GraphNode { } } -/// An edge connecting two nodes. +/// A directed edge connecting two nodes by their IDs. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct GraphEdge { + /// ID of the upstream (source) node. pub from: String, + /// ID of the downstream (destination) node. pub to: String, } -/// A complete graph definition. +/// A complete pipeline graph definition containing nodes and edges. +/// +/// The graph must be a valid DAG (directed acyclic graph) with unique node IDs. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Graph { + /// All nodes in the pipeline. pub nodes: Vec<GraphNode>, + /// Directed edges describing data flow between nodes. pub edges: Vec<GraphEdge>, } diff --git a/crates/nvisy-engine/src/compiler/mod.rs b/crates/nvisy-engine/src/compiler/mod.rs index 1f31250..9b21fc8 100644 --- a/crates/nvisy-engine/src/compiler/mod.rs +++ b/crates/nvisy-engine/src/compiler/mod.rs @@ -1,3 +1,9 @@ +//! Pipeline compilation: parsing, graph construction, and execution planning. +//! +//! The compiler takes a JSON pipeline definition, validates it, builds a +//! directed graph, and produces a topologically-sorted execution plan. + +pub mod graph; pub mod parse; pub mod plan; diff --git a/crates/nvisy-engine/src/compiler/parse.rs b/crates/nvisy-engine/src/compiler/parse.rs index e22b906..68b53c0 100644 --- a/crates/nvisy-engine/src/compiler/parse.rs +++ b/crates/nvisy-engine/src/compiler/parse.rs @@ -1,22 +1,32 @@ -use crate::schema::Graph; -use nvisy_core::errors::NvisyError; +//! JSON parsing and validation for pipeline graph definitions. +//! +//! Deserializes a [`serde_json::Value`] into a [`Graph`] and validates +//! structural invariants (non-empty, unique IDs, valid edge references). -/// Parse a graph from a JSON value. -pub fn parse_graph(value: &serde_json::Value) -> Result<Graph, NvisyError> { +use crate::compiler::graph::Graph; +use nvisy_core::error::Error; + +/// Parses and validates a [`Graph`] from a JSON value. +/// +/// Performs the following validations: +/// - The graph must contain at least one node. +/// - All node IDs must be unique. +/// - All edge endpoints must reference existing node IDs. +pub fn parse_graph(value: &serde_json::Value) -> Result<Graph, Error> { let graph: Graph = serde_json::from_value(value.clone()).map_err(|e| { - NvisyError::validation(format!("Invalid graph definition: {}", e), "compiler") + Error::validation(format!("Invalid graph definition: {}", e), "compiler") })?; // Validate: must have at least one node if graph.nodes.is_empty() { - return Err(NvisyError::validation("Graph must have at least one node", "compiler")); + return Err(Error::validation("Graph must have at least one node", "compiler")); } // Validate: no duplicate node IDs let mut seen = std::collections::HashSet::new(); for node in &graph.nodes { if !seen.insert(node.id()) { - return Err(NvisyError::validation( + return Err(Error::validation( format!("Duplicate node ID: {}", node.id()), "compiler", )); @@ -27,13 +37,13 @@ pub fn parse_graph(value: &serde_json::Value) -> Result<Graph, NvisyError> { let node_ids: std::collections::HashSet<&str> = graph.nodes.iter().map(|n| n.id()).collect(); for edge in &graph.edges { if !node_ids.contains(edge.from.as_str()) { - return Err(NvisyError::validation( + return Err(Error::validation( format!("Edge references unknown source node: {}", edge.from), "compiler", )); } if !node_ids.contains(edge.to.as_str()) { - return Err(NvisyError::validation( + return Err(Error::validation( format!("Edge references unknown target node: {}", edge.to), "compiler", )); diff --git a/crates/nvisy-engine/src/compiler/plan.rs b/crates/nvisy-engine/src/compiler/plan.rs index 3120d99..44be2d0 100644 --- a/crates/nvisy-engine/src/compiler/plan.rs +++ b/crates/nvisy-engine/src/compiler/plan.rs @@ -1,27 +1,45 @@ +//! Execution planning via topological sort. +//! +//! Converts a validated [`Graph`] into an [`ExecutionPlan`] by performing +//! cycle detection and topological sorting using `petgraph`. + use std::collections::HashMap; use petgraph::algo::{is_cyclic_directed, toposort}; use petgraph::graph::{DiGraph, NodeIndex}; -use crate::schema::{Graph, GraphNode}; -use nvisy_core::errors::NvisyError; -use nvisy_core::registry::Registry; +use crate::compiler::graph::{Graph, GraphNode}; +use nvisy_core::error::Error; -/// A node resolved against the registry. +/// A graph node enriched with topological ordering and adjacency information. #[derive(Debug, Clone)] pub struct ResolvedNode { + /// The original graph node definition. pub node: GraphNode, + /// Zero-based position in the topological ordering. pub topo_order: usize, + /// IDs of nodes that feed data into this node. pub upstream_ids: Vec<String>, + /// IDs of nodes that receive data from this node. pub downstream_ids: Vec<String>, } /// A compiled execution plan ready for the executor. +/// +/// Contains all nodes in topological order along with their adjacency +/// information so the executor can wire channels and schedule tasks. pub struct ExecutionPlan { + /// Resolved nodes sorted in topological order. pub nodes: Vec<ResolvedNode>, + /// Node IDs in topological order. pub topo_order: Vec<String>, } -/// Build an execution plan from a parsed graph and registry. -pub fn build_plan(graph: &Graph, registry: &Registry) -> Result<ExecutionPlan, NvisyError> { +/// Builds an execution plan from a parsed [`Graph`]. +/// +/// Validates that the graph is acyclic, performs a topological sort, and +/// computes upstream/downstream adjacency lists for each node. +/// +/// Returns an error if the graph contains a cycle or references unknown nodes. +pub fn build_plan(graph: &Graph) -> Result<ExecutionPlan, Error> { // Build petgraph let mut pg: DiGraph<&str, ()> = DiGraph::new(); let mut index_map: HashMap<&str, NodeIndex> = HashMap::new(); @@ -33,50 +51,26 @@ pub fn build_plan(graph: &Graph, registry: &Registry) -> Result<ExecutionPlan, N for edge in &graph.edges { let from = index_map.get(edge.from.as_str()).ok_or_else(|| { - NvisyError::validation(format!("Unknown edge source: {}", edge.from), "compiler") + Error::validation(format!("Unknown edge source: {}", edge.from), "compiler") })?; let to = index_map.get(edge.to.as_str()).ok_or_else(|| { - NvisyError::validation(format!("Unknown edge target: {}", edge.to), "compiler") + Error::validation(format!("Unknown edge target: {}", edge.to), "compiler") })?; pg.add_edge(*from, *to, ()); } // Cycle detection if is_cyclic_directed(&pg) { - return Err(NvisyError::validation("Graph contains a cycle", "compiler")); + return Err(Error::validation("Graph contains a cycle", "compiler")); } // Topological sort let topo = toposort(&pg, None).map_err(|_| { - NvisyError::validation("Graph contains a cycle", "compiler") + Error::validation("Graph contains a cycle", "compiler") })?; let topo_order: Vec<String> = topo.iter().map(|idx| pg[*idx].to_string()).collect(); - // Resolve nodes against registry - for node in &graph.nodes { - match node { - GraphNode::Action { action, params, .. } => { - let _a = registry.get_action(action).ok_or_else(|| { - NvisyError::validation(format!("Unknown action: {}", action), "compiler") - })?; - _a.validate_params(params)?; - } - GraphNode::Source { provider, stream, .. } => { - let source_key = format!("{}/{}", provider, stream); - let _s = registry.get_source(&source_key).ok_or_else(|| { - NvisyError::validation(format!("Unknown source: {}", source_key), "compiler") - })?; - } - GraphNode::Target { provider, stream, .. } => { - let target_key = format!("{}/{}", provider, stream); - let _t = registry.get_target(&target_key).ok_or_else(|| { - NvisyError::validation(format!("Unknown target: {}", target_key), "compiler") - })?; - } - } - } - // Build resolved nodes with adjacency info let node_map: HashMap<&str, &GraphNode> = graph.nodes.iter().map(|n| (n.id(), n)).collect(); let mut resolved = Vec::new(); diff --git a/crates/nvisy-engine/src/connections/mod.rs b/crates/nvisy-engine/src/connections/mod.rs index 98384bf..73793a1 100644 --- a/crates/nvisy-engine/src/connections/mod.rs +++ b/crates/nvisy-engine/src/connections/mod.rs @@ -1,16 +1,25 @@ +//! External service connection definitions. +//! +//! A [`Connection`] holds the provider type, credentials, and optional context +//! needed to interact with an external service (e.g. S3, a database). +//! [`Connections`] is a type alias mapping connection IDs to their definitions. + use std::collections::HashMap; use serde::{Deserialize, Serialize}; -/// A validated connection to an external service. +/// A validated connection to an external service such as S3 or a database. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct Connection { + /// Provider type identifier (e.g. `"s3"`, `"postgres"`). #[serde(rename = "type")] pub provider_type: String, + /// Opaque credentials payload specific to the provider. pub credentials: serde_json::Value, + /// Optional provider-specific context (e.g. region, endpoint overrides). #[serde(default)] pub context: serde_json::Value, } -/// Map of connection_id -> Connection +/// Map of connection IDs to their [`Connection`] definitions. pub type Connections = HashMap<String, Connection>; diff --git a/crates/nvisy-engine/src/executor/context.rs b/crates/nvisy-engine/src/executor/context.rs index 2731e0c..0189e88 100644 --- a/crates/nvisy-engine/src/executor/context.rs +++ b/crates/nvisy-engine/src/executor/context.rs @@ -1,13 +1,21 @@ +//! Channel primitives used to wire data flow between pipeline nodes. +//! +//! [`EdgeChannel`] carries [`Blob`] items along a graph edge, while +//! [`NodeSignal`] broadcasts node completion. + use tokio::sync::{mpsc, watch}; -use nvisy_core::data::DataValue; +use nvisy_core::datatypes::blob::Blob; -/// Buffer size for inter-node channels. +/// Default buffer size for bounded inter-node MPSC channels. pub const CHANNEL_BUFFER_SIZE: usize = 256; -/// Wiring for a single edge: sender + receiver pair. +/// A bounded MPSC channel pair used to transfer [`Blob`] items along a +/// single graph edge from an upstream node to a downstream node. pub struct EdgeChannel { - pub sender: mpsc::Sender<DataValue>, - pub receiver: mpsc::Receiver<DataValue>, + /// Sending half, held by the upstream node. + pub sender: mpsc::Sender<Blob>, + /// Receiving half, held by the downstream node. + pub receiver: mpsc::Receiver<Blob>, } impl Default for EdgeChannel { @@ -17,15 +25,21 @@ impl Default for EdgeChannel { } impl EdgeChannel { + /// Creates a new edge channel with [`CHANNEL_BUFFER_SIZE`] capacity. pub fn new() -> Self { let (sender, receiver) = mpsc::channel(CHANNEL_BUFFER_SIZE); Self { sender, receiver } } } -/// Signals that a node has completed. +/// A watch channel pair used to signal that a node has completed execution. +/// +/// The sender broadcasts `true` when the node finishes, and downstream nodes +/// wait on the receiver before starting. pub struct NodeSignal { + /// Sending half; set to `true` when the node completes. pub sender: watch::Sender<bool>, + /// Receiving half; downstream tasks call `wait_for(|&done| done)`. pub receiver: watch::Receiver<bool>, } @@ -36,6 +50,7 @@ impl Default for NodeSignal { } impl NodeSignal { + /// Creates a new node signal initialized to `false` (not completed). pub fn new() -> Self { let (sender, receiver) = watch::channel(false); Self { sender, receiver } diff --git a/crates/nvisy-engine/src/executor/mod.rs b/crates/nvisy-engine/src/executor/mod.rs index 31905ca..7669b01 100644 --- a/crates/nvisy-engine/src/executor/mod.rs +++ b/crates/nvisy-engine/src/executor/mod.rs @@ -1,5 +1,9 @@ +//! Pipeline execution runtime. +//! +//! Spawns concurrent Tokio tasks for each node in topological order, +//! wires inter-node channels, and collects per-node results. + pub mod context; -pub mod nodes; pub mod runner; pub use runner::run_graph; diff --git a/crates/nvisy-engine/src/executor/nodes.rs b/crates/nvisy-engine/src/executor/nodes.rs deleted file mode 100644 index 4c97921..0000000 --- a/crates/nvisy-engine/src/executor/nodes.rs +++ /dev/null @@ -1,63 +0,0 @@ -use std::any::Any; -use tokio::sync::mpsc; -use nvisy_core::data::DataValue; -use nvisy_core::errors::NvisyError; -use nvisy_core::registry::Registry; -use crate::schema::GraphNode; - -/// Execute a source node: read from external system into output channel. -pub async fn execute_source( - node: &GraphNode, - output: mpsc::Sender<DataValue>, - registry: &Registry, - client: Box<dyn Any + Send>, -) -> Result<u64, NvisyError> { - match node { - GraphNode::Source { provider, stream, params, .. } => { - let source_key = format!("{}/{}", provider, stream); - let source = registry.get_source(&source_key).ok_or_else(|| { - NvisyError::runtime(format!("Source not found: {}", source_key), "executor", false) - })?; - source.read(output, params.clone(), client).await - } - _ => Err(NvisyError::runtime("Expected source node", "executor", false)), - } -} - -/// Execute an action node: consume from input, produce to output. -pub async fn execute_action( - node: &GraphNode, - input: mpsc::Receiver<DataValue>, - output: mpsc::Sender<DataValue>, - registry: &Registry, - client: Option<Box<dyn Any + Send>>, -) -> Result<u64, NvisyError> { - match node { - GraphNode::Action { action, params, .. } => { - let act = registry.get_action(action).ok_or_else(|| { - NvisyError::runtime(format!("Action not found: {}", action), "executor", false) - })?; - act.execute(input, output, params.clone(), client).await - } - _ => Err(NvisyError::runtime("Expected action node", "executor", false)), - } -} - -/// Execute a target node: consume from input, write to external system. -pub async fn execute_target( - node: &GraphNode, - input: mpsc::Receiver<DataValue>, - registry: &Registry, - client: Box<dyn Any + Send>, -) -> Result<u64, NvisyError> { - match node { - GraphNode::Target { provider, stream, params, .. } => { - let target_key = format!("{}/{}", provider, stream); - let target = registry.get_target(&target_key).ok_or_else(|| { - NvisyError::runtime(format!("Target not found: {}", target_key), "executor", false) - })?; - target.write(input, params.clone(), client).await - } - _ => Err(NvisyError::runtime("Expected target node", "executor", false)), - } -} diff --git a/crates/nvisy-engine/src/executor/runner.rs b/crates/nvisy-engine/src/executor/runner.rs index 838b886..fcfdbf3 100644 --- a/crates/nvisy-engine/src/executor/runner.rs +++ b/crates/nvisy-engine/src/executor/runner.rs @@ -1,45 +1,56 @@ +//! Graph runner that executes a compiled [`ExecutionPlan`]. +//! +//! Each node is spawned as a concurrent Tokio task. Data flows between nodes +//! via bounded MPSC channels, and upstream completion is signalled via watch +//! channels so downstream tasks wait before starting. + use std::collections::HashMap; use tokio::sync::{mpsc, watch}; use tokio::task::JoinSet; use uuid::Uuid; -use nvisy_core::data::DataValue; -use nvisy_core::errors::NvisyError; -use nvisy_core::registry::Registry; +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::error::Error; use crate::compiler::plan::ExecutionPlan; use crate::connections::Connections; use crate::executor::context::CHANNEL_BUFFER_SIZE; -use crate::schema::GraphNode; +use crate::compiler::graph::GraphNode; -/// Result of a single node execution. +/// Outcome of executing a single node in the pipeline. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct NodeResult { + /// ID of the node that produced this result. pub node_id: String, + /// Number of data items processed by this node. pub items_processed: u64, + /// Error message if the node failed, or `None` on success. pub error: Option<String>, } -/// Result of an entire graph execution. +/// Aggregate outcome of executing an entire pipeline graph. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct RunResult { + /// Unique identifier for this execution run. pub run_id: Uuid, + /// Per-node results in completion order. pub node_results: Vec<NodeResult>, + /// `true` if all nodes completed without error. pub success: bool, } -/// Execute a compiled graph plan. +/// Executes a compiled [`ExecutionPlan`] by spawning concurrent tasks for each node. +/// +/// Returns a [`RunResult`] containing per-node outcomes and an overall success flag. pub async fn run_graph( plan: &ExecutionPlan, _connections: &Connections, - _registry: &Registry, -) -> Result<RunResult, NvisyError> { +) -> Result<RunResult, Error> { let run_id = Uuid::new_v4(); // Create channels for each edge - // Key: "from_id -> to_id", value: (sender, receiver) - let mut senders: HashMap<String, Vec<mpsc::Sender<DataValue>>> = HashMap::new(); - let mut receivers: HashMap<String, Vec<mpsc::Receiver<DataValue>>> = HashMap::new(); + let mut senders: HashMap<String, Vec<mpsc::Sender<Blob>>> = HashMap::new(); + let mut receivers: HashMap<String, Vec<mpsc::Receiver<Blob>>> = HashMap::new(); for node in &plan.nodes { let node_id = node.node.id(); @@ -131,9 +142,9 @@ pub async fn run_graph( /// Execute a single node with its channels (simplified -- does not use registry directly). async fn execute_node( _node: &GraphNode, - senders: Vec<mpsc::Sender<DataValue>>, - mut receivers: Vec<mpsc::Receiver<DataValue>>, -) -> Result<u64, NvisyError> { + senders: Vec<mpsc::Sender<Blob>>, + mut receivers: Vec<mpsc::Receiver<Blob>>, +) -> Result<u64, Error> { // For now, forward items from receivers to senders (passthrough behavior). // The actual registry-based dispatch happens via the Engine wrapper. let mut count = 0u64; diff --git a/crates/nvisy-engine/src/lib.rs b/crates/nvisy-engine/src/lib.rs index 465f82e..c096c86 100644 --- a/crates/nvisy-engine/src/lib.rs +++ b/crates/nvisy-engine/src/lib.rs @@ -1,3 +1,9 @@ +//! DAG execution engine for nvisy pipelines. +//! +//! This crate compiles pipeline definitions into directed acyclic graphs (DAGs), +//! plans topologically-ordered execution, and runs nodes concurrently with +//! retry and timeout policies. + #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] @@ -7,4 +13,6 @@ pub mod connections; pub mod executor; pub mod policies; pub mod runs; -pub mod schema; + +#[doc(hidden)] +pub mod prelude; diff --git a/crates/nvisy-engine/src/policies/mod.rs b/crates/nvisy-engine/src/policies/mod.rs index 04d3a94..eed231b 100644 --- a/crates/nvisy-engine/src/policies/mod.rs +++ b/crates/nvisy-engine/src/policies/mod.rs @@ -1,9 +1,18 @@ +//! Retry and timeout policies for pipeline execution. +//! +//! Provides [`compute_delay`] for backoff calculation, [`with_retry`] for +//! automatic retry of fallible futures, and [`with_timeout`] for deadline +//! enforcement. + use std::time::Duration; use tokio::time; -use nvisy_core::errors::NvisyError; -use crate::schema::{BackoffStrategy, RetryPolicy}; +use nvisy_core::error::Error; +pub mod retry; + +use crate::policies::retry::{BackoffStrategy, RetryPolicy}; -/// Compute delay for a retry attempt. +/// Computes the sleep duration before a retry attempt based on the policy's +/// [`BackoffStrategy`] and the zero-based attempt number. pub fn compute_delay(policy: &RetryPolicy, attempt: u32) -> Duration { let base = Duration::from_millis(policy.delay_ms); match policy.backoff { @@ -18,14 +27,18 @@ pub fn compute_delay(policy: &RetryPolicy, attempt: u32) -> Duration { } } -/// Execute a future with retry logic. +/// Executes a fallible async closure with automatic retry according to the +/// given [`RetryPolicy`]. +/// +/// The closure is invoked up to `max_retries + 1` times. Non-retryable errors +/// (as determined by [`Error::is_retryable`]) are returned immediately. pub async fn with_retry<F, Fut, T>( policy: &RetryPolicy, mut f: F, -) -> Result<T, NvisyError> +) -> Result<T, Error> where F: FnMut() -> Fut, - Fut: std::future::Future<Output = Result<T, NvisyError>>, + Fut: std::future::Future<Output = Result<T, Error>>, { let mut last_err = None; for attempt in 0..=policy.max_retries { @@ -41,20 +54,21 @@ where } } } - Err(last_err.unwrap_or_else(|| NvisyError::runtime("Retry exhausted", "policies", false))) + Err(last_err.unwrap_or_else(|| Error::runtime("Retry exhausted", "policies", false))) } -/// Execute a future with a timeout. +/// Wraps a future with a deadline, returning an [`Error::timeout`] if it +/// does not complete within `timeout_ms` milliseconds. pub async fn with_timeout<F, T>( timeout_ms: u64, f: F, -) -> Result<T, NvisyError> +) -> Result<T, Error> where - F: std::future::Future<Output = Result<T, NvisyError>>, + F: std::future::Future<Output = Result<T, Error>>, { match time::timeout(Duration::from_millis(timeout_ms), f).await { Ok(result) => result, - Err(_) => Err(NvisyError::timeout(format!( + Err(_) => Err(Error::timeout(format!( "Operation timed out after {}ms", timeout_ms ))), diff --git a/crates/nvisy-engine/src/policies/retry.rs b/crates/nvisy-engine/src/policies/retry.rs new file mode 100644 index 0000000..62b2337 --- /dev/null +++ b/crates/nvisy-engine/src/policies/retry.rs @@ -0,0 +1,52 @@ +//! Retry policy types and backoff strategies. +//! +//! [`RetryPolicy`] configures how many times a failed node should be retried, +//! the base delay between attempts, and the [`BackoffStrategy`] to use. + +use serde::{Deserialize, Serialize}; + +/// Retry policy attached to a pipeline node. +/// +/// Defaults to 3 retries with a 1 000 ms fixed delay. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct RetryPolicy { + /// Maximum number of retry attempts after the initial failure. + #[serde(default = "default_max_retries")] + pub max_retries: u32, + /// Base delay in milliseconds between retry attempts. + #[serde(default = "default_delay_ms")] + pub delay_ms: u64, + /// Strategy used to compute the delay between successive retries. + #[serde(default)] + pub backoff: BackoffStrategy, +} + +/// Returns the default maximum retry count (3). +fn default_max_retries() -> u32 { 3 } +/// Returns the default base delay in milliseconds (1 000). +fn default_delay_ms() -> u64 { 1000 } + +impl Default for RetryPolicy { + fn default() -> Self { + Self { + max_retries: 3, + delay_ms: 1000, + backoff: BackoffStrategy::default(), + } + } +} + +/// Strategy for computing the delay between retry attempts. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum BackoffStrategy { + /// Constant delay equal to `delay_ms` on every attempt. + #[default] + Fixed, + /// Delay doubles with each attempt: `delay_ms * 2^attempt`. + Exponential, + /// Exponential backoff with an added random jitter to prevent thundering herd. + Jitter, +} diff --git a/crates/nvisy-engine/src/prelude.rs b/crates/nvisy-engine/src/prelude.rs new file mode 100644 index 0000000..f0da8ad --- /dev/null +++ b/crates/nvisy-engine/src/prelude.rs @@ -0,0 +1,5 @@ +//! Convenience re-exports. +pub use crate::compiler::plan::{build_plan, ExecutionPlan, ResolvedNode}; +pub use crate::executor::runner::{run_graph, RunResult}; +pub use crate::runs::{RunManager, RunState, RunStatus, RunSummary}; +pub use crate::compiler::graph::{Graph, GraphEdge, GraphNode}; diff --git a/crates/nvisy-engine/src/runs/mod.rs b/crates/nvisy-engine/src/runs/mod.rs index d45d7d0..2423efa 100644 --- a/crates/nvisy-engine/src/runs/mod.rs +++ b/crates/nvisy-engine/src/runs/mod.rs @@ -1,3 +1,9 @@ +//! Pipeline run lifecycle management. +//! +//! Tracks the status of every pipeline execution from creation through +//! completion or cancellation. Provides [`RunManager`] for concurrent +//! read/write access to run state. + use std::collections::HashMap; use std::sync::Arc; use chrono::{DateTime, Utc}; @@ -6,62 +12,88 @@ use tokio_util::sync::CancellationToken; use uuid::Uuid; use crate::executor::runner::RunResult; -/// Status of a pipeline run. +/// Lifecycle status of a pipeline run. #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] pub enum RunStatus { + /// The run has been created but not yet started. Pending, + /// The run is actively executing nodes. Running, + /// All nodes completed without error. Success, + /// Some nodes succeeded while others failed. PartialFailure, + /// All nodes failed. Failure, + /// The run was cancelled by the caller. Cancelled, } -/// Progress of a single node within a run. +/// Execution progress of a single node within a run. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct NodeProgress { + /// ID of the node this progress belongs to. pub node_id: String, + /// Current status of this node. pub status: RunStatus, + /// Number of data items processed so far. pub items_processed: u64, + /// Error message if the node failed. #[serde(skip_serializing_if = "Option::is_none")] pub error: Option<String>, } -/// Full state of a run. +/// Complete mutable state of a pipeline run. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct RunState { + /// Unique run identifier. pub id: Uuid, + /// Current overall status. pub status: RunStatus, + /// Timestamp when the run was created. pub created_at: DateTime<Utc>, + /// Timestamp when the run finished, if applicable. #[serde(skip_serializing_if = "Option::is_none")] pub completed_at: Option<DateTime<Utc>>, + /// Per-node progress keyed by node ID. pub node_progress: HashMap<String, NodeProgress>, + /// Final result after the run completes. #[serde(skip_serializing_if = "Option::is_none")] pub result: Option<RunResult>, } -/// Summary of a run for listing. +/// Lightweight summary of a run for listing endpoints. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] pub struct RunSummary { + /// Unique run identifier. pub id: Uuid, + /// Current overall status. pub status: RunStatus, + /// Timestamp when the run was created. pub created_at: DateTime<Utc>, + /// Timestamp when the run finished, if applicable. #[serde(skip_serializing_if = "Option::is_none")] pub completed_at: Option<DateTime<Utc>>, } -/// Manages all tracked runs. +/// Thread-safe manager that tracks all pipeline runs. +/// +/// Internally uses [`RwLock`]-protected maps so multiple readers can inspect +/// run state concurrently while writes are serialized. pub struct RunManager { + /// All known runs keyed by their UUID. runs: Arc<RwLock<HashMap<Uuid, RunState>>>, + /// Cancellation tokens for runs that are still in progress. cancel_tokens: Arc<RwLock<HashMap<Uuid, CancellationToken>>>, } impl RunManager { + /// Creates a new, empty run manager. pub fn new() -> Self { Self { runs: Arc::new(RwLock::new(HashMap::new())), diff --git a/crates/nvisy-object/src/client/mod.rs b/crates/nvisy-object/src/client/mod.rs index 7479f4e..df925d2 100644 --- a/crates/nvisy-object/src/client/mod.rs +++ b/crates/nvisy-object/src/client/mod.rs @@ -1,31 +1,52 @@ -use async_trait::async_trait; +//! Abstract object-store client trait and helper types. +//! +//! The [`ObjectStoreClient`] trait defines the CRUD surface that every backend +//! (S3, GCS, local filesystem, etc.) must implement. [`ObjectStoreBox`] wraps +//! a concrete client so it can be passed through the engine as `Box<dyn Any + Send>`. + use bytes::Bytes; -/// Result of a list operation. +/// Result returned by [`ObjectStoreClient::list`]. pub struct ListResult { + /// Object keys matching the requested prefix. pub keys: Vec<String>, + /// Opaque pagination cursor; `None` when there are no more pages. pub next_cursor: Option<String>, } /// Abstract client for object storage operations. -#[async_trait] +/// +/// Implementations provide list, get, put, and delete over a single bucket +/// or container. +#[async_trait::async_trait] pub trait ObjectStoreClient: Send + Sync + 'static { + /// List object keys under `prefix`, optionally continuing from `cursor`. async fn list(&self, prefix: &str, cursor: Option<&str>) -> Result<ListResult, Box<dyn std::error::Error + Send + Sync>>; + /// Retrieve the object stored at `key`. async fn get(&self, key: &str) -> Result<GetResult, Box<dyn std::error::Error + Send + Sync>>; + /// Upload `data` to `key`, optionally setting the content-type header. async fn put(&self, key: &str, data: Bytes, content_type: Option<&str>) -> Result<(), Box<dyn std::error::Error + Send + Sync>>; + /// Delete the object at `key`. async fn delete(&self, key: &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>>; } -/// Result of a get operation. +/// Result returned by [`ObjectStoreClient::get`]. pub struct GetResult { + /// Raw bytes of the retrieved object. pub data: Bytes, + /// MIME content-type, if the backend provides one. pub content_type: Option<String>, } -/// A sized wrapper around a boxed ObjectStoreClient, usable with `Box<dyn Any + Send>`. +/// Type-erased wrapper around a boxed [`ObjectStoreClient`]. +/// +/// This allows the client to be stored as `Box<dyn Any + Send>` inside the +/// engine's `ConnectedInstance` while still being downcasted back to a usable +/// object-store client. pub struct ObjectStoreBox(pub Box<dyn ObjectStoreClient>); impl ObjectStoreBox { + /// Wrap a concrete [`ObjectStoreClient`] implementation. pub fn new(client: impl ObjectStoreClient) -> Self { Self(Box::new(client)) } diff --git a/crates/nvisy-object/src/lib.rs b/crates/nvisy-object/src/lib.rs index 4d091cd..806462b 100644 --- a/crates/nvisy-object/src/lib.rs +++ b/crates/nvisy-object/src/lib.rs @@ -1,3 +1,8 @@ +//! Object storage providers and streams for the nvisy pipeline. +//! +//! This crate provides an abstraction layer over cloud object stores (currently S3) +//! and exposes streaming read/write interfaces that plug into the nvisy engine. + #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] @@ -6,15 +11,5 @@ pub mod client; pub mod providers; pub mod streams; -use nvisy_core::plugin::PluginDescriptor; -use crate::providers::s3::S3ProviderFactory; -use crate::streams::read::ObjectReadStream; -use crate::streams::write::ObjectWriteStream; - -/// Create the object store plugin descriptor. -pub fn object_plugin() -> PluginDescriptor { - PluginDescriptor::new("object") - .with_provider(S3ProviderFactory) - .with_source(ObjectReadStream) - .with_target(ObjectWriteStream) -} +#[doc(hidden)] +pub mod prelude; diff --git a/crates/nvisy-object/src/prelude.rs b/crates/nvisy-object/src/prelude.rs new file mode 100644 index 0000000..38fde55 --- /dev/null +++ b/crates/nvisy-object/src/prelude.rs @@ -0,0 +1,4 @@ +//! Convenience re-exports. +pub use crate::providers::s3::S3ProviderFactory; +pub use crate::streams::read::ObjectReadStream; +pub use crate::streams::write::ObjectWriteStream; diff --git a/crates/nvisy-object/src/providers/mod.rs b/crates/nvisy-object/src/providers/mod.rs index 7dce405..17b9082 100644 --- a/crates/nvisy-object/src/providers/mod.rs +++ b/crates/nvisy-object/src/providers/mod.rs @@ -1 +1,3 @@ +//! Object storage provider factories. + pub mod s3; diff --git a/crates/nvisy-object/src/providers/s3.rs b/crates/nvisy-object/src/providers/s3.rs index 556cda1..364a8ad 100644 --- a/crates/nvisy-object/src/providers/s3.rs +++ b/crates/nvisy-object/src/providers/s3.rs @@ -1,25 +1,34 @@ -use async_trait::async_trait; +//! AWS S3 (and S3-compatible) provider implementation. +//! +//! Provides [`S3ObjectStoreClient`] which implements [`ObjectStoreClient`] and +//! [`S3ProviderFactory`] which plugs into the engine's provider system. + use aws_config::BehaviorVersion; use aws_sdk_s3::Client as S3Client; use bytes::Bytes; -use nvisy_core::errors::NvisyError; +use nvisy_core::error::Error; use nvisy_core::traits::provider::{ConnectedInstance, ProviderFactory}; use crate::client::{GetResult, ListResult, ObjectStoreClient}; /// S3-compatible object store client. +/// +/// Wraps the AWS SDK [`S3Client`] and scopes all operations to a single bucket. pub struct S3ObjectStoreClient { + /// Underlying AWS SDK client. client: S3Client, + /// Target S3 bucket name. bucket: String, } impl S3ObjectStoreClient { + /// Create a new client bound to the given `bucket`. pub fn new(client: S3Client, bucket: String) -> Self { Self { client, bucket } } } -#[async_trait] +#[async_trait::async_trait] impl ObjectStoreClient for S3ObjectStoreClient { async fn list(&self, prefix: &str, cursor: Option<&str>) -> Result<ListResult, Box<dyn std::error::Error + Send + Sync>> { let mut req = self.client @@ -85,31 +94,37 @@ impl ObjectStoreClient for S3ObjectStoreClient { } } -/// S3 provider factory. +/// Factory that creates [`S3ObjectStoreClient`] instances from JSON credentials. +/// +/// Expected credential keys: +/// - `bucket` (required) -- S3 bucket name. +/// - `region` (optional, defaults to `us-east-1`). +/// - `endpoint` (optional) -- custom endpoint URL for S3-compatible services. +/// - `accessKeyId` / `secretAccessKey` / `sessionToken` (optional) -- static credentials. pub struct S3ProviderFactory; -#[async_trait] +#[async_trait::async_trait] impl ProviderFactory for S3ProviderFactory { fn id(&self) -> &str { "s3" } - fn validate_credentials(&self, creds: &serde_json::Value) -> Result<(), NvisyError> { + fn validate_credentials(&self, creds: &serde_json::Value) -> Result<(), Error> { let bucket = creds.get("bucket").and_then(|v| v.as_str()); if bucket.is_none() { - return Err(NvisyError::validation("Missing 'bucket' in S3 credentials", "s3")); + return Err(Error::validation("Missing 'bucket' in S3 credentials", "s3")); } Ok(()) } - async fn verify(&self, creds: &serde_json::Value) -> Result<(), NvisyError> { + async fn verify(&self, creds: &serde_json::Value) -> Result<(), Error> { self.validate_credentials(creds)?; // Could do a HeadBucket call here for verification Ok(()) } - async fn connect(&self, creds: &serde_json::Value) -> Result<ConnectedInstance, NvisyError> { + async fn connect(&self, creds: &serde_json::Value) -> Result<ConnectedInstance, Error> { let bucket = creds.get("bucket") .and_then(|v| v.as_str()) - .ok_or_else(|| NvisyError::validation("Missing 'bucket'", "s3"))? + .ok_or_else(|| Error::validation("Missing 'bucket'", "s3"))? .to_string(); let region = creds.get("region") diff --git a/crates/nvisy-object/src/streams/mod.rs b/crates/nvisy-object/src/streams/mod.rs index 6295529..f69ff65 100644 --- a/crates/nvisy-object/src/streams/mod.rs +++ b/crates/nvisy-object/src/streams/mod.rs @@ -1,2 +1,4 @@ +//! Streaming read and write adapters for object stores. + pub mod read; pub mod write; diff --git a/crates/nvisy-object/src/streams/read.rs b/crates/nvisy-object/src/streams/read.rs index dc4ec77..d8159f5 100644 --- a/crates/nvisy-object/src/streams/read.rs +++ b/crates/nvisy-object/src/streams/read.rs @@ -1,33 +1,39 @@ +//! Streaming reader that pulls objects from an S3-compatible store. + use std::any::Any; -use async_trait::async_trait; use tokio::sync::mpsc; -use nvisy_core::data::DataValue; use nvisy_core::datatypes::blob::Blob; -use nvisy_core::errors::NvisyError; +use nvisy_core::error::Error; use nvisy_core::traits::stream::StreamSource; use crate::client::ObjectStoreBox; +/// A [`StreamSource`] that lists and fetches objects from an S3-compatible store, +/// emitting each object as a [`Blob`] onto the output channel. +/// +/// # Parameters (JSON) +/// +/// - `prefix` -- object key prefix to filter by (default: `""`). +/// - `batchSize` -- number of keys to fetch per page (default: `100`). pub struct ObjectReadStream; -#[async_trait] +#[async_trait::async_trait] impl StreamSource for ObjectReadStream { fn id(&self) -> &str { "read" } - fn output_type(&self) -> &str { "blob" } fn required_provider_id(&self) -> &str { "s3" } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { Ok(()) } async fn read( &self, - output: mpsc::Sender<DataValue>, + output: mpsc::Sender<Blob>, params: serde_json::Value, client: Box<dyn Any + Send>, - ) -> Result<u64, NvisyError> { + ) -> Result<u64, Error> { let store_box = client.downcast::<ObjectStoreBox>().map_err(|_| { - NvisyError::runtime("Invalid client type for object read stream", "object/read", false) + Error::runtime("Invalid client type for object read stream", "object/read", false) })?; let store_client = &store_box.0; @@ -41,7 +47,7 @@ impl StreamSource for ObjectReadStream { let result = store_client .list(prefix, cursor.as_deref()) .await - .map_err(|e| NvisyError::runtime(format!("List failed: {}", e), "object/read", true))?; + .map_err(|e| Error::runtime(format!("List failed: {}", e), "object/read", true))?; let keys_count = result.keys.len(); @@ -49,7 +55,7 @@ impl StreamSource for ObjectReadStream { let get_result = store_client .get(key) .await - .map_err(|e| NvisyError::runtime(format!("Get failed for {}: {}", key, e), "object/read", true))?; + .map_err(|e| Error::runtime(format!("Get failed for {}: {}", key, e), "object/read", true))?; let mut blob = Blob::new(key.clone(), get_result.data); if let Some(ct) = get_result.content_type { @@ -57,7 +63,7 @@ impl StreamSource for ObjectReadStream { } total += 1; - if output.send(DataValue::Blob(blob)).await.is_err() { + if output.send(blob).await.is_err() { return Ok(total); } } diff --git a/crates/nvisy-object/src/streams/write.rs b/crates/nvisy-object/src/streams/write.rs index fd23790..eb258e1 100644 --- a/crates/nvisy-object/src/streams/write.rs +++ b/crates/nvisy-object/src/streams/write.rs @@ -1,53 +1,57 @@ +//! Streaming writer that uploads blobs to an S3-compatible store. + use std::any::Any; -use async_trait::async_trait; use tokio::sync::mpsc; -use nvisy_core::data::DataValue; -use nvisy_core::errors::NvisyError; +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::error::Error; use nvisy_core::traits::stream::StreamTarget; use crate::client::ObjectStoreBox; +/// A [`StreamTarget`] that receives [`Blob`]s from the input channel and +/// uploads each one to an S3-compatible object store. +/// +/// # Parameters (JSON) +/// +/// - `prefix` -- key prefix prepended to each blob path (default: `""`). pub struct ObjectWriteStream; -#[async_trait] +#[async_trait::async_trait] impl StreamTarget for ObjectWriteStream { fn id(&self) -> &str { "write" } - fn input_type(&self) -> &str { "blob" } fn required_provider_id(&self) -> &str { "s3" } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { Ok(()) } async fn write( &self, - mut input: mpsc::Receiver<DataValue>, + mut input: mpsc::Receiver<Blob>, params: serde_json::Value, client: Box<dyn Any + Send>, - ) -> Result<u64, NvisyError> { + ) -> Result<u64, Error> { let store_box = client.downcast::<ObjectStoreBox>().map_err(|_| { - NvisyError::runtime("Invalid client type for object write stream", "object/write", false) + Error::runtime("Invalid client type for object write stream", "object/write", false) })?; let store_client = &store_box.0; let prefix = params.get("prefix").and_then(|v| v.as_str()).unwrap_or(""); let mut total = 0u64; - while let Some(item) = input.recv().await { - if let DataValue::Blob(blob) = item { - let key = if prefix.is_empty() { - blob.path.clone() - } else { - format!("{}{}", prefix, blob.path) - }; - - store_client - .put(&key, blob.content.clone(), blob.content_type()) - .await - .map_err(|e| NvisyError::runtime(format!("Put failed for {}: {}", key, e), "object/write", true))?; - - total += 1; - } + while let Some(blob) = input.recv().await { + let key = if prefix.is_empty() { + blob.path.clone() + } else { + format!("{}{}", prefix, blob.path) + }; + + store_client + .put(&key, blob.content.clone(), blob.content_type()) + .await + .map_err(|e| Error::runtime(format!("Put failed for {}: {}", key, e), "object/write", true))?; + + total += 1; } Ok(total) diff --git a/crates/nvisy-python/src/actions/mod.rs b/crates/nvisy-python/src/actions/mod.rs index 45c419d..3cb558d 100644 --- a/crates/nvisy-python/src/actions/mod.rs +++ b/crates/nvisy-python/src/actions/mod.rs @@ -1,113 +1,160 @@ +//! Pipeline actions that perform AI-powered named-entity recognition. +//! +//! Two actions are provided: +//! - [`DetectNerAction`] -- runs NER over text documents. +//! - [`DetectNerImageAction`] -- runs NER over images (OCR + entity detection). + use std::any::Any; -use async_trait::async_trait; use tokio::sync::mpsc; -use nvisy_core::data::DataValue; -use nvisy_core::errors::NvisyError; +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::Document; +use nvisy_core::datatypes::image::ImageData; +use nvisy_core::error::Error; use nvisy_core::traits::action::Action; use crate::bridge::PythonBridge; use crate::ner::{self, NerConfig}; -/// AI NER detection action for text documents. +/// Pipeline action that detects named entities in text documents. +/// +/// If the incoming [`Blob`] carries `"documents"` artifacts, each document's +/// text is sent through the NER model. Otherwise the raw blob content is +/// interpreted as UTF-8 text. Detected entities are stored as `"entities"` +/// artifacts on the blob. pub struct DetectNerAction; -#[async_trait] +#[async_trait::async_trait] impl Action for DetectNerAction { fn id(&self) -> &str { "detect-ner" } - fn input_type(&self) -> &str { "document" } - fn output_type(&self) -> &str { "entity" } fn requires_client(&self) -> bool { true } fn required_provider_id(&self) -> Option<&str> { Some("ai") } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { Ok(()) } async fn execute( &self, - mut input: mpsc::Receiver<DataValue>, - output: mpsc::Sender<DataValue>, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, params: serde_json::Value, client: Option<Box<dyn Any + Send>>, - ) -> Result<u64, NvisyError> { + ) -> Result<u64, Error> { let bridge = extract_bridge(client)?; let config = parse_ner_config(¶ms); let mut count = 0u64; - while let Some(item) = input.recv().await { - if let DataValue::Document(doc) = &item { + while let Some(mut blob) = input.recv().await { + let documents: Vec<Document> = blob.get_artifacts("documents") + .map_err(|e| Error::runtime(format!("Failed to get document artifacts: {}", e), "python/ner", false))?; + + let docs = if documents.is_empty() { + let text = String::from_utf8(blob.content.to_vec()) + .map_err(|e| Error::runtime(format!("Blob content is not valid UTF-8: {}", e), "python/ner", false))?; + vec![Document::new(text)] + } else { + documents + }; + + for doc in &docs { let entities = ner::detect_ner(&bridge, &doc.content, &config).await?; - for mut entity in entities { - entity.source_id = Some(doc.data.id); - entity.data.parent_id = Some(doc.data.id); + for entity in &entities { + blob.add_artifact("entities", entity) + .map_err(|e| Error::runtime(format!("Failed to add entity artifact: {}", e), "python/ner", false))?; count += 1; - if output.send(DataValue::Entity(entity)).await.is_err() { - return Ok(count); - } } } + + if output.send(blob).await.is_err() { + return Ok(count); + } } Ok(count) } } -/// AI NER detection action for images. +/// Pipeline action that detects named entities in images. +/// +/// If the incoming [`Blob`] carries `"images"` artifacts, each image is +/// processed individually. Otherwise the raw blob content is treated as a +/// single image whose MIME type is inferred from the blob metadata. +/// Detected entities are stored as `"entities"` artifacts on the blob. pub struct DetectNerImageAction; -#[async_trait] +#[async_trait::async_trait] impl Action for DetectNerImageAction { fn id(&self) -> &str { "detect-ner-image" } - fn input_type(&self) -> &str { "image" } - fn output_type(&self) -> &str { "entity" } fn requires_client(&self) -> bool { true } fn required_provider_id(&self) -> Option<&str> { Some("ai") } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), NvisyError> { + fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { Ok(()) } async fn execute( &self, - mut input: mpsc::Receiver<DataValue>, - output: mpsc::Sender<DataValue>, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, params: serde_json::Value, client: Option<Box<dyn Any + Send>>, - ) -> Result<u64, NvisyError> { + ) -> Result<u64, Error> { let bridge = extract_bridge(client)?; let config = parse_ner_config(¶ms); let mut count = 0u64; - while let Some(item) = input.recv().await { - if let DataValue::Image(img) = &item { + while let Some(mut blob) = input.recv().await { + let images: Vec<ImageData> = blob.get_artifacts("images") + .map_err(|e| Error::runtime(format!("Failed to get image artifacts: {}", e), "python/ner-image", false))?; + + if images.is_empty() { + let mime_type = blob.content_type().unwrap_or("application/octet-stream").to_string(); let entities = ner::detect_ner_image( &bridge, - &img.image_data, - &img.mime_type, + &blob.content, + &mime_type, &config, ).await?; - for mut entity in entities { - entity.data.parent_id = Some(img.data.id); + for entity in &entities { + blob.add_artifact("entities", entity) + .map_err(|e| Error::runtime(format!("Failed to add entity artifact: {}", e), "python/ner-image", false))?; count += 1; - if output.send(DataValue::Entity(entity)).await.is_err() { - return Ok(count); + } + } else { + for img in &images { + let entities = ner::detect_ner_image( + &bridge, + &img.image_data, + &img.mime_type, + &config, + ).await?; + for entity in &entities { + blob.add_artifact("entities", entity) + .map_err(|e| Error::runtime(format!("Failed to add entity artifact: {}", e), "python/ner-image", false))?; + count += 1; } } } + + if output.send(blob).await.is_err() { + return Ok(count); + } } Ok(count) } } -fn extract_bridge(client: Option<Box<dyn Any + Send>>) -> Result<PythonBridge, NvisyError> { +/// Downcast the opaque provider client to a [`PythonBridge`]. +fn extract_bridge(client: Option<Box<dyn Any + Send>>) -> Result<PythonBridge, Error> { client - .ok_or_else(|| NvisyError::runtime("AI provider client required", "python", false))? + .ok_or_else(|| Error::runtime("AI provider client required", "python", false))? .downcast::<PythonBridge>() .map(|b| *b) - .map_err(|_| NvisyError::runtime("Invalid client type for AI actions", "python", false)) + .map_err(|_| Error::runtime("Invalid client type for AI actions", "python", false)) } +/// Extract [`NerConfig`] from the action's JSON parameters. fn parse_ner_config(params: &serde_json::Value) -> NerConfig { NerConfig { entity_types: params diff --git a/crates/nvisy-python/src/bridge/mod.rs b/crates/nvisy-python/src/bridge/mod.rs index f28c991..d785427 100644 --- a/crates/nvisy-python/src/bridge/mod.rs +++ b/crates/nvisy-python/src/bridge/mod.rs @@ -1,10 +1,17 @@ +//! Lightweight handle to a Python module loaded via PyO3. + use pyo3::prelude::*; -use nvisy_core::errors::NvisyError; +use nvisy_core::error::Error; use crate::error::from_pyerr; -/// Holds a reference to the loaded Python NER module. +/// Lightweight handle to a Python NER module. +/// +/// The bridge does **not** hold the GIL or any Python objects; it simply +/// remembers which module to `import` when a detection function is called. +/// The default module name is `"nvisy_ai"`. #[derive(Clone)] pub struct PythonBridge { + /// Dotted Python module name to import (e.g., `"nvisy_ai"`). module_name: String, } @@ -17,7 +24,7 @@ impl PythonBridge { } /// Initialize Python and verify the module can be imported. - pub fn init(&self) -> Result<(), NvisyError> { + pub fn init(&self) -> Result<(), Error> { Python::with_gil(|py| { py.import(&self.module_name) .map_err(from_pyerr)?; diff --git a/crates/nvisy-python/src/error/mod.rs b/crates/nvisy-python/src/error/mod.rs index f173ec1..ed3176f 100644 --- a/crates/nvisy-python/src/error/mod.rs +++ b/crates/nvisy-python/src/error/mod.rs @@ -1,9 +1,11 @@ -use nvisy_core::errors::NvisyError; +//! Conversion utilities from Python errors to [`Error`]. + +use nvisy_core::error::Error; use pyo3::PyErr; use pyo3::types::PyTracebackMethods; -/// Convert a Python error to a NvisyError. -pub fn from_pyerr(err: PyErr) -> NvisyError { +/// Convert a [`PyErr`] into an [`Error`], preserving the Python traceback when available. +pub fn from_pyerr(err: PyErr) -> Error { pyo3::Python::with_gil(|py| { let traceback = err .traceback(py) @@ -12,6 +14,6 @@ pub fn from_pyerr(err: PyErr) -> NvisyError { Some(tb) => format!("{}\n{}", err, tb), None => err.to_string(), }; - NvisyError::python(msg) + Error::python(msg) }) } diff --git a/crates/nvisy-python/src/lib.rs b/crates/nvisy-python/src/lib.rs index 9c4c41c..c130f8f 100644 --- a/crates/nvisy-python/src/lib.rs +++ b/crates/nvisy-python/src/lib.rs @@ -1,3 +1,11 @@ +//! Python/PyO3 bridge for AI-powered NER detection. +//! +//! This crate embeds a CPython interpreter via PyO3 and delegates named-entity +//! recognition (NER) to a Python module (`nvisy_ai`). It exposes pipeline +//! [`Action`](nvisy_core::traits::action::Action) implementations as well as a +//! [`ProviderFactory`](nvisy_core::traits::provider::ProviderFactory) for the +//! `"ai"` provider. + #![deny(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] @@ -8,14 +16,5 @@ pub mod error; pub mod ner; pub mod provider; -use nvisy_core::plugin::PluginDescriptor; -use crate::actions::{DetectNerAction, DetectNerImageAction}; -use crate::provider::AiProviderFactory; - -/// Create the Python AI plugin descriptor. -pub fn python_plugin() -> PluginDescriptor { - PluginDescriptor::new("ai") - .with_action(DetectNerAction) - .with_action(DetectNerImageAction) - .with_provider(AiProviderFactory) -} +#[doc(hidden)] +pub mod prelude; diff --git a/crates/nvisy-python/src/ner/mod.rs b/crates/nvisy-python/src/ner/mod.rs index eeb26b4..eecf199 100644 --- a/crates/nvisy-python/src/ner/mod.rs +++ b/crates/nvisy-python/src/ner/mod.rs @@ -1,20 +1,31 @@ +//! Named-entity recognition (NER) detection via a Python AI backend. +//! +//! Functions in this module acquire the GIL, call into the Python `nvisy_ai` +//! module, and convert the returned list of dicts into [`Entity`] values. + use pyo3::prelude::*; use pyo3::types::{PyDict, PyList}; use nvisy_core::datatypes::entity::{Entity, EntityLocation}; -use nvisy_core::errors::NvisyError; -use nvisy_core::types::{DetectionMethod, EntityCategory}; +use nvisy_core::error::Error; +use nvisy_core::datatypes::entity::{DetectionMethod, EntityCategory}; use crate::bridge::PythonBridge; use crate::error::from_pyerr; -/// Configuration for NER detection. +/// Configuration for NER detection passed to the Python backend. #[derive(Debug, Clone)] pub struct NerConfig { + /// Entity type labels to detect (e.g., `["PERSON", "SSN"]`). pub entity_types: Vec<String>, + /// Minimum confidence score to include a detection (0.0 -- 1.0). pub confidence_threshold: f64, + /// Sampling temperature forwarded to the AI model. pub temperature: f64, + /// API key for the AI provider. pub api_key: String, + /// Model identifier (e.g., `"gpt-4"`). pub model: String, + /// AI provider name (e.g., `"openai"`). pub provider: String, } @@ -23,7 +34,7 @@ pub async fn detect_ner( bridge: &PythonBridge, text: &str, config: &NerConfig, -) -> Result<Vec<Entity>, NvisyError> { +) -> Result<Vec<Entity>, Error> { let module_name = bridge.module_name().to_string(); let text = text.to_string(); let config = config.clone(); @@ -49,7 +60,7 @@ pub async fn detect_ner( }) }) .await - .map_err(|e| NvisyError::python(format!("Task join error: {}", e)))? + .map_err(|e| Error::python(format!("Task join error: {}", e)))? } /// Call Python detect_ner_image function via GIL + spawn_blocking. @@ -58,7 +69,7 @@ pub async fn detect_ner_image( image_data: &[u8], mime_type: &str, config: &NerConfig, -) -> Result<Vec<Entity>, NvisyError> { +) -> Result<Vec<Entity>, Error> { let module_name = bridge.module_name().to_string(); let image_data = image_data.to_vec(); let mime_type = mime_type.to_string(); @@ -85,26 +96,26 @@ pub async fn detect_ner_image( }) }) .await - .map_err(|e| NvisyError::python(format!("Task join error: {}", e)))? + .map_err(|e| Error::python(format!("Task join error: {}", e)))? } /// Parse Python list[dict] response into Vec<Entity>. -fn parse_python_entities(_py: Python<'_>, result: Bound<'_, PyAny>) -> Result<Vec<Entity>, NvisyError> { +fn parse_python_entities(_py: Python<'_>, result: Bound<'_, PyAny>) -> Result<Vec<Entity>, Error> { let list: &Bound<'_, PyList> = result.downcast().map_err(|e| { - NvisyError::python(format!("Expected list from Python: {}", e)) + Error::python(format!("Expected list from Python: {}", e)) })?; let mut entities = Vec::new(); for item in list.iter() { let dict: &Bound<'_, PyDict> = item.downcast().map_err(|e| { - NvisyError::python(format!("Expected dict in list: {}", e)) + Error::python(format!("Expected dict in list: {}", e)) })?; let category_str: String = dict .get_item("category") .map_err(from_pyerr)? - .ok_or_else(|| NvisyError::python("Missing 'category'"))? + .ok_or_else(|| Error::python("Missing 'category'"))? .extract() .map_err(from_pyerr)?; @@ -119,21 +130,21 @@ fn parse_python_entities(_py: Python<'_>, result: Bound<'_, PyAny>) -> Result<Ve let entity_type: String = dict .get_item("entity_type") .map_err(from_pyerr)? - .ok_or_else(|| NvisyError::python("Missing 'entity_type'"))? + .ok_or_else(|| Error::python("Missing 'entity_type'"))? .extract() .map_err(from_pyerr)?; let value: String = dict .get_item("value") .map_err(from_pyerr)? - .ok_or_else(|| NvisyError::python("Missing 'value'"))? + .ok_or_else(|| Error::python("Missing 'value'"))? .extract() .map_err(from_pyerr)?; let confidence: f64 = dict .get_item("confidence") .map_err(from_pyerr)? - .ok_or_else(|| NvisyError::python("Missing 'confidence'"))? + .ok_or_else(|| Error::python("Missing 'confidence'"))? .extract() .map_err(from_pyerr)?; diff --git a/crates/nvisy-python/src/prelude.rs b/crates/nvisy-python/src/prelude.rs new file mode 100644 index 0000000..1b6aca3 --- /dev/null +++ b/crates/nvisy-python/src/prelude.rs @@ -0,0 +1,4 @@ +//! Convenience re-exports. +pub use crate::actions::{DetectNerAction, DetectNerImageAction}; +pub use crate::bridge::PythonBridge; +pub use crate::provider::AiProviderFactory; diff --git a/crates/nvisy-python/src/provider/mod.rs b/crates/nvisy-python/src/provider/mod.rs index 734c3c9..23be9cd 100644 --- a/crates/nvisy-python/src/provider/mod.rs +++ b/crates/nvisy-python/src/provider/mod.rs @@ -1,27 +1,37 @@ -use async_trait::async_trait; -use nvisy_core::errors::NvisyError; +//! AI provider factory for the Python NER bridge. +//! +//! Registers itself as the `"ai"` provider and yields a [`PythonBridge`] +//! instance upon connection. + +use nvisy_core::error::Error; use nvisy_core::traits::provider::{ConnectedInstance, ProviderFactory}; use crate::bridge::PythonBridge; -/// AI provider factory that creates PythonBridge instances. +/// Factory that creates [`PythonBridge`] instances from JSON credentials. +/// +/// Expected credential keys: +/// - `apiKey` (required) -- the API key forwarded to the AI model provider. +/// +/// The Python interpreter is **not** initialized at connection time; it is +/// lazily loaded on the first NER call. pub struct AiProviderFactory; -#[async_trait] +#[async_trait::async_trait] impl ProviderFactory for AiProviderFactory { fn id(&self) -> &str { "ai" } - fn validate_credentials(&self, creds: &serde_json::Value) -> Result<(), NvisyError> { + fn validate_credentials(&self, creds: &serde_json::Value) -> Result<(), Error> { if creds.get("apiKey").and_then(|v| v.as_str()).is_none() { - return Err(NvisyError::validation("Missing 'apiKey' in AI credentials", "ai")); + return Err(Error::validation("Missing 'apiKey' in AI credentials", "ai")); } Ok(()) } - async fn verify(&self, creds: &serde_json::Value) -> Result<(), NvisyError> { + async fn verify(&self, creds: &serde_json::Value) -> Result<(), Error> { self.validate_credentials(creds) } - async fn connect(&self, _creds: &serde_json::Value) -> Result<ConnectedInstance, NvisyError> { + async fn connect(&self, _creds: &serde_json::Value) -> Result<ConnectedInstance, Error> { let bridge = PythonBridge::default(); // Don't init here — Python might not be available at connect time // Init happens lazily when detect_ner is called diff --git a/crates/nvisy-server/Cargo.toml b/crates/nvisy-server/Cargo.toml index 7bbfd39..539ef45 100644 --- a/crates/nvisy-server/Cargo.toml +++ b/crates/nvisy-server/Cargo.toml @@ -29,9 +29,6 @@ rustdoc-args = ["--cfg", "docsrs"] # Internal crates nvisy-core = { workspace = true, features = ["schema"] } nvisy-engine = { workspace = true, features = ["schema"] } -nvisy-detect = { workspace = true, features = [] } -nvisy-object = { workspace = true, features = [] } -nvisy-python = { workspace = true, features = [] } # JSON Schema generation schemars = { workspace = true } @@ -50,7 +47,7 @@ tower-http = { workspace = true, features = ["cors", "trace", "request-id", "lim # OpenAPI / Documentation utoipa = { workspace = true, features = ["axum_extras"] } -utoipa-swagger-ui = { workspace = true, features = ["axum"] } +utoipa-scalar = { workspace = true, features = ["axum"] } # Primitive datatypes uuid = { workspace = true, features = ["v4"] } diff --git a/crates/nvisy-server/src/app/mod.rs b/crates/nvisy-server/src/app/mod.rs index 8b93b67..45392f7 100644 --- a/crates/nvisy-server/src/app/mod.rs +++ b/crates/nvisy-server/src/app/mod.rs @@ -1,21 +1,26 @@ +//! HTTP application bootstrap and route composition. +//! +//! The [`build_app`] function wires together all Axum routers, middleware +//! (CORS, tracing), and shared application state into a single [`Router`]. + use axum::Router; use std::sync::Arc; use tower_http::cors::{Any, CorsLayer}; use tower_http::trace::TraceLayer; use utoipa::OpenApi; -use utoipa_swagger_ui::SwaggerUi; +use utoipa_scalar::{Scalar, Servable}; use crate::handler; -use crate::service::engine_factory; use crate::service::{AuditStore, AppState, PolicyStore, ServerConfig}; use nvisy_engine::runs::RunManager; -/// Build a fully configured Axum application. +/// Build a fully configured Axum [`Router`] with all handlers and middleware. +/// +/// This constructs the shared [`AppState`], applies CORS and HTTP tracing +/// layers, and merges the health, graphs, redact, policies, audit, and +/// Scalar API-docs routes. pub async fn build_app(_config: &ServerConfig) -> anyhow::Result<Router> { - let registry = engine_factory::create_registry()?; - let state = AppState { - registry: Arc::new(registry), run_manager: Arc::new(RunManager::new()), policy_store: Arc::new(PolicyStore::new()), audit_store: Arc::new(AuditStore::new()), @@ -32,7 +37,7 @@ pub async fn build_app(_config: &ServerConfig) -> anyhow::Result<Router> { .merge(handler::redact::router()) .merge(handler::policies::router()) .merge(handler::audit::router()) - .merge(SwaggerUi::new("/swagger-ui").url("/api-docs/openapi.json", handler::ApiDoc::openapi())) + .merge(Scalar::with_url("/scalar", handler::ApiDoc::openapi())) .layer(TraceLayer::new_for_http()) .layer(cors) .with_state(state); diff --git a/crates/nvisy-server/src/service/engine_factory.rs b/crates/nvisy-server/src/service/engine_factory.rs deleted file mode 100644 index 3c8bb54..0000000 --- a/crates/nvisy-server/src/service/engine_factory.rs +++ /dev/null @@ -1,16 +0,0 @@ -use nvisy_core::registry::Registry; -use nvisy_core::errors::NvisyError; - -/// Create a registry with all standard plugins loaded. -pub fn create_registry() -> Result<Registry, NvisyError> { - let mut registry = Registry::new(); - registry.load(nvisy_detect::detect_plugin())?; - registry.load(nvisy_object::object_plugin())?; - registry.load(nvisy_python::python_plugin())?; - tracing::info!( - actions = ?registry.action_keys(), - providers = ?registry.provider_keys(), - "Registry initialized" - ); - Ok(registry) -} diff --git a/crates/nvisy-server/src/service/mod.rs b/crates/nvisy-server/src/service/mod.rs index 0145003..171a036 100644 --- a/crates/nvisy-server/src/service/mod.rs +++ b/crates/nvisy-server/src/service/mod.rs @@ -1,6 +1,11 @@ +//! Shared application services, configuration, and state. +//! +//! This module re-exports the primary service types and implements Axum's +//! [`FromRef`](axum::extract::FromRef) for each sub-state field so that +//! handlers can extract individual services directly. + pub mod audit_store; pub mod config; -pub mod engine_factory; pub mod policy_store; pub mod state; @@ -9,7 +14,6 @@ use std::sync::Arc; // Re-exports for convenience pub use audit_store::AuditStore; pub use config::ServerConfig; -pub use engine_factory::create_registry; pub use policy_store::PolicyStore; pub use state::AppState; @@ -24,7 +28,6 @@ macro_rules! impl_di { } impl_di! { - registry: Arc<nvisy_core::registry::Registry>, run_manager: Arc<nvisy_engine::runs::RunManager>, policy_store: Arc<PolicyStore>, audit_store: Arc<AuditStore> diff --git a/crates/nvisy-server/src/service/state.rs b/crates/nvisy-server/src/service/state.rs index fc66c6c..5446ad5 100644 --- a/crates/nvisy-server/src/service/state.rs +++ b/crates/nvisy-server/src/service/state.rs @@ -1,14 +1,19 @@ +//! Central application state shared across all HTTP handlers. + use std::sync::Arc; use nvisy_engine::runs::RunManager; use super::audit_store::AuditStore; use super::policy_store::PolicyStore; -use nvisy_core::registry::Registry; -/// Shared application state. +/// Shared application state passed to every Axum handler via [`axum::extract::State`]. +/// +/// Each field is wrapped in an [`Arc`] so cloning the state is cheap. #[derive(Clone)] pub struct AppState { - pub registry: Arc<Registry>, + /// Manages in-flight and completed pipeline runs. pub run_manager: Arc<RunManager>, + /// In-memory store of policy definitions. pub policy_store: Arc<PolicyStore>, + /// In-memory store of audit log entries. pub audit_store: Arc<AuditStore>, } From 4adfdc5fcd56057b1e458936c7c9e16411d30863 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Tue, 10 Feb 2026 15:48:19 +0100 Subject: [PATCH 09/17] refactor(core): restructure modules, add typed traits with associated types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reorganize nvisy-core: rename DataItem→Data, move entity/redaction/audit types to ontology/, create redaction/ for context+policy, rename traits/ to registry/. Replace serde_json::Value params and Box<dyn Any> clients with associated types (Params, Credentials, Client) on all traits. Update all consumer crates (detect, object, python, server) with typed param structs and concrete client types. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- crates/nvisy-core/src/datatypes/blob.rs | 6 +- crates/nvisy-core/src/datatypes/document.rs | 454 +++++++++++++++++- crates/nvisy-core/src/datatypes/image.rs | 69 --- crates/nvisy-core/src/datatypes/mod.rs | 21 +- crates/nvisy-core/src/documents/elements.rs | 241 ---------- crates/nvisy-core/src/documents/mod.rs | 8 - crates/nvisy-core/src/documents/ontology.rs | 129 ----- crates/nvisy-core/src/lib.rs | 5 +- .../src/{datatypes => ontology}/audit.rs | 6 +- .../src/{datatypes => ontology}/entity.rs | 6 +- crates/nvisy-core/src/ontology/mod.rs | 15 + .../src/{datatypes => ontology}/redaction.rs | 6 +- crates/nvisy-core/src/prelude.rs | 15 +- .../context.rs} | 4 +- crates/nvisy-core/src/redaction/mod.rs | 7 + .../src/{datatypes => redaction}/policy.rs | 10 +- .../src/{traits => registry}/action.rs | 24 +- .../src/{traits => registry}/loader.rs | 9 +- .../src/{traits => registry}/mod.rs | 0 .../src/{traits => registry}/provider.rs | 26 +- .../src/{traits => registry}/stream.rs | 29 +- .../src/actions/apply_redaction.rs | 14 +- crates/nvisy-detect/src/actions/classify.rs | 14 +- .../src/actions/detect_checksum.rs | 46 +- .../nvisy-detect/src/actions/detect_regex.rs | 42 +- crates/nvisy-detect/src/actions/emit_audit.rs | 49 +- .../src/actions/evaluate_policy.rs | 62 +-- crates/nvisy-detect/src/actions/mod.rs | 2 +- crates/nvisy-detect/src/loaders/csv_loader.rs | 6 +- .../nvisy-detect/src/loaders/json_loader.rs | 6 +- crates/nvisy-detect/src/loaders/plaintext.rs | 6 +- crates/nvisy-detect/src/patterns/mod.rs | 2 +- crates/nvisy-object/src/providers/s3.rs | 79 +-- crates/nvisy-object/src/streams/read.rs | 42 +- crates/nvisy-object/src/streams/write.rs | 34 +- crates/nvisy-python/src/actions/mod.rs | 120 ++--- crates/nvisy-python/src/lib.rs | 4 +- crates/nvisy-python/src/ner/mod.rs | 4 +- crates/nvisy-python/src/provider/mod.rs | 32 +- crates/nvisy-server/src/handler/redact.rs | 2 +- 40 files changed, 853 insertions(+), 803 deletions(-) delete mode 100644 crates/nvisy-core/src/datatypes/image.rs delete mode 100644 crates/nvisy-core/src/documents/elements.rs delete mode 100644 crates/nvisy-core/src/documents/mod.rs delete mode 100644 crates/nvisy-core/src/documents/ontology.rs rename crates/nvisy-core/src/{datatypes => ontology}/audit.rs (97%) rename crates/nvisy-core/src/{datatypes => ontology}/entity.rs (98%) create mode 100644 crates/nvisy-core/src/ontology/mod.rs rename crates/nvisy-core/src/{datatypes => ontology}/redaction.rs (97%) rename crates/nvisy-core/src/{datatypes/redaction_context.rs => redaction/context.rs} (98%) create mode 100644 crates/nvisy-core/src/redaction/mod.rs rename crates/nvisy-core/src/{datatypes => redaction}/policy.rs (95%) rename crates/nvisy-core/src/{traits => registry}/action.rs (69%) rename crates/nvisy-core/src/{traits => registry}/loader.rs (85%) rename crates/nvisy-core/src/{traits => registry}/mod.rs (100%) rename crates/nvisy-core/src/{traits => registry}/provider.rs (60%) rename crates/nvisy-core/src/{traits => registry}/stream.rs (68%) diff --git a/crates/nvisy-core/src/datatypes/blob.rs b/crates/nvisy-core/src/datatypes/blob.rs index dfdf54d..a5e4c21 100644 --- a/crates/nvisy-core/src/datatypes/blob.rs +++ b/crates/nvisy-core/src/datatypes/blob.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use bytes::Bytes; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; -use super::DataItem; +use super::Data; /// Content type information for a blob. /// @@ -34,7 +34,7 @@ pub struct BlobContentInfo { pub struct Blob { /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] - pub data: DataItem, + pub data: Data, /// Storage path or key identifying this blob's origin. pub path: String, /// Raw byte content of the blob. @@ -60,7 +60,7 @@ impl Blob { let content = content.into(); let detected_mime = infer::get(&content).map(|t| t.mime_type().to_string()); Self { - data: DataItem::new(), + data: Data::new(), path: path.into(), content, provided: BlobContentInfo { diff --git a/crates/nvisy-core/src/datatypes/document.rs b/crates/nvisy-core/src/datatypes/document.rs index c91b613..fdec9d6 100644 --- a/crates/nvisy-core/src/datatypes/document.rs +++ b/crates/nvisy-core/src/datatypes/document.rs @@ -1,8 +1,382 @@ -//! Parsed document representation. +//! Parsed document representation, structural elements, and element ontology. +use bytes::Bytes; use serde::{Deserialize, Serialize}; -use super::DataItem; -use crate::documents::elements::Element; +use uuid::Uuid; +use super::Data; +use super::Metadata; + +// --------------------------------------------------------------------------- +// Element ontology +// --------------------------------------------------------------------------- + +/// Broad grouping of element types. +/// +/// Every [`ElementType`] belongs to exactly one category, providing +/// a coarse filter for pipeline actions that only operate on certain +/// kinds of content. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum ElementCategory { + /// Narrative text, headings, list items, captions, and addresses. + Text, + /// Tabular data. + Table, + /// Images and other media content. + Media, + /// Source code fragments. + Code, + /// Mathematical formulae. + Math, + /// Form elements such as checkboxes and key-value fields. + Form, + /// Layout markers like page breaks and page numbers. + Layout, + /// Email message content. + Email, +} + +/// Specific structural element type extracted from a document. +/// +/// Each variant maps to a single [`ElementCategory`] via +/// [`ElementType::category`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[serde(rename_all = "kebab-case")] +pub enum ElementType { + // -- Text -- + + /// A document title or section heading. + Title, + /// A block of narrative prose. + NarrativeText, + /// An item within a bulleted or numbered list. + ListItem, + /// A page or section header. + Header, + /// A page or section footer. + Footer, + /// Caption text associated with a figure. + FigureCaption, + /// A physical or mailing address. + Address, + /// Text that does not fit any other text category. + UncategorizedText, + + // -- Table -- + + /// A data table with rows and columns. + Table, + + // -- Media -- + + /// An embedded image. + Image, + + // -- Code -- + + /// A source code snippet or block. + CodeSnippet, + + // -- Math -- + + /// A mathematical formula or equation. + Formula, + + // -- Form -- + + /// A checkbox form control. + Checkbox, + /// A set of key-value pairs extracted from a form. + FormKeysValues, + + // -- Layout -- + + /// A page break marker. + PageBreak, + /// A page number indicator. + PageNumber, + + // -- Email -- + + /// An email message body and headers. + EmailMessage, +} + +impl ElementType { + /// Return the category this element type belongs to. + pub fn category(&self) -> ElementCategory { + match self { + Self::Title + | Self::NarrativeText + | Self::ListItem + | Self::Header + | Self::Footer + | Self::FigureCaption + | Self::Address + | Self::UncategorizedText => ElementCategory::Text, + Self::Table => ElementCategory::Table, + Self::Image => ElementCategory::Media, + Self::CodeSnippet => ElementCategory::Code, + Self::Formula => ElementCategory::Math, + Self::Checkbox | Self::FormKeysValues => ElementCategory::Form, + Self::PageBreak | Self::PageNumber => ElementCategory::Layout, + Self::EmailMessage => ElementCategory::Email, + } + } +} + +/// Parse an element type string and return its category. +/// +/// Returns `None` if the string does not match any known [`ElementType`]. +pub fn category_of(type_str: &str) -> Option<ElementCategory> { + let et: ElementType = + serde_json::from_value(serde_json::Value::String(type_str.to_string())).ok()?; + Some(et.category()) +} + +// --------------------------------------------------------------------------- +// Structural elements +// --------------------------------------------------------------------------- + +/// An inline hyperlink within element text. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct Link { + /// Display text of the hyperlink. + pub text: String, + /// Target URL of the hyperlink. + pub url: String, + /// Character offset where the link text begins in the parent element. + pub start_index: usize, +} + +/// An inline formatting span within element text. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct EmphasizedText { + /// The emphasized text content. + pub text: String, + /// HTML tag name describing the emphasis (e.g. `"b"`, `"i"`, `"em"`). + pub tag: String, +} + +/// A single cell within a table structure. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct TableCellData { + /// Zero-based row index. + pub row: usize, + /// Zero-based column index. + pub column: usize, + /// Text content of the cell. + pub text: String, + /// Whether this cell is a header cell. + #[serde(skip_serializing_if = "Option::is_none")] + pub is_header: Option<bool>, +} + +/// Extraction or OCR provenance data for an element. +/// +/// Records how an element was detected and any extraction +/// confidence metadata. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct ElementProvenance { + /// Confidence score of the extraction (0.0 to 1.0). + #[serde(skip_serializing_if = "Option::is_none")] + pub confidence: Option<f64>, + /// Name of the extraction engine or model that produced this element. + #[serde(skip_serializing_if = "Option::is_none")] + pub detection_origin: Option<String>, + /// Whether this element continues from a previous element split across pages. + #[serde(skip_serializing_if = "Option::is_none")] + pub is_continuation: Option<bool>, + /// Type of header or footer (e.g. `"primary"`, `"footnote"`), if applicable. + #[serde(skip_serializing_if = "Option::is_none")] + pub header_footer_type: Option<String>, +} + +/// Structured key-value pair extracted from a form. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct FormKeyValuePair { + /// Form field label or key. + pub key: String, + /// Form field value, if one was extracted. + #[serde(skip_serializing_if = "Option::is_none")] + pub value: Option<String>, + /// Extraction confidence for this key-value pair. + #[serde(skip_serializing_if = "Option::is_none")] + pub confidence: Option<f64>, +} + +/// A single structural element extracted from a document. +/// +/// Combines base element fields with optional type-specific fields +/// (image, table, form, email) in a flat struct rather than inheritance. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct Element { + /// Unique identifier for this element. + pub id: Uuid, + /// The structural type of this element. + #[serde(rename = "type")] + pub element_type: ElementType, + /// Plain-text content of the element. + pub text: String, + + /// Identifier of the parent element (for nested structures). + #[serde(skip_serializing_if = "Option::is_none")] + pub parent_id: Option<Uuid>, + /// 1-based page number where this element appears. + #[serde(skip_serializing_if = "Option::is_none")] + pub page_number: Option<u32>, + /// Named page or sheet label (e.g. worksheet name in a spreadsheet). + #[serde(skip_serializing_if = "Option::is_none")] + pub page_name: Option<String>, + /// Heading level (1-6) for title or header elements. + #[serde(skip_serializing_if = "Option::is_none")] + pub level: Option<u32>, + /// BCP-47 language tags detected in this element. + #[serde(skip_serializing_if = "Option::is_none")] + pub languages: Option<Vec<String>>, + /// Arbitrary metadata associated with this element. + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option<Metadata>, + /// Tag identifying the extraction source or pipeline stage. + #[serde(skip_serializing_if = "Option::is_none")] + pub source_tag: Option<String>, + /// HTML representation of the element's text with inline formatting. + #[serde(skip_serializing_if = "Option::is_none")] + pub text_as_html: Option<String>, + /// Inline hyperlinks found within this element's text. + #[serde(skip_serializing_if = "Option::is_none")] + pub links: Option<Vec<Link>>, + /// Inline formatting spans (bold, italic, etc.) within this element. + #[serde(skip_serializing_if = "Option::is_none")] + pub emphasized_texts: Option<Vec<EmphasizedText>>, + /// Extraction or OCR provenance information. + #[serde(skip_serializing_if = "Option::is_none")] + pub provenance: Option<ElementProvenance>, + + // -- Image-specific fields (when element_type is Image) -- + + /// Base64-encoded image data. + #[serde(skip_serializing_if = "Option::is_none")] + pub image_base64: Option<String>, + /// MIME type of the embedded image. + #[serde(skip_serializing_if = "Option::is_none")] + pub image_mime_type: Option<String>, + /// Remote URL of the image. + #[serde(skip_serializing_if = "Option::is_none")] + pub image_url: Option<String>, + /// Local file path of the image. + #[serde(skip_serializing_if = "Option::is_none")] + pub image_path: Option<String>, + + // -- Table-specific fields (when element_type is Table) -- + + /// Individual table cells with row/column coordinates. + #[serde(skip_serializing_if = "Option::is_none")] + pub cells: Option<Vec<TableCellData>>, + + // -- Form-specific fields (when element_type is Checkbox/FormKeysValues) -- + + /// Whether a checkbox is checked. + #[serde(skip_serializing_if = "Option::is_none")] + pub checked: Option<bool>, + /// Value of a form field. + #[serde(skip_serializing_if = "Option::is_none")] + pub value: Option<String>, + /// Structured key-value pairs extracted from a form. + #[serde(skip_serializing_if = "Option::is_none")] + pub key_value_pairs: Option<Vec<FormKeyValuePair>>, + + // -- Email-specific fields (when element_type is EmailMessage) -- + + /// Sender addresses. + #[serde(skip_serializing_if = "Option::is_none")] + pub sent_from: Option<Vec<String>>, + /// Primary recipient addresses. + #[serde(skip_serializing_if = "Option::is_none")] + pub sent_to: Option<Vec<String>>, + /// CC recipient addresses. + #[serde(skip_serializing_if = "Option::is_none")] + pub cc_recipient: Option<Vec<String>>, + /// BCC recipient addresses. + #[serde(skip_serializing_if = "Option::is_none")] + pub bcc_recipient: Option<Vec<String>>, + /// Email subject line. + #[serde(skip_serializing_if = "Option::is_none")] + pub subject: Option<String>, + /// Email signature block. + #[serde(skip_serializing_if = "Option::is_none")] + pub signature: Option<String>, + /// RFC 2822 Message-ID of the email. + #[serde(skip_serializing_if = "Option::is_none")] + pub email_message_id: Option<String>, +} + +impl Element { + /// Create a new element with the given type and text content. + pub fn new(element_type: ElementType, text: impl Into<String>) -> Self { + Self { + id: Uuid::new_v4(), + element_type, + text: text.into(), + parent_id: None, + page_number: None, + page_name: None, + level: None, + languages: None, + metadata: None, + source_tag: None, + text_as_html: None, + links: None, + emphasized_texts: None, + provenance: None, + image_base64: None, + image_mime_type: None, + image_url: None, + image_path: None, + cells: None, + checked: None, + value: None, + key_value_pairs: None, + sent_from: None, + sent_to: None, + cc_recipient: None, + bcc_recipient: None, + subject: None, + signature: None, + email_message_id: None, + } + } + + /// Set the 1-based page number for this element. + pub fn with_page_number(mut self, page: u32) -> Self { + self.page_number = Some(page); + self + } + + /// Set the heading level (1-6) for title or header elements. + pub fn with_level(mut self, level: u32) -> Self { + self.level = Some(level); + self + } + + /// Set BCP-47 language tags detected in this element. + pub fn with_languages(mut self, langs: Vec<String>) -> Self { + self.languages = Some(langs); + self + } +} + +// --------------------------------------------------------------------------- +// Document +// --------------------------------------------------------------------------- /// A parsed human-readable text representation of a document. /// @@ -14,7 +388,7 @@ use crate::documents::elements::Element; pub struct Document { /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] - pub data: DataItem, + pub data: Data, /// Full text content of the document. pub content: String, /// Document title, if one was extracted. @@ -35,7 +409,7 @@ impl Document { /// Create a new document from raw text content. pub fn new(content: impl Into<String>) -> Self { Self { - data: DataItem::new(), + data: Data::new(), content: content.into(), title: None, elements: None, @@ -72,7 +446,7 @@ impl Document { pub fn from_elements(elements: Vec<Element>) -> Self { let content = elements.iter().map(|e| e.text.as_str()).collect::<Vec<_>>().join("\n\n"); Self { - data: DataItem::new(), + data: Data::new(), content, title: None, elements: Some(elements), @@ -111,3 +485,71 @@ impl Document { map } } + +// --------------------------------------------------------------------------- +// ImageData +// --------------------------------------------------------------------------- + +/// An image extracted from a document or provided directly. +/// +/// Carries the raw pixel data, MIME type, optional dimensions, and +/// provenance information linking back to its source. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct ImageData { + /// Common data-item fields (id, parent_id, metadata). + #[serde(flatten)] + pub data: Data, + /// Raw image bytes (PNG, JPEG, etc.). + #[serde(with = "crate::datatypes::blob::bytes_serde")] + #[cfg_attr(feature = "schema", schemars(with = "Vec<u8>"))] + pub image_data: Bytes, + /// MIME type of the image (e.g. `"image/png"`). + pub mime_type: String, + /// Width of the image in pixels, if known. + #[serde(skip_serializing_if = "Option::is_none")] + pub width: Option<u32>, + /// Height of the image in pixels, if known. + #[serde(skip_serializing_if = "Option::is_none")] + pub height: Option<u32>, + /// File path or URL the image was loaded from, if applicable. + #[serde(skip_serializing_if = "Option::is_none")] + pub source_path: Option<String>, + /// 1-based page number the image was extracted from, if applicable. + #[serde(skip_serializing_if = "Option::is_none")] + pub page_number: Option<u32>, +} + +impl ImageData { + /// Create a new image from raw bytes and a MIME type. + pub fn new(image_data: impl Into<Bytes>, mime_type: impl Into<String>) -> Self { + Self { + data: Data::new(), + image_data: image_data.into(), + mime_type: mime_type.into(), + width: None, + height: None, + source_path: None, + page_number: None, + } + } + + /// Set the pixel dimensions of the image. + pub fn with_dimensions(mut self, width: u32, height: u32) -> Self { + self.width = Some(width); + self.height = Some(height); + self + } + + /// Record the file path or URL the image originated from. + pub fn with_source_path(mut self, path: impl Into<String>) -> Self { + self.source_path = Some(path.into()); + self + } + + /// Set the page number this image was extracted from. + pub fn with_page_number(mut self, page: u32) -> Self { + self.page_number = Some(page); + self + } +} diff --git a/crates/nvisy-core/src/datatypes/image.rs b/crates/nvisy-core/src/datatypes/image.rs deleted file mode 100644 index 02067e3..0000000 --- a/crates/nvisy-core/src/datatypes/image.rs +++ /dev/null @@ -1,69 +0,0 @@ -//! Image data extracted from documents or provided directly. - -use bytes::Bytes; -use serde::{Deserialize, Serialize}; -use super::DataItem; - -/// An image extracted from a document or provided directly. -/// -/// Carries the raw pixel data, MIME type, optional dimensions, and -/// provenance information linking back to its source. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct ImageData { - /// Common data-item fields (id, parent_id, metadata). - #[serde(flatten)] - pub data: DataItem, - /// Raw image bytes (PNG, JPEG, etc.). - #[serde(with = "crate::datatypes::blob::bytes_serde")] - #[cfg_attr(feature = "schema", schemars(with = "Vec<u8>"))] - pub image_data: Bytes, - /// MIME type of the image (e.g. `"image/png"`). - pub mime_type: String, - /// Width of the image in pixels, if known. - #[serde(skip_serializing_if = "Option::is_none")] - pub width: Option<u32>, - /// Height of the image in pixels, if known. - #[serde(skip_serializing_if = "Option::is_none")] - pub height: Option<u32>, - /// File path or URL the image was loaded from, if applicable. - #[serde(skip_serializing_if = "Option::is_none")] - pub source_path: Option<String>, - /// 1-based page number the image was extracted from, if applicable. - #[serde(skip_serializing_if = "Option::is_none")] - pub page_number: Option<u32>, -} - -impl ImageData { - /// Create a new image from raw bytes and a MIME type. - pub fn new(image_data: impl Into<Bytes>, mime_type: impl Into<String>) -> Self { - Self { - data: DataItem::new(), - image_data: image_data.into(), - mime_type: mime_type.into(), - width: None, - height: None, - source_path: None, - page_number: None, - } - } - - /// Set the pixel dimensions of the image. - pub fn with_dimensions(mut self, width: u32, height: u32) -> Self { - self.width = Some(width); - self.height = Some(height); - self - } - - /// Record the file path or URL the image originated from. - pub fn with_source_path(mut self, path: impl Into<String>) -> Self { - self.source_path = Some(path.into()); - self - } - - /// Set the page number this image was extracted from. - pub fn with_page_number(mut self, page: u32) -> Self { - self.page_number = Some(page); - self - } -} diff --git a/crates/nvisy-core/src/datatypes/mod.rs b/crates/nvisy-core/src/datatypes/mod.rs index 078c1a8..b937467 100644 --- a/crates/nvisy-core/src/datatypes/mod.rs +++ b/crates/nvisy-core/src/datatypes/mod.rs @@ -1,20 +1,13 @@ //! Domain data types for the nvisy pipeline. //! //! This module defines the core data structures that flow through the nvisy -//! processing pipeline: blobs, documents, entities, redactions, audits, -//! policies, and images. +//! processing pipeline: blobs and documents. use serde::{Deserialize, Serialize}; use uuid::Uuid; -pub mod audit; pub mod blob; pub mod document; -pub mod entity; -pub mod image; -pub mod policy; -pub mod redaction; -pub mod redaction_context; /// General-purpose metadata map. pub type Metadata = serde_json::Map<String, serde_json::Value>; @@ -22,11 +15,11 @@ pub type Metadata = serde_json::Map<String, serde_json::Value>; /// Common fields shared by all domain data items. /// /// Every first-class object in the pipeline (blobs, documents, entities, etc.) -/// embeds a `DataItem` to carry a unique identifier, an optional parent +/// embeds a `Data` to carry a unique identifier, an optional parent /// lineage link, and arbitrary metadata. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct DataItem { +pub struct Data { /// Unique identifier for this item, generated as a v4 UUID on creation. pub id: Uuid, /// Identifier of the item this was derived from, if any. @@ -37,8 +30,8 @@ pub struct DataItem { pub metadata: Option<Metadata>, } -impl DataItem { - /// Create a new `DataItem` with a freshly generated UUID and no parent or metadata. +impl Data { + /// Create a new `Data` with a freshly generated UUID and no parent or metadata. pub fn new() -> Self { Self { id: Uuid::new_v4(), @@ -54,13 +47,13 @@ impl DataItem { } /// Set `parent_id` to the id of `parent`, establishing lineage. - pub fn derive_from(mut self, parent: &DataItem) -> Self { + pub fn derive_from(mut self, parent: &Data) -> Self { self.parent_id = Some(parent.id); self } } -impl Default for DataItem { +impl Default for Data { fn default() -> Self { Self::new() } diff --git a/crates/nvisy-core/src/documents/elements.rs b/crates/nvisy-core/src/documents/elements.rs deleted file mode 100644 index 58f1fdf..0000000 --- a/crates/nvisy-core/src/documents/elements.rs +++ /dev/null @@ -1,241 +0,0 @@ -//! Structural elements extracted from parsed documents. - -use serde::{Deserialize, Serialize}; -use uuid::Uuid; - -use crate::documents::ontology::ElementType; -use crate::datatypes::Metadata; - -/// An inline hyperlink within element text. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct Link { - /// Display text of the hyperlink. - pub text: String, - /// Target URL of the hyperlink. - pub url: String, - /// Character offset where the link text begins in the parent element. - pub start_index: usize, -} - -/// An inline formatting span within element text. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct EmphasizedText { - /// The emphasized text content. - pub text: String, - /// HTML tag name describing the emphasis (e.g. `"b"`, `"i"`, `"em"`). - pub tag: String, -} - -/// A single cell within a table structure. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct TableCellData { - /// Zero-based row index. - pub row: usize, - /// Zero-based column index. - pub column: usize, - /// Text content of the cell. - pub text: String, - /// Whether this cell is a header cell. - #[serde(skip_serializing_if = "Option::is_none")] - pub is_header: Option<bool>, -} - -/// Extraction or OCR provenance data for an element. -/// -/// Records how an element was detected and any extraction -/// confidence metadata. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct ElementProvenance { - /// Confidence score of the extraction (0.0 to 1.0). - #[serde(skip_serializing_if = "Option::is_none")] - pub confidence: Option<f64>, - /// Name of the extraction engine or model that produced this element. - #[serde(skip_serializing_if = "Option::is_none")] - pub detection_origin: Option<String>, - /// Whether this element continues from a previous element split across pages. - #[serde(skip_serializing_if = "Option::is_none")] - pub is_continuation: Option<bool>, - /// Type of header or footer (e.g. `"primary"`, `"footnote"`), if applicable. - #[serde(skip_serializing_if = "Option::is_none")] - pub header_footer_type: Option<String>, -} - -/// Structured key-value pair extracted from a form. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct FormKeyValuePair { - /// Form field label or key. - pub key: String, - /// Form field value, if one was extracted. - #[serde(skip_serializing_if = "Option::is_none")] - pub value: Option<String>, - /// Extraction confidence for this key-value pair. - #[serde(skip_serializing_if = "Option::is_none")] - pub confidence: Option<f64>, -} - -/// A single structural element extracted from a document. -/// -/// Combines base element fields with optional type-specific fields -/// (image, table, form, email) in a flat struct rather than inheritance. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct Element { - /// Unique identifier for this element. - pub id: Uuid, - /// The structural type of this element. - #[serde(rename = "type")] - pub element_type: ElementType, - /// Plain-text content of the element. - pub text: String, - - /// Identifier of the parent element (for nested structures). - #[serde(skip_serializing_if = "Option::is_none")] - pub parent_id: Option<Uuid>, - /// 1-based page number where this element appears. - #[serde(skip_serializing_if = "Option::is_none")] - pub page_number: Option<u32>, - /// Named page or sheet label (e.g. worksheet name in a spreadsheet). - #[serde(skip_serializing_if = "Option::is_none")] - pub page_name: Option<String>, - /// Heading level (1-6) for title or header elements. - #[serde(skip_serializing_if = "Option::is_none")] - pub level: Option<u32>, - /// BCP-47 language tags detected in this element. - #[serde(skip_serializing_if = "Option::is_none")] - pub languages: Option<Vec<String>>, - /// Arbitrary metadata associated with this element. - #[serde(skip_serializing_if = "Option::is_none")] - pub metadata: Option<Metadata>, - /// Tag identifying the extraction source or pipeline stage. - #[serde(skip_serializing_if = "Option::is_none")] - pub source_tag: Option<String>, - /// HTML representation of the element's text with inline formatting. - #[serde(skip_serializing_if = "Option::is_none")] - pub text_as_html: Option<String>, - /// Inline hyperlinks found within this element's text. - #[serde(skip_serializing_if = "Option::is_none")] - pub links: Option<Vec<Link>>, - /// Inline formatting spans (bold, italic, etc.) within this element. - #[serde(skip_serializing_if = "Option::is_none")] - pub emphasized_texts: Option<Vec<EmphasizedText>>, - /// Extraction or OCR provenance information. - #[serde(skip_serializing_if = "Option::is_none")] - pub provenance: Option<ElementProvenance>, - - // -- Image-specific fields (when element_type is Image) -- - - /// Base64-encoded image data. - #[serde(skip_serializing_if = "Option::is_none")] - pub image_base64: Option<String>, - /// MIME type of the embedded image. - #[serde(skip_serializing_if = "Option::is_none")] - pub image_mime_type: Option<String>, - /// Remote URL of the image. - #[serde(skip_serializing_if = "Option::is_none")] - pub image_url: Option<String>, - /// Local file path of the image. - #[serde(skip_serializing_if = "Option::is_none")] - pub image_path: Option<String>, - - // -- Table-specific fields (when element_type is Table) -- - - /// Individual table cells with row/column coordinates. - #[serde(skip_serializing_if = "Option::is_none")] - pub cells: Option<Vec<TableCellData>>, - - // -- Form-specific fields (when element_type is Checkbox/FormKeysValues) -- - - /// Whether a checkbox is checked. - #[serde(skip_serializing_if = "Option::is_none")] - pub checked: Option<bool>, - /// Value of a form field. - #[serde(skip_serializing_if = "Option::is_none")] - pub value: Option<String>, - /// Structured key-value pairs extracted from a form. - #[serde(skip_serializing_if = "Option::is_none")] - pub key_value_pairs: Option<Vec<FormKeyValuePair>>, - - // -- Email-specific fields (when element_type is EmailMessage) -- - - /// Sender addresses. - #[serde(skip_serializing_if = "Option::is_none")] - pub sent_from: Option<Vec<String>>, - /// Primary recipient addresses. - #[serde(skip_serializing_if = "Option::is_none")] - pub sent_to: Option<Vec<String>>, - /// CC recipient addresses. - #[serde(skip_serializing_if = "Option::is_none")] - pub cc_recipient: Option<Vec<String>>, - /// BCC recipient addresses. - #[serde(skip_serializing_if = "Option::is_none")] - pub bcc_recipient: Option<Vec<String>>, - /// Email subject line. - #[serde(skip_serializing_if = "Option::is_none")] - pub subject: Option<String>, - /// Email signature block. - #[serde(skip_serializing_if = "Option::is_none")] - pub signature: Option<String>, - /// RFC 2822 Message-ID of the email. - #[serde(skip_serializing_if = "Option::is_none")] - pub email_message_id: Option<String>, -} - -impl Element { - /// Create a new element with the given type and text content. - pub fn new(element_type: ElementType, text: impl Into<String>) -> Self { - Self { - id: Uuid::new_v4(), - element_type, - text: text.into(), - parent_id: None, - page_number: None, - page_name: None, - level: None, - languages: None, - metadata: None, - source_tag: None, - text_as_html: None, - links: None, - emphasized_texts: None, - provenance: None, - image_base64: None, - image_mime_type: None, - image_url: None, - image_path: None, - cells: None, - checked: None, - value: None, - key_value_pairs: None, - sent_from: None, - sent_to: None, - cc_recipient: None, - bcc_recipient: None, - subject: None, - signature: None, - email_message_id: None, - } - } - - /// Set the 1-based page number for this element. - pub fn with_page_number(mut self, page: u32) -> Self { - self.page_number = Some(page); - self - } - - /// Set the heading level (1-6) for title or header elements. - pub fn with_level(mut self, level: u32) -> Self { - self.level = Some(level); - self - } - - /// Set BCP-47 language tags detected in this element. - pub fn with_languages(mut self, langs: Vec<String>) -> Self { - self.languages = Some(langs); - self - } -} diff --git a/crates/nvisy-core/src/documents/mod.rs b/crates/nvisy-core/src/documents/mod.rs deleted file mode 100644 index 2db6a42..0000000 --- a/crates/nvisy-core/src/documents/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -//! Document structure and element ontology. -//! -//! This module provides the structural representation of parsed documents, -//! including individual elements (paragraphs, tables, images, etc.) and -//! the ontology that classifies them. - -pub mod elements; -pub mod ontology; diff --git a/crates/nvisy-core/src/documents/ontology.rs b/crates/nvisy-core/src/documents/ontology.rs deleted file mode 100644 index cd2ed78..0000000 --- a/crates/nvisy-core/src/documents/ontology.rs +++ /dev/null @@ -1,129 +0,0 @@ -//! Element type ontology and category classification. - -use serde::{Deserialize, Serialize}; - -/// Broad grouping of element types. -/// -/// Every [`ElementType`] belongs to exactly one category, providing -/// a coarse filter for pipeline actions that only operate on certain -/// kinds of content. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -#[serde(rename_all = "snake_case")] -pub enum ElementCategory { - /// Narrative text, headings, list items, captions, and addresses. - Text, - /// Tabular data. - Table, - /// Images and other media content. - Media, - /// Source code fragments. - Code, - /// Mathematical formulae. - Math, - /// Form elements such as checkboxes and key-value fields. - Form, - /// Layout markers like page breaks and page numbers. - Layout, - /// Email message content. - Email, -} - -/// Specific structural element type extracted from a document. -/// -/// Each variant maps to a single [`ElementCategory`] via -/// [`ElementType::category`]. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -#[serde(rename_all = "kebab-case")] -pub enum ElementType { - // -- Text -- - - /// A document title or section heading. - Title, - /// A block of narrative prose. - NarrativeText, - /// An item within a bulleted or numbered list. - ListItem, - /// A page or section header. - Header, - /// A page or section footer. - Footer, - /// Caption text associated with a figure. - FigureCaption, - /// A physical or mailing address. - Address, - /// Text that does not fit any other text category. - UncategorizedText, - - // -- Table -- - - /// A data table with rows and columns. - Table, - - // -- Media -- - - /// An embedded image. - Image, - - // -- Code -- - - /// A source code snippet or block. - CodeSnippet, - - // -- Math -- - - /// A mathematical formula or equation. - Formula, - - // -- Form -- - - /// A checkbox form control. - Checkbox, - /// A set of key-value pairs extracted from a form. - FormKeysValues, - - // -- Layout -- - - /// A page break marker. - PageBreak, - /// A page number indicator. - PageNumber, - - // -- Email -- - - /// An email message body and headers. - EmailMessage, -} - -impl ElementType { - /// Return the category this element type belongs to. - pub fn category(&self) -> ElementCategory { - match self { - Self::Title - | Self::NarrativeText - | Self::ListItem - | Self::Header - | Self::Footer - | Self::FigureCaption - | Self::Address - | Self::UncategorizedText => ElementCategory::Text, - Self::Table => ElementCategory::Table, - Self::Image => ElementCategory::Media, - Self::CodeSnippet => ElementCategory::Code, - Self::Formula => ElementCategory::Math, - Self::Checkbox | Self::FormKeysValues => ElementCategory::Form, - Self::PageBreak | Self::PageNumber => ElementCategory::Layout, - Self::EmailMessage => ElementCategory::Email, - } - } -} - -/// Parse an element type string and return its category. -/// -/// Returns `None` if the string does not match any known [`ElementType`]. -pub fn category_of(type_str: &str) -> Option<ElementCategory> { - let et: ElementType = - serde_json::from_value(serde_json::Value::String(type_str.to_string())).ok()?; - Some(et.category()) -} diff --git a/crates/nvisy-core/src/lib.rs b/crates/nvisy-core/src/lib.rs index 7fdb276..3961ed4 100644 --- a/crates/nvisy-core/src/lib.rs +++ b/crates/nvisy-core/src/lib.rs @@ -3,9 +3,10 @@ #![doc = include_str!("../README.md")] pub mod datatypes; -pub mod documents; pub mod error; -pub mod traits; +pub mod ontology; +pub mod redaction; +pub mod registry; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-core/src/datatypes/audit.rs b/crates/nvisy-core/src/ontology/audit.rs similarity index 97% rename from crates/nvisy-core/src/datatypes/audit.rs rename to crates/nvisy-core/src/ontology/audit.rs index 5ef13c1..3e91148 100644 --- a/crates/nvisy-core/src/datatypes/audit.rs +++ b/crates/nvisy-core/src/ontology/audit.rs @@ -3,7 +3,7 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use super::DataItem; +use crate::datatypes::Data; use crate::datatypes::Metadata; /// Kind of auditable action recorded in an [`Audit`] entry. @@ -32,7 +32,7 @@ pub enum AuditAction { pub struct Audit { /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] - pub data: DataItem, + pub data: Data, /// The kind of event this audit entry records. pub action: AuditAction, /// UTC timestamp when the event occurred. @@ -64,7 +64,7 @@ impl Audit { /// Create a new audit record for the given action, timestamped to now. pub fn new(action: AuditAction) -> Self { Self { - data: DataItem::new(), + data: Data::new(), action, timestamp: Utc::now(), entity_id: None, diff --git a/crates/nvisy-core/src/datatypes/entity.rs b/crates/nvisy-core/src/ontology/entity.rs similarity index 98% rename from crates/nvisy-core/src/datatypes/entity.rs rename to crates/nvisy-core/src/ontology/entity.rs index c10a3db..70fc576 100644 --- a/crates/nvisy-core/src/datatypes/entity.rs +++ b/crates/nvisy-core/src/ontology/entity.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize}; use uuid::Uuid; -use super::DataItem; +use crate::datatypes::Data; /// Category of sensitive data an entity belongs to. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] @@ -80,7 +80,7 @@ pub struct EntityLocation { pub struct Entity { /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] - pub data: DataItem, + pub data: Data, /// Broad classification of the sensitive data. pub category: EntityCategory, /// Specific type label (e.g. `"ssn"`, `"email"`, `"credit_card"`). @@ -109,7 +109,7 @@ impl Entity { location: EntityLocation, ) -> Self { Self { - data: DataItem::new(), + data: Data::new(), category, entity_type: entity_type.into(), value: value.into(), diff --git a/crates/nvisy-core/src/ontology/mod.rs b/crates/nvisy-core/src/ontology/mod.rs new file mode 100644 index 0000000..060a638 --- /dev/null +++ b/crates/nvisy-core/src/ontology/mod.rs @@ -0,0 +1,15 @@ +//! Detection and redaction domain types. +//! +//! Types in this module represent the core ontology of the nvisy pipeline: +//! entities (detected sensitive data), redactions (how entities are masked), +//! and audit records (immutable event log). + +pub mod audit; +pub mod entity; +pub mod redaction; + +pub use audit::{Audit, AuditAction}; +pub use entity::{ + BoundingBox, DetectionMethod, Entity, EntityCategory, EntityLocation, +}; +pub use redaction::{Redaction, RedactionMethod}; diff --git a/crates/nvisy-core/src/datatypes/redaction.rs b/crates/nvisy-core/src/ontology/redaction.rs similarity index 97% rename from crates/nvisy-core/src/datatypes/redaction.rs rename to crates/nvisy-core/src/ontology/redaction.rs index faaa649..679736b 100644 --- a/crates/nvisy-core/src/datatypes/redaction.rs +++ b/crates/nvisy-core/src/ontology/redaction.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize}; use uuid::Uuid; -use super::DataItem; +use crate::datatypes::Data; /// Strategy used to redact or obfuscate a detected entity. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] @@ -36,7 +36,7 @@ pub enum RedactionMethod { pub struct Redaction { /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] - pub data: DataItem, + pub data: Data, /// Identifier of the entity being redacted. pub entity_id: Uuid, /// Redaction strategy applied to the entity. @@ -61,7 +61,7 @@ impl Redaction { replacement_value: impl Into<String>, ) -> Self { Self { - data: DataItem::new(), + data: Data::new(), entity_id, method, replacement_value: replacement_value.into(), diff --git a/crates/nvisy-core/src/prelude.rs b/crates/nvisy-core/src/prelude.rs index 5e157a1..78ab3e1 100644 --- a/crates/nvisy-core/src/prelude.rs +++ b/crates/nvisy-core/src/prelude.rs @@ -2,12 +2,13 @@ //! //! Import everything from this module to get the most commonly used //! types without individual `use` statements. + pub use crate::datatypes::blob::Blob; -pub use crate::datatypes::DataItem; +pub use crate::datatypes::Data; pub use crate::error::{Error, ErrorKind, Result}; -pub use crate::traits::action::Action; -pub use crate::traits::loader::Loader; -pub use crate::traits::provider::{ConnectedInstance, ProviderFactory}; -pub use crate::traits::stream::{StreamSource, StreamTarget}; -pub use crate::datatypes::entity::{DetectionMethod, EntityCategory}; -pub use crate::datatypes::redaction::RedactionMethod; +pub use crate::registry::action::Action; +pub use crate::registry::loader::Loader; +pub use crate::registry::provider::{ConnectedInstance, ProviderFactory}; +pub use crate::registry::stream::{StreamSource, StreamTarget}; +pub use crate::ontology::entity::{DetectionMethod, EntityCategory}; +pub use crate::ontology::redaction::RedactionMethod; diff --git a/crates/nvisy-core/src/datatypes/redaction_context.rs b/crates/nvisy-core/src/redaction/context.rs similarity index 98% rename from crates/nvisy-core/src/datatypes/redaction_context.rs rename to crates/nvisy-core/src/redaction/context.rs index 78f70df..d2c6064 100644 --- a/crates/nvisy-core/src/datatypes/redaction_context.rs +++ b/crates/nvisy-core/src/redaction/context.rs @@ -1,8 +1,8 @@ //! Request-scoped redaction context for per-invocation control. use serde::{Deserialize, Serialize}; -use crate::datatypes::entity::EntityCategory; -use crate::datatypes::redaction::RedactionMethod; +use crate::ontology::entity::EntityCategory; +use crate::ontology::redaction::RedactionMethod; /// Per-entity-type override for the redaction method. /// diff --git a/crates/nvisy-core/src/redaction/mod.rs b/crates/nvisy-core/src/redaction/mod.rs new file mode 100644 index 0000000..f523c68 --- /dev/null +++ b/crates/nvisy-core/src/redaction/mod.rs @@ -0,0 +1,7 @@ +//! Redaction context and policy types. + +pub mod context; +pub mod policy; + +pub use context::{EntityRedactionRule, RedactionContext}; +pub use policy::{Policy, PolicyRule}; diff --git a/crates/nvisy-core/src/datatypes/policy.rs b/crates/nvisy-core/src/redaction/policy.rs similarity index 95% rename from crates/nvisy-core/src/datatypes/policy.rs rename to crates/nvisy-core/src/redaction/policy.rs index fb1b474..584bc3a 100644 --- a/crates/nvisy-core/src/datatypes/policy.rs +++ b/crates/nvisy-core/src/redaction/policy.rs @@ -1,9 +1,9 @@ //! Redaction policies and rules. use serde::{Deserialize, Serialize}; -use super::DataItem; -use crate::datatypes::entity::EntityCategory; -use crate::datatypes::redaction::RedactionMethod; +use crate::datatypes::Data; +use crate::ontology::entity::EntityCategory; +use crate::ontology::redaction::RedactionMethod; /// A single rule within a redaction [`Policy`]. /// @@ -42,7 +42,7 @@ pub struct PolicyRule { pub struct Policy { /// Common data-item fields (id, parent_id, metadata). #[serde(flatten)] - pub data: DataItem, + pub data: Data, /// Human-readable policy name. pub name: String, /// Ordered list of redaction rules. @@ -58,7 +58,7 @@ impl Policy { /// fallback method ([`Mask`](RedactionMethod::Mask)) and threshold (0.5). pub fn new(name: impl Into<String>, rules: Vec<PolicyRule>) -> Self { Self { - data: DataItem::new(), + data: Data::new(), name: name.into(), rules, default_method: RedactionMethod::Mask, diff --git a/crates/nvisy-core/src/traits/action.rs b/crates/nvisy-core/src/registry/action.rs similarity index 69% rename from crates/nvisy-core/src/traits/action.rs rename to crates/nvisy-core/src/registry/action.rs index 824a276..23d5133 100644 --- a/crates/nvisy-core/src/traits/action.rs +++ b/crates/nvisy-core/src/registry/action.rs @@ -1,7 +1,6 @@ //! The `Action` trait -- the fundamental processing unit in a pipeline. -use std::any::Any; - +use serde::de::DeserializeOwned; use tokio::sync::mpsc; use crate::datatypes::blob::Blob; @@ -13,23 +12,19 @@ use crate::error::Error; /// Actions are the primary unit of work in a pipeline. Each action /// receives blobs via an async MPSC channel, transforms them (possibly /// attaching artifacts), and forwards results to the next stage. +/// +/// Actions that need a provider client should hold it as a struct field +/// rather than receiving it as a parameter. #[async_trait::async_trait] pub trait Action: Send + Sync + 'static { + /// Strongly-typed parameters for this action. + type Params: DeserializeOwned + Send; + /// Unique identifier for this action (e.g. "detect-regex"). fn id(&self) -> &str; - /// Whether this action requires a provider client. - fn requires_client(&self) -> bool { - false - } - - /// The provider ID this action requires, if any. - fn required_provider_id(&self) -> Option<&str> { - None - } - /// Validate action parameters. - fn validate_params(&self, params: &serde_json::Value) -> Result<(), Error>; + fn validate_params(&self, params: &Self::Params) -> Result<(), Error>; /// Execute the action, consuming blobs from input and sending results to output. /// Returns the number of items processed. @@ -37,7 +32,6 @@ pub trait Action: Send + Sync + 'static { &self, input: mpsc::Receiver<Blob>, output: mpsc::Sender<Blob>, - params: serde_json::Value, - client: Option<Box<dyn Any + Send>>, + params: Self::Params, ) -> Result<u64, Error>; } diff --git a/crates/nvisy-core/src/traits/loader.rs b/crates/nvisy-core/src/registry/loader.rs similarity index 85% rename from crates/nvisy-core/src/traits/loader.rs rename to crates/nvisy-core/src/registry/loader.rs index 6cf660b..16a4f90 100644 --- a/crates/nvisy-core/src/traits/loader.rs +++ b/crates/nvisy-core/src/registry/loader.rs @@ -1,8 +1,10 @@ //! The `Loader` trait for converting raw blobs into structured documents or images. +use serde::de::DeserializeOwned; + use crate::datatypes::blob::Blob; use crate::datatypes::document::Document; -use crate::datatypes::image::ImageData; +use crate::datatypes::document::ImageData; use crate::error::Error; /// Output of a loader -- either a parsed document or an extracted image. @@ -20,6 +22,9 @@ pub enum LoaderOutput { /// content type and extension. #[async_trait::async_trait] pub trait Loader: Send + Sync + 'static { + /// Strongly-typed parameters for this loader. + type Params: DeserializeOwned + Send; + /// Unique identifier for this loader (e.g. `"csv"`, `"pdf"`). fn id(&self) -> &str; /// File extensions this loader handles (e.g. `["csv", "tsv"]`). @@ -31,6 +36,6 @@ pub trait Loader: Send + Sync + 'static { async fn load( &self, blob: &Blob, - params: &serde_json::Value, + params: &Self::Params, ) -> Result<Vec<LoaderOutput>, Error>; } diff --git a/crates/nvisy-core/src/traits/mod.rs b/crates/nvisy-core/src/registry/mod.rs similarity index 100% rename from crates/nvisy-core/src/traits/mod.rs rename to crates/nvisy-core/src/registry/mod.rs diff --git a/crates/nvisy-core/src/traits/provider.rs b/crates/nvisy-core/src/registry/provider.rs similarity index 60% rename from crates/nvisy-core/src/traits/provider.rs rename to crates/nvisy-core/src/registry/provider.rs index 5fde2ad..132b58c 100644 --- a/crates/nvisy-core/src/traits/provider.rs +++ b/crates/nvisy-core/src/registry/provider.rs @@ -1,20 +1,17 @@ //! Provider factory trait for creating authenticated client connections. -use std::any::Any; use std::future::Future; use std::pin::Pin; +use serde::de::DeserializeOwned; + use crate::error::Error; -/// A connected provider instance holding an opaque client and an +/// A connected provider instance holding a typed client and an /// optional async disconnect callback. -/// -/// The `client` is type-erased so that different providers (S3, OpenAI, -/// databases, etc.) can return their own client types without requiring -/// a common interface. -pub struct ConnectedInstance { - /// Type-erased client handle, downcast by consumers to the concrete type. - pub client: Box<dyn Any + Send>, +pub struct ConnectedInstance<C> { + /// Typed client handle. + pub client: C, /// Optional cleanup function called when the connection is no longer needed. pub disconnect: Option<Box<dyn FnOnce() -> Pin<Box<dyn Future<Output = ()> + Send>> + Send>>, } @@ -25,15 +22,20 @@ pub struct ConnectedInstance { /// and client construction for a specific provider (e.g. S3, OpenAI). #[async_trait::async_trait] pub trait ProviderFactory: Send + Sync + 'static { + /// Strongly-typed credentials for this provider. + type Credentials: DeserializeOwned + Send; + /// The client type produced by [`connect`](Self::connect). + type Client: Send + 'static; + /// Unique identifier (e.g. "s3", "openai"). fn id(&self) -> &str; /// Validate credentials shape without connecting. - fn validate_credentials(&self, creds: &serde_json::Value) -> Result<(), Error>; + fn validate_credentials(&self, creds: &Self::Credentials) -> Result<(), Error>; /// Verify credentials by attempting a lightweight connection. - async fn verify(&self, creds: &serde_json::Value) -> Result<(), Error>; + async fn verify(&self, creds: &Self::Credentials) -> Result<(), Error>; /// Create a connected instance. - async fn connect(&self, creds: &serde_json::Value) -> Result<ConnectedInstance, Error>; + async fn connect(&self, creds: &Self::Credentials) -> Result<ConnectedInstance<Self::Client>, Error>; } diff --git a/crates/nvisy-core/src/traits/stream.rs b/crates/nvisy-core/src/registry/stream.rs similarity index 68% rename from crates/nvisy-core/src/traits/stream.rs rename to crates/nvisy-core/src/registry/stream.rs index 435d99b..abb6820 100644 --- a/crates/nvisy-core/src/traits/stream.rs +++ b/crates/nvisy-core/src/registry/stream.rs @@ -1,7 +1,6 @@ //! Stream source and target traits for external I/O. -use std::any::Any; - +use serde::de::DeserializeOwned; use tokio::sync::mpsc; use crate::datatypes::blob::Blob; @@ -13,12 +12,15 @@ use crate::error::Error; /// and emit blobs into the pipeline's input channel. #[async_trait::async_trait] pub trait StreamSource: Send + Sync + 'static { + /// Strongly-typed parameters for this stream source. + type Params: DeserializeOwned + Send; + /// The client type this stream requires. + type Client: Send + 'static; + /// Unique identifier for this stream source (e.g. `"s3-read"`). fn id(&self) -> &str; - /// The provider this stream requires (e.g. `"s3"`). - fn required_provider_id(&self) -> &str; /// Validate source parameters before execution. - fn validate_params(&self, params: &serde_json::Value) -> Result<(), Error>; + fn validate_params(&self, params: &Self::Params) -> Result<(), Error>; /// Read blobs from the external system and send them to `output`. /// @@ -26,8 +28,8 @@ pub trait StreamSource: Send + Sync + 'static { async fn read( &self, output: mpsc::Sender<Blob>, - params: serde_json::Value, - client: Box<dyn Any + Send>, + params: Self::Params, + client: Self::Client, ) -> Result<u64, Error>; } @@ -37,12 +39,15 @@ pub trait StreamSource: Send + Sync + 'static { /// them to a storage backend. #[async_trait::async_trait] pub trait StreamTarget: Send + Sync + 'static { + /// Strongly-typed parameters for this stream target. + type Params: DeserializeOwned + Send; + /// The client type this stream requires. + type Client: Send + 'static; + /// Unique identifier for this stream target (e.g. `"s3-write"`). fn id(&self) -> &str; - /// The provider this stream requires (e.g. `"s3"`). - fn required_provider_id(&self) -> &str; /// Validate target parameters before execution. - fn validate_params(&self, params: &serde_json::Value) -> Result<(), Error>; + fn validate_params(&self, params: &Self::Params) -> Result<(), Error>; /// Receive blobs from `input` and write them to the external system. /// @@ -50,7 +55,7 @@ pub trait StreamTarget: Send + Sync + 'static { async fn write( &self, input: mpsc::Receiver<Blob>, - params: serde_json::Value, - client: Box<dyn Any + Send>, + params: Self::Params, + client: Self::Client, ) -> Result<u64, Error>; } diff --git a/crates/nvisy-detect/src/actions/apply_redaction.rs b/crates/nvisy-detect/src/actions/apply_redaction.rs index 3a70b07..48e636a 100644 --- a/crates/nvisy-detect/src/actions/apply_redaction.rs +++ b/crates/nvisy-detect/src/actions/apply_redaction.rs @@ -1,16 +1,15 @@ //! Action that applies pending redactions to document text. -use std::any::Any; use std::collections::HashMap; use tokio::sync::mpsc; use uuid::Uuid; use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; -use nvisy_core::datatypes::entity::Entity; -use nvisy_core::datatypes::redaction::Redaction; +use nvisy_core::ontology::entity::Entity; +use nvisy_core::ontology::redaction::Redaction; use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::traits::action::Action; +use nvisy_core::registry::action::Action; /// Applies pending [`Redaction`] artifacts to document content. /// @@ -32,11 +31,13 @@ struct PendingRedaction { #[async_trait::async_trait] impl Action for ApplyRedactionAction { + type Params = (); + fn id(&self) -> &str { "apply-redaction" } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { Ok(()) } @@ -44,8 +45,7 @@ impl Action for ApplyRedactionAction { &self, mut input: mpsc::Receiver<Blob>, output: mpsc::Sender<Blob>, - _params: serde_json::Value, - _client: Option<Box<dyn Any + Send>>, + _params: Self::Params, ) -> Result<u64, Error> { let mut count = 0u64; diff --git a/crates/nvisy-detect/src/actions/classify.rs b/crates/nvisy-detect/src/actions/classify.rs index ed5d5e7..d29059f 100644 --- a/crates/nvisy-detect/src/actions/classify.rs +++ b/crates/nvisy-detect/src/actions/classify.rs @@ -1,12 +1,11 @@ //! Sensitivity classification action. -use std::any::Any; use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::entity::Entity; +use nvisy_core::ontology::entity::Entity; use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::traits::action::Action; +use nvisy_core::registry::action::Action; /// Assigns a sensitivity level to each blob based on its detected entities. /// @@ -18,11 +17,13 @@ pub struct ClassifyAction; #[async_trait::async_trait] impl Action for ClassifyAction { + type Params = (); + fn id(&self) -> &str { "classify" } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { Ok(()) } @@ -30,8 +31,7 @@ impl Action for ClassifyAction { &self, mut input: mpsc::Receiver<Blob>, output: mpsc::Sender<Blob>, - _params: serde_json::Value, - _client: Option<Box<dyn Any + Send>>, + _params: Self::Params, ) -> Result<u64, Error> { let mut count = 0u64; @@ -78,7 +78,7 @@ fn compute_sensitivity_level(entities: &[Entity]) -> String { let has_high_confidence = entities.iter().any(|e| e.confidence >= 0.9); let has_critical_types = entities.iter().any(|e| { - matches!(e.category, nvisy_core::datatypes::entity::EntityCategory::Credentials) + matches!(e.category, nvisy_core::ontology::entity::EntityCategory::Credentials) || e.entity_type == "ssn" || e.entity_type == "credit_card" }); diff --git a/crates/nvisy-detect/src/actions/detect_checksum.rs b/crates/nvisy-detect/src/actions/detect_checksum.rs index 1c87711..d1b60aa 100644 --- a/crates/nvisy-detect/src/actions/detect_checksum.rs +++ b/crates/nvisy-detect/src/actions/detect_checksum.rs @@ -1,38 +1,47 @@ //! Checksum-based entity validation action. -use std::any::Any; +use serde::Deserialize; use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::entity::Entity; +use nvisy_core::ontology::entity::{DetectionMethod, Entity}; use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::traits::action::Action; -use nvisy_core::datatypes::entity::DetectionMethod; +use nvisy_core::registry::action::Action; use crate::patterns::validators::luhn_check; +/// Typed parameters for [`DetectChecksumAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DetectChecksumParams { + /// Whether to discard entities that fail validation. + #[serde(default = "default_true")] + pub drop_invalid: bool, + /// Amount added to confidence on successful validation. + #[serde(default = "default_boost")] + pub confidence_boost: f64, +} + +fn default_true() -> bool { true } +fn default_boost() -> f64 { 0.05 } + /// Validates previously detected entities using checksum algorithms. /// /// Entities whose type has a registered validator (e.g. Luhn for credit cards) /// are verified. Valid matches receive a confidence boost and are re-emitted /// with [`DetectionMethod::Checksum`]. Invalid matches can optionally be /// dropped from the pipeline. -/// -/// # Parameters (JSON) -/// -/// | Key | Type | Default | Description | -/// |-------------------|--------|---------|------------------------------------------------------| -/// | `dropInvalid` | `bool` | `true` | Whether to discard entities that fail validation. | -/// | `confidenceBoost` | `f64` | `0.05` | Amount added to confidence on successful validation. | pub struct DetectChecksumAction; #[async_trait::async_trait] impl Action for DetectChecksumAction { + type Params = DetectChecksumParams; + fn id(&self) -> &str { "detect-checksum" } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { Ok(()) } @@ -40,17 +49,10 @@ impl Action for DetectChecksumAction { &self, mut input: mpsc::Receiver<Blob>, output: mpsc::Sender<Blob>, - params: serde_json::Value, - _client: Option<Box<dyn Any + Send>>, + params: Self::Params, ) -> Result<u64, Error> { - let drop_invalid = params - .get("dropInvalid") - .and_then(|v| v.as_bool()) - .unwrap_or(true); - let confidence_boost = params - .get("confidenceBoost") - .and_then(|v| v.as_f64()) - .unwrap_or(0.05); + let drop_invalid = params.drop_invalid; + let confidence_boost = params.confidence_boost; let mut count = 0u64; diff --git a/crates/nvisy-detect/src/actions/detect_regex.rs b/crates/nvisy-detect/src/actions/detect_regex.rs index 72a0597..f631839 100644 --- a/crates/nvisy-detect/src/actions/detect_regex.rs +++ b/crates/nvisy-detect/src/actions/detect_regex.rs @@ -1,38 +1,45 @@ //! Regex-based PII/PHI entity detection action. use regex::Regex; -use std::any::Any; +use serde::Deserialize; use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; -use nvisy_core::datatypes::entity::{DetectionMethod, Entity, EntityLocation}; +use nvisy_core::ontology::entity::{DetectionMethod, Entity, EntityLocation}; use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::traits::action::Action; +use nvisy_core::registry::action::Action; use crate::patterns::{self, PatternDefinition}; +/// Typed parameters for [`DetectRegexAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DetectRegexParams { + /// Minimum pattern confidence to emit. + #[serde(default)] + pub confidence_threshold: f64, + /// Subset of built-in pattern names to use. `None` means all. + #[serde(default)] + pub patterns: Option<Vec<String>>, +} + /// Scans document text against compiled regex patterns to detect PII/PHI entities. /// /// For each blob the action reads the `"documents"` artifact (or falls back to /// the raw blob content), runs every active pattern, optionally validates /// matches, and appends resulting [`Entity`] artifacts. -/// -/// # Parameters (JSON) -/// -/// | Key | Type | Default | Description | -/// |----------------------|------------|---------|------------------------------------------| -/// | `confidenceThreshold`| `f64` | `0.0` | Minimum pattern confidence to emit. | -/// | `patterns` | `[String]` | all | Subset of built-in pattern names to use. | pub struct DetectRegexAction; #[async_trait::async_trait] impl Action for DetectRegexAction { + type Params = DetectRegexParams; + fn id(&self) -> &str { "detect-regex" } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { Ok(()) } @@ -40,17 +47,10 @@ impl Action for DetectRegexAction { &self, mut input: mpsc::Receiver<Blob>, output: mpsc::Sender<Blob>, - params: serde_json::Value, - _client: Option<Box<dyn Any + Send>>, + params: Self::Params, ) -> Result<u64, Error> { - let confidence_threshold: f64 = params - .get("confidenceThreshold") - .and_then(|v| v.as_f64()) - .unwrap_or(0.0); - - let requested_patterns: Option<Vec<String>> = params - .get("patterns") - .and_then(|v| serde_json::from_value(v.clone()).ok()); + let confidence_threshold = params.confidence_threshold; + let requested_patterns = params.patterns; // Resolve patterns let active_patterns = resolve_patterns(&requested_patterns); diff --git a/crates/nvisy-detect/src/actions/emit_audit.rs b/crates/nvisy-detect/src/actions/emit_audit.rs index 9b64821..d92509c 100644 --- a/crates/nvisy-detect/src/actions/emit_audit.rs +++ b/crates/nvisy-detect/src/actions/emit_audit.rs @@ -1,36 +1,42 @@ //! Audit trail emission action. -use std::any::Any; +use serde::Deserialize; use tokio::sync::mpsc; +use uuid::Uuid; use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::audit::Audit; +use nvisy_core::ontology::audit::{Audit, AuditAction}; +use nvisy_core::ontology::redaction::Redaction; use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::traits::action::Action; -use nvisy_core::datatypes::audit::AuditAction; -use nvisy_core::datatypes::redaction::Redaction; +use nvisy_core::registry::action::Action; + +/// Typed parameters for [`EmitAuditAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct EmitAuditParams { + /// Pipeline run identifier to attach. + #[serde(default)] + pub run_id: Option<Uuid>, + /// Human or service identity to record. + #[serde(default)] + pub actor: Option<String>, +} /// Emits an [`Audit`] record for every [`Redaction`] found in the blob. /// /// Each audit entry captures the redaction method, replacement value, and -/// (when available) the originating policy rule ID. Optional `runId` and -/// `actor` parameters are attached to every emitted audit. -/// -/// # Parameters (JSON) -/// -/// | Key | Type | Default | Description | -/// |---------|----------|---------|-------------------------------------| -/// | `runId` | `UUID` | `None` | Pipeline run identifier to attach. | -/// | `actor` | `String` | `None` | Human or service identity to record.| +/// (when available) the originating policy rule ID. pub struct EmitAuditAction; #[async_trait::async_trait] impl Action for EmitAuditAction { + type Params = EmitAuditParams; + fn id(&self) -> &str { "emit-audit" } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { Ok(()) } @@ -38,17 +44,10 @@ impl Action for EmitAuditAction { &self, mut input: mpsc::Receiver<Blob>, output: mpsc::Sender<Blob>, - params: serde_json::Value, - _client: Option<Box<dyn Any + Send>>, + params: Self::Params, ) -> Result<u64, Error> { - let run_id: Option<uuid::Uuid> = params - .get("runId") - .and_then(|v| v.as_str()) - .and_then(|s| s.parse().ok()); - let actor: Option<String> = params - .get("actor") - .and_then(|v| v.as_str()) - .map(|s| s.to_string()); + let run_id = params.run_id; + let actor = params.actor; let mut count = 0u64; diff --git a/crates/nvisy-detect/src/actions/evaluate_policy.rs b/crates/nvisy-detect/src/actions/evaluate_policy.rs index 7b2aea7..52dfa60 100644 --- a/crates/nvisy-detect/src/actions/evaluate_policy.rs +++ b/crates/nvisy-detect/src/actions/evaluate_policy.rs @@ -1,15 +1,32 @@ //! Policy evaluation action that maps detected entities to redaction instructions. -use std::any::Any; +use serde::Deserialize; use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::entity::Entity; -use nvisy_core::datatypes::policy::PolicyRule; -use nvisy_core::datatypes::redaction::Redaction; +use nvisy_core::ontology::entity::Entity; +use nvisy_core::redaction::policy::PolicyRule; +use nvisy_core::ontology::redaction::{Redaction, RedactionMethod}; use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::traits::action::Action; -use nvisy_core::datatypes::redaction::RedactionMethod; +use nvisy_core::registry::action::Action; + +/// Typed parameters for [`EvaluatePolicyAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct EvaluatePolicyParams { + /// Ordered policy rules to evaluate. + #[serde(default)] + pub rules: Vec<PolicyRule>, + /// Fallback redaction method when no rule matches. + #[serde(default = "default_method")] + pub default_method: RedactionMethod, + /// Fallback confidence threshold. + #[serde(default = "default_threshold")] + pub default_confidence_threshold: f64, +} + +fn default_method() -> RedactionMethod { RedactionMethod::Mask } +fn default_threshold() -> f64 { 0.5 } /// Evaluates policy rules against detected entities and emits [`Redaction`] artifacts. /// @@ -17,23 +34,17 @@ use nvisy_core::datatypes::redaction::RedactionMethod; /// applies its redaction method and replacement template, and writes a /// `"redactions"` artifact to the blob. Entities that fall below the confidence /// threshold are skipped. -/// -/// # Parameters (JSON) -/// -/// | Key | Type | Default | Description | -/// |------------------------------|-----------------------|----------|----------------------------------------------| -/// | `rules` | `[PolicyRule]` | `[]` | Ordered policy rules to evaluate. | -/// | `defaultMethod` | `RedactionMethod` | `Mask` | Fallback redaction method when no rule matches.| -/// | `defaultConfidenceThreshold` | `f64` | `0.5` | Fallback confidence threshold. | pub struct EvaluatePolicyAction; #[async_trait::async_trait] impl Action for EvaluatePolicyAction { + type Params = EvaluatePolicyParams; + fn id(&self) -> &str { "evaluate-policy" } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { Ok(()) } @@ -41,23 +52,12 @@ impl Action for EvaluatePolicyAction { &self, mut input: mpsc::Receiver<Blob>, output: mpsc::Sender<Blob>, - params: serde_json::Value, - _client: Option<Box<dyn Any + Send>>, + params: Self::Params, ) -> Result<u64, Error> { - let rules: Vec<PolicyRule> = params - .get("rules") - .and_then(|v| serde_json::from_value(v.clone()).ok()) - .unwrap_or_default(); - let default_method: RedactionMethod = params - .get("defaultMethod") - .and_then(|v| serde_json::from_value(v.clone()).ok()) - .unwrap_or(RedactionMethod::Mask); - let default_threshold: f64 = params - .get("defaultConfidenceThreshold") - .and_then(|v| v.as_f64()) - .unwrap_or(0.5); - - let mut sorted_rules = rules; + let default_method = params.default_method; + let default_threshold = params.default_confidence_threshold; + + let mut sorted_rules = params.rules; sorted_rules.sort_by_key(|r| r.priority); let mut count = 0u64; diff --git a/crates/nvisy-detect/src/actions/mod.rs b/crates/nvisy-detect/src/actions/mod.rs index a9a3168..02cfb1a 100644 --- a/crates/nvisy-detect/src/actions/mod.rs +++ b/crates/nvisy-detect/src/actions/mod.rs @@ -1,6 +1,6 @@ //! Pipeline actions for the detection and redaction workflow. //! -//! Each sub-module exposes a single [`Action`](nvisy_core::traits::action::Action) +//! Each sub-module exposes a single [`Action`](nvisy_core::registry::action::Action) //! implementation that can be wired into an nvisy execution plan. /// Applies pending redactions to document content. diff --git a/crates/nvisy-detect/src/loaders/csv_loader.rs b/crates/nvisy-detect/src/loaders/csv_loader.rs index 6148e6c..79d868d 100644 --- a/crates/nvisy-detect/src/loaders/csv_loader.rs +++ b/crates/nvisy-detect/src/loaders/csv_loader.rs @@ -3,7 +3,7 @@ use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; use nvisy_core::error::Error; -use nvisy_core::traits::loader::{Loader, LoaderOutput}; +use nvisy_core::registry::loader::{Loader, LoaderOutput}; /// Loads CSV blobs into a single [`Document`] containing the raw CSV text. /// @@ -14,6 +14,8 @@ pub struct CsvLoader; #[async_trait::async_trait] impl Loader for CsvLoader { + type Params = (); + fn id(&self) -> &str { "csv" } @@ -29,7 +31,7 @@ impl Loader for CsvLoader { async fn load( &self, blob: &Blob, - _params: &serde_json::Value, + _params: &Self::Params, ) -> Result<Vec<LoaderOutput>, Error> { let content = String::from_utf8(blob.content.to_vec()).map_err(|e| { Error::validation(format!("Invalid UTF-8 in CSV: {}", e), "csv-loader") diff --git a/crates/nvisy-detect/src/loaders/json_loader.rs b/crates/nvisy-detect/src/loaders/json_loader.rs index b93ce96..827443d 100644 --- a/crates/nvisy-detect/src/loaders/json_loader.rs +++ b/crates/nvisy-detect/src/loaders/json_loader.rs @@ -3,7 +3,7 @@ use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; use nvisy_core::error::Error; -use nvisy_core::traits::loader::{Loader, LoaderOutput}; +use nvisy_core::registry::loader::{Loader, LoaderOutput}; /// Loads JSON blobs into a single [`Document`] containing the raw JSON text. /// @@ -14,6 +14,8 @@ pub struct JsonLoader; #[async_trait::async_trait] impl Loader for JsonLoader { + type Params = (); + fn id(&self) -> &str { "json" } @@ -29,7 +31,7 @@ impl Loader for JsonLoader { async fn load( &self, blob: &Blob, - _params: &serde_json::Value, + _params: &Self::Params, ) -> Result<Vec<LoaderOutput>, Error> { let content = String::from_utf8(blob.content.to_vec()).map_err(|e| { Error::validation(format!("Invalid UTF-8 in JSON: {}", e), "json-loader") diff --git a/crates/nvisy-detect/src/loaders/plaintext.rs b/crates/nvisy-detect/src/loaders/plaintext.rs index b4242de..a885a33 100644 --- a/crates/nvisy-detect/src/loaders/plaintext.rs +++ b/crates/nvisy-detect/src/loaders/plaintext.rs @@ -3,7 +3,7 @@ use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; use nvisy_core::error::Error; -use nvisy_core::traits::loader::{Loader, LoaderOutput}; +use nvisy_core::registry::loader::{Loader, LoaderOutput}; /// Loads plain-text blobs into a single [`Document`]. /// @@ -14,6 +14,8 @@ pub struct PlaintextLoader; #[async_trait::async_trait] impl Loader for PlaintextLoader { + type Params = (); + fn id(&self) -> &str { "plaintext" } @@ -29,7 +31,7 @@ impl Loader for PlaintextLoader { async fn load( &self, blob: &Blob, - _params: &serde_json::Value, + _params: &Self::Params, ) -> Result<Vec<LoaderOutput>, Error> { let content = String::from_utf8(blob.content.to_vec()).map_err(|e| { Error::validation( diff --git a/crates/nvisy-detect/src/patterns/mod.rs b/crates/nvisy-detect/src/patterns/mod.rs index 226a71c..75a5bd0 100644 --- a/crates/nvisy-detect/src/patterns/mod.rs +++ b/crates/nvisy-detect/src/patterns/mod.rs @@ -9,7 +9,7 @@ pub mod validators; use std::collections::HashMap; use std::sync::LazyLock; -use nvisy_core::datatypes::entity::EntityCategory; +use nvisy_core::ontology::entity::EntityCategory; /// JSON representation of a pattern loaded from disk. #[derive(Debug, Clone, serde::Deserialize)] diff --git a/crates/nvisy-object/src/providers/s3.rs b/crates/nvisy-object/src/providers/s3.rs index 364a8ad..f5f7574 100644 --- a/crates/nvisy-object/src/providers/s3.rs +++ b/crates/nvisy-object/src/providers/s3.rs @@ -6,10 +6,11 @@ use aws_config::BehaviorVersion; use aws_sdk_s3::Client as S3Client; use bytes::Bytes; +use serde::Deserialize; use nvisy_core::error::Error; -use nvisy_core::traits::provider::{ConnectedInstance, ProviderFactory}; -use crate::client::{GetResult, ListResult, ObjectStoreClient}; +use nvisy_core::registry::provider::{ConnectedInstance, ProviderFactory}; +use crate::client::{GetResult, ListResult, ObjectStoreBox, ObjectStoreClient}; /// S3-compatible object store client. /// @@ -94,59 +95,63 @@ impl ObjectStoreClient for S3ObjectStoreClient { } } -/// Factory that creates [`S3ObjectStoreClient`] instances from JSON credentials. -/// -/// Expected credential keys: -/// - `bucket` (required) -- S3 bucket name. -/// - `region` (optional, defaults to `us-east-1`). -/// - `endpoint` (optional) -- custom endpoint URL for S3-compatible services. -/// - `accessKeyId` / `secretAccessKey` / `sessionToken` (optional) -- static credentials. +/// Typed credentials for S3 provider. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct S3Credentials { + /// S3 bucket name. + pub bucket: String, + /// AWS region (defaults to `us-east-1`). + #[serde(default = "default_region")] + pub region: String, + /// Custom endpoint URL for S3-compatible services. + #[serde(default)] + pub endpoint: Option<String>, + /// AWS access key ID for static credentials. + #[serde(default)] + pub access_key_id: Option<String>, + /// AWS secret access key for static credentials. + #[serde(default)] + pub secret_access_key: Option<String>, + /// AWS session token for temporary credentials. + #[serde(default)] + pub session_token: Option<String>, +} + +fn default_region() -> String { "us-east-1".to_string() } + +/// Factory that creates [`S3ObjectStoreClient`] instances from typed credentials. pub struct S3ProviderFactory; #[async_trait::async_trait] impl ProviderFactory for S3ProviderFactory { + type Credentials = S3Credentials; + type Client = ObjectStoreBox; + fn id(&self) -> &str { "s3" } - fn validate_credentials(&self, creds: &serde_json::Value) -> Result<(), Error> { - let bucket = creds.get("bucket").and_then(|v| v.as_str()); - if bucket.is_none() { - return Err(Error::validation("Missing 'bucket' in S3 credentials", "s3")); - } + fn validate_credentials(&self, _creds: &Self::Credentials) -> Result<(), Error> { + // Bucket is required by the struct, so if we got here it's present. Ok(()) } - async fn verify(&self, creds: &serde_json::Value) -> Result<(), Error> { + async fn verify(&self, creds: &Self::Credentials) -> Result<(), Error> { self.validate_credentials(creds)?; // Could do a HeadBucket call here for verification Ok(()) } - async fn connect(&self, creds: &serde_json::Value) -> Result<ConnectedInstance, Error> { - let bucket = creds.get("bucket") - .and_then(|v| v.as_str()) - .ok_or_else(|| Error::validation("Missing 'bucket'", "s3"))? - .to_string(); - - let region = creds.get("region") - .and_then(|v| v.as_str()) - .unwrap_or("us-east-1"); - - let endpoint = creds.get("endpoint") - .and_then(|v| v.as_str()); - + async fn connect(&self, creds: &Self::Credentials) -> Result<ConnectedInstance<Self::Client>, Error> { let mut config_loader = aws_config::defaults(BehaviorVersion::latest()) - .region(aws_sdk_s3::config::Region::new(region.to_string())); + .region(aws_sdk_s3::config::Region::new(creds.region.clone())); // If access_key and secret_key provided, use static credentials - if let (Some(access_key), Some(secret_key)) = ( - creds.get("accessKeyId").and_then(|v| v.as_str()), - creds.get("secretAccessKey").and_then(|v| v.as_str()), - ) { + if let (Some(access_key), Some(secret_key)) = (&creds.access_key_id, &creds.secret_access_key) { config_loader = config_loader.credentials_provider( aws_sdk_s3::config::Credentials::new( access_key, secret_key, - creds.get("sessionToken").and_then(|v| v.as_str()).map(|s| s.to_string()), + creds.session_token.clone(), None, "nvisy-s3", ), @@ -156,15 +161,15 @@ impl ProviderFactory for S3ProviderFactory { let config = config_loader.load().await; let mut s3_config = aws_sdk_s3::config::Builder::from(&config); - if let Some(ep) = endpoint { + if let Some(ref ep) = creds.endpoint { s3_config = s3_config.endpoint_url(ep).force_path_style(true); } let client = S3Client::from_conf(s3_config.build()); - let store_client = S3ObjectStoreClient::new(client, bucket); + let store_client = S3ObjectStoreClient::new(client, creds.bucket.clone()); Ok(ConnectedInstance { - client: Box::new(crate::client::ObjectStoreBox::new(store_client)), + client: ObjectStoreBox::new(store_client), disconnect: None, }) } diff --git a/crates/nvisy-object/src/streams/read.rs b/crates/nvisy-object/src/streams/read.rs index d8159f5..02d7222 100644 --- a/crates/nvisy-object/src/streams/read.rs +++ b/crates/nvisy-object/src/streams/read.rs @@ -1,44 +1,52 @@ //! Streaming reader that pulls objects from an S3-compatible store. -use std::any::Any; +use serde::Deserialize; use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; use nvisy_core::error::Error; -use nvisy_core::traits::stream::StreamSource; +use nvisy_core::registry::stream::StreamSource; use crate::client::ObjectStoreBox; +/// Typed parameters for [`ObjectReadStream`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ObjectReadParams { + /// Object key prefix to filter by. + #[serde(default)] + pub prefix: String, + /// Number of keys to fetch per page. + #[serde(default = "default_batch_size")] + pub batch_size: usize, +} + +fn default_batch_size() -> usize { 100 } + /// A [`StreamSource`] that lists and fetches objects from an S3-compatible store, /// emitting each object as a [`Blob`] onto the output channel. -/// -/// # Parameters (JSON) -/// -/// - `prefix` -- object key prefix to filter by (default: `""`). -/// - `batchSize` -- number of keys to fetch per page (default: `100`). pub struct ObjectReadStream; #[async_trait::async_trait] impl StreamSource for ObjectReadStream { + type Params = ObjectReadParams; + type Client = ObjectStoreBox; + fn id(&self) -> &str { "read" } - fn required_provider_id(&self) -> &str { "s3" } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { Ok(()) } async fn read( &self, output: mpsc::Sender<Blob>, - params: serde_json::Value, - client: Box<dyn Any + Send>, + params: Self::Params, + client: Self::Client, ) -> Result<u64, Error> { - let store_box = client.downcast::<ObjectStoreBox>().map_err(|_| { - Error::runtime("Invalid client type for object read stream", "object/read", false) - })?; - let store_client = &store_box.0; + let store_client = &client.0; - let prefix = params.get("prefix").and_then(|v| v.as_str()).unwrap_or(""); - let batch_size = params.get("batchSize").and_then(|v| v.as_u64()).unwrap_or(100) as usize; + let prefix = ¶ms.prefix; + let batch_size = params.batch_size; let mut cursor: Option<String> = None; let mut total = 0u64; diff --git a/crates/nvisy-object/src/streams/write.rs b/crates/nvisy-object/src/streams/write.rs index eb258e1..1de42a2 100644 --- a/crates/nvisy-object/src/streams/write.rs +++ b/crates/nvisy-object/src/streams/write.rs @@ -1,42 +1,46 @@ //! Streaming writer that uploads blobs to an S3-compatible store. -use std::any::Any; +use serde::Deserialize; use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; use nvisy_core::error::Error; -use nvisy_core::traits::stream::StreamTarget; +use nvisy_core::registry::stream::StreamTarget; use crate::client::ObjectStoreBox; +/// Typed parameters for [`ObjectWriteStream`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ObjectWriteParams { + /// Key prefix prepended to each blob path. + #[serde(default)] + pub prefix: String, +} + /// A [`StreamTarget`] that receives [`Blob`]s from the input channel and /// uploads each one to an S3-compatible object store. -/// -/// # Parameters (JSON) -/// -/// - `prefix` -- key prefix prepended to each blob path (default: `""`). pub struct ObjectWriteStream; #[async_trait::async_trait] impl StreamTarget for ObjectWriteStream { + type Params = ObjectWriteParams; + type Client = ObjectStoreBox; + fn id(&self) -> &str { "write" } - fn required_provider_id(&self) -> &str { "s3" } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { Ok(()) } async fn write( &self, mut input: mpsc::Receiver<Blob>, - params: serde_json::Value, - client: Box<dyn Any + Send>, + params: Self::Params, + client: Self::Client, ) -> Result<u64, Error> { - let store_box = client.downcast::<ObjectStoreBox>().map_err(|_| { - Error::runtime("Invalid client type for object write stream", "object/write", false) - })?; - let store_client = &store_box.0; + let store_client = &client.0; - let prefix = params.get("prefix").and_then(|v| v.as_str()).unwrap_or(""); + let prefix = ¶ms.prefix; let mut total = 0u64; while let Some(blob) = input.recv().await { diff --git a/crates/nvisy-python/src/actions/mod.rs b/crates/nvisy-python/src/actions/mod.rs index 3cb558d..16f4514 100644 --- a/crates/nvisy-python/src/actions/mod.rs +++ b/crates/nvisy-python/src/actions/mod.rs @@ -4,32 +4,63 @@ //! - [`DetectNerAction`] -- runs NER over text documents. //! - [`DetectNerImageAction`] -- runs NER over images (OCR + entity detection). -use std::any::Any; +use serde::Deserialize; use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; -use nvisy_core::datatypes::image::ImageData; +use nvisy_core::datatypes::document::ImageData; use nvisy_core::error::Error; -use nvisy_core::traits::action::Action; +use nvisy_core::registry::action::Action; use crate::bridge::PythonBridge; use crate::ner::{self, NerConfig}; +/// Typed parameters for NER actions. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DetectNerParams { + /// Entity type labels to detect (e.g., `["PERSON", "SSN"]`). + #[serde(default)] + pub entity_types: Vec<String>, + /// Minimum confidence score to include a detection (0.0 -- 1.0). + #[serde(default = "default_confidence_threshold")] + pub confidence_threshold: f64, + /// Sampling temperature forwarded to the AI model. + #[serde(default)] + pub temperature: f64, + /// API key for the AI provider. + #[serde(default)] + pub api_key: String, + /// Model identifier (e.g., `"gpt-4"`). + #[serde(default = "default_model")] + pub model: String, + /// AI provider name (e.g., `"openai"`). + #[serde(default = "default_provider")] + pub provider: String, +} + +fn default_confidence_threshold() -> f64 { 0.5 } +fn default_model() -> String { "gpt-4".to_string() } +fn default_provider() -> String { "openai".to_string() } + /// Pipeline action that detects named entities in text documents. /// /// If the incoming [`Blob`] carries `"documents"` artifacts, each document's /// text is sent through the NER model. Otherwise the raw blob content is /// interpreted as UTF-8 text. Detected entities are stored as `"entities"` /// artifacts on the blob. -pub struct DetectNerAction; +pub struct DetectNerAction { + /// Python bridge used to call the NER model. + pub bridge: PythonBridge, +} #[async_trait::async_trait] impl Action for DetectNerAction { + type Params = DetectNerParams; + fn id(&self) -> &str { "detect-ner" } - fn requires_client(&self) -> bool { true } - fn required_provider_id(&self) -> Option<&str> { Some("ai") } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { Ok(()) } @@ -37,11 +68,9 @@ impl Action for DetectNerAction { &self, mut input: mpsc::Receiver<Blob>, output: mpsc::Sender<Blob>, - params: serde_json::Value, - client: Option<Box<dyn Any + Send>>, + params: Self::Params, ) -> Result<u64, Error> { - let bridge = extract_bridge(client)?; - let config = parse_ner_config(¶ms); + let config = ner_config_from_params(¶ms); let mut count = 0u64; while let Some(mut blob) = input.recv().await { @@ -57,7 +86,7 @@ impl Action for DetectNerAction { }; for doc in &docs { - let entities = ner::detect_ner(&bridge, &doc.content, &config).await?; + let entities = ner::detect_ner(&self.bridge, &doc.content, &config).await?; for entity in &entities { blob.add_artifact("entities", entity) .map_err(|e| Error::runtime(format!("Failed to add entity artifact: {}", e), "python/ner", false))?; @@ -80,15 +109,18 @@ impl Action for DetectNerAction { /// processed individually. Otherwise the raw blob content is treated as a /// single image whose MIME type is inferred from the blob metadata. /// Detected entities are stored as `"entities"` artifacts on the blob. -pub struct DetectNerImageAction; +pub struct DetectNerImageAction { + /// Python bridge used to call the NER model. + pub bridge: PythonBridge, +} #[async_trait::async_trait] impl Action for DetectNerImageAction { + type Params = DetectNerParams; + fn id(&self) -> &str { "detect-ner-image" } - fn requires_client(&self) -> bool { true } - fn required_provider_id(&self) -> Option<&str> { Some("ai") } - fn validate_params(&self, _params: &serde_json::Value) -> Result<(), Error> { + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { Ok(()) } @@ -96,11 +128,9 @@ impl Action for DetectNerImageAction { &self, mut input: mpsc::Receiver<Blob>, output: mpsc::Sender<Blob>, - params: serde_json::Value, - client: Option<Box<dyn Any + Send>>, + params: Self::Params, ) -> Result<u64, Error> { - let bridge = extract_bridge(client)?; - let config = parse_ner_config(¶ms); + let config = ner_config_from_params(¶ms); let mut count = 0u64; while let Some(mut blob) = input.recv().await { @@ -110,7 +140,7 @@ impl Action for DetectNerImageAction { if images.is_empty() { let mime_type = blob.content_type().unwrap_or("application/octet-stream").to_string(); let entities = ner::detect_ner_image( - &bridge, + &self.bridge, &blob.content, &mime_type, &config, @@ -123,7 +153,7 @@ impl Action for DetectNerImageAction { } else { for img in &images { let entities = ner::detect_ner_image( - &bridge, + &self.bridge, &img.image_data, &img.mime_type, &config, @@ -145,44 +175,14 @@ impl Action for DetectNerImageAction { } } -/// Downcast the opaque provider client to a [`PythonBridge`]. -fn extract_bridge(client: Option<Box<dyn Any + Send>>) -> Result<PythonBridge, Error> { - client - .ok_or_else(|| Error::runtime("AI provider client required", "python", false))? - .downcast::<PythonBridge>() - .map(|b| *b) - .map_err(|_| Error::runtime("Invalid client type for AI actions", "python", false)) -} - -/// Extract [`NerConfig`] from the action's JSON parameters. -fn parse_ner_config(params: &serde_json::Value) -> NerConfig { +/// Convert [`DetectNerParams`] into the internal [`NerConfig`]. +fn ner_config_from_params(params: &DetectNerParams) -> NerConfig { NerConfig { - entity_types: params - .get("entityTypes") - .and_then(|v| serde_json::from_value(v.clone()).ok()) - .unwrap_or_default(), - confidence_threshold: params - .get("confidenceThreshold") - .and_then(|v| v.as_f64()) - .unwrap_or(0.5), - temperature: params - .get("temperature") - .and_then(|v| v.as_f64()) - .unwrap_or(0.0), - api_key: params - .get("apiKey") - .and_then(|v| v.as_str()) - .unwrap_or("") - .to_string(), - model: params - .get("model") - .and_then(|v| v.as_str()) - .unwrap_or("gpt-4") - .to_string(), - provider: params - .get("provider") - .and_then(|v| v.as_str()) - .unwrap_or("openai") - .to_string(), + entity_types: params.entity_types.clone(), + confidence_threshold: params.confidence_threshold, + temperature: params.temperature, + api_key: params.api_key.clone(), + model: params.model.clone(), + provider: params.provider.clone(), } } diff --git a/crates/nvisy-python/src/lib.rs b/crates/nvisy-python/src/lib.rs index c130f8f..c3b8fca 100644 --- a/crates/nvisy-python/src/lib.rs +++ b/crates/nvisy-python/src/lib.rs @@ -2,8 +2,8 @@ //! //! This crate embeds a CPython interpreter via PyO3 and delegates named-entity //! recognition (NER) to a Python module (`nvisy_ai`). It exposes pipeline -//! [`Action`](nvisy_core::traits::action::Action) implementations as well as a -//! [`ProviderFactory`](nvisy_core::traits::provider::ProviderFactory) for the +//! [`Action`](nvisy_core::registry::action::Action) implementations as well as a +//! [`ProviderFactory`](nvisy_core::registry::provider::ProviderFactory) for the //! `"ai"` provider. #![deny(unsafe_code)] diff --git a/crates/nvisy-python/src/ner/mod.rs b/crates/nvisy-python/src/ner/mod.rs index eecf199..d8a7bdf 100644 --- a/crates/nvisy-python/src/ner/mod.rs +++ b/crates/nvisy-python/src/ner/mod.rs @@ -6,9 +6,9 @@ use pyo3::prelude::*; use pyo3::types::{PyDict, PyList}; -use nvisy_core::datatypes::entity::{Entity, EntityLocation}; +use nvisy_core::ontology::entity::{Entity, EntityLocation}; use nvisy_core::error::Error; -use nvisy_core::datatypes::entity::{DetectionMethod, EntityCategory}; +use nvisy_core::ontology::entity::{DetectionMethod, EntityCategory}; use crate::bridge::PythonBridge; use crate::error::from_pyerr; diff --git a/crates/nvisy-python/src/provider/mod.rs b/crates/nvisy-python/src/provider/mod.rs index 23be9cd..fdacca0 100644 --- a/crates/nvisy-python/src/provider/mod.rs +++ b/crates/nvisy-python/src/provider/mod.rs @@ -3,14 +3,21 @@ //! Registers itself as the `"ai"` provider and yields a [`PythonBridge`] //! instance upon connection. +use serde::Deserialize; + use nvisy_core::error::Error; -use nvisy_core::traits::provider::{ConnectedInstance, ProviderFactory}; +use nvisy_core::registry::provider::{ConnectedInstance, ProviderFactory}; use crate::bridge::PythonBridge; -/// Factory that creates [`PythonBridge`] instances from JSON credentials. -/// -/// Expected credential keys: -/// - `apiKey` (required) -- the API key forwarded to the AI model provider. +/// Typed credentials for the AI provider. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct AiCredentials { + /// API key forwarded to the AI model provider. + pub api_key: String, +} + +/// Factory that creates [`PythonBridge`] instances from typed credentials. /// /// The Python interpreter is **not** initialized at connection time; it is /// lazily loaded on the first NER call. @@ -18,26 +25,27 @@ pub struct AiProviderFactory; #[async_trait::async_trait] impl ProviderFactory for AiProviderFactory { + type Credentials = AiCredentials; + type Client = PythonBridge; + fn id(&self) -> &str { "ai" } - fn validate_credentials(&self, creds: &serde_json::Value) -> Result<(), Error> { - if creds.get("apiKey").and_then(|v| v.as_str()).is_none() { - return Err(Error::validation("Missing 'apiKey' in AI credentials", "ai")); - } + fn validate_credentials(&self, _creds: &Self::Credentials) -> Result<(), Error> { + // api_key is required by the struct, so if we got here it's present. Ok(()) } - async fn verify(&self, creds: &serde_json::Value) -> Result<(), Error> { + async fn verify(&self, creds: &Self::Credentials) -> Result<(), Error> { self.validate_credentials(creds) } - async fn connect(&self, _creds: &serde_json::Value) -> Result<ConnectedInstance, Error> { + async fn connect(&self, _creds: &Self::Credentials) -> Result<ConnectedInstance<Self::Client>, Error> { let bridge = PythonBridge::default(); // Don't init here — Python might not be available at connect time // Init happens lazily when detect_ner is called Ok(ConnectedInstance { - client: Box::new(bridge), + client: bridge, disconnect: None, }) } diff --git a/crates/nvisy-server/src/handler/redact.rs b/crates/nvisy-server/src/handler/redact.rs index d5d57a7..0e44be6 100644 --- a/crates/nvisy-server/src/handler/redact.rs +++ b/crates/nvisy-server/src/handler/redact.rs @@ -5,7 +5,7 @@ use axum::{ Json, }; use std::sync::Arc; -use nvisy_core::datatypes::redaction_context::RedactionContext; +use nvisy_core::redaction::RedactionContext; use nvisy_engine::runs::RunManager; use crate::service::AppState; From 72dedf5ba18df2530f722d17a27ca9f7614c3a5d Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Wed, 11 Feb 2026 03:58:20 +0100 Subject: [PATCH 10/17] refactor: decompose nvisy-core into nvisy-ontology, nvisy-ingest, nvisy-media crates Extract ontology/redaction modules into nvisy-ontology, move Loader trait into nvisy-ingest, move StreamSource/StreamTarget into nvisy-object, add new nvisy-ingest and nvisy-media crates, expose fs/io/path modules in nvisy-core with new dependencies (jiff, hipstr, sha2, hex, strum), and add From<io::Error> impl plus InternalError/InvalidInput/Serialization error variants. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- .github/dependabot.yml | 16 +- Cargo.lock | 3486 ++++++++++++----- Cargo.toml | 47 +- crates/nvisy-core/Cargo.toml | 22 +- crates/nvisy-core/src/datatypes/document.rs | 52 + crates/nvisy-core/src/error.rs | 13 + crates/nvisy-core/src/fs/content_file.rs | 637 +++ crates/nvisy-core/src/fs/content_handler.rs | 131 + crates/nvisy-core/src/fs/content_kind.rs | 132 + crates/nvisy-core/src/fs/content_metadata.rs | 184 + crates/nvisy-core/src/fs/content_registry.rs | 108 + crates/nvisy-core/src/fs/mod.rs | 42 + crates/nvisy-core/src/io/content.rs | 234 ++ crates/nvisy-core/src/io/content_data.rs | 651 +++ crates/nvisy-core/src/io/content_read.rs | 372 ++ crates/nvisy-core/src/io/content_write.rs | 372 ++ crates/nvisy-core/src/io/data_reference.rs | 141 + crates/nvisy-core/src/io/mod.rs | 26 + crates/nvisy-core/src/lib.rs | 5 +- crates/nvisy-core/src/path/mod.rs | 9 + crates/nvisy-core/src/path/source.rs | 287 ++ crates/nvisy-core/src/prelude.rs | 8 +- crates/nvisy-core/src/registry/loader.rs | 41 - crates/nvisy-core/src/registry/mod.rs | 5 - crates/nvisy-core/src/registry/stream.rs | 61 - crates/nvisy-detect/Cargo.toml | 2 + crates/nvisy-detect/README.md | 2 +- .../assets/dictionaries/first_names.txt | 50 + .../assets/dictionaries/last_names.txt | 50 + .../assets/dictionaries/medical_terms.txt | 50 + .../src/actions/apply_redaction.rs | 4 +- crates/nvisy-detect/src/actions/classify.rs | 4 +- .../src/actions/detect_checksum.rs | 2 +- .../src/actions/detect_dictionary.rs | 205 + .../nvisy-detect/src/actions/detect_manual.rs | 87 + .../nvisy-detect/src/actions/detect_regex.rs | 5 +- .../src/actions/detect_tabular.rs | 134 + crates/nvisy-detect/src/actions/emit_audit.rs | 4 +- .../src/actions/evaluate_policy.rs | 6 +- crates/nvisy-detect/src/actions/mod.rs | 6 + crates/nvisy-detect/src/dictionaries/mod.rs | 41 + crates/nvisy-detect/src/lib.rs | 9 +- crates/nvisy-detect/src/loaders/mod.rs | 12 - crates/nvisy-detect/src/patterns/mod.rs | 2 +- crates/nvisy-detect/src/prelude.rs | 6 +- crates/nvisy-ingest/Cargo.toml | 64 + crates/nvisy-ingest/README.md | 9 + crates/nvisy-ingest/src/lib.rs | 9 + .../nvisy-ingest/src/loaders/audio_loader.rs | 58 + .../src/loaders/csv_loader.rs | 2 +- .../nvisy-ingest/src/loaders/docx_loader.rs | 166 + .../nvisy-ingest/src/loaders/html_loader.rs | 105 + .../nvisy-ingest/src/loaders/image_loader.rs | 67 + .../src/loaders/json_loader.rs | 2 +- crates/nvisy-ingest/src/loaders/mod.rs | 72 + .../src/loaders/parquet_loader.rs | 131 + crates/nvisy-ingest/src/loaders/pdf_loader.rs | 168 + .../src/loaders/plaintext.rs | 2 +- .../nvisy-ingest/src/loaders/xlsx_loader.rs | 116 + crates/nvisy-ingest/src/prelude.rs | 19 + crates/nvisy-media/Cargo.toml | 49 + crates/nvisy-media/README.md | 7 + .../src/actions/apply_audio_redaction.rs | 54 + .../src/actions/apply_image_redaction.rs | 159 + .../src/actions/apply_pdf_redaction.rs | 153 + .../src/actions/apply_tabular_redaction.rs | 150 + crates/nvisy-media/src/actions/mod.rs | 10 + crates/nvisy-media/src/lib.rs | 11 + crates/nvisy-media/src/prelude.rs | 5 + crates/nvisy-media/src/render/block.rs | 36 + crates/nvisy-media/src/render/blur.rs | 43 + crates/nvisy-media/src/render/mod.rs | 6 + crates/nvisy-object/Cargo.toml | 8 +- crates/nvisy-object/src/prelude.rs | 1 + crates/nvisy-object/src/providers/s3.rs | 129 +- crates/nvisy-object/src/streams/mod.rs | 60 + crates/nvisy-object/src/streams/read.rs | 2 +- crates/nvisy-object/src/streams/write.rs | 2 +- crates/nvisy-ontology/Cargo.toml | 43 + crates/nvisy-ontology/README.md | 3 + crates/nvisy-ontology/src/lib.rs | 9 + .../src/ontology/audit.rs | 4 +- .../src/ontology/entity.rs | 18 +- .../src/ontology/mod.rs | 0 .../src/ontology/redaction.rs | 2 +- crates/nvisy-ontology/src/prelude.rs | 9 + .../src/redaction/context.rs | 41 +- .../src/redaction/mod.rs | 2 +- .../src/redaction/policy.rs | 2 +- crates/nvisy-python/Cargo.toml | 1 + crates/nvisy-python/src/actions/mod.rs | 8 +- crates/nvisy-python/src/actions/ocr.rs | 136 + crates/nvisy-python/src/lib.rs | 1 + crates/nvisy-python/src/ner/mod.rs | 7 +- crates/nvisy-python/src/ocr/mod.rs | 151 + crates/nvisy-python/src/prelude.rs | 1 + crates/nvisy-server/Cargo.toml | 13 +- crates/nvisy-server/src/handler/mod.rs | 2 +- crates/nvisy-server/src/handler/redact.rs | 218 +- crates/nvisy-server/src/service/mod.rs | 1 + crates/nvisy-server/src/service/pipeline.rs | 332 ++ 101 files changed, 9481 insertions(+), 1161 deletions(-) create mode 100644 crates/nvisy-core/src/fs/content_file.rs create mode 100644 crates/nvisy-core/src/fs/content_handler.rs create mode 100644 crates/nvisy-core/src/fs/content_kind.rs create mode 100644 crates/nvisy-core/src/fs/content_metadata.rs create mode 100644 crates/nvisy-core/src/fs/content_registry.rs create mode 100644 crates/nvisy-core/src/fs/mod.rs create mode 100644 crates/nvisy-core/src/io/content.rs create mode 100644 crates/nvisy-core/src/io/content_data.rs create mode 100644 crates/nvisy-core/src/io/content_read.rs create mode 100644 crates/nvisy-core/src/io/content_write.rs create mode 100644 crates/nvisy-core/src/io/data_reference.rs create mode 100644 crates/nvisy-core/src/io/mod.rs create mode 100644 crates/nvisy-core/src/path/mod.rs create mode 100644 crates/nvisy-core/src/path/source.rs delete mode 100644 crates/nvisy-core/src/registry/loader.rs delete mode 100644 crates/nvisy-core/src/registry/stream.rs create mode 100644 crates/nvisy-detect/assets/dictionaries/first_names.txt create mode 100644 crates/nvisy-detect/assets/dictionaries/last_names.txt create mode 100644 crates/nvisy-detect/assets/dictionaries/medical_terms.txt create mode 100644 crates/nvisy-detect/src/actions/detect_dictionary.rs create mode 100644 crates/nvisy-detect/src/actions/detect_manual.rs create mode 100644 crates/nvisy-detect/src/actions/detect_tabular.rs create mode 100644 crates/nvisy-detect/src/dictionaries/mod.rs delete mode 100644 crates/nvisy-detect/src/loaders/mod.rs create mode 100644 crates/nvisy-ingest/Cargo.toml create mode 100644 crates/nvisy-ingest/README.md create mode 100644 crates/nvisy-ingest/src/lib.rs create mode 100644 crates/nvisy-ingest/src/loaders/audio_loader.rs rename crates/{nvisy-detect => nvisy-ingest}/src/loaders/csv_loader.rs (95%) create mode 100644 crates/nvisy-ingest/src/loaders/docx_loader.rs create mode 100644 crates/nvisy-ingest/src/loaders/html_loader.rs create mode 100644 crates/nvisy-ingest/src/loaders/image_loader.rs rename crates/{nvisy-detect => nvisy-ingest}/src/loaders/json_loader.rs (96%) create mode 100644 crates/nvisy-ingest/src/loaders/mod.rs create mode 100644 crates/nvisy-ingest/src/loaders/parquet_loader.rs create mode 100644 crates/nvisy-ingest/src/loaders/pdf_loader.rs rename crates/{nvisy-detect => nvisy-ingest}/src/loaders/plaintext.rs (95%) create mode 100644 crates/nvisy-ingest/src/loaders/xlsx_loader.rs create mode 100644 crates/nvisy-ingest/src/prelude.rs create mode 100644 crates/nvisy-media/Cargo.toml create mode 100644 crates/nvisy-media/README.md create mode 100644 crates/nvisy-media/src/actions/apply_audio_redaction.rs create mode 100644 crates/nvisy-media/src/actions/apply_image_redaction.rs create mode 100644 crates/nvisy-media/src/actions/apply_pdf_redaction.rs create mode 100644 crates/nvisy-media/src/actions/apply_tabular_redaction.rs create mode 100644 crates/nvisy-media/src/actions/mod.rs create mode 100644 crates/nvisy-media/src/lib.rs create mode 100644 crates/nvisy-media/src/prelude.rs create mode 100644 crates/nvisy-media/src/render/block.rs create mode 100644 crates/nvisy-media/src/render/blur.rs create mode 100644 crates/nvisy-media/src/render/mod.rs create mode 100644 crates/nvisy-ontology/Cargo.toml create mode 100644 crates/nvisy-ontology/README.md create mode 100644 crates/nvisy-ontology/src/lib.rs rename crates/{nvisy-core => nvisy-ontology}/src/ontology/audit.rs (98%) rename crates/{nvisy-core => nvisy-ontology}/src/ontology/entity.rs (87%) rename crates/{nvisy-core => nvisy-ontology}/src/ontology/mod.rs (100%) rename crates/{nvisy-core => nvisy-ontology}/src/ontology/redaction.rs (98%) create mode 100644 crates/nvisy-ontology/src/prelude.rs rename crates/{nvisy-core => nvisy-ontology}/src/redaction/context.rs (74%) rename crates/{nvisy-core => nvisy-ontology}/src/redaction/mod.rs (59%) rename crates/{nvisy-core => nvisy-ontology}/src/redaction/policy.rs (99%) create mode 100644 crates/nvisy-python/src/actions/ocr.rs create mode 100644 crates/nvisy-python/src/ocr/mod.rs create mode 100644 crates/nvisy-server/src/service/pipeline.rs diff --git a/.github/dependabot.yml b/.github/dependabot.yml index ea615b7..f68793c 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -11,7 +11,7 @@ updates: labels: - "chore" commit-message: - prefix: "chore(deps)" + prefix: "chore(deps-rs)" groups: rust-dependencies: patterns: @@ -32,6 +32,13 @@ updates: - "chore" commit-message: prefix: "chore(deps-py)" + groups: + rust-dependencies: + patterns: + - "*" + update-types: + - "minor" + - "patch" - package-ecosystem: "pip" directory: "/packages/nvisy-exif" @@ -45,6 +52,13 @@ updates: - "chore" commit-message: prefix: "chore(deps-py)" + groups: + rust-dependencies: + patterns: + - "*" + update-types: + - "minor" + - "patch" - package-ecosystem: "github-actions" directory: "/" diff --git a/Cargo.lock b/Cargo.lock index 129c0b9..f2f931a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,22 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "ab_glyph" +version = "0.2.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01c0457472c38ea5bd1c3b5ada5e368271cb550be7a4ca4a0b4634e9913f6cc2" +dependencies = [ + "ab_glyph_rasterizer", + "owned_ttf_parser", +] + +[[package]] +name = "ab_glyph_rasterizer" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "366ffbaa4442f4684d91e2cd7c5ea7c4ed8add41959a31447066e279e432b618" + [[package]] name = "addr2line" version = "0.25.1" @@ -17,6 +33,40 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "adobe-cmap-parser" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3" +dependencies = [ + "pom", +] + +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "const-random", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -27,10 +77,22 @@ dependencies = [ ] [[package]] -name = "allocator-api2" -version = "0.2.21" +name = "aligned" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee4508988c62edf04abd8d92897fca0c2995d907ce1dfeaf369dac3716a40685" +dependencies = [ + "as-slice", +] + +[[package]] +name = "aligned-vec" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" +checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b" +dependencies = [ + "equator", +] [[package]] name = "android_system_properties" @@ -42,476 +104,368 @@ dependencies = [ ] [[package]] -name = "anyhow" -version = "1.0.101" +name = "anstream" +version = "0.6.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ - "backtrace", + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", ] [[package]] -name = "async-trait" -version = "0.1.89" +name = "anstyle" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" dependencies = [ - "proc-macro2", - "quote", - "syn", + "utf8parse", ] [[package]] -name = "atomic-waker" -version = "1.1.2" +name = "anstyle-query" +version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] [[package]] -name = "autocfg" -version = "1.5.0" +name = "anstyle-wincon" +version = "3.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] [[package]] -name = "aws-config" -version = "1.8.13" +name = "anyhow" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c456581cb3c77fafcc8c67204a70680d40b61112d6da78c77bd31d945b65f1b5" +checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-sdk-sso", - "aws-sdk-ssooidc", - "aws-sdk-sts", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "fastrand", - "hex", - "http 1.4.0", - "ring", - "time", - "tokio", - "tracing", - "url", - "zeroize", + "backtrace", ] [[package]] -name = "aws-credential-types" -version = "1.2.11" +name = "approx" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cd362783681b15d136480ad555a099e82ecd8e2d10a841e14dfd0078d67fee3" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" dependencies = [ - "aws-smithy-async", - "aws-smithy-runtime-api", - "aws-smithy-types", - "zeroize", + "num-traits", ] [[package]] -name = "aws-lc-rs" -version = "1.15.4" +name = "arbitrary" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b7b6141e96a8c160799cc2d5adecd5cbbe5054cb8c7c4af53da0f83bb7ad256" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" dependencies = [ - "aws-lc-sys", - "zeroize", + "derive_arbitrary", ] [[package]] -name = "aws-lc-sys" -version = "0.37.0" +name = "arg_enum_proc_macro" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c34dda4df7017c8db52132f0f8a2e0f8161649d15723ed63fc00c82d0f2081a" +checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" dependencies = [ - "cc", - "cmake", - "dunce", - "fs_extra", + "proc-macro2", + "quote", + "syn 2.0.114", ] [[package]] -name = "aws-runtime" -version = "1.6.0" +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "arrow" +version = "54.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c635c2dc792cb4a11ce1a4f392a925340d1bdf499289b5ec1ec6810954eb43f5" +checksum = "b5ec52ba94edeed950e4a41f75d35376df196e8cb04437f7280a5aa49f20f796" dependencies = [ - "aws-credential-types", - "aws-sigv4", - "aws-smithy-async", - "aws-smithy-eventstream", - "aws-smithy-http", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "fastrand", - "http 0.2.12", - "http 1.4.0", - "http-body 0.4.6", - "http-body 1.0.1", - "percent-encoding", - "pin-project-lite", - "tracing", - "uuid", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", ] [[package]] -name = "aws-sdk-s3" -version = "1.122.0" +name = "arrow-arith" +version = "54.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94c2ca0cba97e8e279eb6c0b2d0aa10db5959000e602ab2b7c02de6b85d4c19b" +checksum = "8fc766fdacaf804cb10c7c70580254fcdb5d55cdfda2bc57b02baf5223a3af9e" dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-sigv4", - "aws-smithy-async", - "aws-smithy-checksums", - "aws-smithy-eventstream", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-observability", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-smithy-xml", - "aws-types", - "bytes", - "fastrand", - "hex", - "hmac", - "http 0.2.12", - "http 1.4.0", - "http-body 1.0.1", - "lru", - "percent-encoding", - "regex-lite", - "sha2", - "tracing", - "url", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num", ] [[package]] -name = "aws-sdk-sso" -version = "1.93.0" +name = "arrow-array" +version = "54.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dcb38bb33fc0a11f1ffc3e3e85669e0a11a37690b86f77e75306d8f369146a0" +checksum = "a12fcdb3f1d03f69d3ec26ac67645a8fe3f878d77b5ebb0b15d64a116c212985" dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-observability", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "fastrand", - "http 0.2.12", - "http 1.4.0", - "regex-lite", - "tracing", + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown 0.15.5", + "num", ] [[package]] -name = "aws-sdk-ssooidc" -version = "1.95.0" +name = "arrow-buffer" +version = "54.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ada8ffbea7bd1be1f53df1dadb0f8fdb04badb13185b3321b929d1ee3caad09" +checksum = "263f4801ff1839ef53ebd06f99a56cecd1dbaf314ec893d93168e2e860e0291c" dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-observability", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", "bytes", - "fastrand", - "http 0.2.12", - "http 1.4.0", - "regex-lite", - "tracing", + "half", + "num", ] [[package]] -name = "aws-sdk-sts" -version = "1.97.0" +name = "arrow-cast" +version = "54.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6443ccadc777095d5ed13e21f5c364878c9f5bad4e35187a6cdbd863b0afcad" +checksum = "ede6175fbc039dfc946a61c1b6d42fd682fcecf5ab5d148fbe7667705798cac9" dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-observability", - "aws-smithy-query", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-smithy-xml", - "aws-types", - "fastrand", - "http 0.2.12", - "http 1.4.0", - "regex-lite", - "tracing", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "half", + "lexical-core", + "num", + "ryu", ] [[package]] -name = "aws-sigv4" -version = "1.3.8" +name = "arrow-data" +version = "54.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efa49f3c607b92daae0c078d48a4571f599f966dce3caee5f1ea55c4d9073f99" +checksum = "61cfdd7d99b4ff618f167e548b2411e5dd2c98c0ddebedd7df433d34c20a4429" dependencies = [ - "aws-credential-types", - "aws-smithy-eventstream", - "aws-smithy-http", - "aws-smithy-runtime-api", - "aws-smithy-types", - "bytes", - "crypto-bigint 0.5.5", - "form_urlencoded", - "hex", - "hmac", - "http 0.2.12", - "http 1.4.0", - "p256", - "percent-encoding", - "ring", - "sha2", - "subtle", - "time", - "tracing", - "zeroize", + "arrow-buffer", + "arrow-schema", + "half", + "num", ] [[package]] -name = "aws-smithy-async" -version = "1.2.11" +name = "arrow-ipc" +version = "54.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52eec3db979d18cb807fc1070961cc51d87d069abe9ab57917769687368a8c6c" +checksum = "62ff528658b521e33905334723b795ee56b393dbe9cf76c8b1f64b648c65a60c" dependencies = [ - "futures-util", - "pin-project-lite", - "tokio", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "flatbuffers", ] [[package]] -name = "aws-smithy-checksums" -version = "0.64.3" +name = "arrow-ord" +version = "54.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddcf418858f9f3edd228acb8759d77394fed7531cce78d02bdda499025368439" +checksum = "f0a3334a743bd2a1479dbc635540617a3923b4b2f6870f37357339e6b5363c21" dependencies = [ - "aws-smithy-http", - "aws-smithy-types", - "bytes", - "crc-fast", - "hex", - "http 1.4.0", - "http-body 1.0.1", - "http-body-util", - "md-5", - "pin-project-lite", - "sha1", - "sha2", - "tracing", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", ] [[package]] -name = "aws-smithy-eventstream" -version = "0.60.18" +name = "arrow-row" +version = "54.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35b9c7354a3b13c66f60fe4616d6d1969c9fd36b1b5333a5dfb3ee716b33c588" +checksum = "8d1d7a7291d2c5107e92140f75257a99343956871f3d3ab33a7b41532f79cb68" dependencies = [ - "aws-smithy-types", - "bytes", - "crc32fast", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", ] [[package]] -name = "aws-smithy-http" -version = "0.63.3" +name = "arrow-schema" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cfaf5e440be44db5413b75b72c2a87c1f8f0627117d110264048f2969b99e9" + +[[package]] +name = "arrow-select" +version = "54.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "630e67f2a31094ffa51b210ae030855cb8f3b7ee1329bdd8d085aaf61e8b97fc" +checksum = "69efcd706420e52cd44f5c4358d279801993846d1c2a8e52111853d61d55a619" dependencies = [ - "aws-smithy-eventstream", - "aws-smithy-runtime-api", - "aws-smithy-types", - "bytes", - "bytes-utils", - "futures-core", - "futures-util", - "http 1.4.0", - "http-body 1.0.1", - "http-body-util", - "percent-encoding", - "pin-project-lite", - "pin-utils", - "tracing", + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", ] [[package]] -name = "aws-smithy-http-client" -version = "1.1.9" +name = "arrow-string" +version = "54.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12fb0abf49ff0cab20fd31ac1215ed7ce0ea92286ba09e2854b42ba5cabe7525" -dependencies = [ - "aws-smithy-async", - "aws-smithy-runtime-api", - "aws-smithy-types", - "h2 0.3.27", - "h2 0.4.13", - "http 0.2.12", - "http 1.4.0", - "http-body 0.4.6", - "hyper 0.14.32", - "hyper 1.8.1", - "hyper-rustls 0.24.2", - "hyper-rustls 0.27.7", - "hyper-util", - "pin-project-lite", - "rustls 0.21.12", - "rustls 0.23.36", - "rustls-native-certs", - "rustls-pki-types", - "tokio", - "tokio-rustls 0.26.4", - "tower", - "tracing", +checksum = "a21546b337ab304a32cfc0770f671db7411787586b45b78b4593ae78e64e2b03" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", ] [[package]] -name = "aws-smithy-json" -version = "0.62.3" +name = "as-slice" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cb96aa208d62ee94104645f7b2ecaf77bf27edf161590b6224bfbac2832f979" +checksum = "516b6b4f0e40d50dcda9365d53964ec74560ad4284da2e7fc97122cd83174516" dependencies = [ - "aws-smithy-types", + "stable_deref_trait", ] [[package]] -name = "aws-smithy-observability" -version = "0.2.4" +name = "async-recursion" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0a46543fbc94621080b3cf553eb4cbbdc41dd9780a30c4756400f0139440a1d" +checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ - "aws-smithy-runtime-api", + "proc-macro2", + "quote", + "syn 2.0.114", ] [[package]] -name = "aws-smithy-query" -version = "0.60.13" +name = "async-trait" +version = "0.1.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cebbddb6f3a5bd81553643e9c7daf3cc3dc5b0b5f398ac668630e8a84e6fff0" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ - "aws-smithy-types", - "urlencoding", + "proc-macro2", + "quote", + "syn 2.0.114", ] [[package]] -name = "aws-smithy-runtime" -version = "1.10.0" +name = "atoi" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3df87c14f0127a0d77eb261c3bc45d5b4833e2a1f63583ebfb728e4852134ee" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" dependencies = [ - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-http-client", - "aws-smithy-observability", - "aws-smithy-runtime-api", - "aws-smithy-types", - "bytes", - "fastrand", - "http 0.2.12", - "http 1.4.0", - "http-body 0.4.6", - "http-body 1.0.1", - "http-body-util", - "pin-project-lite", - "pin-utils", - "tokio", - "tracing", + "num-traits", ] [[package]] -name = "aws-smithy-runtime-api" -version = "1.11.3" +name = "atoi_simd" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49952c52f7eebb72ce2a754d3866cc0f87b97d2a46146b79f80f3a93fb2b3716" +checksum = "8ad17c7c205c2c28b527b9845eeb91cf1b4d008b438f98ce0e628227a822758e" dependencies = [ - "aws-smithy-async", - "aws-smithy-types", - "bytes", - "http 0.2.12", - "http 1.4.0", - "pin-project-lite", - "tokio", - "tracing", - "zeroize", + "debug_unsafe", ] [[package]] -name = "aws-smithy-types" -version = "1.4.3" +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "av-scenechange" +version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b3a26048eeab0ddeba4b4f9d51654c79af8c3b32357dc5f336cee85ab331c33" +checksum = "0f321d77c20e19b92c39e7471cf986812cbb46659d2af674adc4331ef3f18394" dependencies = [ - "base64-simd", - "bytes", - "bytes-utils", - "futures-core", - "http 0.2.12", - "http 1.4.0", - "http-body 0.4.6", - "http-body 1.0.1", - "http-body-util", - "itoa", - "num-integer", - "pin-project-lite", - "pin-utils", - "ryu", - "serde", - "time", - "tokio", - "tokio-util", + "aligned", + "anyhow", + "arg_enum_proc_macro", + "arrayvec", + "log", + "num-rational", + "num-traits", + "pastey", + "rayon", + "thiserror", + "v_frame", + "y4m", ] [[package]] -name = "aws-smithy-xml" -version = "0.60.13" +name = "av1-grain" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11b2f670422ff42bf7065031e72b45bc52a3508bd089f743ea90731ca2b6ea57" +checksum = "8cfddb07216410377231960af4fcab838eaa12e013417781b78bd95ee22077f8" dependencies = [ - "xmlparser", + "anyhow", + "arrayvec", + "log", + "nom 8.0.0", + "num-rational", + "v_frame", ] [[package]] -name = "aws-types" -version = "1.3.11" +name = "avif-serialize" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d980627d2dd7bfc32a3c025685a033eeab8d365cc840c631ef59d1b8f428164" +checksum = "47c8fbc0f831f4519fe8b810b6a7a91410ec83031b8233f730a0480029f6a23f" dependencies = [ - "aws-credential-types", - "aws-smithy-async", - "aws-smithy-runtime-api", - "aws-smithy-types", - "rustc_version", - "tracing", + "arrayvec", ] [[package]] @@ -525,15 +479,16 @@ dependencies = [ "bytes", "form_urlencoded", "futures-util", - "http 1.4.0", - "http-body 1.0.1", + "http", + "http-body", "http-body-util", - "hyper 1.8.1", + "hyper", "hyper-util", "itoa", "matchit", "memchr", "mime", + "multer", "percent-encoding", "pin-project-lite", "serde_core", @@ -556,8 +511,8 @@ checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" dependencies = [ "bytes", "futures-core", - "http 1.4.0", - "http-body 1.0.1", + "http", + "http-body", "http-body-util", "mime", "pin-project-lite", @@ -575,7 +530,7 @@ checksum = "604fde5e028fea851ce1d8570bbdc034bec850d157f7569d10f347d06808c05c" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -593,12 +548,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "base16ct" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" - [[package]] name = "base64" version = "0.22.1" @@ -606,20 +555,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] -name = "base64-simd" -version = "0.8.0" +name = "bit_field" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" -dependencies = [ - "outref", - "vsimd", -] +checksum = "1e4b40c7323adcfc0a41c4b88143ed58346ff65a288fc144329c5c45e05d70c6" [[package]] -name = "base64ct" -version = "1.8.3" +name = "bitflags" +version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" @@ -627,6 +572,15 @@ version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +[[package]] +name = "bitstream-io" +version = "4.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60d4bd9d1db2c6bdf285e223a7fa369d5ce98ec767dec949c6ca62863ce61757" +dependencies = [ + "core2", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -636,18 +590,36 @@ dependencies = [ "generic-array", ] +[[package]] +name = "built" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4ad8f11f288f48ca24471bbd51ac257aaeaaa07adae295591266b792902ae64" + [[package]] name = "bumpalo" version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + [[package]] name = "byteorder" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "byteorder-lite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" + [[package]] name = "bytes" version = "1.11.1" @@ -658,25 +630,51 @@ dependencies = [ ] [[package]] -name = "bytes-utils" -version = "0.1.4" +name = "bzip2" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" dependencies = [ - "bytes", - "either", + "bzip2-sys", ] [[package]] -name = "cc" -version = "1.2.55" +name = "bzip2-sys" +version = "0.1.13+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b26a0954ae34af09b50f0de26458fa95369a0d478d8236d3f93082b219bd29" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" dependencies = [ - "find-msvc-tools", - "jobserver", - "libc", - "shlex", + "cc", + "pkg-config", +] + +[[package]] +name = "calamine" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96ae094b353c7810cd5efd2e69413ebb9354816138a387c09f7b90d4e826a49f" +dependencies = [ + "atoi_simd", + "byteorder", + "codepage", + "encoding_rs", + "fast-float2", + "log", + "quick-xml 0.38.4", + "serde", + "zip 7.4.0", +] + +[[package]] +name = "cc" +version = "1.2.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b26a0954ae34af09b50f0de26458fa95369a0d478d8236d3f93082b219bd29" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", ] [[package]] @@ -711,25 +709,67 @@ dependencies = [ ] [[package]] -name = "cmake" -version = "0.1.57" +name = "cipher" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" dependencies = [ - "cc", + "crypto-common", + "inout", +] + +[[package]] +name = "codepage" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4" +dependencies = [ + "encoding_rs", +] + +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.17", + "once_cell", + "tiny-keccak", ] [[package]] -name = "const-oid" -version = "0.9.6" +name = "constant_time_eq" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" [[package]] name = "core-foundation" -version = "0.10.1" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" dependencies = [ "core-foundation-sys", "libc", @@ -741,6 +781,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "core2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" +dependencies = [ + "memchr", +] + [[package]] name = "cpufeatures" version = "0.2.17" @@ -765,18 +814,6 @@ version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" -[[package]] -name = "crc-fast" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd92aca2c6001b1bf5ba0ff84ee74ec8501b52bbef0cac80bf25a6c1d87a83d" -dependencies = [ - "crc", - "digest", - "rustversion", - "spin", -] - [[package]] name = "crc32fast" version = "1.5.0" @@ -787,27 +824,36 @@ dependencies = [ ] [[package]] -name = "crypto-bigint" -version = "0.4.9" +name = "crossbeam-deque" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef2b4b23cddf68b89b8f8069890e8c270d54e2d5fe1b143820234805e4cb17ef" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" dependencies = [ - "generic-array", - "rand_core 0.6.4", - "subtle", - "zeroize", + "crossbeam-epoch", + "crossbeam-utils", ] [[package]] -name = "crypto-bigint" -version = "0.5.5" +name = "crossbeam-epoch" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "rand_core 0.6.4", - "subtle", + "crossbeam-utils", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + [[package]] name = "crypto-common" version = "0.1.7" @@ -819,15 +865,54 @@ dependencies = [ ] [[package]] -name = "der" +name = "cssparser" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf", + "smallvec", +] + +[[package]] +name = "cssparser-macros" version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" dependencies = [ - "const-oid", - "zeroize", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", ] +[[package]] +name = "debug_unsafe" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eed2c4702fa172d1ce21078faa7c5203e69f5394d48cc436d25928394a867a2" + +[[package]] +name = "deflate64" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26bf8fc351c5ed29b5c2f0cbbac1b209b74f60ecd62e675a998df72c49af5204" + [[package]] name = "deranged" version = "0.5.5" @@ -837,6 +922,39 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derivative" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "derive_more" +version = "0.99.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "derive_more" version = "1.0.0" @@ -854,7 +972,7 @@ checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", "unicode-xid", ] @@ -877,14 +995,23 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] -name = "dunce" -version = "1.0.5" +name = "dtoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" + +[[package]] +name = "dtoa-short" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] [[package]] name = "dyn-clone" @@ -893,16 +1020,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" [[package]] -name = "ecdsa" -version = "0.14.8" +name = "ego-tree" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" -dependencies = [ - "der", - "elliptic-curve", - "rfc6979", - "signature", -] +checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" [[package]] name = "either" @@ -911,23 +1032,55 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] -name = "elliptic-curve" -version = "0.12.3" +name = "encoding_rs" +version = "0.8.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" dependencies = [ - "base16ct", - "crypto-bigint 0.4.9", - "der", - "digest", - "ff", - "generic-array", - "group", - "pkcs8", - "rand_core 0.6.4", - "sec1", - "subtle", - "zeroize", + "cfg-if", +] + +[[package]] +name = "env_filter" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "jiff", + "log", +] + +[[package]] +name = "equator" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", ] [[package]] @@ -946,6 +1099,36 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "euclid" +version = "0.20.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad" +dependencies = [ + "num-traits", +] + +[[package]] +name = "exr" +version = "1.74.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4300e043a56aa2cb633c01af81ca8f699a321879a7854d3896a0ba89056363be" +dependencies = [ + "bit_field", + "half", + "lebe", + "miniz_oxide", + "rayon-core", + "smallvec", + "zune-inflate", +] + +[[package]] +name = "fast-float2" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" + [[package]] name = "fastrand" version = "2.3.0" @@ -953,13 +1136,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] -name = "ff" -version = "0.12.1" +name = "fax" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160" +checksum = "f05de7d48f37cd6730705cbca900770cab77a89f413d23e100ad7fad7795a0ab" dependencies = [ - "rand_core 0.6.4", - "subtle", + "fax_derive", +] + +[[package]] +name = "fax_derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0aca10fb742cb43f9e7bb8467c91aa9bcb8e3ffbc6a6f7389bb93ffc920577d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "fdeflate" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", ] [[package]] @@ -974,6 +1176,27 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" +[[package]] +name = "flatbuffers" +version = "24.12.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" +dependencies = [ + "bitflags 1.3.2", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", + "zlib-rs", +] + [[package]] name = "fnv" version = "1.0.7" @@ -987,10 +1210,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" [[package]] -name = "foldhash" -version = "0.2.0" +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" [[package]] name = "form_urlencoded" @@ -1002,10 +1234,29 @@ dependencies = [ ] [[package]] -name = "fs_extra" -version = "1.3.0" +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "futures" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] [[package]] name = "futures-channel" @@ -1014,6 +1265,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", + "futures-sink", ] [[package]] @@ -1023,32 +1275,75 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] -name = "futures-sink" +name = "futures-executor" version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] [[package]] -name = "futures-task" +name = "futures-io" version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] -name = "futures-util" +name = "futures-macro" version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ - "futures-core", - "futures-task", - "pin-project-lite", - "pin-utils", + "proc-macro2", + "quote", + "syn 2.0.114", ] [[package]] -name = "generic-array" -version = "0.14.7" +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "generic-array" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ @@ -1056,6 +1351,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "getopts" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -1063,8 +1367,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", ] [[package]] @@ -1074,46 +1380,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", + "js-sys", "libc", "r-efi", "wasip2", + "wasm-bindgen", ] [[package]] -name = "gimli" -version = "0.32.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" - -[[package]] -name = "group" -version = "0.12.1" +name = "gif" +version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" +checksum = "f5df2ba84018d80c213569363bdcd0c64e6933c67fe4c1d60ecf822971a3c35e" dependencies = [ - "ff", - "rand_core 0.6.4", - "subtle", + "color_quant", + "weezl", ] [[package]] -name = "h2" -version = "0.3.27" +name = "gimli" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http 0.2.12", - "indexmap", - "slab", - "tokio", - "tokio-util", - "tracing", -] +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" [[package]] name = "h2" @@ -1126,7 +1414,7 @@ dependencies = [ "fnv", "futures-core", "futures-sink", - "http 1.4.0", + "http", "indexmap", "slab", "tokio", @@ -1134,13 +1422,31 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", + "zerocopy", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "hashbrown" version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "foldhash 0.1.5", + "foldhash", ] [[package]] @@ -1148,11 +1454,6 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" -dependencies = [ - "allocator-api2", - "equivalent", - "foldhash 0.2.0", -] [[package]] name = "hdrhistogram" @@ -1176,6 +1477,17 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hipstr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97971ffc85d4c98de12e2608e992a43f5294ebb625fdb045b27c731b64c4c6d6" +dependencies = [ + "serde", + "serde_bytes", + "sptr", +] + [[package]] name = "hmac" version = "0.12.1" @@ -1186,14 +1498,15 @@ dependencies = [ ] [[package]] -name = "http" -version = "0.2.12" +name = "html5ever" +version = "0.29.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +checksum = "3b7410cae13cbc75623c98ac4cbfd1f0bedddf3227afc24f370cf0f50a44a11c" dependencies = [ - "bytes", - "fnv", - "itoa", + "log", + "mac", + "markup5ever", + "match_token", ] [[package]] @@ -1206,17 +1519,6 @@ dependencies = [ "itoa", ] -[[package]] -name = "http-body" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" -dependencies = [ - "bytes", - "http 0.2.12", - "pin-project-lite", -] - [[package]] name = "http-body" version = "1.0.1" @@ -1224,7 +1526,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.4.0", + "http", ] [[package]] @@ -1235,8 +1537,8 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http 1.4.0", - "http-body 1.0.1", + "http", + "http-body", "pin-project-lite", ] @@ -1252,30 +1554,6 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" -[[package]] -name = "hyper" -version = "0.14.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2 0.3.27", - "http 0.2.12", - "http-body 0.4.6", - "httparse", - "httpdate", - "itoa", - "pin-project-lite", - "socket2 0.5.10", - "tokio", - "tower-service", - "tracing", - "want", -] - [[package]] name = "hyper" version = "1.8.1" @@ -1286,9 +1564,9 @@ dependencies = [ "bytes", "futures-channel", "futures-core", - "h2 0.4.13", - "http 1.4.0", - "http-body 1.0.1", + "h2", + "http", + "http-body", "httparse", "httpdate", "itoa", @@ -1300,34 +1578,18 @@ dependencies = [ ] [[package]] -name = "hyper-rustls" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" -dependencies = [ - "futures-util", - "http 0.2.12", - "hyper 0.14.32", - "log", - "rustls 0.21.12", - "tokio", - "tokio-rustls 0.24.1", -] - -[[package]] -name = "hyper-rustls" -version = "0.27.7" +name = "hyper-tls" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" dependencies = [ - "http 1.4.0", - "hyper 1.8.1", + "bytes", + "http-body-util", + "hyper", "hyper-util", - "rustls 0.23.36", - "rustls-native-certs", - "rustls-pki-types", + "native-tls", "tokio", - "tokio-rustls 0.26.4", + "tokio-native-tls", "tower-service", ] @@ -1341,14 +1603,14 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.4.0", - "http-body 1.0.1", - "hyper 1.8.1", + "http", + "http-body", + "hyper", "ipnet", "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.2", + "socket2", "tokio", "tower-service", "tracing", @@ -1480,6 +1742,64 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "image" +version = "0.25.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6506c6c10786659413faa717ceebcb8f70731c0a60cbae39795fdf114519c1a" +dependencies = [ + "bytemuck", + "byteorder-lite", + "color_quant", + "exr", + "gif", + "image-webp", + "moxcms", + "num-traits", + "png", + "qoi", + "ravif", + "rayon", + "rgb", + "tiff", + "zune-core 0.5.1", + "zune-jpeg 0.5.12", +] + +[[package]] +name = "image-webp" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3" +dependencies = [ + "byteorder-lite", + "quick-error", +] + +[[package]] +name = "imageproc" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2393fb7808960751a52e8a154f67e7dd3f8a2ef9bd80d1553078a7b4e8ed3f0d" +dependencies = [ + "ab_glyph", + "approx", + "getrandom 0.2.17", + "image", + "itertools 0.12.1", + "nalgebra", + "num", + "rand 0.8.5", + "rand_distr", + "rayon", +] + +[[package]] +name = "imgref" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c5cedc30da3a610cac6b4ba17597bdf7152cf974e8aab3afb3d54455e371c8" + [[package]] name = "indexmap" version = "2.13.0" @@ -1510,18 +1830,119 @@ dependencies = [ "cfb", ] +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "generic-array", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "interpolate_name" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "ipnet" version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" +[[package]] +name = "iri-string" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +[[package]] +name = "jiff" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89a5b5e10d5a9ad6e5d1f4bd58225f655d6fe9767575a5e8ac5a6fe64e04495" +dependencies = [ + "jiff-static", + "jiff-tzdb-platform", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", + "windows-sys 0.61.2", +] + +[[package]] +name = "jiff-static" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff7a39c8862fc1369215ccf0a8f12dd4598c7f6484704359f0351bd617034dbf" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68971ebff725b9e2ca27a601c5eb38a4c5d64422c4cbab0c535f248087eda5c2" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + [[package]] name = "jobserver" version = "0.1.34" @@ -1549,46 +1970,232 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] -name = "libc" -version = "0.2.181" +name = "lebe" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "459427e2af2b9c839b132acb702a1c654d95e10f8c326bfc2ad11310e458b1c5" +checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" [[package]] -name = "litemap" -version = "0.8.1" +name = "lexical-core" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] [[package]] -name = "log" -version = "0.4.29" +name = "lexical-parse-float" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] [[package]] -name = "lru" -version = "0.16.3" +name = "lexical-parse-integer" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" dependencies = [ - "hashbrown 0.16.1", + "lexical-util", ] [[package]] -name = "matchers" -version = "0.2.0" +name = "lexical-util" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" dependencies = [ - "regex-automata", + "lexical-util", + "lexical-write-integer", ] [[package]] -name = "matchit" -version = "0.8.4" +name = "lexical-write-integer" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "libc" +version = "0.2.181" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "459427e2af2b9c839b132acb702a1c654d95e10f8c326bfc2ad11310e458b1c5" + +[[package]] +name = "libfuzzer-sys" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5037190e1f70cbeef565bd267599242926f724d3b8a9f510fd7e0b540cfa4404" +dependencies = [ + "arbitrary", + "cc", +] + +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "linux-raw-sys" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" + +[[package]] +name = "litemap" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "loop9" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fae87c125b03c1d2c0150c90365d7d6bcc53fb73a9acaef207d2d065860f062" +dependencies = [ + "imgref", +] + +[[package]] +name = "lopdf" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff" +dependencies = [ + "chrono", + "encoding_rs", + "flate2", + "indexmap", + "itoa", + "log", + "md-5", + "nom 7.1.3", + "rangemap", + "rayon", + "time", + "weezl", +] + +[[package]] +name = "lzma-rs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" +dependencies = [ + "byteorder", + "crc", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7a7213d12e1864c0f002f52c2923d4556935a43dec5e71355c2760e0f6e7a18" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "match_token" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88a9689d8d44bf9964484516275f5cd4c9b59457a6940c1d5d0ecbb94510a36b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "matchit" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" + +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "maybe-rayon" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" +dependencies = [ + "cfg-if", + "rayon", +] [[package]] name = "md-5" @@ -1600,6 +2207,12 @@ dependencies = [ "digest", ] +[[package]] +name = "md5" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" + [[package]] name = "memchr" version = "2.8.0" @@ -1621,6 +2234,52 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "minio" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3824101357fa899d01c729e4a245776e20a03f2f6645979e86b9d3d5d9c42741" +dependencies = [ + "async-recursion", + "async-trait", + "base64", + "byteorder", + "bytes", + "chrono", + "crc", + "dashmap", + "derivative", + "env_logger", + "futures", + "futures-util", + "hex", + "hmac", + "http", + "hyper", + "lazy_static", + "log", + "md5", + "multimap", + "percent-encoding", + "rand 0.8.5", + "regex", + "reqwest", + "serde", + "serde_json", + "sha2", + "tokio", + "tokio-stream", + "tokio-util", + "urlencoding", + "xmltree", +] + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -1628,6 +2287,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", + "simd-adler32", ] [[package]] @@ -1641,6 +2301,105 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "moxcms" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac9557c559cd6fc9867e122e20d2cbefc9ca29d80d027a8e39310920ed2f0a97" +dependencies = [ + "num-traits", + "pxfm", +] + +[[package]] +name = "multer" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83e87776546dc87511aa5ee218730c92b666d7264ab6ed41f9d215af9cd5224b" +dependencies = [ + "bytes", + "encoding_rs", + "futures-util", + "http", + "httparse", + "memchr", + "mime", + "spin", + "version_check", +] + +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +dependencies = [ + "serde", +] + +[[package]] +name = "nalgebra" +version = "0.32.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5c17de023a86f59ed79891b2e5d5a94c705dbe904a5b5c9c952ea6221b03e4" +dependencies = [ + "approx", + "matrixmultiply", + "num-complex", + "num-rational", + "num-traits", + "simba", + "typenum", +] + +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + +[[package]] +name = "noop_proc_macro" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -1650,12 +2409,56 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + [[package]] name = "num-conv" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "num-integer" version = "0.1.46" @@ -1665,6 +2468,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1672,6 +2497,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -1682,11 +2508,17 @@ dependencies = [ "async-trait", "bytes", "chrono", - "derive_more", + "derive_more 1.0.0", + "hex", + "hipstr", "infer", + "jiff", "schemars", "serde", "serde_json", + "sha2", + "strum", + "tempfile", "thiserror", "tokio", "tracing", @@ -1697,8 +2529,10 @@ dependencies = [ name = "nvisy-detect" version = "0.1.0" dependencies = [ + "aho-corasick", "async-trait", "nvisy-core", + "nvisy-ontology", "regex", "serde", "serde_json", @@ -1715,13 +2549,55 @@ dependencies = [ "chrono", "nvisy-core", "petgraph", - "rand", + "rand 0.9.2", "schemars", "serde", "serde_json", - "thiserror", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "uuid", +] + +[[package]] +name = "nvisy-ingest" +version = "0.1.0" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "calamine", + "image", + "infer", + "lopdf", + "nvisy-core", + "parquet", + "pdf-extract", + "quick-xml 0.37.5", + "scraper", + "serde", + "serde_json", + "tokio", + "tracing", + "uuid", + "zip 2.4.2", +] + +[[package]] +name = "nvisy-media" +version = "0.1.0" +dependencies = [ + "async-trait", + "bytes", + "image", + "imageproc", + "lopdf", + "nvisy-core", + "nvisy-ontology", + "serde", + "serde_json", "tokio", - "tokio-util", "tracing", "uuid", ] @@ -1731,9 +2607,9 @@ name = "nvisy-object" version = "0.1.0" dependencies = [ "async-trait", - "aws-config", - "aws-sdk-s3", "bytes", + "futures", + "minio", "nvisy-core", "serde", "serde_json", @@ -1743,12 +2619,26 @@ dependencies = [ "uuid", ] +[[package]] +name = "nvisy-ontology" +version = "0.1.0" +dependencies = [ + "chrono", + "derive_more 1.0.0", + "nvisy-core", + "schemars", + "serde", + "serde_json", + "uuid", +] + [[package]] name = "nvisy-python" version = "0.1.0" dependencies = [ "async-trait", "nvisy-core", + "nvisy-ontology", "pyo3", "serde", "serde_json", @@ -1764,9 +2654,16 @@ version = "0.1.0" dependencies = [ "anyhow", "axum", + "base64", + "bytes", "chrono", "nvisy-core", + "nvisy-detect", "nvisy-engine", + "nvisy-ingest", + "nvisy-media", + "nvisy-ontology", + "nvisy-python", "schemars", "serde", "serde_json", @@ -1796,27 +2693,159 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "openssl" +version = "0.10.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" +dependencies = [ + "bitflags 2.10.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "openssl-probe" -version = "0.2.1" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] -name = "outref" -version = "0.5.2" +name = "openssl-sys" +version = "0.9.111" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] [[package]] -name = "p256" -version = "0.11.1" +name = "ordered-float" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" dependencies = [ - "ecdsa", - "elliptic-curve", - "sha2", + "num-traits", +] + +[[package]] +name = "owned_ttf_parser" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36820e9051aca1014ddc75770aab4d68bc1e9e632f0f5627c4086bc216fb583b" +dependencies = [ + "ttf-parser", +] + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "parquet" +version = "54.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfb15796ac6f56b429fd99e33ba133783ad75b27c36b4b5ce06f1f82cc97754e" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", + "bytes", + "chrono", + "half", + "hashbrown 0.15.5", + "num", + "num-bigint", + "paste", + "seq-macro", + "thrift", + "twox-hash", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pastey" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" + +[[package]] +name = "pbkdf2" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" +dependencies = [ + "digest", + "hmac", +] + +[[package]] +name = "pdf-extract" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575" +dependencies = [ + "adobe-cmap-parser", + "encoding_rs", + "euclid", + "lopdf", + "postscript", + "type1-encoding-parser", + "unicode-normalization", ] [[package]] @@ -1837,6 +2866,58 @@ dependencies = [ "serde", ] +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared", + "rand 0.8.5", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project-lite" version = "0.2.16" @@ -1850,21 +2931,51 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] -name = "pkcs8" -version = "0.9.0" +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "png" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" +checksum = "97baced388464909d42d89643fe4361939af9b7ce7a31ee32a168f832a70f2a0" dependencies = [ - "der", - "spki", + "bitflags 2.10.0", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", ] +[[package]] +name = "pom" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6" + [[package]] name = "portable-atomic" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +[[package]] +name = "portable-atomic-util" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a9db96d7fa8782dd8c15ce32ffe8680bbd1e978a43bf51a34d39483540495f5" +dependencies = [ + "portable-atomic", +] + +[[package]] +name = "postscript" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306" + [[package]] name = "potential_utf" version = "0.1.4" @@ -1889,6 +3000,12 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "proc-macro2" version = "1.0.106" @@ -1898,6 +3015,34 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "profiling" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3eb8486b569e12e2c32ad3e204dbaba5e4b5b216e9367044f25f1dba42341773" +dependencies = [ + "profiling-procmacros", +] + +[[package]] +name = "profiling-procmacros" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52717f9a02b6965224f95ca2a81e2e0c5c43baacd28ca057577988930b6c3d5b" +dependencies = [ + "quote", + "syn 2.0.114", +] + +[[package]] +name = "pxfm" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7186d3822593aa4393561d186d1393b3923e9d6163d3fbfd6e825e3e6cf3e6a8" +dependencies = [ + "num-traits", +] + [[package]] name = "pyo3" version = "0.23.5" @@ -1946,7 +3091,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -1955,64 +3100,220 @@ version = "0.23.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" dependencies = [ - "heck", - "proc-macro2", - "pyo3-build-config", - "quote", - "syn", + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "qoi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", +] + +[[package]] +name = "quick-xml" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +dependencies = [ + "encoding_rs", + "memchr", +] + +[[package]] +name = "quote" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", ] [[package]] -name = "quote" -version = "1.0.44" +name = "rand_distr" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" dependencies = [ - "proc-macro2", + "num-traits", + "rand 0.8.5", ] [[package]] -name = "r-efi" -version = "5.3.0" +name = "rangemap" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68" [[package]] -name = "rand" -version = "0.9.2" +name = "rav1e" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +checksum = "43b6dd56e85d9483277cde964fd1bdb0428de4fec5ebba7540995639a21cb32b" dependencies = [ - "rand_chacha", - "rand_core 0.9.5", + "aligned-vec", + "arbitrary", + "arg_enum_proc_macro", + "arrayvec", + "av-scenechange", + "av1-grain", + "bitstream-io", + "built", + "cfg-if", + "interpolate_name", + "itertools 0.14.0", + "libc", + "libfuzzer-sys", + "log", + "maybe-rayon", + "new_debug_unreachable", + "noop_proc_macro", + "num-derive", + "num-traits", + "paste", + "profiling", + "rand 0.9.2", + "rand_chacha 0.9.0", + "simd_helpers", + "thiserror", + "v_frame", + "wasm-bindgen", ] [[package]] -name = "rand_chacha" -version = "0.9.0" +name = "ravif" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +checksum = "ef69c1990ceef18a116855938e74793a5f7496ee907562bd0857b6ac734ab285" dependencies = [ - "ppv-lite86", - "rand_core 0.9.5", + "avif-serialize", + "imgref", + "loop9", + "quick-error", + "rav1e", + "rayon", + "rgb", ] [[package]] -name = "rand_core" -version = "0.6.4" +name = "rawpointer" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" dependencies = [ - "getrandom 0.2.17", + "either", + "rayon-core", ] [[package]] -name = "rand_core" -version = "0.9.5" +name = "rayon-core" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" dependencies = [ - "getrandom 0.3.4", + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags 2.10.0", ] [[package]] @@ -2032,7 +3333,7 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -2058,12 +3359,6 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "regex-lite" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" - [[package]] name = "regex-syntax" version = "0.8.9" @@ -2071,29 +3366,49 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" [[package]] -name = "rfc6979" -version = "0.3.1" +name = "reqwest" +version = "0.12.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ - "crypto-bigint 0.4.9", - "hmac", - "zeroize", + "base64", + "bytes", + "futures-core", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-tls", + "hyper-util", + "js-sys", + "log", + "native-tls", + "percent-encoding", + "pin-project-lite", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-native-tls", + "tokio-util", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", ] [[package]] -name = "ring" -version = "0.17.14" +name = "rgb" +version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" -dependencies = [ - "cc", - "cfg-if", - "getrandom 0.2.17", - "libc", - "untrusted", - "windows-sys 0.52.0", -] +checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce" [[package]] name = "rustc-demangle" @@ -2111,41 +3426,16 @@ dependencies = [ ] [[package]] -name = "rustls" -version = "0.21.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" -dependencies = [ - "log", - "ring", - "rustls-webpki 0.101.7", - "sct", -] - -[[package]] -name = "rustls" -version = "0.23.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" -dependencies = [ - "aws-lc-rs", - "once_cell", - "rustls-pki-types", - "rustls-webpki 0.103.9", - "subtle", - "zeroize", -] - -[[package]] -name = "rustls-native-certs" -version = "0.8.3" +name = "rustix" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" dependencies = [ - "openssl-probe", - "rustls-pki-types", - "schannel", - "security-framework", + "bitflags 2.10.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", ] [[package]] @@ -2157,28 +3447,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rustls-webpki" -version = "0.101.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "rustls-webpki" -version = "0.103.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" -dependencies = [ - "aws-lc-rs", - "ring", - "rustls-pki-types", - "untrusted", -] - [[package]] name = "rustversion" version = "1.0.22" @@ -2191,6 +3459,15 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "safe_arch" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96b02de82ddbe1b636e6170c21be622223aea188ef2e139be0a5b219ec215323" +dependencies = [ + "bytemuck", +] + [[package]] name = "schannel" version = "0.1.28" @@ -2225,40 +3502,37 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn", + "syn 2.0.114", ] [[package]] -name = "sct" -version = "0.7.1" +name = "scopeguard" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" -dependencies = [ - "ring", - "untrusted", -] +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] -name = "sec1" -version = "0.3.0" +name = "scraper" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" +checksum = "cc3d051b884f40e309de6c149734eab57aa8cc1347992710dc80bcc1c2194c15" dependencies = [ - "base16ct", - "der", - "generic-array", - "pkcs8", - "subtle", - "zeroize", + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "precomputed-hash", + "selectors", + "tendril", ] [[package]] name = "security-framework" -version = "3.5.1" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags", + "bitflags 2.10.0", "core-foundation", "core-foundation-sys", "libc", @@ -2275,12 +3549,37 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8" +dependencies = [ + "bitflags 2.10.0", + "cssparser", + "derive_more 0.99.20", + "fxhash", + "log", + "new_debug_unreachable", + "phf", + "phf_codegen", + "precomputed-hash", + "servo_arc", + "smallvec", +] + [[package]] name = "semver" version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + [[package]] name = "serde" version = "1.0.228" @@ -2291,6 +3590,16 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde_bytes" +version = "0.11.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5d440709e79d88e51ac01c4b72fc6cb7314017bb7da9eeff678aa94c10e3ea8" +dependencies = [ + "serde", + "serde_core", +] + [[package]] name = "serde_core" version = "1.0.228" @@ -2308,7 +3617,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -2319,7 +3628,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -2358,6 +3667,15 @@ dependencies = [ "serde", ] +[[package]] +name = "servo_arc" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "sha1" version = "0.10.6" @@ -2406,15 +3724,39 @@ dependencies = [ ] [[package]] -name = "signature" -version = "1.6.4" +name = "simba" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c" +checksum = "061507c94fc6ab4ba1c9a0305018408e312e17c041eb63bef8aa726fa33aceae" dependencies = [ - "digest", - "rand_core 0.6.4", + "approx", + "num-complex", + "num-traits", + "paste", + "wide", +] + +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + +[[package]] +name = "simd_helpers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6" +dependencies = [ + "quote", ] +[[package]] +name = "siphasher" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" + [[package]] name = "slab" version = "0.4.12" @@ -2427,16 +3769,6 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" -[[package]] -name = "socket2" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - [[package]] name = "socket2" version = "0.6.2" @@ -2449,19 +3781,15 @@ dependencies = [ [[package]] name = "spin" -version = "0.10.0" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] -name = "spki" -version = "0.6.0" +name = "sptr" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" -dependencies = [ - "base64ct", - "der", -] +checksum = "3b9b39299b249ad65f3b7e96443bad61c02ca5cd3589f46cb6d610a0fd6c0d6a" [[package]] name = "stable_deref_trait" @@ -2469,12 +3797,76 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.114", +] + [[package]] name = "subtle" version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.114" @@ -2491,6 +3883,9 @@ name = "sync_wrapper" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] [[package]] name = "synstructure" @@ -2500,7 +3895,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -2509,6 +3904,30 @@ version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +[[package]] +name = "tempfile" +version = "3.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1" +dependencies = [ + "fastrand", + "getrandom 0.3.4", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "thiserror" version = "2.0.18" @@ -2526,16 +3945,41 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", +] + +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", ] [[package]] -name = "thread_local" -version = "1.1.9" +name = "tiff" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +checksum = "af9605de7fee8d9551863fd692cce7637f548dbd9db9180fcc07ccc6d26c336f" dependencies = [ - "cfg-if", + "fax", + "flate2", + "half", + "quick-error", + "weezl", + "zune-jpeg 0.4.21", ] [[package]] @@ -2545,6 +3989,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", + "itoa", "num-conv", "powerfmt", "serde_core", @@ -2568,6 +4013,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinystr" version = "0.8.2" @@ -2578,6 +4032,21 @@ dependencies = [ "zerovec", ] +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "tokio" version = "1.49.0" @@ -2587,9 +4056,10 @@ dependencies = [ "bytes", "libc", "mio", + "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.2", + "socket2", "tokio-macros", "windows-sys 0.61.2", ] @@ -2602,26 +4072,27 @@ checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] -name = "tokio-rustls" -version = "0.24.1" +name = "tokio-native-tls" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" dependencies = [ - "rustls 0.21.12", + "native-tls", "tokio", ] [[package]] -name = "tokio-rustls" -version = "0.26.4" +name = "tokio-stream" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" dependencies = [ - "rustls 0.23.36", + "futures-core", + "pin-project-lite", "tokio", ] @@ -2664,12 +4135,15 @@ version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ - "bitflags", + "bitflags 2.10.0", "bytes", - "http 1.4.0", - "http-body 1.0.1", + "futures-util", + "http", + "http-body", "http-body-util", + "iri-string", "pin-project-lite", + "tower", "tower-layer", "tower-service", "tracing", @@ -2708,7 +4182,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -2769,6 +4243,37 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "ttf-parser" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31" + +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + +[[package]] +name = "type1-encoding-parser" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3d6cc09e1a99c7e01f2afe4953789311a1c50baebbdac5b477ecf78e2e92a5b" +dependencies = [ + "pom", +] + +[[package]] +name = "typed-path" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3015e6ce46d5ad8751e4a772543a30c7511468070e98e64e20165f8f81155b64" + [[package]] name = "typenum" version = "1.19.0" @@ -2781,6 +4286,21 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "537dd038a89878be9b64dd4bd1b260315c1bb94f4d784956b81e27a088d9a09e" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "unicode-xid" version = "0.2.6" @@ -2793,12 +4313,6 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" -[[package]] -name = "untrusted" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" - [[package]] name = "url" version = "2.5.8" @@ -2817,12 +4331,24 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8_iter" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "utoipa" version = "5.4.0" @@ -2844,7 +4370,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn", + "syn 2.0.114", "uuid", ] @@ -2872,6 +4398,17 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "v_frame" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "666b7727c8875d6ab5db9533418d7c764233ac9c0cff1d469aec8fa127597be2" +dependencies = [ + "aligned-vec", + "num-traits", + "wasm-bindgen", +] + [[package]] name = "valuable" version = "0.1.1" @@ -2879,16 +4416,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" [[package]] -name = "version_check" -version = "0.9.5" +name = "vcpkg" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] -name = "vsimd" -version = "0.8.0" +name = "version_check" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "want" @@ -2927,6 +4464,20 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" +dependencies = [ + "cfg-if", + "futures-util", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.108" @@ -2946,7 +4497,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn", + "syn 2.0.114", "wasm-bindgen-shared", ] @@ -2959,6 +4510,45 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "web-sys" +version = "0.3.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "weezl" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" + +[[package]] +name = "wide" +version = "0.7.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce5da8ecb62bcd8ec8b7ea19f69a51275e91299be594ea5cc6ef7819e16cd03" +dependencies = [ + "bytemuck", + "safe_arch", +] + [[package]] name = "windows-core" version = "0.62.2" @@ -2980,7 +4570,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -2991,7 +4581,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -3018,22 +4608,13 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-sys" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" -dependencies = [ - "windows-targets 0.52.6", -] - [[package]] name = "windows-sys" version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets 0.53.5", + "windows-targets", ] [[package]] @@ -3045,22 +4626,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-targets" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" -dependencies = [ - "windows_aarch64_gnullvm 0.52.6", - "windows_aarch64_msvc 0.52.6", - "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm 0.52.6", - "windows_i686_msvc 0.52.6", - "windows_x86_64_gnu 0.52.6", - "windows_x86_64_gnullvm 0.52.6", - "windows_x86_64_msvc 0.52.6", -] - [[package]] name = "windows-targets" version = "0.53.5" @@ -3068,106 +4633,58 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" dependencies = [ "windows-link", - "windows_aarch64_gnullvm 0.53.1", - "windows_aarch64_msvc 0.53.1", - "windows_i686_gnu 0.53.1", - "windows_i686_gnullvm 0.53.1", - "windows_i686_msvc 0.53.1", - "windows_x86_64_gnu 0.53.1", - "windows_x86_64_gnullvm 0.53.1", - "windows_x86_64_msvc 0.53.1", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" - [[package]] name = "windows_aarch64_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" -[[package]] -name = "windows_aarch64_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" - [[package]] name = "windows_aarch64_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" -[[package]] -name = "windows_i686_gnu" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" - [[package]] name = "windows_i686_gnu" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" -[[package]] -name = "windows_i686_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" - [[package]] name = "windows_i686_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" -[[package]] -name = "windows_i686_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" - [[package]] name = "windows_i686_msvc" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" -[[package]] -name = "windows_x86_64_gnu" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" - [[package]] name = "windows_x86_64_gnu" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" - [[package]] name = "windows_x86_64_gnullvm" version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" -[[package]] -name = "windows_x86_64_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" - [[package]] name = "windows_x86_64_msvc" version = "0.53.1" @@ -3187,10 +4704,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" [[package]] -name = "xmlparser" -version = "0.13.6" +name = "xml-rs" +version = "0.8.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ae8337f8a065cfc972643663ea4279e04e7256de865aa66fe25cec5fb912d3f" + +[[package]] +name = "xmltree" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b619f8c85654798007fb10afa5125590b43b088c225a25fc2fec100a9fad0fc6" +dependencies = [ + "xml-rs", +] + +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + +[[package]] +name = "y4m" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" +checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448" [[package]] name = "yoke" @@ -3211,7 +4752,7 @@ checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", "synstructure", ] @@ -3232,7 +4773,7 @@ checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", ] [[package]] @@ -3252,7 +4793,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", "synstructure", ] @@ -3261,6 +4802,20 @@ name = "zeroize" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85a5b4158499876c763cb03bc4e49185d3cccbabb15b33c627f7884f43db852e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] [[package]] name = "zerotrie" @@ -3292,11 +4847,140 @@ checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.114", +] + +[[package]] +name = "zip" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" +dependencies = [ + "aes", + "arbitrary", + "bzip2", + "constant_time_eq", + "crc32fast", + "crossbeam-utils", + "deflate64", + "displaydoc", + "flate2", + "getrandom 0.3.4", + "hmac", + "indexmap", + "lzma-rs", + "memchr", + "pbkdf2", + "sha1", + "thiserror", + "time", + "xz2", + "zeroize", + "zopfli", + "zstd", +] + +[[package]] +name = "zip" +version = "7.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc12baa6db2b15a140161ce53d72209dacea594230798c24774139b54ecaa980" +dependencies = [ + "crc32fast", + "flate2", + "indexmap", + "memchr", + "typed-path", + "zopfli", ] +[[package]] +name = "zlib-rs" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7948af682ccbc3342b6e9420e8c51c1fe5d7bf7756002b4a3c6cabfe96a7e3c" + [[package]] name = "zmij" version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4de98dfa5d5b7fef4ee834d0073d560c9ca7b6c46a71d058c48db7960f8cfaf7" + +[[package]] +name = "zopfli" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249" +dependencies = [ + "bumpalo", + "crc32fast", + "log", + "simd-adler32", +] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] + +[[package]] +name = "zune-core" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f423a2c17029964870cfaabb1f13dfab7d092a62a29a89264f4d36990ca414a" + +[[package]] +name = "zune-core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" + +[[package]] +name = "zune-inflate" +version = "0.2.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "zune-jpeg" +version = "0.4.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29ce2c8a9384ad323cf564b67da86e21d3cfdff87908bc1223ed5c99bc792713" +dependencies = [ + "zune-core 0.4.12", +] + +[[package]] +name = "zune-jpeg" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "410e9ecef634c709e3831c2cfdb8d9c32164fae1c67496d5b68fff728eec37fe" +dependencies = [ + "zune-core 0.5.1", +] diff --git a/Cargo.toml b/Cargo.toml index 9ae4d49..274f78d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,10 @@ members = [ "./crates/nvisy-core", "./crates/nvisy-detect", "./crates/nvisy-engine", + "./crates/nvisy-ingest", + "./crates/nvisy-media", "./crates/nvisy-object", + "./crates/nvisy-ontology", "./crates/nvisy-python", "./crates/nvisy-server", ] @@ -33,7 +36,10 @@ documentation = "https://docs.rs/nvisy-runtime" nvisy-core = { path = "./crates/nvisy-core", version = "0.1.0" } nvisy-detect = { path = "./crates/nvisy-detect", version = "0.1.0" } nvisy-engine = { path = "./crates/nvisy-engine", version = "0.1.0" } +nvisy-ingest = { path = "./crates/nvisy-ingest", version = "0.1.0" } +nvisy-media = { path = "./crates/nvisy-media", version = "0.1.0" } nvisy-object = { path = "./crates/nvisy-object", version = "0.1.0" } +nvisy-ontology = { path = "./crates/nvisy-ontology", version = "0.1.0" } nvisy-python = { path = "./crates/nvisy-python", version = "0.1.0" } nvisy-server = { path = "./crates/nvisy-server", version = "0.1.0" } @@ -66,12 +72,13 @@ anyhow = { version = "1.0", features = [] } derive_more = { version = "1", features = ["display"] } # Primitive datatypes -uuid = { version = "1", features = ["serde", "v4"] } +uuid = { version = "1", features = ["serde", "v4", "v7"] } chrono = { version = "0.4", features = ["serde"] } bytes = { version = "1", features = ["serde"] } # Text processing regex = { version = "1.0", features = [] } +aho-corasick = { version = "1", features = [] } # Graph data structures petgraph = { version = "0.8", features = [] } @@ -85,9 +92,41 @@ schemars = { version = "1", features = ["uuid1", "chrono04", "bytes1"] } # Python interop pyo3 = { version = "0.23", features = [] } -# AWS SDK -aws-sdk-s3 = { version = "1", features = [] } -aws-config = { version = "1", features = [] } +# S3-compatible object storage +minio = { version = "0.3", features = [] } + +# Image processing +image = { version = "0.25", default-features = false, features = ["png", "jpeg", "tiff"] } +imageproc = "0.25" + +# Document parsing +pdf-extract = "0.7" +lopdf = "0.34" +scraper = "0.22" +calamine = "0.33" +zip = "2" +quick-xml = "0.37" +arrow = { version = "54", default-features = false } +parquet = { version = "54", default-features = false, features = ["arrow"] } + +# Encoding +base64 = "0.22" + +# Time +jiff = "0.2" + +# Interned strings +hipstr = "0.6" + +# Hashing +sha2 = "0.10" +hex = "0.4" + +# Enum derives +strum = { version = "0.26", features = ["derive"] } + +# Testing +tempfile = "3" # Randomness rand = { version = "0.9", features = [] } diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index 617f75b..cf597db 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -33,11 +33,11 @@ serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = [] } # Async runtime -tokio = { workspace = true, features = ["sync"] } +tokio = { workspace = true, features = ["sync", "fs", "io-util", "rt"] } async-trait = { workspace = true, features = [] } # Primitive datatypes -uuid = { workspace = true, features = ["serde", "v4"] } +uuid = { workspace = true, features = ["serde", "v4", "v7"] } chrono = { workspace = true, features = ["serde"] } bytes = { workspace = true, features = ["serde"] } @@ -47,7 +47,23 @@ infer = { workspace = true, features = [] } # Error handling thiserror = { workspace = true, features = [] } anyhow = { workspace = true, features = [] } -derive_more = { workspace = true, features = ["display"] } +derive_more = { workspace = true, features = ["display", "deref", "as_ref"] } + +# Time +jiff = { workspace = true } + +# Interned strings +hipstr = { workspace = true } + +# Hashing +sha2 = { workspace = true } +hex = { workspace = true } + +# Enum derives +strum = { workspace = true } # Observability tracing = { workspace = true, features = [] } + +[dev-dependencies] +tempfile = { workspace = true } diff --git a/crates/nvisy-core/src/datatypes/document.rs b/crates/nvisy-core/src/datatypes/document.rs index fdec9d6..5d7101e 100644 --- a/crates/nvisy-core/src/datatypes/document.rs +++ b/crates/nvisy-core/src/datatypes/document.rs @@ -486,6 +486,58 @@ impl Document { } } +// --------------------------------------------------------------------------- +// TabularData +// --------------------------------------------------------------------------- + +/// Tabular data extracted from spreadsheets, CSV files, or database exports. +/// +/// Represents a two-dimensional table with named columns and string cell +/// values. Carries optional metadata about the original file format and +/// sheet name for multi-sheet workbooks. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct TabularData { + /// Common data-item fields (id, parent_id, metadata). + #[serde(flatten)] + pub data: Data, + /// Column header names. + pub columns: Vec<String>, + /// Row data — each inner Vec has the same length as `columns`. + pub rows: Vec<Vec<String>>, + /// Original file format (e.g. `"csv"`, `"parquet"`, `"xlsx"`). + #[serde(skip_serializing_if = "Option::is_none")] + pub source_format: Option<String>, + /// Sheet or tab name within a multi-sheet workbook. + #[serde(skip_serializing_if = "Option::is_none")] + pub sheet_name: Option<String>, +} + +impl TabularData { + /// Create new tabular data with the given columns and rows. + pub fn new(columns: Vec<String>, rows: Vec<Vec<String>>) -> Self { + Self { + data: Data::new(), + columns, + rows, + source_format: None, + sheet_name: None, + } + } + + /// Record the original file format (e.g. `"csv"`, `"xlsx"`). + pub fn with_source_format(mut self, format: impl Into<String>) -> Self { + self.source_format = Some(format.into()); + self + } + + /// Set the sheet name for multi-sheet workbooks. + pub fn with_sheet_name(mut self, name: impl Into<String>) -> Self { + self.sheet_name = Some(name.into()); + self + } +} + // --------------------------------------------------------------------------- // ImageData // --------------------------------------------------------------------------- diff --git a/crates/nvisy-core/src/error.rs b/crates/nvisy-core/src/error.rs index 14195e5..a7a8452 100644 --- a/crates/nvisy-core/src/error.rs +++ b/crates/nvisy-core/src/error.rs @@ -27,6 +27,12 @@ pub enum ErrorKind { Runtime, /// An error originating from the embedded Python bridge. Python, + /// An internal infrastructure error (filesystem, I/O). + InternalError, + /// The input was invalid or out of bounds. + InvalidInput, + /// A serialization or encoding error. + Serialization, /// An error that does not fit any other category. Other, } @@ -134,6 +140,13 @@ impl Error { } } +impl From<std::io::Error> for Error { + fn from(err: std::io::Error) -> Self { + Self::new(ErrorKind::InternalError, err.to_string()) + .with_source(err) + } +} + impl From<anyhow::Error> for Error { fn from(err: anyhow::Error) -> Self { // anyhow::Error doesn't implement std::error::Error, so we capture the diff --git a/crates/nvisy-core/src/fs/content_file.rs b/crates/nvisy-core/src/fs/content_file.rs new file mode 100644 index 0000000..0d46e06 --- /dev/null +++ b/crates/nvisy-core/src/fs/content_file.rs @@ -0,0 +1,637 @@ +//! Content file handling for filesystem operations +//! +//! This module provides the [`ContentFile`] struct for working with files +//! on the filesystem while maintaining content source tracking and metadata. + +use std::io; +use std::path::{Path, PathBuf}; + +use bytes::Bytes; +use tokio::fs::{File, OpenOptions}; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWrite, AsyncWriteExt, SeekFrom}; + +use crate::error::{Error, ErrorKind, Result}; +use crate::fs::ContentMetadata; +use crate::io::{AsyncContentRead, AsyncContentWrite, ContentData}; +use crate::path::ContentSource; + +/// A file wrapper that combines filesystem operations with content tracking +/// +/// This struct provides a high-level interface for working with files while +/// maintaining content source identification and metadata throughout the +/// processing pipeline. +#[derive(Debug)] +pub struct ContentFile { + /// Unique identifier for this content source + content_source: ContentSource, + /// The underlying tokio file handle + file: File, + /// Path to the file + path: PathBuf, +} + +impl ContentFile { + /// Create a new `ContentFile` by opening an existing file + /// + /// # Errors + /// + /// Returns an error if the file cannot be opened or doesn't exist. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::fs::ContentFile; + /// use std::path::Path; + /// + /// async fn open_file() -> Result<(), Box<dyn std::error::Error>> { + /// let content_file = ContentFile::open("example.txt").await?; + /// println!("Opened file with source: {}", content_file.content_source()); + /// Ok(()) + /// } + /// ``` + pub async fn open(path: impl AsRef<Path>) -> io::Result<Self> { + let path_buf = path.as_ref().to_path_buf(); + let file = File::open(&path_buf).await?; + let content_source = ContentSource::new(); + + Ok(Self { + content_source, + file, + path: path_buf, + }) + } + + /// Create a new `ContentFile` with a specific content source + /// + /// # Errors + /// + /// Returns an error if the file cannot be opened or read. + pub async fn open_with_source( + path: impl AsRef<Path>, + content_source: ContentSource, + ) -> io::Result<Self> { + let path_buf = path.as_ref().to_path_buf(); + let file = File::open(&path_buf).await?; + + Ok(Self { + content_source, + file, + path: path_buf, + }) + } + + /// Create a new file and return a `ContentFile` + /// + /// # Errors + /// + /// Returns an error if the file cannot be created. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::fs::ContentFile; + /// + /// async fn create_file() -> Result<(), Box<dyn std::error::Error>> { + /// let content_file = ContentFile::create("new_file.txt").await?; + /// println!("Created file with source: {}", content_file.content_source()); + /// Ok(()) + /// } + /// ``` + pub async fn create(path: impl AsRef<Path>) -> io::Result<Self> { + let path_buf = path.as_ref().to_path_buf(); + let file = File::create(&path_buf).await?; + let content_source = ContentSource::new(); + + Ok(Self { + content_source, + file, + path: path_buf, + }) + } + + /// Create a new file with a specific content source + /// + /// # Errors + /// + /// Returns an error if the file cannot be created or written to. + pub async fn create_with_source( + path: impl AsRef<Path>, + content_source: ContentSource, + ) -> io::Result<Self> { + let path_buf = path.as_ref().to_path_buf(); + let file = File::create(&path_buf).await?; + + Ok(Self { + content_source, + file, + path: path_buf, + }) + } + + /// Open a file with custom options + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::fs::ContentFile; + /// use tokio::fs::OpenOptions; + /// + /// async fn open_with_options() -> Result<(), Box<dyn std::error::Error>> { + /// let mut options = OpenOptions::new(); + /// options.read(true) + /// .write(true) + /// .create(true); + /// + /// let content_file = ContentFile::open_with_options("data.txt", &options).await?; + /// Ok(()) + /// } + /// ``` + /// + /// # Errors + /// + /// Returns an error if the file cannot be opened with the specified options. + pub async fn open_with_options( + path: impl AsRef<Path>, + options: &OpenOptions, + ) -> io::Result<Self> { + let path_buf = path.as_ref().to_path_buf(); + let file = options.open(&path_buf).await?; + let content_source = ContentSource::new(); + + Ok(Self { + content_source, + file, + path: path_buf, + }) + } + + /// Read all content from the file into a `ContentData` structure + /// + /// # Errors + /// + /// Returns an error if the file cannot be read or if an I/O error occurs. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::fs::ContentFile; + /// + /// async fn read_content() -> Result<(), Box<dyn std::error::Error>> { + /// let mut content_file = ContentFile::open("example.txt").await?; + /// let content_data = content_file.read_to_content_data().await?; + /// + /// println!("Read {} bytes", content_data.size()); + /// Ok(()) + /// } + /// ``` + pub async fn read_to_content_data(&mut self) -> Result<ContentData> { + let mut buffer = Vec::new(); + self.file.read_to_end(&mut buffer).await?; + + let content_data = ContentData::new(self.content_source, Bytes::from(buffer)); + + Ok(content_data) + } + + /// Read content with size limit to prevent memory issues + /// + /// # Errors + /// + /// Returns an error if the file cannot be read, if an I/O error occurs, + /// or if the file size exceeds the specified maximum size. + pub async fn read_to_content_data_limited(&mut self, max_size: usize) -> Result<ContentData> { + let mut buffer = Vec::new(); + let mut temp_buffer = vec![0u8; 8192]; + let mut total_read = 0; + + loop { + let bytes_read = self.file.read(&mut temp_buffer).await?; + if bytes_read == 0 { + break; // EOF + } + + if total_read + bytes_read > max_size { + return Err(Error::new(ErrorKind::InvalidInput, format!( + "File size exceeds maximum limit of {max_size} bytes" + ))); + } + + buffer.extend_from_slice(&temp_buffer[..bytes_read]); + total_read += bytes_read; + } + + let content_data = ContentData::new(self.content_source, Bytes::from(buffer)); + + Ok(content_data) + } + + /// Write `ContentData` to the file + /// + /// # Errors + /// + /// Returns an error if the data cannot be written or if an I/O error occurs. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::fs::ContentFile; + /// use nvisy_core::io::ContentData; + /// + /// async fn write_content() -> Result<(), Box<dyn std::error::Error>> { + /// let mut content_file = ContentFile::create("output.txt").await?; + /// let content_data = ContentData::from("Hello, world!"); + /// + /// let metadata = content_file.write_from_content_data(content_data).await?; + /// println!("Written to: {:?}", metadata.source_path); + /// Ok(()) + /// } + /// ``` + pub async fn write_from_content_data( + &mut self, + content_data: ContentData, + ) -> Result<ContentMetadata> { + self.file.write_all(content_data.as_bytes()).await?; + self.file.flush().await?; + + let metadata = ContentMetadata::with_path(content_data.content_source, self.path.clone()); + Ok(metadata) + } + + /// Append `ContentData` to the file + /// + /// # Errors + /// + /// Returns an error if the data cannot be appended or if an I/O error occurs. + pub async fn append_from_content_data( + &mut self, + content_data: ContentData, + ) -> Result<ContentMetadata> { + self.file.seek(SeekFrom::End(0)).await?; + self.file.write_all(content_data.as_bytes()).await?; + self.file.flush().await?; + + let metadata = ContentMetadata::with_path(content_data.content_source, self.path.clone()); + Ok(metadata) + } + + /// Write `ContentData` in chunks for better memory efficiency + /// + /// # Errors + /// + /// Returns an error if the data cannot be written or if an I/O error occurs. + pub async fn write_from_content_data_chunked( + &mut self, + content_data: ContentData, + chunk_size: usize, + ) -> Result<ContentMetadata> { + let data = content_data.as_bytes(); + + for chunk in data.chunks(chunk_size) { + self.file.write_all(chunk).await?; + } + + self.file.flush().await?; + + let metadata = ContentMetadata::with_path(content_data.content_source, self.path.clone()); + Ok(metadata) + } + + /// Get content metadata for this file + pub fn content_metadata(&self) -> ContentMetadata { + ContentMetadata::with_path(self.content_source, self.path.clone()) + } + + /// Get the file path + pub fn path(&self) -> &Path { + &self.path + } + + /// Get the content source + pub fn content_source(&self) -> ContentSource { + self.content_source + } + + /// Get the source identifier for this content + pub fn source(&self) -> ContentSource { + self.content_source + } + + /// Get a reference to the underlying file + pub fn as_file(&self) -> &File { + &self.file + } + + /// Get a mutable reference to the underlying file + pub fn as_file_mut(&mut self) -> &mut File { + &mut self.file + } + + /// Convert into the underlying file, consuming the `ContentFile` + pub fn into_file(self) -> File { + self.file + } + + /// Get file size in bytes + /// + /// # Errors + /// + /// Returns an error if the file metadata cannot be retrieved. + pub async fn size(&mut self) -> Result<u64> { + let metadata = self.file.metadata().await?; + Ok(metadata.len()) + } + + /// Check if the file exists + pub fn exists(&self) -> bool { + self.path.exists() + } + + /// Get the filename + pub fn filename(&self) -> Option<&str> { + self.path.file_name().and_then(|name| name.to_str()) + } + + /// Get the file extension + pub fn extension(&self) -> Option<&str> { + self.path.extension().and_then(|ext| ext.to_str()) + } + + /// Sync all data to disk + /// + /// # Errors + /// + /// Returns an error if the sync operation fails. + pub async fn sync_all(&mut self) -> Result<()> { + self.file.sync_all().await?; + Ok(()) + } + + /// Sync data (but not metadata) to disk + /// + /// # Errors + /// + /// Returns an error if the sync operation fails. + pub async fn sync_data(&mut self) -> Result<()> { + self.file.sync_data().await?; + Ok(()) + } + + /// Seek to a specific position in the file + /// + /// # Errors + /// + /// Returns an error if the seek operation fails. + pub async fn seek(&mut self, pos: SeekFrom) -> Result<u64> { + let position = self.file.seek(pos).await?; + Ok(position) + } + + /// Get current position in the file + /// + /// # Errors + /// + /// Returns an error if the current position cannot be determined. + pub async fn stream_position(&mut self) -> Result<u64> { + let position = self.file.stream_position().await?; + Ok(position) + } +} + +// Implement AsyncRead for ContentFile by delegating to the underlying file +impl AsyncRead for ContentFile { + fn poll_read( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> std::task::Poll<std::io::Result<()>> { + std::pin::Pin::new(&mut self.file).poll_read(cx, buf) + } +} + +// Implement AsyncWrite for ContentFile by delegating to the underlying file +impl AsyncWrite for ContentFile { + fn poll_write( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> std::task::Poll<std::result::Result<usize, std::io::Error>> { + std::pin::Pin::new(&mut self.file).poll_write(cx, buf) + } + + fn poll_flush( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll<std::result::Result<(), std::io::Error>> { + std::pin::Pin::new(&mut self.file).poll_flush(cx) + } + + fn poll_shutdown( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll<std::result::Result<(), std::io::Error>> { + std::pin::Pin::new(&mut self.file).poll_shutdown(cx) + } +} + +// Implement AsyncContentRead for ContentFile by delegating to the underlying file +impl AsyncContentRead for ContentFile { + // Default implementations from the trait will work since File implements AsyncRead +} + +// Implement AsyncContentWrite for ContentFile by delegating to the underlying file +impl AsyncContentWrite for ContentFile { + // Default implementations from the trait will work since File implements AsyncWrite +} + +#[cfg(test)] +mod tests { + use tempfile::NamedTempFile; + + use super::*; + + #[tokio::test] + async fn test_create_and_open() { + let temp_file = NamedTempFile::new().unwrap(); + let path = temp_file.path(); + + // Create file + let content_file = ContentFile::create(path).await.unwrap(); + assert_eq!(content_file.path(), path); + assert!(!content_file.content_source.as_uuid().is_nil()); + + // Clean up + drop(content_file); + + // Open existing file + let content_file = ContentFile::open(path).await.unwrap(); + assert_eq!(content_file.path(), path); + } + + #[tokio::test] + async fn test_write_and_read_content_data() { + let temp_file = NamedTempFile::new().unwrap(); + let path = temp_file.path(); + + // Write content + let mut content_file = ContentFile::create(path).await.unwrap(); + let content_data = ContentData::from("Hello, world!"); + let metadata = content_file + .write_from_content_data(content_data) + .await + .unwrap(); + + assert_eq!(metadata.source_path, Some(path.to_path_buf())); + + // Read content back + drop(content_file); + let mut content_file = ContentFile::open(path).await.unwrap(); + let read_content = content_file.read_to_content_data().await.unwrap(); + + assert_eq!(read_content.as_string().unwrap(), "Hello, world!"); + } + + #[tokio::test] + async fn test_file_extension() { + let temp_file = NamedTempFile::new().unwrap(); + let mut path = temp_file.path().to_path_buf(); + path.set_extension("txt"); + + let content_file = ContentFile::create(&path).await.unwrap(); + assert_eq!(content_file.extension(), Some("txt")); + assert_eq!( + content_file.filename(), + path.file_name().and_then(|n| n.to_str()) + ); + } + + #[tokio::test] + async fn test_write_chunked() { + let temp_file = NamedTempFile::new().unwrap(); + let path = temp_file.path(); + + let mut content_file = ContentFile::create(path).await.unwrap(); + let large_data = vec![b'A'; 1000]; + let content_data = ContentData::from(large_data.clone()); + + let metadata = content_file + .write_from_content_data_chunked(content_data, 100) + .await + .unwrap(); + assert_eq!(metadata.source_path, Some(path.to_path_buf())); + + // Verify content + drop(content_file); + let mut content_file = ContentFile::open(path).await.unwrap(); + let read_content = content_file.read_to_content_data().await.unwrap(); + + assert_eq!(read_content.as_bytes(), large_data.as_slice()); + } + + #[tokio::test] + async fn test_append_content() { + let temp_file = NamedTempFile::new().unwrap(); + let path = temp_file.path(); + + // Write initial content + let mut content_file = ContentFile::create(path).await.unwrap(); + let initial_content = ContentData::from("Hello, "); + content_file + .write_from_content_data(initial_content) + .await + .unwrap(); + + // Append more content + let append_content = ContentData::from("world!"); + content_file + .append_from_content_data(append_content) + .await + .unwrap(); + + // Verify combined content + drop(content_file); + let mut content_file = ContentFile::open(path).await.unwrap(); + let read_content = content_file.read_to_content_data().await.unwrap(); + + assert_eq!(read_content.as_string().unwrap(), "Hello, world!"); + } + + #[tokio::test] + async fn test_read_with_limit() { + let temp_file = NamedTempFile::new().unwrap(); + let path = temp_file.path(); + + // Write content larger than limit + let mut content_file = ContentFile::create(path).await.unwrap(); + let large_content = ContentData::from(vec![b'X'; 1000]); + content_file + .write_from_content_data(large_content) + .await + .unwrap(); + + drop(content_file); + + // Try to read with small limit + let mut content_file = ContentFile::open(path).await.unwrap(); + let result = content_file.read_to_content_data_limited(100).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_file_operations() { + let temp_file = NamedTempFile::new().unwrap(); + let path = temp_file.path(); + + let mut content_file = ContentFile::create(path).await.unwrap(); + + // Test size (should be 0 for new file) + let size = content_file.size().await.unwrap(); + assert_eq!(size, 0); + + // Test existence + assert!(content_file.exists()); + + // Write some content + let content = ContentData::from("Test content"); + content_file.write_from_content_data(content).await.unwrap(); + + // Test size after writing + let size = content_file.size().await.unwrap(); + assert!(size > 0); + + // Test sync operations + content_file.sync_all().await.unwrap(); + content_file.sync_data().await.unwrap(); + } + + #[tokio::test] + async fn test_seeking() { + let temp_file = NamedTempFile::new().unwrap(); + let path = temp_file.path(); + + let mut content_file = ContentFile::create(path).await.unwrap(); + let content = ContentData::from("0123456789"); + content_file.write_from_content_data(content).await.unwrap(); + + // Test seeking + let pos = content_file.seek(SeekFrom::Start(5)).await.unwrap(); + assert_eq!(pos, 5); + + let current_pos = content_file.stream_position().await.unwrap(); + assert_eq!(current_pos, 5); + } + + #[tokio::test] + async fn test_with_specific_source() { + let temp_file = NamedTempFile::new().unwrap(); + let path = temp_file.path(); + + let source = ContentSource::new(); + let content_file = ContentFile::create_with_source(path, source).await.unwrap(); + + assert_eq!(content_file.content_source, source); + + let metadata = content_file.content_metadata(); + assert_eq!(metadata.content_source, source); + assert_eq!(metadata.source_path, Some(path.to_path_buf())); + } +} diff --git a/crates/nvisy-core/src/fs/content_handler.rs b/crates/nvisy-core/src/fs/content_handler.rs new file mode 100644 index 0000000..69d3f8b --- /dev/null +++ b/crates/nvisy-core/src/fs/content_handler.rs @@ -0,0 +1,131 @@ +use std::fmt; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use crate::path::ContentSource; + +/// Inner state cleaned up when the last `ContentHandler` reference is dropped. +struct ContentHandlerInner { + content_source: ContentSource, + dir: PathBuf, + runtime_handle: tokio::runtime::Handle, +} + +impl fmt::Debug for ContentHandlerInner { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ContentHandlerInner") + .field("content_source", &self.content_source) + .field("dir", &self.dir) + .finish() + } +} + +impl Drop for ContentHandlerInner { + fn drop(&mut self) { + let dir = self.dir.clone(); + let source = self.content_source; + + self.runtime_handle.spawn(async move { + if let Err(err) = tokio::fs::remove_dir_all(&dir).await { + tracing::warn!( + target: "nvisy_core::fs", + content_source = %source, + path = %dir.display(), + error = %err, + "Failed to clean up temporary content directory" + ); + } else { + tracing::trace!( + target: "nvisy_core::fs", + content_source = %source, + path = %dir.display(), + "Cleaned up temporary content directory" + ); + } + }); + } +} + +/// Handle to content stored in a managed temporary directory. +/// +/// Cloning is cheap — clones share the same underlying directory via `Arc`. +/// When the last clone is dropped, the temporary directory is deleted. +#[derive(Debug, Clone)] +pub struct ContentHandler { + inner: Arc<ContentHandlerInner>, +} + +impl ContentHandler { + /// Creates a new content handler. + pub(crate) fn new( + content_source: ContentSource, + dir: PathBuf, + runtime_handle: tokio::runtime::Handle, + ) -> Self { + Self { + inner: Arc::new(ContentHandlerInner { + content_source, + dir, + runtime_handle, + }), + } + } + + /// Returns the content source identifier. + pub fn content_source(&self) -> ContentSource { + self.inner.content_source + } + + /// Returns the path to the temporary directory. + pub fn dir(&self) -> &Path { + &self.inner.dir + } +} + +#[cfg(test)] +mod tests { + use crate::fs::ContentRegistry; + use crate::io::{Content, ContentData}; + + #[tokio::test] + async fn test_handler_has_valid_source() { + let temp = tempfile::TempDir::new().unwrap(); + let registry = ContentRegistry::new(temp.path().join("content")); + let content = Content::new(ContentData::from("test data")); + let handler = registry.register(content).await.unwrap(); + + assert!(!handler.content_source().as_uuid().is_nil()); + assert!(handler.dir().exists()); + } + + #[tokio::test] + async fn test_clone_shares_same_directory() { + let temp = tempfile::TempDir::new().unwrap(); + let registry = ContentRegistry::new(temp.path().join("content")); + let content = Content::new(ContentData::from("shared")); + let handler1 = registry.register(content).await.unwrap(); + let handler2 = handler1.clone(); + + assert_eq!(handler1.dir(), handler2.dir()); + } + + #[tokio::test] + async fn test_directory_cleaned_on_last_drop() { + let temp = tempfile::TempDir::new().unwrap(); + let registry = ContentRegistry::new(temp.path().join("content")); + let content = Content::new(ContentData::from("cleanup test")); + let handler = registry.register(content).await.unwrap(); + let dir = handler.dir().to_path_buf(); + let handler2 = handler.clone(); + + assert!(dir.exists()); + + drop(handler); + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + assert!(dir.exists()); + + drop(handler2); + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + assert!(!dir.exists()); + } +} diff --git a/crates/nvisy-core/src/fs/content_kind.rs b/crates/nvisy-core/src/fs/content_kind.rs new file mode 100644 index 0000000..288f488 --- /dev/null +++ b/crates/nvisy-core/src/fs/content_kind.rs @@ -0,0 +1,132 @@ +//! Content type classification for different categories of data +//! +//! This module provides the [`ContentKind`] enum for classifying content +//! into broad categories. Extension-to-kind mapping is handled by the +//! engine's format registry. + +use serde::{Deserialize, Serialize}; +use strum::{AsRefStr, Display, EnumIter, EnumString}; + +/// Content type classification for different categories of data +/// +/// This enum represents high-level content categories without knowledge +/// of specific file extensions or MIME types. The engine's format registry +/// handles the mapping from extensions/MIME types to content kinds. +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(AsRefStr, Display, EnumString, EnumIter)] +#[derive(Serialize, Deserialize)] +#[strum(serialize_all = "lowercase")] +#[serde(rename_all = "lowercase")] +pub enum ContentKind { + /// Plain text content + Text, + /// Document files (PDF, Word, etc.) + Document, + /// Spreadsheet files (Excel, CSV, etc.) + Spreadsheet, + /// Image files + Image, + /// Archive files (ZIP, TAR, etc.) + Archive, + /// Unknown or unsupported content type + #[default] + Unknown, +} + +impl ContentKind { + /// Check if this content kind represents text-based content + #[must_use] + pub fn is_text_based(&self) -> bool { + matches!(self, Self::Text) + } + + /// Check if this content kind represents a document + #[must_use] + pub fn is_document(&self) -> bool { + matches!(self, Self::Document) + } + + /// Check if this content kind represents a spreadsheet + #[must_use] + pub fn is_spreadsheet(&self) -> bool { + matches!(self, Self::Spreadsheet) + } + + /// Check if this content kind represents an image + #[must_use] + pub fn is_image(&self) -> bool { + matches!(self, Self::Image) + } + + /// Check if this content kind represents an archive + #[must_use] + pub fn is_archive(&self) -> bool { + matches!(self, Self::Archive) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_content_kind_predicates() { + assert!(ContentKind::Text.is_text_based()); + assert!(!ContentKind::Document.is_text_based()); + + assert!(ContentKind::Document.is_document()); + assert!(!ContentKind::Text.is_document()); + + assert!(ContentKind::Spreadsheet.is_spreadsheet()); + assert!(!ContentKind::Document.is_spreadsheet()); + + assert!(ContentKind::Image.is_image()); + assert!(!ContentKind::Text.is_image()); + + assert!(ContentKind::Archive.is_archive()); + assert!(!ContentKind::Document.is_archive()); + } + + #[test] + fn test_content_kind_display() { + assert_eq!(ContentKind::Text.to_string(), "text"); + assert_eq!(ContentKind::Document.to_string(), "document"); + assert_eq!(ContentKind::Spreadsheet.to_string(), "spreadsheet"); + assert_eq!(ContentKind::Image.to_string(), "image"); + assert_eq!(ContentKind::Archive.to_string(), "archive"); + assert_eq!(ContentKind::Unknown.to_string(), "unknown"); + } + + #[test] + fn test_content_kind_as_ref() { + assert_eq!(ContentKind::Text.as_ref(), "text"); + assert_eq!(ContentKind::Document.as_ref(), "document"); + } + + #[test] + fn test_content_kind_from_str() { + use std::str::FromStr; + + assert_eq!(ContentKind::from_str("text").unwrap(), ContentKind::Text); + assert_eq!( + ContentKind::from_str("document").unwrap(), + ContentKind::Document + ); + assert!(ContentKind::from_str("invalid").is_err()); + } + + #[test] + fn test_default() { + assert_eq!(ContentKind::default(), ContentKind::Unknown); + } + + #[test] + fn test_serialization() { + let kind = ContentKind::Spreadsheet; + let serialized = serde_json::to_string(&kind).unwrap(); + assert_eq!(serialized, "\"spreadsheet\""); + + let deserialized: ContentKind = serde_json::from_str(&serialized).unwrap(); + assert_eq!(deserialized, kind); + } +} diff --git a/crates/nvisy-core/src/fs/content_metadata.rs b/crates/nvisy-core/src/fs/content_metadata.rs new file mode 100644 index 0000000..23d01da --- /dev/null +++ b/crates/nvisy-core/src/fs/content_metadata.rs @@ -0,0 +1,184 @@ +//! Content metadata for filesystem operations +//! +//! This module provides the [`ContentMetadata`] struct for handling metadata +//! about content files, including paths and source tracking. + +use std::path::{Path, PathBuf}; + +use serde::{Deserialize, Serialize}; + +use crate::path::ContentSource; + +/// Metadata associated with content files +/// +/// This struct stores metadata about content including its source identifier +/// and file path. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct ContentMetadata { + /// Unique identifier for the content source + pub content_source: ContentSource, + /// Optional path to the source file + pub source_path: Option<PathBuf>, +} + +impl ContentMetadata { + /// Create new content metadata with just a source + /// + /// # Example + /// + /// ``` + /// use nvisy_core::{fs::ContentMetadata, path::ContentSource}; + /// + /// let source = ContentSource::new(); + /// let metadata = ContentMetadata::new(source); + /// ``` + #[must_use] + pub fn new(content_source: ContentSource) -> Self { + Self { + content_source, + source_path: None, + } + } + + /// Create content metadata with a file path + /// + /// # Example + /// + /// ``` + /// use nvisy_core::{fs::ContentMetadata, path::ContentSource}; + /// use std::path::PathBuf; + /// + /// let source = ContentSource::new(); + /// let metadata = ContentMetadata::with_path(source, PathBuf::from("document.pdf")); + /// assert_eq!(metadata.file_extension(), Some("pdf")); + /// ``` + pub fn with_path(content_source: ContentSource, path: impl Into<PathBuf>) -> Self { + Self { + content_source, + source_path: Some(path.into()), + } + } + + /// Get the file extension if available + #[must_use] + pub fn file_extension(&self) -> Option<&str> { + self.source_path + .as_ref() + .and_then(|path| path.extension()) + .and_then(|ext| ext.to_str()) + } + + /// Get the filename if available + #[must_use] + pub fn filename(&self) -> Option<&str> { + self.source_path + .as_ref() + .and_then(|path| path.file_name()) + .and_then(|name| name.to_str()) + } + + /// Get the parent directory if available + #[must_use] + pub fn parent_directory(&self) -> Option<&Path> { + self.source_path.as_ref().and_then(|path| path.parent()) + } + + /// Get the full path if available + #[must_use] + pub fn path(&self) -> Option<&Path> { + self.source_path.as_deref() + } + + /// Set the source path + pub fn set_path(&mut self, path: impl Into<PathBuf>) { + self.source_path = Some(path.into()); + } + + /// Remove the source path + pub fn clear_path(&mut self) { + self.source_path = None; + } + + /// Check if this metadata has a path + #[must_use] + pub fn has_path(&self) -> bool { + self.source_path.is_some() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_content_metadata_creation() { + let source = ContentSource::new(); + let metadata = ContentMetadata::new(source); + + assert_eq!(metadata.content_source, source); + assert!(metadata.source_path.is_none()); + assert!(!metadata.has_path()); + } + + #[test] + fn test_content_metadata_with_path() { + let source = ContentSource::new(); + let path = PathBuf::from("/path/to/document.pdf"); + let metadata = ContentMetadata::with_path(source, path.clone()); + + assert_eq!(metadata.content_source, source); + assert_eq!(metadata.source_path, Some(path)); + assert!(metadata.has_path()); + } + + #[test] + fn test_file_extension_detection() { + let source = ContentSource::new(); + let metadata = ContentMetadata::with_path(source, PathBuf::from("document.pdf")); + + assert_eq!(metadata.file_extension(), Some("pdf")); + } + + #[test] + fn test_metadata_filename() { + let source = ContentSource::new(); + let metadata = ContentMetadata::with_path(source, PathBuf::from("/path/to/file.txt")); + + assert_eq!(metadata.filename(), Some("file.txt")); + } + + #[test] + fn test_metadata_parent_directory() { + let source = ContentSource::new(); + let metadata = ContentMetadata::with_path(source, PathBuf::from("/path/to/file.txt")); + + assert_eq!(metadata.parent_directory(), Some(Path::new("/path/to"))); + } + + #[test] + fn test_path_operations() { + let source = ContentSource::new(); + let mut metadata = ContentMetadata::new(source); + + assert!(!metadata.has_path()); + + metadata.set_path("test.txt"); + assert!(metadata.has_path()); + assert_eq!(metadata.filename(), Some("test.txt")); + + metadata.clear_path(); + assert!(!metadata.has_path()); + assert_eq!(metadata.filename(), None); + } + + #[test] + fn test_serde_serialization() { + let source = ContentSource::new(); + let metadata = ContentMetadata::with_path(source, PathBuf::from("test.json")); + + let serialized = serde_json::to_string(&metadata).unwrap(); + let deserialized: ContentMetadata = serde_json::from_str(&serialized).unwrap(); + + assert_eq!(metadata, deserialized); + } +} diff --git a/crates/nvisy-core/src/fs/content_registry.rs b/crates/nvisy-core/src/fs/content_registry.rs new file mode 100644 index 0000000..6a190a5 --- /dev/null +++ b/crates/nvisy-core/src/fs/content_registry.rs @@ -0,0 +1,108 @@ +use std::path::{Path, PathBuf}; + +use crate::error::{Error, ErrorKind, Result}; +use crate::fs::ContentHandler; +use crate::io::Content; + +/// Registry that accepts content, creates temporary directories, and returns +/// handlers that manage the directory lifecycle. +/// +/// Each call to [`register`](ContentRegistry::register) creates a subdirectory +/// under the base path, named by the content's [`ContentSource`](crate::path::ContentSource) +/// UUID. The directory is automatically cleaned up when the last +/// [`ContentHandler`] referencing it is dropped. +#[derive(Debug, Clone)] +pub struct ContentRegistry { + base_dir: PathBuf, +} + +impl ContentRegistry { + /// Creates a new content registry with the specified base directory. + /// + /// The directory does not need to exist yet — it is created lazily + /// when content is first registered. + pub fn new(base_dir: impl Into<PathBuf>) -> Self { + Self { + base_dir: base_dir.into(), + } + } + + /// Registers content and creates a managed temporary directory for it. + /// + /// Creates a subdirectory named by the content's `ContentSource` UUID, + /// writes the content data as `content.bin`, and returns a handler that + /// deletes the directory when the last reference is dropped. + pub async fn register(&self, content: Content) -> Result<ContentHandler> { + let content_source = content.content_source(); + let dir = self.base_dir.join(content_source.to_string()); + + tokio::fs::create_dir_all(&dir).await.map_err(|err| { + Error::new(ErrorKind::InternalError, format!( + "Failed to create temporary content directory (path: {})", dir.display() + )).with_source(err) + })?; + + let data_path = dir.join("content.bin"); + tokio::fs::write(&data_path, content.as_bytes()) + .await + .map_err(|err| { + Error::new(ErrorKind::InternalError, format!( + "Failed to write content data (path: {})", data_path.display() + )).with_source(err) + })?; + + let runtime_handle = tokio::runtime::Handle::current(); + + Ok(ContentHandler::new(content_source, dir, runtime_handle)) + } + + /// Returns the base directory path. + pub fn base_dir(&self) -> &Path { + &self.base_dir + } +} + +#[cfg(test)] +mod tests { + use crate::io::{Content, ContentData}; + + use super::*; + + #[tokio::test] + async fn test_register_creates_directory() { + let temp = tempfile::TempDir::new().unwrap(); + let registry = ContentRegistry::new(temp.path().join("content")); + let content = Content::new(ContentData::from("Hello, world!")); + let handler = registry.register(content).await.unwrap(); + + assert!(handler.dir().exists()); + assert!(handler.dir().join("content.bin").exists()); + } + + #[tokio::test] + async fn test_base_dir() { + let temp = tempfile::TempDir::new().unwrap(); + let base = temp.path().join("content"); + let registry = ContentRegistry::new(&base); + assert_eq!(registry.base_dir(), base); + } + + #[tokio::test] + async fn test_register_multiple() { + let temp = tempfile::TempDir::new().unwrap(); + let registry = ContentRegistry::new(temp.path().join("content")); + + let h1 = registry + .register(Content::new(ContentData::from("first"))) + .await + .unwrap(); + let h2 = registry + .register(Content::new(ContentData::from("second"))) + .await + .unwrap(); + + assert_ne!(h1.dir(), h2.dir()); + assert!(h1.dir().exists()); + assert!(h2.dir().exists()); + } +} diff --git a/crates/nvisy-core/src/fs/mod.rs b/crates/nvisy-core/src/fs/mod.rs new file mode 100644 index 0000000..920670e --- /dev/null +++ b/crates/nvisy-core/src/fs/mod.rs @@ -0,0 +1,42 @@ +//! Filesystem module for content file operations +//! +//! This module provides filesystem-specific functionality for working with +//! content files, including file metadata handling and archive operations. +//! +//! # Core Types +//! +//! - [`ContentFile`]: A file wrapper that combines filesystem operations with content tracking +//! - [`ContentMetadata`]: Metadata information for content files +//! - [`ContentKind`]: Classification of content types by file extension +//! +//! # Example +//! +//! ```no_run +//! use nvisy_core::fs::ContentFile; +//! use nvisy_core::io::ContentData; +//! +//! async fn example() -> Result<(), Box<dyn std::error::Error>> { +//! // Create a new file +//! let mut content_file = ContentFile::create("example.txt").await?; +//! +//! // Write some content +//! let content_data = ContentData::from("Hello, world!"); +//! let metadata = content_file.write_from_content_data(content_data).await?; +//! +//! println!("Written to: {:?}", metadata.source_path); +//! Ok(()) +//! } +//! ``` + +mod content_file; +mod content_handler; +mod content_kind; +mod content_metadata; +mod content_registry; + +// Re-export main types +pub use content_file::ContentFile; +pub use content_handler::ContentHandler; +pub use content_kind::ContentKind; +pub use content_metadata::ContentMetadata; +pub use content_registry::ContentRegistry; diff --git a/crates/nvisy-core/src/io/content.rs b/crates/nvisy-core/src/io/content.rs new file mode 100644 index 0000000..ed6f3cb --- /dev/null +++ b/crates/nvisy-core/src/io/content.rs @@ -0,0 +1,234 @@ +//! Content representation combining data with metadata +//! +//! This module provides the [`Content`] struct that combines [`ContentData`] +//! with optional [`ContentMetadata`] for complete content representation. + +use derive_more::{AsRef, Deref}; +use serde::{Deserialize, Serialize}; + +use super::ContentData; +use crate::error::Result; +use crate::fs::ContentMetadata; +use crate::path::ContentSource; + +/// Complete content representation with data and metadata +/// +/// This struct combines [`ContentData`] (the actual content bytes) with +/// optional [`ContentMetadata`] (path, extension info, etc.) to provide +/// a complete content representation. +/// +/// # Examples +/// +/// ```rust +/// use nvisy_core::io::{Content, ContentData}; +/// use nvisy_core::fs::ContentMetadata; +/// use nvisy_core::path::ContentSource; +/// +/// // Create content from data +/// let data = ContentData::from("Hello, world!"); +/// let content = Content::new(data); +/// +/// assert_eq!(content.size(), 13); +/// assert!(content.is_likely_text()); +/// +/// // Create content with metadata +/// let source = ContentSource::new(); +/// let data = ContentData::from_text(source, "Sample text"); +/// let metadata = ContentMetadata::with_path(source, "document.txt"); +/// let content = Content::with_metadata(data, metadata); +/// +/// assert_eq!(content.metadata().and_then(|m| m.filename()), Some("document.txt")); +/// ``` +#[derive(Debug, Clone, PartialEq)] +#[derive(AsRef, Deref, Serialize, Deserialize)] +pub struct Content { + /// The actual content data + #[deref] + #[as_ref] + data: ContentData, + /// Optional metadata about the content + metadata: Option<ContentMetadata>, +} + +impl From<ContentData> for Content { + fn from(data: ContentData) -> Self { + Self::new(data) + } +} + +impl Content { + /// Create new content from data without metadata + pub fn new(data: ContentData) -> Self { + Self { + data, + metadata: None, + } + } + + /// Create new content with metadata + pub fn with_metadata(data: ContentData, metadata: ContentMetadata) -> Self { + Self { + data, + metadata: Some(metadata), + } + } + + /// Get the content data + pub fn data(&self) -> &ContentData { + &self.data + } + + /// Get the content metadata if available + pub fn metadata(&self) -> Option<&ContentMetadata> { + self.metadata.as_ref() + } + + /// Get the content source + pub fn content_source(&self) -> ContentSource { + self.data.content_source + } + + /// Get the content as bytes + pub fn as_bytes(&self) -> &[u8] { + self.data.as_bytes() + } + + /// Returns `true` if the content appears to be text. + pub fn is_likely_text(&self) -> bool { + self.data.is_likely_text() + } + + /// Try to get the content as a string slice. + /// + /// # Errors + /// + /// Returns an error if the content is not valid UTF-8. + pub fn as_str(&self) -> Result<&str> { + self.data.as_str() + } + + /// Get the file extension from metadata if available + pub fn file_extension(&self) -> Option<&str> { + self.metadata.as_ref().and_then(|m| m.file_extension()) + } + + /// Get the filename from metadata if available + pub fn filename(&self) -> Option<&str> { + self.metadata.as_ref().and_then(|m| m.filename()) + } + + /// Set the metadata + pub fn set_metadata(&mut self, metadata: ContentMetadata) { + self.metadata = Some(metadata); + } + + /// Remove the metadata + pub fn clear_metadata(&mut self) { + self.metadata = None; + } + + /// Consume and return the inner [`ContentData`]. + pub fn into_data(self) -> ContentData { + self.data + } + + /// Consume and return both data and metadata + pub fn into_parts(self) -> (ContentData, Option<ContentMetadata>) { + (self.data, self.metadata) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_content_creation() { + let data = ContentData::from("Hello, world!"); + let content = Content::new(data.clone()); + + assert_eq!(content.size(), 13); + assert!(content.is_likely_text()); + assert!(content.metadata().is_none()); + } + + #[test] + fn test_content_with_metadata() { + let source = ContentSource::new(); + let data = ContentData::from_text(source, "Test content"); + let metadata = ContentMetadata::with_path(source, "test.txt"); + let content = Content::with_metadata(data, metadata); + + assert!(content.metadata().is_some()); + assert_eq!(content.file_extension(), Some("txt")); + assert_eq!(content.filename(), Some("test.txt")); + } + + #[test] + fn test_content_deref() { + let data = ContentData::from("Hello"); + let content = Content::new(data); + + // Test that Deref works - we can call ContentData methods directly + assert_eq!(content.size(), 5); + assert_eq!(content.as_str().unwrap(), "Hello"); + } + + #[test] + fn test_content_from() { + let data = ContentData::from("Test"); + let content: Content = data.into(); + + assert_eq!(content.size(), 4); + } + + #[test] + fn test_metadata_operations() { + let data = ContentData::from("Test"); + let mut content = Content::new(data); + + assert!(content.metadata().is_none()); + + let source = content.content_source(); + let metadata = ContentMetadata::with_path(source, "file.pdf"); + content.set_metadata(metadata); + + assert!(content.metadata().is_some()); + assert_eq!(content.file_extension(), Some("pdf")); + + content.clear_metadata(); + assert!(content.metadata().is_none()); + } + + #[test] + fn test_into_parts() { + let source = ContentSource::new(); + let data = ContentData::from_text(source, "Test"); + let metadata = ContentMetadata::with_path(source, "test.txt"); + let content = Content::with_metadata(data.clone(), metadata.clone()); + + let (recovered_data, recovered_metadata) = content.into_parts(); + assert_eq!(recovered_data, data); + assert_eq!(recovered_metadata, Some(metadata)); + } + + #[test] + fn test_serialization() { + let data = ContentData::from("Test content"); + let content = Content::new(data); + + let json = serde_json::to_string(&content).unwrap(); + let deserialized: Content = serde_json::from_str(&json).unwrap(); + + assert_eq!(content, deserialized); + } + + #[test] + fn test_content_source() { + let source = ContentSource::new(); + let data = ContentData::from_text(source, "Test"); + let content = Content::new(data); + + assert_eq!(content.content_source(), source); + } +} diff --git a/crates/nvisy-core/src/io/content_data.rs b/crates/nvisy-core/src/io/content_data.rs new file mode 100644 index 0000000..6b6073a --- /dev/null +++ b/crates/nvisy-core/src/io/content_data.rs @@ -0,0 +1,651 @@ +//! Content data structure for storing and managing content with metadata +//! +//! This module provides the [`ContentData`] struct for storing content data +//! along with its metadata and source information. + +use std::fmt; +use std::ops::Deref; +use std::sync::OnceLock; + +use bytes::Bytes; +use hipstr::HipStr; +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; + +use crate::error::{Error, ErrorKind, Result}; +use crate::path::ContentSource; + +/// A wrapper around `Bytes` for content storage. +/// +/// This struct wraps `bytes::Bytes` and provides additional methods +/// for text conversion. It's cheap to clone as `Bytes` uses reference +/// counting internally. +#[derive(Debug, Clone, PartialEq, Eq, Default)] +#[derive(Serialize, Deserialize)] +#[serde(transparent)] +pub struct ContentBytes(Bytes); + +impl ContentBytes { + /// Creates a new `ContentBytes` from raw bytes. + #[must_use] + pub fn new(bytes: Bytes) -> Self { + Self(bytes) + } + + /// Returns the size of the content in bytes. + #[must_use] + pub fn len(&self) -> usize { + self.0.len() + } + + /// Returns `true` if the content is empty. + #[must_use] + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Returns the content as a byte slice. + #[must_use] + pub fn as_bytes(&self) -> &[u8] { + &self.0 + } + + /// Tries to return the content as a string slice. + /// + /// Returns `None` if the content is not valid UTF-8. + #[must_use] + pub fn as_str(&self) -> Option<&str> { + std::str::from_utf8(&self.0).ok() + } + + /// Converts to a `HipStr` if the content is valid UTF-8. + /// + /// # Errors + /// + /// Returns an error if the content is not valid UTF-8. + pub fn as_hipstr(&self) -> Result<HipStr<'static>> { + let s = std::str::from_utf8(&self.0).map_err(|e| { + Error::new(ErrorKind::Serialization, format!("Invalid UTF-8: {e}")) + })?; + Ok(HipStr::from(s)) + } + + /// Returns the underlying `Bytes`. + #[must_use] + pub fn to_bytes(&self) -> Bytes { + self.0.clone() + } + + /// Consumes and returns the underlying `Bytes`. + #[must_use] + pub fn into_bytes(self) -> Bytes { + self.0 + } + + /// Returns `true` if the content appears to be text. + /// + /// Uses a simple heuristic: checks if all bytes are ASCII printable + /// or whitespace characters. + #[must_use] + pub fn is_likely_text(&self) -> bool { + self.0 + .iter() + .all(|&b| b.is_ascii_graphic() || b.is_ascii_whitespace()) + } +} + +impl Deref for ContentBytes { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl AsRef<[u8]> for ContentBytes { + fn as_ref(&self) -> &[u8] { + &self.0 + } +} + +impl From<&str> for ContentBytes { + fn from(s: &str) -> Self { + Self(Bytes::copy_from_slice(s.as_bytes())) + } +} + +impl From<String> for ContentBytes { + fn from(s: String) -> Self { + Self(Bytes::from(s)) + } +} + +impl From<HipStr<'static>> for ContentBytes { + fn from(s: HipStr<'static>) -> Self { + Self(Bytes::copy_from_slice(s.as_bytes())) + } +} + +impl From<&[u8]> for ContentBytes { + fn from(bytes: &[u8]) -> Self { + Self(Bytes::copy_from_slice(bytes)) + } +} + +impl From<Vec<u8>> for ContentBytes { + fn from(vec: Vec<u8>) -> Self { + Self(Bytes::from(vec)) + } +} + +impl From<Bytes> for ContentBytes { + fn from(bytes: Bytes) -> Self { + Self(bytes) + } +} + +/// Content data with metadata and computed hashes. +/// +/// This struct wraps [`ContentBytes`] and stores content data along with +/// metadata about its source and optional computed SHA256 hash. +/// It's designed to be cheap to clone using reference-counted types. +/// The SHA256 hash is lazily computed using `OnceLock` for lock-free +/// access after initialization. +#[derive(Debug, Serialize, Deserialize)] +pub struct ContentData { + /// Unique identifier for the content source. + pub content_source: ContentSource, + /// The actual content data. + data: ContentBytes, + /// Lazily computed SHA256 hash of the content. + #[serde(skip)] + sha256_cache: OnceLock<Bytes>, +} + +impl ContentData { + /// Creates new content data from bytes. + /// + /// # Example + /// + /// ``` + /// use nvisy_core::{io::ContentData, path::ContentSource}; + /// use bytes::Bytes; + /// + /// let source = ContentSource::new(); + /// let data = Bytes::from("Hello, world!"); + /// let content = ContentData::new(source, data); + /// + /// assert_eq!(content.size(), 13); + /// ``` + pub fn new(content_source: ContentSource, data: Bytes) -> Self { + Self { + content_source, + data: ContentBytes::new(data), + sha256_cache: OnceLock::new(), + } + } + + /// Creates new content data from text. + /// + /// # Example + /// + /// ``` + /// use nvisy_core::{io::ContentData, path::ContentSource}; + /// + /// let source = ContentSource::new(); + /// let content = ContentData::from_text(source, "Hello, world!"); + /// + /// assert_eq!(content.as_str().unwrap(), "Hello, world!"); + /// ``` + pub fn from_text(content_source: ContentSource, text: impl Into<String>) -> Self { + Self { + content_source, + data: ContentBytes::from(text.into()), + sha256_cache: OnceLock::new(), + } + } + + /// Creates content data with explicit `ContentBytes`. + pub fn with_content_bytes(content_source: ContentSource, data: ContentBytes) -> Self { + Self { + content_source, + data, + sha256_cache: OnceLock::new(), + } + } + + /// Returns the size of the content in bytes. + #[must_use] + pub fn size(&self) -> usize { + self.data.len() + } + + /// Returns a pretty formatted size string. + #[allow(clippy::cast_precision_loss)] + #[must_use] + pub fn get_pretty_size(&self) -> String { + let bytes = self.size(); + match bytes { + 0..=1023 => format!("{bytes} B"), + 1024..=1_048_575 => format!("{:.1} KB", bytes as f64 / 1024.0), + 1_048_576..=1_073_741_823 => format!("{:.1} MB", bytes as f64 / 1_048_576.0), + _ => format!("{:.1} GB", bytes as f64 / 1_073_741_824.0), + } + } + + /// Returns the content data as a byte slice. + #[must_use] + pub fn as_bytes(&self) -> &[u8] { + self.data.as_bytes() + } + + /// Returns a reference to the underlying `ContentBytes`. + #[must_use] + pub fn content_bytes(&self) -> &ContentBytes { + &self.data + } + + /// Converts the content data to `Bytes`. + #[must_use] + pub fn to_bytes(&self) -> Bytes { + self.data.to_bytes() + } + + /// Consumes and converts into `Bytes`. + #[must_use] + pub fn into_bytes(self) -> Bytes { + self.data.into_bytes() + } + + /// Returns `true` if the content appears to be text. + /// + /// Uses a simple heuristic: checks if all bytes are ASCII printable + /// or whitespace characters. + #[must_use] + pub fn is_likely_text(&self) -> bool { + self.data.is_likely_text() + } + + /// Tries to convert the content data to a UTF-8 string. + /// + /// # Errors + /// + /// Returns an error if the content data contains invalid UTF-8 sequences. + pub fn as_string(&self) -> Result<String> { + self.data.as_hipstr().map(|s| s.to_string()) + } + + /// Tries to convert the content data to a UTF-8 string slice. + /// + /// # Errors + /// + /// Returns an error if the content data contains invalid UTF-8 sequences. + pub fn as_str(&self) -> Result<&str> { + std::str::from_utf8(self.data.as_bytes()).map_err(|e| { + Error::new(ErrorKind::Serialization, format!("Invalid UTF-8: {e}")) + }) + } + + /// Converts to a `HipStr` if the content is valid UTF-8. + /// + /// # Errors + /// + /// Returns an error if the content is not valid UTF-8. + pub fn as_hipstr(&self) -> Result<HipStr<'static>> { + self.data.as_hipstr() + } + + /// Computes SHA256 hash of the content. + fn compute_sha256_internal(&self) -> Bytes { + let mut hasher = Sha256::new(); + hasher.update(self.data.as_bytes()); + Bytes::from(hasher.finalize().to_vec()) + } + + /// Returns the SHA256 hash, computing it if not already done. + #[must_use] + pub fn sha256(&self) -> &Bytes { + self.sha256_cache + .get_or_init(|| self.compute_sha256_internal()) + } + + /// Returns the SHA256 hash as a hex string. + #[must_use] + pub fn sha256_hex(&self) -> String { + hex::encode(self.sha256()) + } + + /// Verifies the content against a provided SHA256 hash. + /// + /// # Errors + /// + /// Returns an error if the computed hash does not match the expected hash. + pub fn verify_sha256(&self, expected_hash: impl AsRef<[u8]>) -> Result<()> { + let actual_hash = self.sha256(); + let expected = expected_hash.as_ref(); + + if actual_hash.as_ref() == expected { + Ok(()) + } else { + Err(Error::new(ErrorKind::InvalidInput, format!( + "Hash mismatch: expected {}, got {}", + hex::encode(expected), + hex::encode(actual_hash) + ))) + } + } + + /// Returns a slice of the content data. + /// + /// # Errors + /// + /// Returns an error if the end index is beyond the content length + /// or if start is greater than end. + pub fn slice(&self, start: usize, end: usize) -> Result<Bytes> { + let bytes = self.data.as_bytes(); + if end > bytes.len() { + return Err(Error::new(ErrorKind::InvalidInput, format!( + "Slice end {} exceeds content length {}", + end, + bytes.len() + ))); + } + if start > end { + return Err(Error::new(ErrorKind::InvalidInput, + format!("Slice start {start} is greater than end {end}"))); + } + Ok(Bytes::copy_from_slice(&bytes[start..end])) + } + + /// Returns `true` if the content is empty. + #[must_use] + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } +} + +impl Clone for ContentData { + fn clone(&self) -> Self { + let new_lock = OnceLock::new(); + // Copy the computed hash if available + if let Some(hash) = self.sha256_cache.get() { + let _ = new_lock.set(hash.clone()); + } + + Self { + content_source: self.content_source, + data: self.data.clone(), + sha256_cache: new_lock, + } + } +} + +impl PartialEq for ContentData { + fn eq(&self, other: &Self) -> bool { + self.content_source == other.content_source && self.data == other.data + } +} + +impl Eq for ContentData {} + +impl From<&str> for ContentData { + fn from(s: &str) -> Self { + let source = ContentSource::new(); + Self::from_text(source, s) + } +} + +impl From<String> for ContentData { + fn from(s: String) -> Self { + let source = ContentSource::new(); + Self::from_text(source, s) + } +} + +impl From<&[u8]> for ContentData { + fn from(bytes: &[u8]) -> Self { + let source = ContentSource::new(); + Self::new(source, Bytes::copy_from_slice(bytes)) + } +} + +impl From<Vec<u8>> for ContentData { + fn from(vec: Vec<u8>) -> Self { + let source = ContentSource::new(); + Self::new(source, Bytes::from(vec)) + } +} + +impl From<Bytes> for ContentData { + fn from(bytes: Bytes) -> Self { + let source = ContentSource::new(); + Self::new(source, bytes) + } +} + +impl From<HipStr<'static>> for ContentData { + fn from(text: HipStr<'static>) -> Self { + let source = ContentSource::new(); + Self::from_text(source, text.to_string()) + } +} + +impl fmt::Display for ContentData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Ok(text) = self.as_str() { + write!(f, "{text}") + } else { + write!(f, "[Binary data: {} bytes]", self.size()) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_content_data_creation() { + let source = ContentSource::new(); + let data = Bytes::from("Hello, world!"); + let content = ContentData::new(source, data); + + assert_eq!(content.content_source, source); + assert_eq!(content.size(), 13); + assert!(content.sha256_cache.get().is_none()); + } + + #[test] + fn test_content_data_from_text() { + let source = ContentSource::new(); + let content = ContentData::from_text(source, "Hello, world!"); + + assert_eq!(content.as_str().unwrap(), "Hello, world!"); + } + + #[test] + fn test_content_bytes_wrapper() { + let bytes = ContentBytes::from("Hello"); + assert_eq!(bytes.as_str(), Some("Hello")); + assert_eq!(bytes.len(), 5); + assert!(!bytes.is_empty()); + } + + #[test] + fn test_content_bytes_as_hipstr() { + let bytes = ContentBytes::from("Hello, HipStr!"); + let hipstr = bytes.as_hipstr().unwrap(); + assert_eq!(hipstr.as_str(), "Hello, HipStr!"); + + // Test with invalid UTF-8 + let invalid = ContentBytes::from(vec![0xFF, 0xFE]); + assert!(invalid.as_hipstr().is_err()); + } + + #[test] + fn test_content_bytes_binary() { + let binary = ContentBytes::from(vec![0xFF, 0xFE]); + assert_eq!(binary.len(), 2); + assert!(binary.as_str().is_none()); + assert!(!binary.is_likely_text()); + } + + #[test] + fn test_size_methods() { + let content = ContentData::from("Hello"); + assert_eq!(content.size(), 5); + + let pretty_size = content.get_pretty_size(); + assert!(!pretty_size.is_empty()); + } + + #[test] + fn test_sha256_computation() { + let content = ContentData::from("Hello, world!"); + let hash = content.sha256(); + + assert!(content.sha256_cache.get().is_some()); + assert_eq!(hash.len(), 32); + + let hash2 = content.sha256(); + assert_eq!(hash, hash2); + } + + #[test] + fn test_sha256_verification() { + let content = ContentData::from("Hello, world!"); + let hash = content.sha256().clone(); + + assert!(content.verify_sha256(&hash).is_ok()); + + let wrong_hash = vec![0u8; 32]; + assert!(content.verify_sha256(&wrong_hash).is_err()); + } + + #[test] + fn test_string_conversion() { + let content = ContentData::from("Hello, world!"); + assert_eq!(content.as_string().unwrap(), "Hello, world!"); + assert_eq!(content.as_str().unwrap(), "Hello, world!"); + + let binary_content = ContentData::from(vec![0xFF, 0xFE, 0xFD]); + assert!(binary_content.as_string().is_err()); + assert!(binary_content.as_str().is_err()); + } + + #[test] + fn test_as_hipstr() { + let content = ContentData::from("Hello, HipStr!"); + let hipstr = content.as_hipstr().unwrap(); + assert_eq!(hipstr.as_str(), "Hello, HipStr!"); + + let binary_content = ContentData::from(vec![0xFF, 0xFE]); + assert!(binary_content.as_hipstr().is_err()); + } + + #[test] + fn test_is_likely_text() { + let text_content = ContentData::from("Hello, world!"); + assert!(text_content.is_likely_text()); + + let binary_content = ContentData::from(vec![0xFF, 0xFE, 0xFD]); + assert!(!binary_content.is_likely_text()); + } + + #[test] + fn test_slice() { + let content = ContentData::from("Hello, world!"); + + let slice = content.slice(0, 5).unwrap(); + assert_eq!(slice, Bytes::from("Hello")); + + let slice = content.slice(7, 12).unwrap(); + assert_eq!(slice, Bytes::from("world")); + + assert!(content.slice(0, 100).is_err()); + assert!(content.slice(10, 5).is_err()); + } + + #[test] + fn test_from_conversions() { + let from_str = ContentData::from("test"); + let from_string = ContentData::from("test".to_string()); + let from_bytes = ContentData::from(b"test".as_slice()); + let from_vec = ContentData::from(b"test".to_vec()); + let from_bytes_type = ContentData::from(Bytes::from("test")); + + assert_eq!(from_str.as_str().unwrap(), "test"); + assert_eq!(from_string.as_str().unwrap(), "test"); + assert_eq!(from_bytes.as_str().unwrap(), "test"); + assert_eq!(from_vec.as_str().unwrap(), "test"); + assert_eq!(from_bytes_type.as_str().unwrap(), "test"); + } + + #[test] + fn test_display() { + let text_content = ContentData::from("Hello"); + assert_eq!(format!("{text_content}"), "Hello"); + + let binary_content = ContentData::from(vec![0xFF, 0xFE]); + assert!(format!("{binary_content}").contains("Binary data")); + } + + #[test] + fn test_cloning_preserves_hash() { + let original = ContentData::from("Hello, world!"); + let _ = original.sha256(); + + let cloned = original.clone(); + + assert!(original.sha256_cache.get().is_some()); + assert!(cloned.sha256_cache.get().is_some()); + assert_eq!(original.sha256(), cloned.sha256()); + } + + #[test] + fn test_cloning_is_cheap() { + let original = ContentData::from("Hello, world!"); + let cloned = original.clone(); + + assert_eq!(original, cloned); + } + + #[test] + fn test_into_bytes() { + let content = ContentData::from("Hello, world!"); + let bytes = content.into_bytes(); + assert_eq!(bytes, Bytes::from("Hello, world!")); + } + + #[test] + fn test_empty_content() { + let content = ContentData::from(""); + assert!(content.is_empty()); + assert_eq!(content.size(), 0); + } + + #[test] + fn test_to_bytes() { + let text_content = ContentData::from_text(ContentSource::new(), "Hello"); + let bytes = text_content.to_bytes(); + assert_eq!(bytes.as_ref(), b"Hello"); + + let binary_content = ContentData::new(ContentSource::new(), Bytes::from("World")); + let bytes = binary_content.to_bytes(); + assert_eq!(bytes.as_ref(), b"World"); + } + + #[test] + fn test_from_hipstr() { + let hipstr = HipStr::from("Hello from HipStr"); + let content = ContentData::from(hipstr); + assert_eq!(content.as_str().unwrap(), "Hello from HipStr"); + } + + #[test] + fn test_content_bytes_deref() { + let bytes = ContentBytes::from("Hello"); + assert_eq!(&*bytes, b"Hello"); + assert_eq!(bytes.as_ref(), b"Hello"); + } +} diff --git a/crates/nvisy-core/src/io/content_read.rs b/crates/nvisy-core/src/io/content_read.rs new file mode 100644 index 0000000..f889aea --- /dev/null +++ b/crates/nvisy-core/src/io/content_read.rs @@ -0,0 +1,372 @@ +//! Content reading trait for async I/O operations +//! +//! This module provides the [`AsyncContentRead`] trait for reading content data +//! from various async sources into [`ContentData`] structures. + +use std::future::Future; +use std::io; + +use bytes::Bytes; +use tokio::io::{AsyncRead, AsyncReadExt}; + +use super::ContentData; +use crate::path::ContentSource; + +/// Trait for reading content from async sources +/// +/// This trait provides methods for reading content data from async sources +/// and converting them into [`ContentData`] structures with various options +/// for size limits, and verification. +pub trait AsyncContentRead: AsyncRead + Unpin + Send { + /// Read all content from the source into a `ContentData` structure + /// + /// # Errors + /// + /// Returns an error if the read operation fails or if there are I/O issues. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::io::{AsyncContentRead, ContentData}; + /// use tokio::fs::File; + /// use std::io; + /// + /// async fn read_file() -> io::Result<ContentData> { + /// let mut file = File::open("example.txt").await?; + /// file.read_content().await + /// } + /// ``` + fn read_content(&mut self) -> impl Future<Output = io::Result<ContentData>> + Send + where + Self: Sized, + { + async move { + let mut buffer = Vec::new(); + self.read_to_end(&mut buffer).await?; + + let content_data = ContentData::new(ContentSource::new(), Bytes::from(buffer)); + Ok(content_data) + } + } + + /// Read content with a specified content source + /// + /// # Errors + /// + /// Returns an error if the read operation fails or if there are I/O issues. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::{io::{AsyncContentRead, ContentData}, path::ContentSource}; + /// use tokio::fs::File; + /// use std::io; + /// + /// async fn read_with_source() -> io::Result<ContentData> { + /// let mut file = File::open("example.txt").await?; + /// let source = ContentSource::new(); + /// file.read_content_with_source(source).await + /// } + /// ``` + fn read_content_with_source( + &mut self, + source: ContentSource, + ) -> impl Future<Output = io::Result<ContentData>> + Send + where + Self: Sized, + { + async move { + let mut buffer = Vec::new(); + self.read_to_end(&mut buffer).await?; + + let content_data = ContentData::new(source, Bytes::from(buffer)); + Ok(content_data) + } + } + + /// Read content up to a maximum size limit + /// + /// This method prevents reading extremely large files that could cause + /// memory issues. + /// + /// # Errors + /// + /// Returns an error if the read operation fails, if there are I/O issues, + /// or if the content exceeds the maximum size limit. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::io::{AsyncContentRead, ContentData}; + /// use tokio::fs::File; + /// use std::io; + /// + /// async fn read_limited_content() -> io::Result<ContentData> { + /// let mut file = File::open("example.txt").await?; + /// // Limit to 1MB + /// file.read_content_limited(1024 * 1024).await + /// } + /// ``` + fn read_content_limited( + &mut self, + max_size: usize, + ) -> impl Future<Output = io::Result<ContentData>> + Send + where + Self: Sized, + { + async move { + let mut buffer = Vec::with_capacity(std::cmp::min(max_size, 8192)); + let mut total_read = 0; + + loop { + let mut temp_buf = vec![0u8; 8192]; + let bytes_read = self.read(&mut temp_buf).await?; + + if bytes_read == 0 { + break; // EOF reached + } + + if total_read + bytes_read > max_size { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Content size exceeds maximum limit of {max_size} bytes"), + )); + } + + buffer.extend_from_slice(&temp_buf[..bytes_read]); + total_read += bytes_read; + } + + let content_data = ContentData::new(ContentSource::new(), Bytes::from(buffer)); + Ok(content_data) + } + } + + /// Read content in chunks, calling a callback for each chunk + /// + /// This is useful for processing large files without loading them + /// entirely into memory. + /// + /// # Errors + /// + /// Returns an error if the read operation fails or if the callback + /// returns an error. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::io::AsyncContentRead; + /// use tokio::fs::File; + /// use bytes::Bytes; + /// use std::io; + /// + /// async fn process_chunks() -> io::Result<()> { + /// let mut file = File::open("large_file.txt").await?; + /// + /// file.read_content_chunked(8192, |chunk| { + /// println!("Processing chunk of {} bytes", chunk.len()); + /// Ok(()) + /// }).await + /// } + /// ``` + fn read_content_chunked<E>( + &mut self, + chunk_size: usize, + mut callback: impl FnMut(Bytes) -> std::result::Result<(), E> + Send, + ) -> impl Future<Output = std::result::Result<(), E>> + Send + where + Self: Sized, + E: From<io::Error> + Send, + { + async move { + let mut buffer = vec![0u8; chunk_size]; + + loop { + let bytes_read = self.read(&mut buffer).await?; + if bytes_read == 0 { + break; // EOF reached + } + + let chunk = Bytes::copy_from_slice(&buffer[..bytes_read]); + callback(chunk)?; + } + + Ok(()) + } + } + + /// Read content with verification + /// + /// This method reads the content and optionally verifies it meets + /// certain criteria. + /// + /// # Errors + /// + /// Returns an error if the read operation fails, if there are I/O issues, + /// or if verification fails. + fn read_content_verified<F>( + &mut self, + verify_fn: F, + ) -> impl Future<Output = io::Result<ContentData>> + Send + where + Self: Sized, + F: FnOnce(&[u8]) -> bool + Send, + { + async move { + let mut buffer = Vec::new(); + self.read_to_end(&mut buffer).await?; + + // Verify with a reference to the buffer data + if !verify_fn(&buffer) { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Content verification failed", + )); + } + + // Convert to ContentData after verification + let content_data = ContentData::new(ContentSource::new(), Bytes::from(buffer)); + Ok(content_data) + } + } +} + +// Implementations for common types +impl AsyncContentRead for tokio::fs::File {} +impl<T: AsyncRead + Unpin + Send> AsyncContentRead for Box<T> {} + +// Test-specific implementations +#[cfg(test)] +impl<T: AsRef<[u8]> + Unpin + Send> AsyncContentRead for std::io::Cursor<T> {} + +#[cfg(test)] +mod tests { + use std::io::{Cursor, Result}; + + use super::*; + + #[tokio::test] + async fn test_read_content() -> Result<()> { + let data = b"Hello, world!"; + let mut cursor = Cursor::new(data); + + let content = cursor.read_content().await.unwrap(); + assert_eq!(content.as_bytes(), data); + assert_eq!(content.size(), data.len()); + + Ok(()) + } + + #[tokio::test] + async fn test_read_content_with_source() -> Result<()> { + let data = b"Hello, world!"; + let mut cursor = Cursor::new(data); + let source = ContentSource::new(); + + let content = cursor.read_content_with_source(source).await.unwrap(); + assert_eq!(content.content_source, source); + assert_eq!(content.as_bytes(), data); + + Ok(()) + } + + #[tokio::test] + async fn test_read_content_limited() -> Result<()> { + let data = b"Hello, world!"; + let mut cursor = Cursor::new(data); + + // Should succeed within limit + let content = cursor.read_content_limited(20).await?; + assert_eq!(content.as_bytes(), data); + + Ok(()) + } + + #[tokio::test] + async fn test_read_content_limited_exceeds() -> Result<()> { + let data = b"Hello, world!"; + let mut cursor = Cursor::new(data); + + // Should fail when exceeding limit + let result = cursor.read_content_limited(5).await; + assert!(result.is_err()); + + Ok(()) + } + + #[tokio::test] + async fn test_read_content_chunked() -> Result<()> { + let data = b"Hello, world!"; + let mut cursor = Cursor::new(data); + + let mut chunks = Vec::new(); + let result = cursor + .read_content_chunked(5, |chunk| { + chunks.push(chunk); + Ok::<(), io::Error>(()) + }) + .await; + + assert!(result.is_ok()); + assert!(!chunks.is_empty()); + + // Concatenate chunks and verify they match original data + let concatenated: Vec<u8> = chunks + .into_iter() + .flat_map(|chunk| chunk.to_vec()) + .collect(); + assert_eq!(concatenated, data); + + Ok(()) + } + + #[tokio::test] + async fn test_read_content_verified() -> Result<()> { + let data = b"Hello, world!"; + let mut cursor = Cursor::new(data); + + // Should succeed with passing verification + let content = cursor + .read_content_verified(|data| !data.is_empty()) + .await?; + assert_eq!(content.as_bytes(), data); + + Ok(()) + } + + #[tokio::test] + async fn test_read_content_verified_fails() -> Result<()> { + let data = b"Hello, world!"; + let mut cursor = Cursor::new(data); + + // Should fail with failing verification + let result = cursor.read_content_verified(<[u8]>::is_empty).await; + assert!(result.is_err()); + + Ok(()) + } + + #[tokio::test] + async fn test_read_empty_content() -> Result<()> { + let data = b""; + let mut cursor = Cursor::new(data); + + let content = cursor.read_content().await?; + assert_eq!(content.size(), 0); + assert!(content.is_empty()); + + Ok(()) + } + + #[tokio::test] + async fn test_read_large_content() -> Result<()> { + let data = vec![42u8; 10000]; + let mut cursor = Cursor::new(data.clone()); + + let content = cursor.read_content().await?; + assert_eq!(content.as_bytes(), data.as_slice()); + assert_eq!(content.size(), 10000); + + Ok(()) + } +} diff --git a/crates/nvisy-core/src/io/content_write.rs b/crates/nvisy-core/src/io/content_write.rs new file mode 100644 index 0000000..99e749e --- /dev/null +++ b/crates/nvisy-core/src/io/content_write.rs @@ -0,0 +1,372 @@ +//! Content writing trait for async I/O operations +//! +//! This module provides the [`AsyncContentWrite`] trait for writing content data +//! to various async destinations from [`ContentData`] structures. + +use std::future::Future; +use std::io; + +use tokio::io::{AsyncWrite, AsyncWriteExt}; + +use super::ContentData; +use crate::fs::ContentMetadata; + +/// Trait for writing content to async destinations +/// +/// This trait provides methods for writing content data to async destinations +/// with various options for chunking, and verification. +pub trait AsyncContentWrite: AsyncWrite + Unpin + Send { + /// Write content data to the destination + /// + /// # Errors + /// + /// Returns an error if the write operation fails or if there are I/O issues. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::io::{AsyncContentWrite, ContentData}; + /// use nvisy_core::fs::ContentMetadata; + /// use tokio::fs::File; + /// use std::io; + /// + /// async fn write_file() -> io::Result<ContentMetadata> { + /// let mut file = File::create("output.txt").await?; + /// let content = ContentData::from("Hello, world!"); + /// file.write_content(content).await + /// } + /// ``` + fn write_content( + &mut self, + content_data: ContentData, + ) -> impl Future<Output = io::Result<ContentMetadata>> + Send + where + Self: Sized, + { + async move { + self.write_all(content_data.as_bytes()).await?; + self.flush().await?; + + let metadata = ContentMetadata::new(content_data.content_source); + Ok(metadata) + } + } + + /// Write content data and return metadata with specified source path + /// + /// # Errors + /// + /// Returns an error if the write operation fails or if there are I/O issues. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::io::{AsyncContentWrite, ContentData}; + /// use nvisy_core::fs::ContentMetadata; + /// use tokio::fs::File; + /// use std::path::PathBuf; + /// use std::io; + /// + /// async fn write_with_path() -> io::Result<ContentMetadata> { + /// let mut file = File::create("output.txt").await?; + /// let content = ContentData::from("Hello, world!"); + /// let path = PathBuf::from("output.txt"); + /// file.write_content_with_path(content, path).await + /// } + /// ``` + fn write_content_with_path( + &mut self, + content_data: ContentData, + path: impl Into<std::path::PathBuf> + Send, + ) -> impl Future<Output = io::Result<ContentMetadata>> + Send + where + Self: Sized, + { + async move { + self.write_all(content_data.as_bytes()).await?; + self.flush().await?; + + let metadata = ContentMetadata::with_path(content_data.content_source, path); + Ok(metadata) + } + } + + /// Write content data in chunks for better memory efficiency + /// + /// This method is useful for writing large content without keeping it + /// all in memory at once. + /// + /// # Errors + /// + /// Returns an error if the write operation fails or if there are I/O issues. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::io::{AsyncContentWrite, ContentData}; + /// use nvisy_core::fs::ContentMetadata; + /// use tokio::fs::File; + /// use std::io; + /// + /// async fn write_chunked() -> io::Result<ContentMetadata> { + /// let mut file = File::create("output.txt").await?; + /// let content = ContentData::from(vec![0u8; 1_000_000]); // 1MB + /// file.write_content_chunked(content, 8192).await + /// } + /// ``` + fn write_content_chunked( + &mut self, + content_data: ContentData, + chunk_size: usize, + ) -> impl Future<Output = io::Result<ContentMetadata>> + Send + where + Self: Sized, + { + async move { + let data = content_data.as_bytes(); + + for chunk in data.chunks(chunk_size) { + self.write_all(chunk).await?; + } + + self.flush().await?; + + let metadata = ContentMetadata::new(content_data.content_source); + Ok(metadata) + } + } + + /// Write multiple content data items sequentially + /// + /// # Errors + /// + /// Returns an error if any write operation fails or if there are I/O issues. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::io::{AsyncContentWrite, ContentData}; + /// use nvisy_core::fs::ContentMetadata; + /// use tokio::fs::File; + /// use std::io; + /// + /// async fn write_multiple() -> io::Result<Vec<ContentMetadata>> { + /// let mut file = File::create("output.txt").await?; + /// let contents = vec![ + /// ContentData::from("Hello, "), + /// ContentData::from("world!"), + /// ]; + /// file.write_multiple_content(contents).await + /// } + /// ``` + fn write_multiple_content( + &mut self, + content_data_list: Vec<ContentData>, + ) -> impl Future<Output = io::Result<Vec<ContentMetadata>>> + Send + where + Self: Sized, + { + async move { + let mut metadata_list = Vec::with_capacity(content_data_list.len()); + + for content_data in content_data_list { + self.write_all(content_data.as_bytes()).await?; + let metadata = ContentMetadata::new(content_data.content_source); + metadata_list.push(metadata); + } + + self.flush().await?; + Ok(metadata_list) + } + } + + /// Append content data to the destination without truncating + /// + /// This method assumes the destination supports append operations. + /// + /// # Errors + /// + /// Returns an error if the write operation fails or if there are I/O issues. + /// + /// # Example + /// + /// ```no_run + /// use nvisy_core::io::{AsyncContentWrite, ContentData}; + /// use nvisy_core::fs::ContentMetadata; + /// use tokio::fs::OpenOptions; + /// use std::io; + /// + /// async fn append_content() -> io::Result<ContentMetadata> { + /// let mut file = OpenOptions::new() + /// .create(true) + /// .append(true) + /// .open("log.txt") + /// .await?; + /// + /// let content = ContentData::from("New log entry\n"); + /// file.append_content(content).await + /// } + /// ``` + fn append_content( + &mut self, + content_data: ContentData, + ) -> impl Future<Output = io::Result<ContentMetadata>> + Send + where + Self: Sized, + { + async move { + self.write_all(content_data.as_bytes()).await?; + self.flush().await?; + + let metadata = ContentMetadata::new(content_data.content_source); + Ok(metadata) + } + } + + /// Write content data with verification + /// + /// This method writes the content and then optionally verifies it was + /// written correctly by checking the expected size. + /// + /// # Errors + /// + /// Returns an error if the write operation fails, if there are I/O issues, + /// or if verification fails. + fn write_content_verified( + &mut self, + content_data: ContentData, + verify_size: bool, + ) -> impl Future<Output = io::Result<ContentMetadata>> + Send + where + Self: Sized, + { + async move { + let expected_size = content_data.size(); + let data = content_data.as_bytes(); + + let bytes_written = self.write(data).await?; + self.flush().await?; + + if verify_size && bytes_written != expected_size { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + format!( + "Expected to write {expected_size} bytes, but only wrote {bytes_written} bytes" + ), + )); + } + + let metadata = ContentMetadata::new(content_data.content_source); + Ok(metadata) + } + } +} + +// Implementations for common types +impl AsyncContentWrite for tokio::fs::File {} +impl AsyncContentWrite for Vec<u8> {} +impl<T: AsyncWrite + Unpin + Send> AsyncContentWrite for Box<T> {} + +#[cfg(test)] +mod tests { + use std::io::Result; + + use super::*; + + #[tokio::test] + async fn test_write_content() -> Result<()> { + let mut writer = Vec::<u8>::new(); + let content = ContentData::from("Hello, world!"); + + let metadata = writer.write_content(content).await?; + assert!(!metadata.content_source.as_uuid().is_nil()); + + Ok(()) + } + + #[tokio::test] + async fn test_write_content_with_path() -> Result<()> { + let mut writer = Vec::<u8>::new(); + let content = ContentData::from("Hello, world!"); + + let metadata = writer.write_content_with_path(content, "test.txt").await?; + assert!(metadata.has_path()); + assert_eq!(metadata.filename(), Some("test.txt")); + + Ok(()) + } + + #[tokio::test] + async fn test_write_content_chunked() -> Result<()> { + let mut writer = Vec::<u8>::new(); + let data = vec![42u8; 1000]; + let content = ContentData::from(data.clone()); + + let metadata = writer.write_content_chunked(content, 100).await?; + assert!(!metadata.content_source.as_uuid().is_nil()); + assert_eq!(writer.as_slice(), data.as_slice()); + + Ok(()) + } + + #[tokio::test] + async fn test_write_multiple_content() -> Result<()> { + let mut writer = Vec::<u8>::new(); + let contents = vec![ContentData::from("Hello, "), ContentData::from("world!")]; + + let metadata_list = writer.write_multiple_content(contents).await?; + assert_eq!(metadata_list.len(), 2); + assert_eq!(writer.as_slice(), b"Hello, world!"); + + Ok(()) + } + + #[tokio::test] + async fn test_append_content() -> Result<()> { + let mut writer = Vec::<u8>::new(); + let content = ContentData::from("Hello, world!"); + + let metadata = writer.append_content(content).await?; + assert!(!metadata.content_source.as_uuid().is_nil()); + assert_eq!(writer.as_slice(), b"Hello, world!"); + + Ok(()) + } + + #[tokio::test] + async fn test_write_content_verified() -> Result<()> { + let mut writer = Vec::<u8>::new(); + let content = ContentData::from("Hello, world!"); + + let metadata = writer.write_content_verified(content, true).await?; + assert!(!metadata.content_source.as_uuid().is_nil()); + assert_eq!(writer.as_slice(), b"Hello, world!"); + + Ok(()) + } + + #[tokio::test] + async fn test_write_empty_content() -> Result<()> { + let mut writer = Vec::<u8>::new(); + let content = ContentData::from(""); + + let metadata = writer.write_content(content).await?; + assert!(!metadata.content_source.as_uuid().is_nil()); + assert_eq!(writer.as_slice(), b""); + + Ok(()) + } + + #[tokio::test] + async fn test_write_large_content() -> Result<()> { + let mut writer = Vec::<u8>::new(); + let data = vec![123u8; 10000]; + let content = ContentData::from(data.clone()); + + let metadata = writer.write_content(content).await?; + assert!(!metadata.content_source.as_uuid().is_nil()); + assert_eq!(writer.as_slice(), data.as_slice()); + + Ok(()) + } +} diff --git a/crates/nvisy-core/src/io/data_reference.rs b/crates/nvisy-core/src/io/data_reference.rs new file mode 100644 index 0000000..7397498 --- /dev/null +++ b/crates/nvisy-core/src/io/data_reference.rs @@ -0,0 +1,141 @@ +//! Data reference definitions +//! +//! This module provides the `DataReference` struct for referencing and +//! tracking content within the Nvisy system. + +use serde::{Deserialize, Serialize}; + +use crate::io::Content; +use crate::path::ContentSource; + +/// Reference to data with source tracking and content information +/// +/// A `DataReference` provides a lightweight way to reference data content +/// while maintaining information about its source location and optional +/// mapping within that source. +/// +/// # Examples +/// +/// ```rust +/// use nvisy_core::io::{DataReference, Content, ContentData}; +/// +/// let content = Content::new(ContentData::from("Hello, world!")); +/// let data_ref = DataReference::new(content) +/// .with_mapping_id("line-42"); +/// +/// assert!(data_ref.mapping_id().is_some()); +/// assert_eq!(data_ref.mapping_id().unwrap(), "line-42"); +/// ``` +#[derive(Debug, Clone)] +#[derive(Serialize, Deserialize)] +pub struct DataReference { + /// Unique identifier for the source containing this data + /// Using `UUIDv7` for time-ordered, globally unique identification + source: ContentSource, + + /// Optional identifier that defines the position/location of the data within the source + /// Examples: line numbers, byte offsets, element IDs, `XPath` expressions + mapping_id: Option<String>, + + /// The actual content data + content: Content, +} + +impl DataReference { + /// Create a new data reference with auto-generated source ID (`UUIDv7`) + pub fn new(content: Content) -> Self { + Self { + source: ContentSource::new(), + mapping_id: None, + content, + } + } + + /// Create a new data reference with specific source + pub fn with_source(source: ContentSource, content: Content) -> Self { + Self { + source, + mapping_id: None, + content, + } + } + + /// Set the mapping ID for this data reference + #[must_use] + pub fn with_mapping_id<S: Into<String>>(mut self, mapping_id: S) -> Self { + self.mapping_id = Some(mapping_id.into()); + self + } + + /// Get the content source + pub fn source(&self) -> ContentSource { + self.source + } + + /// Get the mapping ID, if any + pub fn mapping_id(&self) -> Option<&str> { + self.mapping_id.as_deref() + } + + /// Get a reference to the content + pub fn content(&self) -> &Content { + &self.content + } + + /// Check if the content is text-based + pub fn is_likely_text(&self) -> bool { + self.content.is_likely_text() + } + + /// Get the size of the content in bytes + pub fn size(&self) -> usize { + self.content.size() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::io::ContentData; + + #[test] + fn test_data_reference_creation() { + let content = Content::new(ContentData::from("Hello, world!")); + let data_ref = DataReference::new(content); + + assert!(data_ref.is_likely_text()); + assert!(data_ref.mapping_id().is_none()); + assert_eq!(data_ref.size(), 13); + // Verify UUIDv7 is used + assert_eq!(data_ref.source().as_uuid().get_version_num(), 7); + } + + #[test] + fn test_data_reference_with_mapping() { + let content = Content::new(ContentData::from("Test content")); + let data_ref = DataReference::new(content).with_mapping_id("line-42"); + + assert_eq!(data_ref.mapping_id(), Some("line-42")); + } + + #[test] + fn test_data_reference_with_source() { + let source = ContentSource::new(); + let content = Content::new(ContentData::from("Test content")); + let data_ref = DataReference::with_source(source, content); + + assert_eq!(data_ref.source(), source); + } + + #[test] + fn test_serialization() { + let content = Content::new(ContentData::from("Test content")); + let data_ref = DataReference::new(content).with_mapping_id("test-mapping"); + + let json = serde_json::to_string(&data_ref).unwrap(); + let deserialized: DataReference = serde_json::from_str(&json).unwrap(); + + assert_eq!(data_ref.source(), deserialized.source()); + assert_eq!(data_ref.mapping_id(), deserialized.mapping_id()); + } +} diff --git a/crates/nvisy-core/src/io/mod.rs b/crates/nvisy-core/src/io/mod.rs new file mode 100644 index 0000000..aa33482 --- /dev/null +++ b/crates/nvisy-core/src/io/mod.rs @@ -0,0 +1,26 @@ +//! I/O module for content handling and processing +//! +//! This module provides the core I/O abstractions for handling content data, +//! including content data structures and async read/write traits. +//! +//! # Core Types +//! +//! - [`ContentData`]: Container for content data with metadata, hashing, and size utilities +//! +//! # Traits +//! +//! - [`AsyncContentRead`]: Async trait for reading content from various sources +//! - [`AsyncContentWrite`]: Async trait for writing content to various destinations + +mod content; +mod content_data; +mod content_read; +mod content_write; +mod data_reference; + +// Re-export core types and traits +pub use content::Content; +pub use content_data::{ContentBytes, ContentData}; +pub use content_read::AsyncContentRead; +pub use content_write::AsyncContentWrite; +pub use data_reference::DataReference; diff --git a/crates/nvisy-core/src/lib.rs b/crates/nvisy-core/src/lib.rs index 3961ed4..b94c8b9 100644 --- a/crates/nvisy-core/src/lib.rs +++ b/crates/nvisy-core/src/lib.rs @@ -4,8 +4,9 @@ pub mod datatypes; pub mod error; -pub mod ontology; -pub mod redaction; +pub mod fs; +pub mod io; +pub mod path; pub mod registry; #[doc(hidden)] diff --git a/crates/nvisy-core/src/path/mod.rs b/crates/nvisy-core/src/path/mod.rs new file mode 100644 index 0000000..08cb0c4 --- /dev/null +++ b/crates/nvisy-core/src/path/mod.rs @@ -0,0 +1,9 @@ +//! Path module for content source identification +//! +//! This module provides functionality for uniquely identifying content sources +//! throughout the nvisy system using UUIDv7-based identifiers. + +mod source; + +// Re-export core types +pub use source::ContentSource; diff --git a/crates/nvisy-core/src/path/source.rs b/crates/nvisy-core/src/path/source.rs new file mode 100644 index 0000000..49b2811 --- /dev/null +++ b/crates/nvisy-core/src/path/source.rs @@ -0,0 +1,287 @@ +//! Content source identification module +//! +//! This module provides the [`ContentSource`] struct for uniquely identifying +//! data sources throughout the nvisy system using `UUIDv7`. + +use std::fmt; + +use jiff::Zoned; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +/// Unique identifier for content sources in the system +/// +/// Uses `UUIDv7` for time-ordered, globally unique identification of data sources. +/// +/// This allows for efficient tracking and correlation of content throughout +/// the processing pipeline. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Serialize, Deserialize)] +pub struct ContentSource { + /// `UUIDv7` identifier + id: Uuid, +} + +impl ContentSource { + /// Create a new content source with a fresh `UUIDv7` + /// + /// # Example + /// + /// ``` + /// use nvisy_core::path::ContentSource; + /// + /// let source = ContentSource::new(); + /// assert!(!source.as_uuid().is_nil()); + /// ``` + #[must_use] + pub fn new() -> Self { + let now = Zoned::now(); + let timestamp = uuid::Timestamp::from_unix( + uuid::NoContext, + now.timestamp().as_second().unsigned_abs(), + now.timestamp().subsec_nanosecond().unsigned_abs(), + ); + + Self { + id: Uuid::new_v7(timestamp), + } + } + + /// Create a content source from an existing UUID + /// + /// # Example + /// + /// ``` + /// use nvisy_core::path::ContentSource; + /// use uuid::Uuid; + /// + /// let source = ContentSource::new(); + /// let uuid = source.as_uuid(); + /// let source2 = ContentSource::from_uuid(uuid); + /// assert_eq!(source2.as_uuid(), uuid); + /// ``` + #[must_use] + pub fn from_uuid(id: Uuid) -> Self { + Self { id } + } + + /// Get the underlying UUID + /// + /// # Example + /// + /// ``` + /// use nvisy_core::path::ContentSource; + /// + /// let source = ContentSource::new(); + /// let uuid = source.as_uuid(); + /// assert_eq!(uuid.get_version_num(), 7); + /// ``` + #[must_use] + pub fn as_uuid(&self) -> Uuid { + self.id + } + + /// Get the UUID as a string + /// + /// # Example + /// + /// ``` + /// use nvisy_core::path::ContentSource; + /// + /// let source = ContentSource::new(); + /// let id_str = source.to_string(); + /// assert_eq!(id_str.len(), 36); // Standard UUID string length + /// ``` + /// + /// Parse a content source from a string + /// + /// # Errors + /// + /// Returns an error if the string is not a valid UUID format. + /// + /// # Example + /// + /// ``` + /// use nvisy_core::path::ContentSource; + /// + /// let source = ContentSource::new(); + /// let id_str = source.to_string(); + /// let parsed = ContentSource::parse(&id_str).unwrap(); + /// assert_eq!(source, parsed); + /// ``` + pub fn parse(s: &str) -> Result<Self, uuid::Error> { + let id = Uuid::parse_str(s)?; + Ok(Self { id }) + } + + /// Get the timestamp component from the `UUIDv7` + /// + /// Returns the Unix timestamp in milliseconds when this UUID was generated, + /// or None if this is not a `UUIDv7`. + /// + /// # Example + /// + /// ``` + /// use nvisy_core::path::ContentSource; + /// use std::time::{SystemTime, UNIX_EPOCH}; + /// + /// let source = ContentSource::new(); + /// let timestamp = source.timestamp().expect("UUIDv7 should have timestamp"); + /// let now = SystemTime::now() + /// .duration_since(UNIX_EPOCH) + /// .unwrap() + /// .as_millis() as u64; + /// + /// // Should be very close to current time (within a few seconds) + /// assert!((timestamp as i64 - now as i64).abs() < 5000); + /// ``` + #[must_use] + pub fn timestamp(&self) -> Option<u64> { + self.id.get_timestamp().map(|timestamp| { + let (seconds, nanos) = timestamp.to_unix(); + seconds * 1000 + u64::from(nanos) / 1_000_000 + }) + } + + /// Check if this content source was created before another + /// + /// Returns false if either UUID is not a `UUIDv7` and thus has no timestamp. + /// + /// # Example + /// + /// ``` + /// use nvisy_core::path::ContentSource; + /// use std::thread; + /// use std::time::Duration; + /// + /// let source1 = ContentSource::new(); + /// thread::sleep(Duration::from_millis(1)); + /// let source2 = ContentSource::new(); + /// + /// assert!(source1.created_before(&source2)); + /// assert!(!source2.created_before(&source1)); + /// ``` + #[must_use] + pub fn created_before(&self, other: &ContentSource) -> bool { + match (self.timestamp(), other.timestamp()) { + (Some(self_ts), Some(other_ts)) => self_ts < other_ts, + _ => false, + } + } + + /// Check if this content source was created after another + /// + /// Returns false if either UUID is not a `UUIDv7` and thus has no timestamp. + #[must_use] + pub fn created_after(&self, other: &ContentSource) -> bool { + match (self.timestamp(), other.timestamp()) { + (Some(self_ts), Some(other_ts)) => self_ts > other_ts, + _ => false, + } + } +} + +impl Default for ContentSource { + fn default() -> Self { + Self::new() + } +} + +impl fmt::Display for ContentSource { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.id) + } +} + +impl From<Uuid> for ContentSource { + fn from(id: Uuid) -> Self { + Self::from_uuid(id) + } +} + +impl From<ContentSource> for Uuid { + fn from(source: ContentSource) -> Self { + source.id + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + use std::thread; + use std::time::Duration; + + use super::*; + + #[test] + fn test_new_content_source() { + let source = ContentSource::new(); + assert_eq!(source.as_uuid().get_version_num(), 7); + assert!(!source.as_uuid().is_nil()); + } + + #[test] + fn test_uniqueness() { + let mut sources = HashSet::new(); + + // Generate 1000 sources and ensure they're all unique + for _ in 0..1000 { + let source = ContentSource::new(); + assert!(sources.insert(source), "Duplicate content source found"); + } + } + + #[test] + fn test_string_conversion() { + let source = ContentSource::new(); + let string_repr = source.to_string(); + let parsed = ContentSource::parse(&string_repr).unwrap(); + assert_eq!(source, parsed); + } + + #[test] + fn test_invalid_string_parsing() { + let result = ContentSource::parse("invalid-uuid"); + assert!(result.is_err()); + } + + #[test] + fn test_ordering() { + let source1 = ContentSource::new(); + thread::sleep(Duration::from_millis(2)); + let source2 = ContentSource::new(); + + assert!(source1.created_before(&source2)); + assert!(source2.created_after(&source1)); + assert!(source1 < source2); // Test PartialOrd + } + + #[test] + fn test_display() { + let source = ContentSource::new(); + let display_str = format!("{source}"); + let uuid_str = source.as_uuid().to_string(); + assert_eq!(display_str, uuid_str); + } + + #[test] + fn test_serde_serialization() { + let source = ContentSource::new(); + let serialized = serde_json::to_string(&source).unwrap(); + let deserialized: ContentSource = serde_json::from_str(&serialized).unwrap(); + assert_eq!(source, deserialized); + } + + #[test] + fn test_hash_consistency() { + let source = ContentSource::new(); + let mut set = HashSet::new(); + + set.insert(source); + assert!(set.contains(&source)); + + // Same source should hash the same way + let cloned_source = source; + assert!(set.contains(&cloned_source)); + } +} diff --git a/crates/nvisy-core/src/prelude.rs b/crates/nvisy-core/src/prelude.rs index 78ab3e1..7c8c950 100644 --- a/crates/nvisy-core/src/prelude.rs +++ b/crates/nvisy-core/src/prelude.rs @@ -4,11 +4,11 @@ //! types without individual `use` statements. pub use crate::datatypes::blob::Blob; +pub use crate::datatypes::document::TabularData; pub use crate::datatypes::Data; pub use crate::error::{Error, ErrorKind, Result}; +pub use crate::fs::{ContentFile, ContentHandler, ContentKind, ContentMetadata, ContentRegistry}; +pub use crate::io::{AsyncContentRead, AsyncContentWrite, Content, ContentBytes, ContentData, DataReference}; +pub use crate::path::ContentSource; pub use crate::registry::action::Action; -pub use crate::registry::loader::Loader; pub use crate::registry::provider::{ConnectedInstance, ProviderFactory}; -pub use crate::registry::stream::{StreamSource, StreamTarget}; -pub use crate::ontology::entity::{DetectionMethod, EntityCategory}; -pub use crate::ontology::redaction::RedactionMethod; diff --git a/crates/nvisy-core/src/registry/loader.rs b/crates/nvisy-core/src/registry/loader.rs deleted file mode 100644 index 16a4f90..0000000 --- a/crates/nvisy-core/src/registry/loader.rs +++ /dev/null @@ -1,41 +0,0 @@ -//! The `Loader` trait for converting raw blobs into structured documents or images. - -use serde::de::DeserializeOwned; - -use crate::datatypes::blob::Blob; -use crate::datatypes::document::Document; -use crate::datatypes::document::ImageData; -use crate::error::Error; - -/// Output of a loader -- either a parsed document or an extracted image. -pub enum LoaderOutput { - /// A successfully parsed text document. - Document(Document), - /// An extracted or decoded image. - Image(ImageData), -} - -/// Converts raw [`Blob`] content into structured [`Document`]s or [`ImageData`]. -/// -/// Loaders declare which file extensions and MIME types they support. -/// The engine selects the appropriate loader based on the blob's -/// content type and extension. -#[async_trait::async_trait] -pub trait Loader: Send + Sync + 'static { - /// Strongly-typed parameters for this loader. - type Params: DeserializeOwned + Send; - - /// Unique identifier for this loader (e.g. `"csv"`, `"pdf"`). - fn id(&self) -> &str; - /// File extensions this loader handles (e.g. `["csv", "tsv"]`). - fn extensions(&self) -> &[&str]; - /// MIME types this loader handles (e.g. `["text/csv"]`). - fn content_types(&self) -> &[&str]; - - /// Parse the blob and return one or more documents or images. - async fn load( - &self, - blob: &Blob, - params: &Self::Params, - ) -> Result<Vec<LoaderOutput>, Error>; -} diff --git a/crates/nvisy-core/src/registry/mod.rs b/crates/nvisy-core/src/registry/mod.rs index 816673d..825061c 100644 --- a/crates/nvisy-core/src/registry/mod.rs +++ b/crates/nvisy-core/src/registry/mod.rs @@ -1,9 +1,4 @@ //! Core traits defining the pipeline extension points. -//! -//! Actions, loaders, stream sources/targets, and provider factories -//! are the primary interfaces that plugins implement. pub mod action; -pub mod loader; pub mod provider; -pub mod stream; diff --git a/crates/nvisy-core/src/registry/stream.rs b/crates/nvisy-core/src/registry/stream.rs deleted file mode 100644 index abb6820..0000000 --- a/crates/nvisy-core/src/registry/stream.rs +++ /dev/null @@ -1,61 +0,0 @@ -//! Stream source and target traits for external I/O. - -use serde::de::DeserializeOwned; -use tokio::sync::mpsc; - -use crate::datatypes::blob::Blob; -use crate::error::Error; - -/// A source stream that reads blobs from an external system into the pipeline. -/// -/// Implementations connect to a storage backend (e.g. S3, local filesystem) -/// and emit blobs into the pipeline's input channel. -#[async_trait::async_trait] -pub trait StreamSource: Send + Sync + 'static { - /// Strongly-typed parameters for this stream source. - type Params: DeserializeOwned + Send; - /// The client type this stream requires. - type Client: Send + 'static; - - /// Unique identifier for this stream source (e.g. `"s3-read"`). - fn id(&self) -> &str; - /// Validate source parameters before execution. - fn validate_params(&self, params: &Self::Params) -> Result<(), Error>; - - /// Read blobs from the external system and send them to `output`. - /// - /// Returns the number of blobs read. - async fn read( - &self, - output: mpsc::Sender<Blob>, - params: Self::Params, - client: Self::Client, - ) -> Result<u64, Error>; -} - -/// A target stream that writes blobs from the pipeline to an external system. -/// -/// Implementations receive processed blobs from the pipeline and persist -/// them to a storage backend. -#[async_trait::async_trait] -pub trait StreamTarget: Send + Sync + 'static { - /// Strongly-typed parameters for this stream target. - type Params: DeserializeOwned + Send; - /// The client type this stream requires. - type Client: Send + 'static; - - /// Unique identifier for this stream target (e.g. `"s3-write"`). - fn id(&self) -> &str; - /// Validate target parameters before execution. - fn validate_params(&self, params: &Self::Params) -> Result<(), Error>; - - /// Receive blobs from `input` and write them to the external system. - /// - /// Returns the number of blobs written. - async fn write( - &self, - input: mpsc::Receiver<Blob>, - params: Self::Params, - client: Self::Client, - ) -> Result<u64, Error>; -} diff --git a/crates/nvisy-detect/Cargo.toml b/crates/nvisy-detect/Cargo.toml index 42facd0..c169374 100644 --- a/crates/nvisy-detect/Cargo.toml +++ b/crates/nvisy-detect/Cargo.toml @@ -24,6 +24,7 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } +nvisy-ontology = { workspace = true } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -38,6 +39,7 @@ uuid = { workspace = true, features = ["v4"] } # Text processing regex = { workspace = true, features = [] } +aho-corasick = { workspace = true } # Observability tracing = { workspace = true, features = [] } diff --git a/crates/nvisy-detect/README.md b/crates/nvisy-detect/README.md index 4b8bf54..5d620aa 100644 --- a/crates/nvisy-detect/README.md +++ b/crates/nvisy-detect/README.md @@ -1,3 +1,3 @@ # nvisy-detect -Detection and redaction plugin for the Nvisy runtime. Provides regex-based entity detection, checksum validation, policy evaluation, classification, audit emission, and file loaders for plaintext, CSV, and JSON formats. +Detection and redaction plugin for the Nvisy runtime. Provides regex-based entity detection, checksum validation, policy evaluation, classification, and audit emission. diff --git a/crates/nvisy-detect/assets/dictionaries/first_names.txt b/crates/nvisy-detect/assets/dictionaries/first_names.txt new file mode 100644 index 0000000..08fee00 --- /dev/null +++ b/crates/nvisy-detect/assets/dictionaries/first_names.txt @@ -0,0 +1,50 @@ +James +Mary +Robert +Patricia +John +Jennifer +Michael +Linda +David +Elizabeth +William +Barbara +Richard +Susan +Joseph +Jessica +Thomas +Sarah +Christopher +Karen +Charles +Lisa +Daniel +Nancy +Matthew +Betty +Anthony +Margaret +Mark +Sandra +Donald +Ashley +Steven +Dorothy +Paul +Kimberly +Andrew +Emily +Joshua +Donna +Kenneth +Michelle +Kevin +Carol +Brian +Amanda +George +Melissa +Timothy +Deborah diff --git a/crates/nvisy-detect/assets/dictionaries/last_names.txt b/crates/nvisy-detect/assets/dictionaries/last_names.txt new file mode 100644 index 0000000..161ccd1 --- /dev/null +++ b/crates/nvisy-detect/assets/dictionaries/last_names.txt @@ -0,0 +1,50 @@ +Smith +Johnson +Williams +Brown +Jones +Garcia +Miller +Davis +Rodriguez +Martinez +Hernandez +Lopez +Gonzalez +Wilson +Anderson +Thomas +Taylor +Moore +Jackson +Martin +Lee +Perez +Thompson +White +Harris +Sanchez +Clark +Ramirez +Lewis +Robinson +Walker +Young +Allen +King +Wright +Scott +Torres +Nguyen +Hill +Flores +Green +Adams +Nelson +Baker +Hall +Rivera +Campbell +Mitchell +Carter +Roberts diff --git a/crates/nvisy-detect/assets/dictionaries/medical_terms.txt b/crates/nvisy-detect/assets/dictionaries/medical_terms.txt new file mode 100644 index 0000000..e0a5d04 --- /dev/null +++ b/crates/nvisy-detect/assets/dictionaries/medical_terms.txt @@ -0,0 +1,50 @@ +diabetes +hypertension +asthma +cancer +HIV +AIDS +hepatitis +tuberculosis +epilepsy +schizophrenia +bipolar +depression +anxiety +COPD +pneumonia +bronchitis +arthritis +osteoporosis +Alzheimer +Parkinson +dementia +leukemia +lymphoma +melanoma +carcinoma +chemotherapy +radiation +dialysis +transplant +amputation +prosthetic +insulin +metformin +lisinopril +atorvastatin +metoprolol +omeprazole +amlodipine +gabapentin +hydrocodone +oxycodone +morphine +fentanyl +naloxone +prednisone +warfarin +heparin +diagnosis +prognosis +pathology diff --git a/crates/nvisy-detect/src/actions/apply_redaction.rs b/crates/nvisy-detect/src/actions/apply_redaction.rs index 48e636a..56cabf3 100644 --- a/crates/nvisy-detect/src/actions/apply_redaction.rs +++ b/crates/nvisy-detect/src/actions/apply_redaction.rs @@ -6,8 +6,8 @@ use uuid::Uuid; use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; -use nvisy_core::ontology::entity::Entity; -use nvisy_core::ontology::redaction::Redaction; +use nvisy_ontology::ontology::entity::Entity; +use nvisy_ontology::ontology::redaction::Redaction; use nvisy_core::error::{Error, ErrorKind}; use nvisy_core::registry::action::Action; diff --git a/crates/nvisy-detect/src/actions/classify.rs b/crates/nvisy-detect/src/actions/classify.rs index d29059f..28855f6 100644 --- a/crates/nvisy-detect/src/actions/classify.rs +++ b/crates/nvisy-detect/src/actions/classify.rs @@ -3,7 +3,7 @@ use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; -use nvisy_core::ontology::entity::Entity; +use nvisy_ontology::ontology::entity::Entity; use nvisy_core::error::{Error, ErrorKind}; use nvisy_core::registry::action::Action; @@ -78,7 +78,7 @@ fn compute_sensitivity_level(entities: &[Entity]) -> String { let has_high_confidence = entities.iter().any(|e| e.confidence >= 0.9); let has_critical_types = entities.iter().any(|e| { - matches!(e.category, nvisy_core::ontology::entity::EntityCategory::Credentials) + matches!(e.category, nvisy_ontology::ontology::entity::EntityCategory::Credentials) || e.entity_type == "ssn" || e.entity_type == "credit_card" }); diff --git a/crates/nvisy-detect/src/actions/detect_checksum.rs b/crates/nvisy-detect/src/actions/detect_checksum.rs index d1b60aa..25e2f85 100644 --- a/crates/nvisy-detect/src/actions/detect_checksum.rs +++ b/crates/nvisy-detect/src/actions/detect_checksum.rs @@ -4,7 +4,7 @@ use serde::Deserialize; use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; -use nvisy_core::ontology::entity::{DetectionMethod, Entity}; +use nvisy_ontology::ontology::entity::{DetectionMethod, Entity}; use nvisy_core::error::{Error, ErrorKind}; use nvisy_core::registry::action::Action; diff --git a/crates/nvisy-detect/src/actions/detect_dictionary.rs b/crates/nvisy-detect/src/actions/detect_dictionary.rs new file mode 100644 index 0000000..76ae21d --- /dev/null +++ b/crates/nvisy-detect/src/actions/detect_dictionary.rs @@ -0,0 +1,205 @@ +//! Aho-Corasick dictionary-based entity detection action. + +use aho_corasick::AhoCorasick; +use serde::Deserialize; +use tokio::sync::mpsc; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::{Document, TabularData}; +use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityLocation}; +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::registry::action::Action; + +use crate::dictionaries; + +/// Definition of a single dictionary for matching. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DictionaryDef { + /// Dictionary name — `"builtin:first_names"` for built-in, or a custom name. + pub name: String, + /// Entity category for matches from this dictionary. + pub category: EntityCategory, + /// Entity type label for matches (e.g. `"first_name"`, `"medical_term"`). + pub entity_type: String, + /// Custom values — empty when using a builtin dictionary. + #[serde(default)] + pub values: Vec<String>, + /// Whether matching should be case-sensitive. + #[serde(default)] + pub case_sensitive: bool, +} + +/// Typed parameters for [`DetectDictionaryAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DetectDictionaryParams { + /// One or more dictionaries to match against. + pub dictionaries: Vec<DictionaryDef>, + /// Confidence score assigned to dictionary matches. + #[serde(default = "default_confidence")] + pub confidence: f64, +} + +fn default_confidence() -> f64 { + 0.85 +} + +/// Scans document text and tabular cells against Aho-Corasick automata +/// built from user-provided word lists and/or built-in gazetteers. +pub struct DetectDictionaryAction; + +#[async_trait::async_trait] +impl Action for DetectDictionaryAction { + type Params = DetectDictionaryParams; + + fn id(&self) -> &str { + "detect-dictionary" + } + + fn validate_params(&self, params: &Self::Params) -> Result<(), Error> { + if params.dictionaries.is_empty() { + return Err(Error::new( + ErrorKind::Validation, + "at least one dictionary definition is required", + )); + } + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, + params: Self::Params, + ) -> Result<u64, Error> { + // Build automata for each dictionary + let automata = build_automata(¶ms.dictionaries)?; + let confidence = params.confidence; + let mut count = 0u64; + + while let Some(mut blob) = input.recv().await { + // Scan documents + let documents: Vec<Document> = blob.get_artifacts("documents").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read documents artifact: {e}")) + })?; + + for doc in &documents { + for (def, ac, values) in &automata { + for mat in ac.find_iter(&doc.content) { + let value = &values[mat.pattern().as_usize()]; + let entity = Entity::new( + def.category, + &def.entity_type, + value.as_str(), + DetectionMethod::Dictionary, + confidence, + EntityLocation { + start_offset: mat.start(), + end_offset: mat.end(), + element_id: None, + page_number: None, + bounding_box: None, + row_index: None, + column_index: None, + image_id: None, + }, + ) + .with_source_id(doc.data.id); + blob.add_artifact("entities", &entity).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add entity: {e}")) + })?; + count += 1; + } + } + } + + // Scan tabular data + let tables: Vec<TabularData> = blob.get_artifacts("tabular").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read tabular artifact: {e}")) + })?; + + for table in &tables { + for (row_idx, row) in table.rows.iter().enumerate() { + for (col_idx, cell) in row.iter().enumerate() { + if cell.is_empty() { + continue; + } + for (def, ac, values) in &automata { + for mat in ac.find_iter(cell) { + let value = &values[mat.pattern().as_usize()]; + let entity = Entity::new( + def.category, + &def.entity_type, + value.as_str(), + DetectionMethod::Dictionary, + confidence, + EntityLocation { + start_offset: mat.start(), + end_offset: mat.end(), + element_id: None, + page_number: None, + bounding_box: None, + row_index: Some(row_idx), + column_index: Some(col_idx), + image_id: None, + }, + ) + .with_source_id(table.data.id); + blob.add_artifact("entities", &entity).map_err(|e| { + Error::new( + ErrorKind::Runtime, + format!("failed to add entity: {e}"), + ) + })?; + count += 1; + } + } + } + } + } + + if output.send(blob).await.is_err() { + return Ok(count); + } + } + + Ok(count) + } +} + +/// Resolve dictionary values (builtin or custom) and build Aho-Corasick automata. +fn build_automata( + defs: &[DictionaryDef], +) -> Result<Vec<(&DictionaryDef, AhoCorasick, Vec<String>)>, Error> { + let mut result = Vec::with_capacity(defs.len()); + + for def in defs { + let values: Vec<String> = if def.name.starts_with("builtin:") { + let builtin = dictionaries::get_builtin(&def.name).ok_or_else(|| { + Error::new( + ErrorKind::Validation, + format!("unknown builtin dictionary: {}", def.name), + ) + })?; + builtin.to_vec() + } else { + def.values.clone() + }; + + if values.is_empty() { + continue; + } + + let ac = aho_corasick::AhoCorasickBuilder::new() + .ascii_case_insensitive(!def.case_sensitive) + .build(&values) + .map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to build automaton: {e}")) + })?; + + result.push((def, ac, values)); + } + + Ok(result) +} diff --git a/crates/nvisy-detect/src/actions/detect_manual.rs b/crates/nvisy-detect/src/actions/detect_manual.rs new file mode 100644 index 0000000..7768323 --- /dev/null +++ b/crates/nvisy-detect/src/actions/detect_manual.rs @@ -0,0 +1,87 @@ +//! Manual annotation detection action. +//! +//! Converts user-provided [`ManualAnnotation`]s from the blob's +//! `"manual_entities"` artifact into full [`Entity`] objects. + +use serde::Deserialize; +use tokio::sync::mpsc; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityLocation}; +use nvisy_ontology::redaction::ManualAnnotation; +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::registry::action::Action; + +/// Typed parameters for [`DetectManualAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DetectManualParams {} + +/// Reads `"manual_entities"` artifacts from the blob (injected by the +/// server from `RedactionContext.manual_entities`) and converts each +/// [`ManualAnnotation`] into a full [`Entity`] with +/// `DetectionMethod::Manual` and confidence 1.0. +pub struct DetectManualAction; + +#[async_trait::async_trait] +impl Action for DetectManualAction { + type Params = DetectManualParams; + + fn id(&self) -> &str { + "detect-manual" + } + + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, + _params: Self::Params, + ) -> Result<u64, Error> { + let mut count = 0u64; + + while let Some(mut blob) = input.recv().await { + let annotations: Vec<ManualAnnotation> = + blob.get_artifacts("manual_entities").map_err(|e| { + Error::new( + ErrorKind::Runtime, + format!("failed to read manual_entities artifact: {e}"), + ) + })?; + + for ann in &annotations { + let entity = Entity::new( + ann.category, + &ann.entity_type, + &ann.value, + DetectionMethod::Manual, + 1.0, + EntityLocation { + start_offset: ann.start_offset.unwrap_or(0), + end_offset: ann.end_offset.unwrap_or(0), + element_id: None, + page_number: ann.page_number, + bounding_box: ann.bounding_box.clone(), + row_index: ann.row_index, + column_index: ann.column_index, + image_id: None, + }, + ); + + blob.add_artifact("entities", &entity).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add entity: {e}")) + })?; + count += 1; + } + + if output.send(blob).await.is_err() { + return Ok(count); + } + } + + Ok(count) + } +} diff --git a/crates/nvisy-detect/src/actions/detect_regex.rs b/crates/nvisy-detect/src/actions/detect_regex.rs index f631839..1603a39 100644 --- a/crates/nvisy-detect/src/actions/detect_regex.rs +++ b/crates/nvisy-detect/src/actions/detect_regex.rs @@ -6,7 +6,7 @@ use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; -use nvisy_core::ontology::entity::{DetectionMethod, Entity, EntityLocation}; +use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityLocation}; use nvisy_core::error::{Error, ErrorKind}; use nvisy_core::registry::action::Action; @@ -103,6 +103,9 @@ impl Action for DetectRegexAction { element_id: None, page_number: None, bounding_box: None, + row_index: None, + column_index: None, + image_id: None, }, ); entity.source_id = Some(doc.data.id); diff --git a/crates/nvisy-detect/src/actions/detect_tabular.rs b/crates/nvisy-detect/src/actions/detect_tabular.rs new file mode 100644 index 0000000..8e43bd7 --- /dev/null +++ b/crates/nvisy-detect/src/actions/detect_tabular.rs @@ -0,0 +1,134 @@ +//! Column-based rule matching for tabular data. + +use regex::Regex; +use serde::Deserialize; +use tokio::sync::mpsc; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::TabularData; +use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityLocation}; +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::registry::action::Action; + +/// A rule that matches column headers to classify entire columns. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ColumnRule { + /// Regex pattern to match against column names. + pub column_name_pattern: String, + /// Entity category for matches in the column. + pub category: EntityCategory, + /// Entity type label for matches. + pub entity_type: String, +} + +/// Typed parameters for [`DetectTabularAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DetectTabularParams { + /// Column-matching rules. + pub column_rules: Vec<ColumnRule>, +} + +/// Matches column headers against rules and marks every non-empty cell +/// in matched columns as an entity. +pub struct DetectTabularAction; + +#[async_trait::async_trait] +impl Action for DetectTabularAction { + type Params = DetectTabularParams; + + fn id(&self) -> &str { + "detect-tabular" + } + + fn validate_params(&self, params: &Self::Params) -> Result<(), Error> { + for rule in ¶ms.column_rules { + Regex::new(&rule.column_name_pattern).map_err(|e| { + Error::new( + ErrorKind::Validation, + format!("invalid column_name_pattern '{}': {e}", rule.column_name_pattern), + ) + })?; + } + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, + params: Self::Params, + ) -> Result<u64, Error> { + // Compile column-name regexes + let compiled_rules: Vec<(Regex, &ColumnRule)> = params + .column_rules + .iter() + .filter_map(|r| Regex::new(&r.column_name_pattern).ok().map(|re| (re, r))) + .collect(); + + let mut count = 0u64; + + while let Some(mut blob) = input.recv().await { + let tables: Vec<TabularData> = blob.get_artifacts("tabular").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read tabular artifact: {e}")) + })?; + + for table in &tables { + // For each column, check if any rule matches the column name + for (col_idx, col_name) in table.columns.iter().enumerate() { + for (regex, rule) in &compiled_rules { + if !regex.is_match(col_name) { + continue; + } + + // Mark every non-empty cell in this column + for (row_idx, row) in table.rows.iter().enumerate() { + if let Some(cell) = row.get(col_idx) { + if cell.is_empty() { + continue; + } + + let entity = Entity::new( + rule.category, + &rule.entity_type, + cell.as_str(), + DetectionMethod::Composite, + 0.9, + EntityLocation { + start_offset: 0, + end_offset: cell.len(), + element_id: None, + page_number: None, + bounding_box: None, + row_index: Some(row_idx), + column_index: Some(col_idx), + image_id: None, + }, + ) + .with_source_id(table.data.id); + + blob.add_artifact("entities", &entity).map_err(|e| { + Error::new( + ErrorKind::Runtime, + format!("failed to add entity: {e}"), + ) + })?; + count += 1; + } + } + + // Only apply first matching rule per column + break; + } + } + } + + if output.send(blob).await.is_err() { + return Ok(count); + } + } + + Ok(count) + } +} diff --git a/crates/nvisy-detect/src/actions/emit_audit.rs b/crates/nvisy-detect/src/actions/emit_audit.rs index d92509c..63edb42 100644 --- a/crates/nvisy-detect/src/actions/emit_audit.rs +++ b/crates/nvisy-detect/src/actions/emit_audit.rs @@ -5,8 +5,8 @@ use tokio::sync::mpsc; use uuid::Uuid; use nvisy_core::datatypes::blob::Blob; -use nvisy_core::ontology::audit::{Audit, AuditAction}; -use nvisy_core::ontology::redaction::Redaction; +use nvisy_ontology::ontology::audit::{Audit, AuditAction}; +use nvisy_ontology::ontology::redaction::Redaction; use nvisy_core::error::{Error, ErrorKind}; use nvisy_core::registry::action::Action; diff --git a/crates/nvisy-detect/src/actions/evaluate_policy.rs b/crates/nvisy-detect/src/actions/evaluate_policy.rs index 52dfa60..0ad81d1 100644 --- a/crates/nvisy-detect/src/actions/evaluate_policy.rs +++ b/crates/nvisy-detect/src/actions/evaluate_policy.rs @@ -4,9 +4,9 @@ use serde::Deserialize; use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; -use nvisy_core::ontology::entity::Entity; -use nvisy_core::redaction::policy::PolicyRule; -use nvisy_core::ontology::redaction::{Redaction, RedactionMethod}; +use nvisy_ontology::ontology::entity::Entity; +use nvisy_ontology::redaction::policy::PolicyRule; +use nvisy_ontology::ontology::redaction::{Redaction, RedactionMethod}; use nvisy_core::error::{Error, ErrorKind}; use nvisy_core::registry::action::Action; diff --git a/crates/nvisy-detect/src/actions/mod.rs b/crates/nvisy-detect/src/actions/mod.rs index 02cfb1a..af6d5f2 100644 --- a/crates/nvisy-detect/src/actions/mod.rs +++ b/crates/nvisy-detect/src/actions/mod.rs @@ -9,8 +9,14 @@ pub mod apply_redaction; pub mod classify; /// Validates detected entities using checksum algorithms (e.g. Luhn). pub mod detect_checksum; +/// Aho-Corasick dictionary-based entity detection. +pub mod detect_dictionary; +/// Converts user-provided manual annotations into entities. +pub mod detect_manual; /// Scans document text with compiled regex patterns to detect PII/PHI entities. pub mod detect_regex; +/// Column-based rule matching for tabular data. +pub mod detect_tabular; /// Emits audit trail records for every applied redaction. pub mod emit_audit; /// Evaluates policy rules against detected entities and produces redaction instructions. diff --git a/crates/nvisy-detect/src/dictionaries/mod.rs b/crates/nvisy-detect/src/dictionaries/mod.rs new file mode 100644 index 0000000..4ec9e16 --- /dev/null +++ b/crates/nvisy-detect/src/dictionaries/mod.rs @@ -0,0 +1,41 @@ +//! Built-in dictionary data for name and term matching. +//! +//! Dictionaries are embedded at compile time via `include_str!()` and +//! loaded lazily on first access. + +use std::sync::LazyLock; + +static FIRST_NAMES: LazyLock<Vec<String>> = LazyLock::new(|| { + parse_dictionary(include_str!("../../assets/dictionaries/first_names.txt")) +}); + +static LAST_NAMES: LazyLock<Vec<String>> = LazyLock::new(|| { + parse_dictionary(include_str!("../../assets/dictionaries/last_names.txt")) +}); + +static MEDICAL_TERMS: LazyLock<Vec<String>> = LazyLock::new(|| { + parse_dictionary(include_str!("../../assets/dictionaries/medical_terms.txt")) +}); + +/// Load a built-in dictionary by name. +/// +/// Names are prefixed with `"builtin:"` — e.g. `"builtin:first_names"`, +/// `"builtin:last_names"`, `"builtin:medical_terms"`. +/// +/// Returns `None` if the name is not recognized. +pub fn get_builtin(name: &str) -> Option<&'static [String]> { + match name { + "builtin:first_names" => Some(&FIRST_NAMES), + "builtin:last_names" => Some(&LAST_NAMES), + "builtin:medical_terms" => Some(&MEDICAL_TERMS), + _ => None, + } +} + +fn parse_dictionary(text: &str) -> Vec<String> { + text.lines() + .map(|l| l.trim()) + .filter(|l| !l.is_empty()) + .map(|l| l.to_string()) + .collect() +} diff --git a/crates/nvisy-detect/src/lib.rs b/crates/nvisy-detect/src/lib.rs index 87ff6c9..955b5bb 100644 --- a/crates/nvisy-detect/src/lib.rs +++ b/crates/nvisy-detect/src/lib.rs @@ -1,9 +1,8 @@ -//! PII/PHI detection actions and loaders for the nvisy pipeline. +//! PII/PHI detection actions for the nvisy pipeline. //! //! This crate provides the detection, classification, policy evaluation, //! redaction, and audit-trail stages used by the nvisy runtime. It also -//! ships format-specific loaders (CSV, JSON, plaintext) and a built-in -//! set of regex patterns compiled from `assets/patterns.json`. +//! ships a built-in set of regex patterns compiled from `assets/patterns.json`. #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] @@ -11,8 +10,8 @@ /// Pipeline actions for detection, classification, policy, redaction, and audit. pub mod actions; -/// Format-specific blob loaders (CSV, JSON, plaintext). -pub mod loaders; +/// Built-in dictionary data for name and term matching. +pub mod dictionaries; /// Built-in regex pattern definitions and validation helpers. pub mod patterns; diff --git a/crates/nvisy-detect/src/loaders/mod.rs b/crates/nvisy-detect/src/loaders/mod.rs deleted file mode 100644 index aaa34a5..0000000 --- a/crates/nvisy-detect/src/loaders/mod.rs +++ /dev/null @@ -1,12 +0,0 @@ -//! Format-specific blob loaders. -//! -//! Each loader converts raw [`Blob`](nvisy_core::datatypes::blob::Blob) bytes -//! into one or more [`Document`](nvisy_core::datatypes::document::Document)s -//! that downstream actions can process. - -/// Loader for CSV files. -pub mod csv_loader; -/// Loader for JSON files. -pub mod json_loader; -/// Loader for plain-text files. -pub mod plaintext; diff --git a/crates/nvisy-detect/src/patterns/mod.rs b/crates/nvisy-detect/src/patterns/mod.rs index 75a5bd0..2a3cc90 100644 --- a/crates/nvisy-detect/src/patterns/mod.rs +++ b/crates/nvisy-detect/src/patterns/mod.rs @@ -9,7 +9,7 @@ pub mod validators; use std::collections::HashMap; use std::sync::LazyLock; -use nvisy_core::ontology::entity::EntityCategory; +use nvisy_ontology::ontology::entity::EntityCategory; /// JSON representation of a pattern loaded from disk. #[derive(Debug, Clone, serde::Deserialize)] diff --git a/crates/nvisy-detect/src/prelude.rs b/crates/nvisy-detect/src/prelude.rs index 87f870a..e99bbe7 100644 --- a/crates/nvisy-detect/src/prelude.rs +++ b/crates/nvisy-detect/src/prelude.rs @@ -2,9 +2,9 @@ pub use crate::actions::apply_redaction::ApplyRedactionAction; pub use crate::actions::classify::ClassifyAction; pub use crate::actions::detect_checksum::DetectChecksumAction; +pub use crate::actions::detect_dictionary::DetectDictionaryAction; +pub use crate::actions::detect_manual::DetectManualAction; pub use crate::actions::detect_regex::DetectRegexAction; +pub use crate::actions::detect_tabular::DetectTabularAction; pub use crate::actions::emit_audit::EmitAuditAction; pub use crate::actions::evaluate_policy::EvaluatePolicyAction; -pub use crate::loaders::csv_loader::CsvLoader; -pub use crate::loaders::json_loader::JsonLoader; -pub use crate::loaders::plaintext::PlaintextLoader; diff --git a/crates/nvisy-ingest/Cargo.toml b/crates/nvisy-ingest/Cargo.toml new file mode 100644 index 0000000..cad8474 --- /dev/null +++ b/crates/nvisy-ingest/Cargo.toml @@ -0,0 +1,64 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-ingest" +description = "File-format loaders for the Nvisy multimodal redaction platform" +keywords = ["nvisy", "ingest", "loader", "pdf", "docx"] +categories = ["parser-implementations"] + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[features] +default = ["pdf", "docx", "html", "xlsx", "parquet", "image"] +pdf = ["dep:pdf-extract", "dep:lopdf"] +docx = ["dep:zip", "dep:quick-xml"] +html = ["dep:scraper"] +xlsx = ["dep:calamine"] +parquet = ["dep:parquet", "dep:arrow"] +image = ["dep:image"] + +[dependencies] +# Internal crates +nvisy-core = { workspace = true, features = [] } + +# (De)serialization +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true, features = [] } + +# Async runtime +tokio = { workspace = true, features = ["sync"] } +async-trait = { workspace = true, features = [] } + +# Primitive datatypes +uuid = { workspace = true, features = ["v4"] } +bytes = { workspace = true } + +# Observability +tracing = { workspace = true, features = [] } + +# File type detection +infer = { workspace = true } + +# Document parsing (feature-gated) +pdf-extract = { workspace = true, optional = true } +lopdf = { workspace = true, optional = true } +zip = { workspace = true, optional = true } +quick-xml = { workspace = true, optional = true } +scraper = { workspace = true, optional = true } +calamine = { workspace = true, optional = true } +arrow = { workspace = true, optional = true } +parquet = { workspace = true, optional = true } +image = { workspace = true, optional = true } diff --git a/crates/nvisy-ingest/README.md b/crates/nvisy-ingest/README.md new file mode 100644 index 0000000..0cbc41b --- /dev/null +++ b/crates/nvisy-ingest/README.md @@ -0,0 +1,9 @@ +# nvisy-ingest + +File-format loaders for the Nvisy multimodal redaction platform. + +This crate provides loaders for PDF, DOCX, HTML, Image, Parquet, XLSX, +Audio, CSV, JSON, and plain-text files. Each loader implements the +[`Loader`](crate::loaders::Loader) trait and converts raw +blob bytes into structured `Document`, `ImageData`, or `TabularData` +artifacts. diff --git a/crates/nvisy-ingest/src/lib.rs b/crates/nvisy-ingest/src/lib.rs new file mode 100644 index 0000000..cd4b052 --- /dev/null +++ b/crates/nvisy-ingest/src/lib.rs @@ -0,0 +1,9 @@ +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + +/// File-format loaders. +pub mod loaders; + +#[doc(hidden)] +pub mod prelude; diff --git a/crates/nvisy-ingest/src/loaders/audio_loader.rs b/crates/nvisy-ingest/src/loaders/audio_loader.rs new file mode 100644 index 0000000..535e706 --- /dev/null +++ b/crates/nvisy-ingest/src/loaders/audio_loader.rs @@ -0,0 +1,58 @@ +//! Placeholder audio file loader. +//! +//! Returns a document with metadata only — audio redaction is not yet implemented. + +use serde::Deserialize; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::Document; +use nvisy_core::error::Error; +use super::{Loader, LoaderOutput}; + +/// Typed parameters for [`AudioLoader`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct AudioLoaderParams {} + +/// Placeholder loader for audio files. Returns a metadata-only document. +pub struct AudioLoader; + +#[async_trait::async_trait] +impl Loader for AudioLoader { + type Params = AudioLoaderParams; + + fn id(&self) -> &str { + "audio" + } + + fn extensions(&self) -> &[&str] { + &["mp3", "wav", "flac", "ogg", "m4a"] + } + + fn content_types(&self) -> &[&str] { + &[ + "audio/mpeg", + "audio/wav", + "audio/flac", + "audio/ogg", + "audio/mp4", + ] + } + + async fn load( + &self, + blob: &Blob, + _params: &Self::Params, + ) -> Result<Vec<LoaderOutput>, Error> { + let content_type = blob.content_type().unwrap_or("audio/unknown").to_string(); + let size = blob.content.len(); + + let doc = Document::new(format!( + "[Audio file: type={}, size={} bytes. Audio redaction not yet implemented.]", + content_type, size + )) + .with_source_format("audio"); + + Ok(vec![LoaderOutput::Document(doc)]) + } +} diff --git a/crates/nvisy-detect/src/loaders/csv_loader.rs b/crates/nvisy-ingest/src/loaders/csv_loader.rs similarity index 95% rename from crates/nvisy-detect/src/loaders/csv_loader.rs rename to crates/nvisy-ingest/src/loaders/csv_loader.rs index 79d868d..f9c4a3c 100644 --- a/crates/nvisy-detect/src/loaders/csv_loader.rs +++ b/crates/nvisy-ingest/src/loaders/csv_loader.rs @@ -3,7 +3,7 @@ use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; use nvisy_core::error::Error; -use nvisy_core::registry::loader::{Loader, LoaderOutput}; +use super::{Loader, LoaderOutput}; /// Loads CSV blobs into a single [`Document`] containing the raw CSV text. /// diff --git a/crates/nvisy-ingest/src/loaders/docx_loader.rs b/crates/nvisy-ingest/src/loaders/docx_loader.rs new file mode 100644 index 0000000..d8d71ea --- /dev/null +++ b/crates/nvisy-ingest/src/loaders/docx_loader.rs @@ -0,0 +1,166 @@ +//! DOCX (Office Open XML) file loader. + +use bytes::Bytes; +use serde::Deserialize; +use std::io::Cursor; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::{Document, Element, ElementType, ImageData}; +use nvisy_core::error::{Error, ErrorKind}; +use super::{Loader, LoaderOutput}; + +/// Typed parameters for [`DocxLoader`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DocxLoaderParams { + /// Whether to extract embedded images. + #[serde(default = "default_true")] + pub extract_images: bool, +} + +fn default_true() -> bool { + true +} + +/// Extracts text and optionally images from DOCX files. +pub struct DocxLoader; + +#[async_trait::async_trait] +impl Loader for DocxLoader { + type Params = DocxLoaderParams; + + fn id(&self) -> &str { + "docx" + } + + fn extensions(&self) -> &[&str] { + &["docx"] + } + + fn content_types(&self) -> &[&str] { + &["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] + } + + async fn load( + &self, + blob: &Blob, + params: &Self::Params, + ) -> Result<Vec<LoaderOutput>, Error> { + let cursor = Cursor::new(blob.content.to_vec()); + let mut archive = zip::ZipArchive::new(cursor).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("Failed to open DOCX ZIP: {e}")) + })?; + + let mut outputs = Vec::new(); + let mut elements = Vec::new(); + let mut full_text = String::new(); + + // Parse word/document.xml + if let Ok(mut entry) = archive.by_name("word/document.xml") { + let mut xml_content = String::new(); + std::io::Read::read_to_string(&mut entry, &mut xml_content).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("Failed to read document.xml: {e}")) + })?; + + let mut reader = quick_xml::Reader::from_str(&xml_content); + let mut in_text = false; + let mut in_heading = false; + let mut current_text = String::new(); + let mut buf = Vec::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(quick_xml::events::Event::Start(ref e)) => { + match e.name().as_ref() { + b"w:t" => in_text = true, + b"w:pStyle" => { + for attr in e.attributes().flatten() { + if attr.key.as_ref() == b"w:val" { + let val = String::from_utf8_lossy(&attr.value); + if val.starts_with("Heading") { + in_heading = true; + } + } + } + } + _ => {} + } + } + Ok(quick_xml::events::Event::End(ref e)) => { + match e.name().as_ref() { + b"w:t" => in_text = false, + b"w:p" => { + if !current_text.is_empty() { + let element_type = if in_heading { + ElementType::Title + } else { + ElementType::NarrativeText + }; + elements.push(Element::new(element_type, ¤t_text)); + if !full_text.is_empty() { + full_text.push('\n'); + } + full_text.push_str(¤t_text); + current_text.clear(); + in_heading = false; + } + } + _ => {} + } + } + Ok(quick_xml::events::Event::Text(ref e)) => { + if in_text { + let text = e.unescape().unwrap_or_default(); + current_text.push_str(&text); + } + } + Ok(quick_xml::events::Event::Eof) => break, + Err(e) => { + tracing::warn!("DOCX XML parse error: {e}"); + break; + } + _ => {} + } + buf.clear(); + } + } + + let doc = Document::new(full_text) + .with_elements(elements) + .with_source_format("docx"); + + outputs.push(LoaderOutput::Document(doc)); + + // Extract images from word/media/ + if params.extract_images { + let media_names: Vec<String> = (0..archive.len()) + .filter_map(|i| { + let entry = archive.by_index(i).ok()?; + let name = entry.name().to_string(); + if name.starts_with("word/media/") { + Some(name) + } else { + None + } + }) + .collect(); + + for name in media_names { + if let Ok(mut entry) = archive.by_name(&name) { + let mut buf = Vec::new(); + std::io::Read::read_to_end(&mut entry, &mut buf).ok(); + if !buf.is_empty() { + let mime = infer::get(&buf) + .map(|t| t.mime_type().to_string()) + .unwrap_or_else(|| "image/png".to_string()); + let img = ImageData::new(Bytes::from(buf), mime) + .with_source_path(&name); + outputs.push(LoaderOutput::Image(img)); + } + } + } + } + + Ok(outputs) + } +} diff --git a/crates/nvisy-ingest/src/loaders/html_loader.rs b/crates/nvisy-ingest/src/loaders/html_loader.rs new file mode 100644 index 0000000..1c933f7 --- /dev/null +++ b/crates/nvisy-ingest/src/loaders/html_loader.rs @@ -0,0 +1,105 @@ +//! HTML file loader using the `scraper` crate. + +use serde::Deserialize; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::{Document, Element, ElementType}; +use nvisy_core::error::{Error, ErrorKind}; +use super::{Loader, LoaderOutput}; + +/// Typed parameters for [`HtmlLoader`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct HtmlLoaderParams {} + +/// Extracts text and structural elements from HTML documents. +pub struct HtmlLoader; + +#[async_trait::async_trait] +impl Loader for HtmlLoader { + type Params = HtmlLoaderParams; + + fn id(&self) -> &str { + "html" + } + + fn extensions(&self) -> &[&str] { + &["html", "htm"] + } + + fn content_types(&self) -> &[&str] { + &["text/html"] + } + + async fn load( + &self, + blob: &Blob, + _params: &Self::Params, + ) -> Result<Vec<LoaderOutput>, Error> { + let html_str = String::from_utf8(blob.content.to_vec()).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("HTML is not valid UTF-8: {e}")) + })?; + + let document = scraper::Html::parse_document(&html_str); + let mut elements = Vec::new(); + let mut full_text = String::new(); + + // Map HTML tags to element types + let tag_mappings: &[(&str, ElementType)] = &[ + ("h1", ElementType::Title), + ("h2", ElementType::Title), + ("h3", ElementType::Title), + ("h4", ElementType::Title), + ("h5", ElementType::Title), + ("h6", ElementType::Title), + ("p", ElementType::NarrativeText), + ("li", ElementType::ListItem), + ("table", ElementType::Table), + ("pre", ElementType::CodeSnippet), + ("code", ElementType::CodeSnippet), + ("address", ElementType::Address), + ("header", ElementType::Header), + ("footer", ElementType::Footer), + ("figcaption", ElementType::FigureCaption), + ]; + + for (tag, element_type) in tag_mappings { + let selector = scraper::Selector::parse(tag).unwrap(); + for element in document.select(&selector) { + let text: String = element.text().collect::<Vec<_>>().join(" "); + let trimmed = text.trim(); + if trimmed.is_empty() { + continue; + } + let mut el = Element::new(*element_type, trimmed); + // Set heading level for h1-h6 + if tag.starts_with('h') && tag.len() == 2 { + if let Some(level) = tag[1..].parse::<u32>().ok() { + el = el.with_level(level); + } + } + if !full_text.is_empty() { + full_text.push('\n'); + } + full_text.push_str(trimmed); + elements.push(el); + } + } + + // If no structured elements found, extract all body text + if elements.is_empty() { + let body_selector = scraper::Selector::parse("body").unwrap(); + if let Some(body) = document.select(&body_selector).next() { + full_text = body.text().collect::<Vec<_>>().join(" "); + let trimmed = full_text.trim().to_string(); + full_text = trimmed; + } + } + + let doc = Document::new(full_text) + .with_elements(elements) + .with_source_format("html"); + + Ok(vec![LoaderOutput::Document(doc)]) + } +} diff --git a/crates/nvisy-ingest/src/loaders/image_loader.rs b/crates/nvisy-ingest/src/loaders/image_loader.rs new file mode 100644 index 0000000..468aa87 --- /dev/null +++ b/crates/nvisy-ingest/src/loaders/image_loader.rs @@ -0,0 +1,67 @@ +//! Image file loader using the `image` crate. + +use bytes::Bytes; +use serde::Deserialize; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::ImageData; +use nvisy_core::error::{Error, ErrorKind}; +use super::{Loader, LoaderOutput}; + +/// Typed parameters for [`ImageLoader`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ImageLoaderParams {} + +/// Decodes image files and returns an [`ImageData`] with dimensions. +pub struct ImageLoader; + +#[async_trait::async_trait] +impl Loader for ImageLoader { + type Params = ImageLoaderParams; + + fn id(&self) -> &str { + "image" + } + + fn extensions(&self) -> &[&str] { + &["jpg", "jpeg", "png", "tiff", "bmp", "webp"] + } + + fn content_types(&self) -> &[&str] { + &[ + "image/jpeg", + "image/png", + "image/tiff", + "image/bmp", + "image/webp", + ] + } + + async fn load( + &self, + blob: &Blob, + _params: &Self::Params, + ) -> Result<Vec<LoaderOutput>, Error> { + let img = image::load_from_memory(&blob.content).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("Image decode failed: {e}")) + })?; + + let width = img.width(); + let height = img.height(); + + // Detect MIME type from blob or infer + let mime_type = blob + .content_type() + .unwrap_or("image/png") + .to_string(); + + let image_data = ImageData::new( + Bytes::copy_from_slice(&blob.content), + mime_type, + ) + .with_dimensions(width, height); + + Ok(vec![LoaderOutput::Image(image_data)]) + } +} diff --git a/crates/nvisy-detect/src/loaders/json_loader.rs b/crates/nvisy-ingest/src/loaders/json_loader.rs similarity index 96% rename from crates/nvisy-detect/src/loaders/json_loader.rs rename to crates/nvisy-ingest/src/loaders/json_loader.rs index 827443d..68c5490 100644 --- a/crates/nvisy-detect/src/loaders/json_loader.rs +++ b/crates/nvisy-ingest/src/loaders/json_loader.rs @@ -3,7 +3,7 @@ use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; use nvisy_core::error::Error; -use nvisy_core::registry::loader::{Loader, LoaderOutput}; +use super::{Loader, LoaderOutput}; /// Loads JSON blobs into a single [`Document`] containing the raw JSON text. /// diff --git a/crates/nvisy-ingest/src/loaders/mod.rs b/crates/nvisy-ingest/src/loaders/mod.rs new file mode 100644 index 0000000..1ee9d45 --- /dev/null +++ b/crates/nvisy-ingest/src/loaders/mod.rs @@ -0,0 +1,72 @@ +//! File-format loaders for multimodal document ingestion. + +use serde::de::DeserializeOwned; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::Document; +use nvisy_core::datatypes::document::ImageData; +use nvisy_core::datatypes::document::TabularData; +use nvisy_core::error::Error; + +/// Output of a loader -- either a parsed document, an extracted image, or tabular data. +pub enum LoaderOutput { + /// A successfully parsed text document. + Document(Document), + /// An extracted or decoded image. + Image(ImageData), + /// Tabular data extracted from a spreadsheet or data file. + Tabular(TabularData), +} + +/// Converts raw [`Blob`] content into structured [`Document`]s or [`ImageData`]. +/// +/// Loaders declare which file extensions and MIME types they support. +/// The engine selects the appropriate loader based on the blob's +/// content type and extension. +#[async_trait::async_trait] +pub trait Loader: Send + Sync + 'static { + /// Strongly-typed parameters for this loader. + type Params: DeserializeOwned + Send; + + /// Unique identifier for this loader (e.g. `"csv"`, `"pdf"`). + fn id(&self) -> &str; + /// File extensions this loader handles (e.g. `["csv", "tsv"]`). + fn extensions(&self) -> &[&str]; + /// MIME types this loader handles (e.g. `["text/csv"]`). + fn content_types(&self) -> &[&str]; + + /// Parse the blob and return one or more documents or images. + async fn load( + &self, + blob: &Blob, + params: &Self::Params, + ) -> Result<Vec<LoaderOutput>, Error>; +} + +/// Loader for CSV files. +pub mod csv_loader; +/// Loader for JSON files. +pub mod json_loader; +/// Loader for plain-text files. +pub mod plaintext; + +/// Loader for PDF files. +#[cfg(feature = "pdf")] +pub mod pdf_loader; +/// Loader for DOCX (Office Open XML) files. +#[cfg(feature = "docx")] +pub mod docx_loader; +/// Loader for HTML files. +#[cfg(feature = "html")] +pub mod html_loader; +/// Loader for image files (PNG, JPEG, TIFF, etc.). +#[cfg(feature = "image")] +pub mod image_loader; +/// Loader for Apache Parquet files. +#[cfg(feature = "parquet")] +pub mod parquet_loader; +/// Loader for Excel XLSX/XLS files. +#[cfg(feature = "xlsx")] +pub mod xlsx_loader; +/// Placeholder loader for audio files. +pub mod audio_loader; diff --git a/crates/nvisy-ingest/src/loaders/parquet_loader.rs b/crates/nvisy-ingest/src/loaders/parquet_loader.rs new file mode 100644 index 0000000..3117eb4 --- /dev/null +++ b/crates/nvisy-ingest/src/loaders/parquet_loader.rs @@ -0,0 +1,131 @@ +//! Apache Parquet file loader. + +use bytes::Bytes; +use serde::Deserialize; +use std::sync::Arc; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::{Document, TabularData}; +use nvisy_core::error::{Error, ErrorKind}; +use super::{Loader, LoaderOutput}; + +use arrow::array::{Array, RecordBatchReader}; +use arrow::record_batch::RecordBatch; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; + +/// Typed parameters for [`ParquetLoader`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ParquetLoaderParams { + /// Maximum number of rows to read. `None` means all rows. + #[serde(default)] + pub max_rows: Option<usize>, +} + +/// Extracts tabular data from Parquet files plus a flattened text document +/// for regex/dictionary scanning. +pub struct ParquetLoader; + +#[async_trait::async_trait] +impl Loader for ParquetLoader { + type Params = ParquetLoaderParams; + + fn id(&self) -> &str { + "parquet" + } + + fn extensions(&self) -> &[&str] { + &["parquet"] + } + + fn content_types(&self) -> &[&str] { + &["application/x-parquet"] + } + + async fn load( + &self, + blob: &Blob, + params: &Self::Params, + ) -> Result<Vec<LoaderOutput>, Error> { + let data = Bytes::copy_from_slice(&blob.content); + let builder = ParquetRecordBatchReaderBuilder::try_new(data) + .map_err(|e| { + Error::new(ErrorKind::Runtime, format!("Parquet open failed: {e}")) + })?; + + let reader = builder.build().map_err(|e| { + Error::new(ErrorKind::Runtime, format!("Parquet reader build failed: {e}")) + })?; + + let schema = reader.schema(); + let columns: Vec<String> = schema + .fields() + .iter() + .map(|f: &arrow::datatypes::FieldRef| f.name().clone()) + .collect(); + + let mut all_rows: Vec<Vec<String>> = Vec::new(); + + for batch_result in reader { + let batch: RecordBatch = batch_result.map_err(|e| { + Error::new(ErrorKind::Runtime, format!("Parquet batch read failed: {e}")) + })?; + + for row_idx in 0..batch.num_rows() { + if let Some(max) = params.max_rows { + if all_rows.len() >= max { + break; + } + } + + let mut row = Vec::with_capacity(batch.num_columns()); + for col_idx in 0..batch.num_columns() { + let col: &Arc<dyn Array> = batch.column(col_idx); + let val = array_value_to_string(col.as_ref(), row_idx); + row.push(val); + } + all_rows.push(row); + } + + if let Some(max) = params.max_rows { + if all_rows.len() >= max { + break; + } + } + } + + let tabular = TabularData::new(columns.clone(), all_rows.clone()) + .with_source_format("parquet"); + + // Flatten to text for regex/dictionary scanning + let mut text_parts = Vec::new(); + for row in &all_rows { + text_parts.push(row.join("\t")); + } + let flat_text = text_parts.join("\n"); + let doc = Document::new(flat_text).with_source_format("parquet"); + + Ok(vec![ + LoaderOutput::Tabular(tabular), + LoaderOutput::Document(doc), + ]) + } +} + +fn array_value_to_string(array: &dyn Array, index: usize) -> String { + if array.is_null(index) { + return String::new(); + } + + // Use Arrow's display formatting + use std::fmt::Write; + let mut buf = String::new(); + let formatter = arrow::util::display::ArrayFormatter::try_new(array, &Default::default()); + match formatter { + Ok(f) => { + let _ = write!(buf, "{}", f.value(index)); + buf + } + Err(_) => String::new(), + } +} diff --git a/crates/nvisy-ingest/src/loaders/pdf_loader.rs b/crates/nvisy-ingest/src/loaders/pdf_loader.rs new file mode 100644 index 0000000..9847c87 --- /dev/null +++ b/crates/nvisy-ingest/src/loaders/pdf_loader.rs @@ -0,0 +1,168 @@ +//! PDF file loader using `pdf-extract` and `lopdf`. + +use bytes::Bytes; +use serde::Deserialize; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::{Document, ImageData}; +use nvisy_core::error::{Error, ErrorKind}; +use super::{Loader, LoaderOutput}; + +/// Typed parameters for [`PdfLoader`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct PdfLoaderParams { + /// Whether to extract embedded images from the PDF. + #[serde(default = "default_true")] + pub extract_images: bool, + /// Maximum number of pages to process. `None` means all pages. + #[serde(default)] + pub max_pages: Option<u32>, +} + +fn default_true() -> bool { + true +} + +/// Extracts text and optionally images from PDF files. +pub struct PdfLoader; + +#[async_trait::async_trait] +impl Loader for PdfLoader { + type Params = PdfLoaderParams; + + fn id(&self) -> &str { + "pdf" + } + + fn extensions(&self) -> &[&str] { + &["pdf"] + } + + fn content_types(&self) -> &[&str] { + &["application/pdf"] + } + + async fn load( + &self, + blob: &Blob, + params: &Self::Params, + ) -> Result<Vec<LoaderOutput>, Error> { + let bytes = blob.content.to_vec(); + let mut outputs = Vec::new(); + + // Extract text + let text = pdf_extract::extract_text_from_mem(&bytes).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("PDF text extraction failed: {e}")) + })?; + + let lop_doc = lopdf::Document::load_mem(&bytes).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("PDF parsing failed: {e}")) + })?; + + let page_count = lop_doc.get_pages().len() as u32; + + let doc = Document::new(text) + .with_source_format("pdf") + .with_page_count(page_count); + + outputs.push(LoaderOutput::Document(doc)); + + // Extract embedded images + if params.extract_images { + let max_pages = params.max_pages.unwrap_or(page_count); + for (page_num, page_id) in lop_doc.get_pages() { + if page_num > max_pages { + break; + } + + let (resources_opt, _) = match lop_doc.get_page_resources(page_id) { + Ok(r) => r, + Err(_) => continue, + }; + + let resources = match resources_opt { + Some(res) => res, + None => continue, + }; + + let xobject_obj = match resources.get(b"XObject") { + Ok(obj) => obj, + Err(_) => continue, + }; + + let xobjects = match lop_doc.dereference(xobject_obj) { + Ok((_, lopdf::Object::Dictionary(dict))) => dict.clone(), + _ => continue, + }; + + for (_name, obj_ref) in xobjects.iter() { + let stream = match lop_doc.dereference(obj_ref) { + Ok((_, lopdf::Object::Stream(s))) => s.clone(), + _ => continue, + }; + + let is_image = stream + .dict + .get(b"Subtype") + .ok() + .and_then(|s| { + if let lopdf::Object::Name(n) = s { + Some(n.as_slice() == b"Image") + } else { + None + } + }) + .unwrap_or(false); + + if !is_image { + continue; + } + + let image_bytes = stream.content.clone(); + if image_bytes.is_empty() { + continue; + } + + let width = stream + .dict + .get(b"Width") + .ok() + .and_then(|w| { + if let lopdf::Object::Integer(i) = w { + Some(*i as u32) + } else { + None + } + }); + + let height = stream + .dict + .get(b"Height") + .ok() + .and_then(|h| { + if let lopdf::Object::Integer(i) = h { + Some(*i as u32) + } else { + None + } + }); + + let mut img = ImageData::new( + Bytes::from(image_bytes), + "image/png", + ) + .with_page_number(page_num); + + if let (Some(w), Some(h)) = (width, height) { + img = img.with_dimensions(w, h); + } + + outputs.push(LoaderOutput::Image(img)); + } + } + } + + Ok(outputs) + } +} diff --git a/crates/nvisy-detect/src/loaders/plaintext.rs b/crates/nvisy-ingest/src/loaders/plaintext.rs similarity index 95% rename from crates/nvisy-detect/src/loaders/plaintext.rs rename to crates/nvisy-ingest/src/loaders/plaintext.rs index a885a33..9222d4e 100644 --- a/crates/nvisy-detect/src/loaders/plaintext.rs +++ b/crates/nvisy-ingest/src/loaders/plaintext.rs @@ -3,7 +3,7 @@ use nvisy_core::datatypes::blob::Blob; use nvisy_core::datatypes::document::Document; use nvisy_core::error::Error; -use nvisy_core::registry::loader::{Loader, LoaderOutput}; +use super::{Loader, LoaderOutput}; /// Loads plain-text blobs into a single [`Document`]. /// diff --git a/crates/nvisy-ingest/src/loaders/xlsx_loader.rs b/crates/nvisy-ingest/src/loaders/xlsx_loader.rs new file mode 100644 index 0000000..5fa6cf6 --- /dev/null +++ b/crates/nvisy-ingest/src/loaders/xlsx_loader.rs @@ -0,0 +1,116 @@ +//! Excel XLSX/XLS file loader using `calamine`. + +use serde::Deserialize; +use std::io::Cursor; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::{Document, TabularData}; +use nvisy_core::error::{Error, ErrorKind}; +use super::{Loader, LoaderOutput}; + +use calamine::{Reader, open_workbook_auto_from_rs}; + +/// Typed parameters for [`XlsxLoader`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct XlsxLoaderParams { + /// Maximum number of rows per sheet. `None` means all rows. + #[serde(default)] + pub max_rows: Option<usize>, + /// Specific sheet names to load. Empty means all sheets. + #[serde(default)] + pub sheets: Vec<String>, +} + +/// Extracts tabular data per sheet from XLSX/XLS files, plus a flattened +/// text document for regex/dictionary scanning. +pub struct XlsxLoader; + +#[async_trait::async_trait] +impl Loader for XlsxLoader { + type Params = XlsxLoaderParams; + + fn id(&self) -> &str { + "xlsx" + } + + fn extensions(&self) -> &[&str] { + &["xlsx", "xls"] + } + + fn content_types(&self) -> &[&str] { + &[ + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/vnd.ms-excel", + ] + } + + async fn load( + &self, + blob: &Blob, + params: &Self::Params, + ) -> Result<Vec<LoaderOutput>, Error> { + let cursor = Cursor::new(blob.content.to_vec()); + let mut workbook = open_workbook_auto_from_rs(cursor).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("XLSX open failed: {e}")) + })?; + + let sheet_names: Vec<String> = workbook.sheet_names().to_vec(); + let mut outputs = Vec::new(); + let mut all_text_parts = Vec::new(); + + for sheet_name in &sheet_names { + if !params.sheets.is_empty() + && !params.sheets.iter().any(|s| s == sheet_name) + { + continue; + } + + let range = match workbook.worksheet_range(sheet_name) { + Ok(r) => r, + Err(e) => { + tracing::warn!("Skipping sheet '{}': {}", sheet_name, e); + continue; + } + }; + + let mut rows_iter = range.rows(); + + // First row as headers + let columns: Vec<String> = match rows_iter.next() { + Some(header_row) => header_row + .iter() + .map(|c| c.to_string()) + .collect(), + None => continue, + }; + + let mut rows = Vec::new(); + for row in rows_iter { + if let Some(max) = params.max_rows { + if rows.len() >= max { + break; + } + } + let row_data: Vec<String> = row.iter().map(|c| c.to_string()).collect(); + all_text_parts.push(row_data.join("\t")); + rows.push(row_data); + } + + let tabular = TabularData::new(columns, rows) + .with_source_format("xlsx") + .with_sheet_name(sheet_name); + + outputs.push(LoaderOutput::Tabular(tabular)); + } + + // Create a flattened document for regex/dictionary scanning + if !all_text_parts.is_empty() { + let doc = Document::new(all_text_parts.join("\n")) + .with_source_format("xlsx"); + outputs.push(LoaderOutput::Document(doc)); + } + + Ok(outputs) + } +} diff --git a/crates/nvisy-ingest/src/prelude.rs b/crates/nvisy-ingest/src/prelude.rs new file mode 100644 index 0000000..caf6edb --- /dev/null +++ b/crates/nvisy-ingest/src/prelude.rs @@ -0,0 +1,19 @@ +//! Convenience re-exports. + +pub use crate::loaders::csv_loader::CsvLoader; +pub use crate::loaders::json_loader::JsonLoader; +pub use crate::loaders::plaintext::PlaintextLoader; + +#[cfg(feature = "pdf")] +pub use crate::loaders::pdf_loader::PdfLoader; +#[cfg(feature = "docx")] +pub use crate::loaders::docx_loader::DocxLoader; +#[cfg(feature = "html")] +pub use crate::loaders::html_loader::HtmlLoader; +#[cfg(feature = "image")] +pub use crate::loaders::image_loader::ImageLoader; +#[cfg(feature = "parquet")] +pub use crate::loaders::parquet_loader::ParquetLoader; +#[cfg(feature = "xlsx")] +pub use crate::loaders::xlsx_loader::XlsxLoader; +pub use crate::loaders::audio_loader::AudioLoader; diff --git a/crates/nvisy-media/Cargo.toml b/crates/nvisy-media/Cargo.toml new file mode 100644 index 0000000..b93c7fd --- /dev/null +++ b/crates/nvisy-media/Cargo.toml @@ -0,0 +1,49 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-media" +description = "Pixel-level image redaction, tabular redaction, and PDF reassembly for Nvisy" +keywords = ["nvisy", "media", "redaction", "image", "pdf"] +categories = ["multimedia::images"] + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +# Internal crates +nvisy-core = { workspace = true, features = [] } +nvisy-ontology = { workspace = true } + +# (De)serialization +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true, features = [] } + +# Async runtime +tokio = { workspace = true, features = ["sync"] } +async-trait = { workspace = true, features = [] } + +# Primitive datatypes +uuid = { workspace = true, features = ["v4"] } +bytes = { workspace = true } + +# Image processing +image = { workspace = true } +imageproc = { workspace = true } + +# PDF manipulation +lopdf = { workspace = true } + +# Observability +tracing = { workspace = true, features = [] } diff --git a/crates/nvisy-media/README.md b/crates/nvisy-media/README.md new file mode 100644 index 0000000..df45d19 --- /dev/null +++ b/crates/nvisy-media/README.md @@ -0,0 +1,7 @@ +# nvisy-media + +Pixel-level image redaction, tabular redaction, and PDF reassembly. + +This crate provides media processing actions for the Nvisy redaction +pipeline, including image blur/block overlays, tabular cell redaction, +and PDF content stream replacement. diff --git a/crates/nvisy-media/src/actions/apply_audio_redaction.rs b/crates/nvisy-media/src/actions/apply_audio_redaction.rs new file mode 100644 index 0000000..7c185ae --- /dev/null +++ b/crates/nvisy-media/src/actions/apply_audio_redaction.rs @@ -0,0 +1,54 @@ +//! Placeholder audio redaction action. + +use serde::Deserialize; +use tokio::sync::mpsc; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::registry::action::Action; + +/// Typed parameters for [`ApplyAudioRedactionAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ApplyAudioRedactionParams { + /// Time segments to mute, as `(start_seconds, end_seconds)` pairs. + #[serde(default)] + pub mute_segments: Vec<(f64, f64)>, +} + +/// Placeholder action for audio redaction. +/// +/// Returns a runtime error indicating audio redaction is not yet implemented. +pub struct ApplyAudioRedactionAction; + +#[async_trait::async_trait] +impl Action for ApplyAudioRedactionAction { + type Params = ApplyAudioRedactionParams; + + fn id(&self) -> &str { + "apply-audio-redaction" + } + + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, + _params: Self::Params, + ) -> Result<u64, Error> { + // Pass through blobs unchanged — audio redaction is not implemented + while let Some(blob) = input.recv().await { + tracing::warn!("Audio redaction not yet implemented, passing through unchanged"); + if output.send(blob).await.is_err() { + return Err(Error::new( + ErrorKind::Runtime, + "Audio redaction not yet implemented", + )); + } + } + Ok(0) + } +} diff --git a/crates/nvisy-media/src/actions/apply_image_redaction.rs b/crates/nvisy-media/src/actions/apply_image_redaction.rs new file mode 100644 index 0000000..d7e4e93 --- /dev/null +++ b/crates/nvisy-media/src/actions/apply_image_redaction.rs @@ -0,0 +1,159 @@ +//! Image redaction action — applies blur or block overlay to image regions. + +use bytes::Bytes; +use serde::Deserialize; +use tokio::sync::mpsc; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::ImageData; +use nvisy_ontology::ontology::entity::{BoundingBox, Entity}; +use nvisy_ontology::ontology::redaction::{Redaction, RedactionMethod}; +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::registry::action::Action; + +use crate::render::{blur, block}; + +/// Typed parameters for [`ApplyImageRedactionAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ApplyImageRedactionParams { + /// Sigma value for gaussian blur. + #[serde(default = "default_sigma")] + pub blur_sigma: f32, + /// RGBA color for block overlays. + #[serde(default = "default_color")] + pub block_color: [u8; 4], +} + +fn default_sigma() -> f32 { + 15.0 +} +fn default_color() -> [u8; 4] { + [0, 0, 0, 255] +} + +/// Applies blur or block redaction to image regions identified by entities +/// with bounding boxes. +pub struct ApplyImageRedactionAction; + +#[async_trait::async_trait] +impl Action for ApplyImageRedactionAction { + type Params = ApplyImageRedactionParams; + + fn id(&self) -> &str { + "apply-image-redaction" + } + + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, + params: Self::Params, + ) -> Result<u64, Error> { + let mut count = 0u64; + + while let Some(mut blob) = input.recv().await { + let images: Vec<ImageData> = blob.get_artifacts("images").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read images: {e}")) + })?; + let entities: Vec<Entity> = blob.get_artifacts("entities").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read entities: {e}")) + })?; + let redactions: Vec<Redaction> = blob.get_artifacts("redactions").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read redactions: {e}")) + })?; + + // Build entity->redaction map + let redaction_map: std::collections::HashMap<uuid::Uuid, &Redaction> = redactions + .iter() + .filter(|r| !r.applied) + .map(|r| (r.entity_id, r)) + .collect(); + + // Collect entities with bounding boxes, grouped by redaction method + let mut blur_regions: Vec<BoundingBox> = Vec::new(); + let mut block_regions: Vec<BoundingBox> = Vec::new(); + + for entity in &entities { + if let Some(bbox) = &entity.location.bounding_box { + if let Some(redaction) = redaction_map.get(&entity.data.id) { + match redaction.method { + RedactionMethod::Blur => blur_regions.push(bbox.clone()), + RedactionMethod::Block => block_regions.push(bbox.clone()), + // Default non-image methods to block for images + _ => block_regions.push(bbox.clone()), + } + } + } + } + + if !blur_regions.is_empty() || !block_regions.is_empty() { + // Process each image + let mut new_images = Vec::new(); + for img in &images { + let dyn_img = image::load_from_memory(&img.image_data).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("image decode failed: {e}")) + })?; + + let mut result = dyn_img; + if !blur_regions.is_empty() { + result = blur::apply_gaussian_blur(&result, &blur_regions, params.blur_sigma); + } + if !block_regions.is_empty() { + let color = image::Rgba(params.block_color); + result = block::apply_block_overlay(&result, &block_regions, color); + } + + // Encode back to PNG + let mut buf = std::io::Cursor::new(Vec::new()); + result + .write_to(&mut buf, image::ImageFormat::Png) + .map_err(|e| { + Error::new(ErrorKind::Runtime, format!("image encode failed: {e}")) + })?; + + let new_img = ImageData::new( + Bytes::from(buf.into_inner()), + "image/png", + ) + .with_dimensions(result.width(), result.height()); + + new_images.push(new_img); + count += 1; + } + + // Replace images artifact + blob.artifacts.remove("images"); + for img in &new_images { + blob.add_artifact("images", img).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add image: {e}")) + })?; + } + + // Mark redactions as applied + let mut updated_redactions: Vec<Redaction> = redactions.clone(); + for r in &mut updated_redactions { + if redaction_map.contains_key(&r.entity_id) { + r.applied = true; + } + } + blob.artifacts.remove("redactions"); + for r in &updated_redactions { + blob.add_artifact("redactions", r).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add redaction: {e}")) + })?; + } + } + + if output.send(blob).await.is_err() { + return Ok(count); + } + } + + Ok(count) + } +} diff --git a/crates/nvisy-media/src/actions/apply_pdf_redaction.rs b/crates/nvisy-media/src/actions/apply_pdf_redaction.rs new file mode 100644 index 0000000..7593382 --- /dev/null +++ b/crates/nvisy-media/src/actions/apply_pdf_redaction.rs @@ -0,0 +1,153 @@ +//! PDF reassembly action — writes redacted content back to PDF bytes. + +use bytes::Bytes; +use serde::Deserialize; +use tokio::sync::mpsc; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::{Document, ImageData}; +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::registry::action::Action; + +/// Typed parameters for [`ApplyPdfRedactionAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ApplyPdfRedactionParams {} + +/// Reassembles redacted text and images back into the original PDF. +/// +/// Uses `lopdf` to: +/// 1. Replace PDF content streams with redacted text. +/// 2. Replace embedded image XObjects with redacted image data. +/// 3. Write the modified PDF back to `blob.content`. +pub struct ApplyPdfRedactionAction; + +#[async_trait::async_trait] +impl Action for ApplyPdfRedactionAction { + type Params = ApplyPdfRedactionParams; + + fn id(&self) -> &str { + "apply-pdf-redaction" + } + + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, + _params: Self::Params, + ) -> Result<u64, Error> { + let mut count = 0u64; + + while let Some(mut blob) = input.recv().await { + let _documents: Vec<Document> = blob.get_artifacts("documents").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read documents: {e}")) + })?; + let images: Vec<ImageData> = blob.get_artifacts("images").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read images: {e}")) + })?; + + // Only process if the blob is actually a PDF + let is_pdf = blob + .content_type() + .map(|ct| ct == "application/pdf") + .unwrap_or(false); + + if !is_pdf { + if output.send(blob).await.is_err() { + return Ok(count); + } + continue; + } + + let mut pdf_doc = lopdf::Document::load_mem(&blob.content).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("PDF load failed: {e}")) + })?; + + // Replace embedded image XObjects with redacted versions + if !images.is_empty() { + let pages: Vec<(u32, lopdf::ObjectId)> = + pdf_doc.get_pages().into_iter().collect(); + let mut image_idx = 0; + + for (_page_num, page_id) in &pages { + let (resources_opt, _) = match pdf_doc.get_page_resources(*page_id) { + Ok(r) => r, + Err(_) => continue, + }; + + let resources = match resources_opt { + Some(res) => res.clone(), + None => continue, + }; + + let xobject_obj = match resources.get(b"XObject") { + Ok(obj) => obj.clone(), + Err(_) => continue, + }; + + let xobjects = match pdf_doc.dereference(&xobject_obj) { + Ok((_, lopdf::Object::Dictionary(dict))) => dict.clone(), + _ => continue, + }; + + for (_name, obj_ref) in xobjects.iter() { + let stream_id = match obj_ref { + lopdf::Object::Reference(id) => Some(*id), + _ => None, + }; + + let is_image = match pdf_doc.dereference(obj_ref) { + Ok((_, lopdf::Object::Stream(s))) => s + .dict + .get(b"Subtype") + .ok() + .and_then(|st| { + if let lopdf::Object::Name(n) = st { + Some(n.as_slice() == b"Image") + } else { + None + } + }) + .unwrap_or(false), + _ => false, + }; + + if is_image { + if let (Some(sid), Some(redacted_img)) = + (stream_id, images.get(image_idx)) + { + let new_stream = lopdf::Stream::new( + lopdf::Dictionary::new(), + redacted_img.image_data.to_vec(), + ); + pdf_doc + .objects + .insert(sid, lopdf::Object::Stream(new_stream)); + image_idx += 1; + } + } + } + } + } + + // Write the modified PDF to a buffer + let mut output_buf = Vec::new(); + pdf_doc.save_to(&mut output_buf).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("PDF save failed: {e}")) + })?; + + blob.content = Bytes::from(output_buf); + count += 1; + + if output.send(blob).await.is_err() { + return Ok(count); + } + } + + Ok(count) + } +} diff --git a/crates/nvisy-media/src/actions/apply_tabular_redaction.rs b/crates/nvisy-media/src/actions/apply_tabular_redaction.rs new file mode 100644 index 0000000..ead39f9 --- /dev/null +++ b/crates/nvisy-media/src/actions/apply_tabular_redaction.rs @@ -0,0 +1,150 @@ +//! Tabular data redaction action — applies redaction to specific cells. + +use serde::Deserialize; +use tokio::sync::mpsc; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::TabularData; +use nvisy_ontology::ontology::entity::Entity; +use nvisy_ontology::ontology::redaction::{Redaction, RedactionMethod}; +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::registry::action::Action; + +/// Typed parameters for [`ApplyTabularRedactionAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ApplyTabularRedactionParams {} + +/// Applies pending redactions to tabular data cells. +/// +/// For entities with `row_index` and `column_index`, the corresponding cell +/// value is redacted according to the redaction method (mask, replace, +/// remove, hash). +pub struct ApplyTabularRedactionAction; + +#[async_trait::async_trait] +impl Action for ApplyTabularRedactionAction { + type Params = ApplyTabularRedactionParams; + + fn id(&self) -> &str { + "apply-tabular-redaction" + } + + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, + _params: Self::Params, + ) -> Result<u64, Error> { + let mut count = 0u64; + + while let Some(mut blob) = input.recv().await { + let mut tables: Vec<TabularData> = blob.get_artifacts("tabular").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read tabular: {e}")) + })?; + let entities: Vec<Entity> = blob.get_artifacts("entities").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read entities: {e}")) + })?; + let redactions: Vec<Redaction> = blob.get_artifacts("redactions").map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to read redactions: {e}")) + })?; + + // Build entity->redaction map + let redaction_map: std::collections::HashMap<uuid::Uuid, &Redaction> = redactions + .iter() + .filter(|r| !r.applied) + .map(|r| (r.entity_id, r)) + .collect(); + + let mut modified = false; + + for entity in &entities { + if let (Some(row_idx), Some(col_idx)) = + (entity.location.row_index, entity.location.column_index) + { + if let Some(redaction) = redaction_map.get(&entity.data.id) { + // Apply to all matching tables + for table in &mut tables { + if let Some(row) = table.rows.get_mut(row_idx) { + if let Some(cell) = row.get_mut(col_idx) { + *cell = apply_cell_redaction( + cell, + redaction.method, + &redaction.replacement_value, + ); + modified = true; + count += 1; + } + } + } + } + } + } + + if modified { + // Replace tabular artifact + blob.artifacts.remove("tabular"); + for table in &tables { + blob.add_artifact("tabular", table).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add tabular: {e}")) + })?; + } + + // Mark redactions as applied + let mut updated_redactions: Vec<Redaction> = redactions.clone(); + for r in &mut updated_redactions { + if redaction_map.contains_key(&r.entity_id) { + r.applied = true; + } + } + blob.artifacts.remove("redactions"); + for r in &updated_redactions { + blob.add_artifact("redactions", r).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add redaction: {e}")) + })?; + } + } + + if output.send(blob).await.is_err() { + return Ok(count); + } + } + + Ok(count) + } +} + +fn apply_cell_redaction( + cell: &str, + method: RedactionMethod, + replacement: &str, +) -> String { + match method { + RedactionMethod::Mask => { + // Mask all but last 4 characters + if cell.len() > 4 { + format!("{}{}", "*".repeat(cell.len() - 4), &cell[cell.len() - 4..]) + } else { + "*".repeat(cell.len()) + } + } + RedactionMethod::Replace => replacement.to_string(), + RedactionMethod::Remove => String::new(), + RedactionMethod::Hash => { + // Simple hash representation + format!("[HASH:{:x}]", hash_string(cell)) + } + _ => replacement.to_string(), + } +} + +fn hash_string(s: &str) -> u64 { + use std::hash::{Hash, Hasher}; + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + s.hash(&mut hasher); + hasher.finish() +} diff --git a/crates/nvisy-media/src/actions/mod.rs b/crates/nvisy-media/src/actions/mod.rs new file mode 100644 index 0000000..5e37f74 --- /dev/null +++ b/crates/nvisy-media/src/actions/mod.rs @@ -0,0 +1,10 @@ +//! Pipeline actions for applying redactions to media (images, tabular data, PDFs). + +/// Applies image redactions (blur, block) to image artifacts. +pub mod apply_image_redaction; +/// Applies redactions to tabular data cells. +pub mod apply_tabular_redaction; +/// Reassembles redacted content into PDF files. +pub mod apply_pdf_redaction; +/// Placeholder for audio redaction. +pub mod apply_audio_redaction; diff --git a/crates/nvisy-media/src/lib.rs b/crates/nvisy-media/src/lib.rs new file mode 100644 index 0000000..cf43651 --- /dev/null +++ b/crates/nvisy-media/src/lib.rs @@ -0,0 +1,11 @@ +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + +/// Image rendering primitives (blur, block overlay). +pub mod render; +/// Pipeline actions for applying redactions to media. +pub mod actions; + +#[doc(hidden)] +pub mod prelude; diff --git a/crates/nvisy-media/src/prelude.rs b/crates/nvisy-media/src/prelude.rs new file mode 100644 index 0000000..b450238 --- /dev/null +++ b/crates/nvisy-media/src/prelude.rs @@ -0,0 +1,5 @@ +//! Convenience re-exports. +pub use crate::actions::apply_image_redaction::ApplyImageRedactionAction; +pub use crate::actions::apply_tabular_redaction::ApplyTabularRedactionAction; +pub use crate::actions::apply_pdf_redaction::ApplyPdfRedactionAction; +pub use crate::actions::apply_audio_redaction::ApplyAudioRedactionAction; diff --git a/crates/nvisy-media/src/render/block.rs b/crates/nvisy-media/src/render/block.rs new file mode 100644 index 0000000..a707914 --- /dev/null +++ b/crates/nvisy-media/src/render/block.rs @@ -0,0 +1,36 @@ +//! Solid color block overlay for image regions. + +use image::{DynamicImage, Rgba, RgbaImage}; +use nvisy_ontology::ontology::entity::BoundingBox; + +/// Apply a solid color block overlay to the specified regions of an image. +/// +/// Each [`BoundingBox`] describes a rectangular region (in pixel coordinates) +/// that will be covered with an opaque rectangle of the given `color`. +pub fn apply_block_overlay( + image: &DynamicImage, + regions: &[BoundingBox], + color: Rgba<u8>, +) -> DynamicImage { + let mut result = image.to_rgba8(); + let img_w = result.width(); + let img_h = result.height(); + + for region in regions { + let x = region.x.round() as u32; + let y = region.y.round() as u32; + let w = region.width.round() as u32; + let h = region.height.round() as u32; + + if x >= img_w || y >= img_h { + continue; + } + let w = w.min(img_w - x); + let h = h.min(img_h - y); + + let block = RgbaImage::from_pixel(w, h, color); + image::imageops::overlay(&mut result, &block, x as i64, y as i64); + } + + DynamicImage::ImageRgba8(result) +} diff --git a/crates/nvisy-media/src/render/blur.rs b/crates/nvisy-media/src/render/blur.rs new file mode 100644 index 0000000..4c7e56d --- /dev/null +++ b/crates/nvisy-media/src/render/blur.rs @@ -0,0 +1,43 @@ +//! Gaussian blur for image regions. + +use image::DynamicImage; +use imageproc::filter::gaussian_blur_f32; +use nvisy_ontology::ontology::entity::BoundingBox; + +/// Apply gaussian blur to the specified regions of an image. +/// +/// Each [`BoundingBox`] describes a rectangular region (in pixel coordinates) +/// that will be blurred with the given `sigma` value. +pub fn apply_gaussian_blur( + image: &DynamicImage, + regions: &[BoundingBox], + sigma: f32, +) -> DynamicImage { + let mut result = image.clone(); + + for region in regions { + let x = region.x.round() as u32; + let y = region.y.round() as u32; + let w = region.width.round() as u32; + let h = region.height.round() as u32; + + // Clamp to image bounds + let img_w = result.width(); + let img_h = result.height(); + if x >= img_w || y >= img_h { + continue; + } + let w = w.min(img_w - x); + let h = h.min(img_h - y); + if w == 0 || h == 0 { + continue; + } + + // Crop the region, blur it, paste it back + let sub = result.crop_imm(x, y, w, h); + let blurred = DynamicImage::ImageRgba8(gaussian_blur_f32(&sub.to_rgba8(), sigma)); + image::imageops::overlay(&mut result, &blurred, x as i64, y as i64); + } + + result +} diff --git a/crates/nvisy-media/src/render/mod.rs b/crates/nvisy-media/src/render/mod.rs new file mode 100644 index 0000000..1796d48 --- /dev/null +++ b/crates/nvisy-media/src/render/mod.rs @@ -0,0 +1,6 @@ +//! Image rendering primitives for redaction overlays. + +/// Gaussian blur for image regions. +pub mod blur; +/// Solid color block overlay for image regions. +pub mod block; diff --git a/crates/nvisy-object/Cargo.toml b/crates/nvisy-object/Cargo.toml index bd5903a..dd265af 100644 --- a/crates/nvisy-object/Cargo.toml +++ b/crates/nvisy-object/Cargo.toml @@ -37,9 +37,11 @@ async-trait = { workspace = true, features = [] } uuid = { workspace = true, features = ["v4"] } bytes = { workspace = true, features = [] } -# AWS SDK -aws-sdk-s3 = { workspace = true, features = [] } -aws-config = { workspace = true, features = [] } +# S3-compatible object storage +minio = { workspace = true, features = [] } + +# Async streams +futures = { workspace = true, features = [] } # Error handling thiserror = { workspace = true, features = [] } diff --git a/crates/nvisy-object/src/prelude.rs b/crates/nvisy-object/src/prelude.rs index 38fde55..3fc3355 100644 --- a/crates/nvisy-object/src/prelude.rs +++ b/crates/nvisy-object/src/prelude.rs @@ -2,3 +2,4 @@ pub use crate::providers::s3::S3ProviderFactory; pub use crate::streams::read::ObjectReadStream; pub use crate::streams::write::ObjectWriteStream; +pub use crate::streams::{StreamSource, StreamTarget}; diff --git a/crates/nvisy-object/src/providers/s3.rs b/crates/nvisy-object/src/providers/s3.rs index f5f7574..d264112 100644 --- a/crates/nvisy-object/src/providers/s3.rs +++ b/crates/nvisy-object/src/providers/s3.rs @@ -1,30 +1,35 @@ -//! AWS S3 (and S3-compatible) provider implementation. +//! S3-compatible provider implementation using the MinIO Rust SDK. //! //! Provides [`S3ObjectStoreClient`] which implements [`ObjectStoreClient`] and //! [`S3ProviderFactory`] which plugs into the engine's provider system. +//! +//! Works with MinIO, AWS S3, and any S3-compatible service. -use aws_config::BehaviorVersion; -use aws_sdk_s3::Client as S3Client; use bytes::Bytes; use serde::Deserialize; +use minio::s3::creds::StaticProvider; +use minio::s3::http::BaseUrl; +use minio::s3::types::{S3Api, ToStream}; +use minio::s3::{Client as MinioClient, ClientBuilder as MinioClientBuilder}; + use nvisy_core::error::Error; use nvisy_core::registry::provider::{ConnectedInstance, ProviderFactory}; use crate::client::{GetResult, ListResult, ObjectStoreBox, ObjectStoreClient}; /// S3-compatible object store client. /// -/// Wraps the AWS SDK [`S3Client`] and scopes all operations to a single bucket. +/// Wraps the MinIO [`MinioClient`] and scopes all operations to a single bucket. pub struct S3ObjectStoreClient { - /// Underlying AWS SDK client. - client: S3Client, + /// Underlying MinIO client. + client: MinioClient, /// Target S3 bucket name. bucket: String, } impl S3ObjectStoreClient { /// Create a new client bound to the given `bucket`. - pub fn new(client: S3Client, bucket: String) -> Self { + pub fn new(client: MinioClient, bucket: String) -> Self { Self { client, bucket } } } @@ -32,70 +37,74 @@ impl S3ObjectStoreClient { #[async_trait::async_trait] impl ObjectStoreClient for S3ObjectStoreClient { async fn list(&self, prefix: &str, cursor: Option<&str>) -> Result<ListResult, Box<dyn std::error::Error + Send + Sync>> { - let mut req = self.client - .list_objects_v2() - .bucket(&self.bucket) - .prefix(prefix); + use futures::StreamExt; + + let mut builder = self.client + .list_objects(&self.bucket) + .recursive(true) + .use_api_v1(false); + + if !prefix.is_empty() { + builder = builder.prefix(Some(prefix.to_string())); + } if let Some(token) = cursor { - req = req.continuation_token(token); + builder = builder.continuation_token(Some(token.to_string())); } - let resp = req.send().await?; + let mut stream = builder.to_stream().await; - let keys: Vec<String> = resp - .contents() - .iter() - .filter_map(|obj| obj.key().map(|k| k.to_string())) - .collect(); + // Fetch one page + if let Some(result) = stream.next().await { + let resp = result?; + let keys: Vec<String> = resp.contents + .iter() + .filter(|entry| !entry.is_prefix) + .map(|entry| entry.name.clone()) + .collect(); - let next_cursor = resp.next_continuation_token().map(|s| s.to_string()); + let next_cursor = resp.next_continuation_token.clone(); - Ok(ListResult { keys, next_cursor }) + Ok(ListResult { keys, next_cursor }) + } else { + Ok(ListResult { keys: vec![], next_cursor: None }) + } } async fn get(&self, key: &str) -> Result<GetResult, Box<dyn std::error::Error + Send + Sync>> { let resp = self.client - .get_object() - .bucket(&self.bucket) - .key(key) + .get_object(&self.bucket, key) .send() .await?; - let content_type = resp.content_type().map(|s| s.to_string()); - let body = resp.body.collect().await?; - let data = body.into_bytes(); + let data = resp.content.to_segmented_bytes().await?.to_bytes(); - Ok(GetResult { data, content_type }) + Ok(GetResult { data, content_type: None }) } async fn put(&self, key: &str, data: Bytes, content_type: Option<&str>) -> Result<(), Box<dyn std::error::Error + Send + Sync>> { - let mut req = self.client - .put_object() - .bucket(&self.bucket) - .key(key) - .body(data.into()); + let content = minio::s3::builders::ObjectContent::from(data); + let mut builder = self.client + .put_object_content(&self.bucket, key, content); if let Some(ct) = content_type { - req = req.content_type(ct); + builder = builder.content_type(ct.to_string()); } - req.send().await?; + builder.send().await?; Ok(()) } async fn delete(&self, key: &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>> { self.client - .delete_object() - .bucket(&self.bucket) - .key(key) + .delete_object(&self.bucket, key) .send() .await?; Ok(()) } } -/// Typed credentials for S3 provider. +/// Typed credentials for S3-compatible provider. #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] pub struct S3Credentials { @@ -104,16 +113,17 @@ pub struct S3Credentials { /// AWS region (defaults to `us-east-1`). #[serde(default = "default_region")] pub region: String, - /// Custom endpoint URL for S3-compatible services. + /// Endpoint URL (e.g. `http://localhost:9000` for MinIO). + /// Required for non-AWS S3-compatible services. #[serde(default)] pub endpoint: Option<String>, - /// AWS access key ID for static credentials. + /// Access key ID for static credentials. #[serde(default)] pub access_key_id: Option<String>, - /// AWS secret access key for static credentials. + /// Secret access key for static credentials. #[serde(default)] pub secret_access_key: Option<String>, - /// AWS session token for temporary credentials. + /// Session token for temporary credentials. #[serde(default)] pub session_token: Option<String>, } @@ -131,41 +141,38 @@ impl ProviderFactory for S3ProviderFactory { fn id(&self) -> &str { "s3" } fn validate_credentials(&self, _creds: &Self::Credentials) -> Result<(), Error> { - // Bucket is required by the struct, so if we got here it's present. Ok(()) } async fn verify(&self, creds: &Self::Credentials) -> Result<(), Error> { self.validate_credentials(creds)?; - // Could do a HeadBucket call here for verification Ok(()) } async fn connect(&self, creds: &Self::Credentials) -> Result<ConnectedInstance<Self::Client>, Error> { - let mut config_loader = aws_config::defaults(BehaviorVersion::latest()) - .region(aws_sdk_s3::config::Region::new(creds.region.clone())); + let endpoint = creds.endpoint.as_deref().unwrap_or("https://s3.amazonaws.com"); + + let mut base_url: BaseUrl = endpoint.parse().map_err(|e| { + Error::runtime(format!("invalid endpoint URL: {e}"), "s3/connect", true) + })?; + base_url.region = creds.region.clone(); + + let mut builder = MinioClientBuilder::new(base_url); // If access_key and secret_key provided, use static credentials if let (Some(access_key), Some(secret_key)) = (&creds.access_key_id, &creds.secret_access_key) { - config_loader = config_loader.credentials_provider( - aws_sdk_s3::config::Credentials::new( - access_key, - secret_key, - creds.session_token.clone(), - None, - "nvisy-s3", - ), + let provider = StaticProvider::new( + access_key, + secret_key, + creds.session_token.as_deref(), ); + builder = builder.provider(Some(Box::new(provider))); } - let config = config_loader.load().await; - let mut s3_config = aws_sdk_s3::config::Builder::from(&config); - - if let Some(ref ep) = creds.endpoint { - s3_config = s3_config.endpoint_url(ep).force_path_style(true); - } + let client = builder.build().map_err(|e| { + Error::runtime(format!("failed to build MinIO client: {e}"), "s3/connect", true) + })?; - let client = S3Client::from_conf(s3_config.build()); let store_client = S3ObjectStoreClient::new(client, creds.bucket.clone()); Ok(ConnectedInstance { diff --git a/crates/nvisy-object/src/streams/mod.rs b/crates/nvisy-object/src/streams/mod.rs index f69ff65..6542776 100644 --- a/crates/nvisy-object/src/streams/mod.rs +++ b/crates/nvisy-object/src/streams/mod.rs @@ -1,4 +1,64 @@ //! Streaming read and write adapters for object stores. +use serde::de::DeserializeOwned; +use tokio::sync::mpsc; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::error::Error; + +/// A source stream that reads blobs from an external system into the pipeline. +/// +/// Implementations connect to a storage backend (e.g. S3, local filesystem) +/// and emit blobs into the pipeline's input channel. +#[async_trait::async_trait] +pub trait StreamSource: Send + Sync + 'static { + /// Strongly-typed parameters for this stream source. + type Params: DeserializeOwned + Send; + /// The client type this stream requires. + type Client: Send + 'static; + + /// Unique identifier for this stream source (e.g. `"s3-read"`). + fn id(&self) -> &str; + /// Validate source parameters before execution. + fn validate_params(&self, params: &Self::Params) -> Result<(), Error>; + + /// Read blobs from the external system and send them to `output`. + /// + /// Returns the number of blobs read. + async fn read( + &self, + output: mpsc::Sender<Blob>, + params: Self::Params, + client: Self::Client, + ) -> Result<u64, Error>; +} + +/// A target stream that writes blobs from the pipeline to an external system. +/// +/// Implementations receive processed blobs from the pipeline and persist +/// them to a storage backend. +#[async_trait::async_trait] +pub trait StreamTarget: Send + Sync + 'static { + /// Strongly-typed parameters for this stream target. + type Params: DeserializeOwned + Send; + /// The client type this stream requires. + type Client: Send + 'static; + + /// Unique identifier for this stream target (e.g. `"s3-write"`). + fn id(&self) -> &str; + /// Validate target parameters before execution. + fn validate_params(&self, params: &Self::Params) -> Result<(), Error>; + + /// Receive blobs from `input` and write them to the external system. + /// + /// Returns the number of blobs written. + async fn write( + &self, + input: mpsc::Receiver<Blob>, + params: Self::Params, + client: Self::Client, + ) -> Result<u64, Error>; +} + pub mod read; pub mod write; diff --git a/crates/nvisy-object/src/streams/read.rs b/crates/nvisy-object/src/streams/read.rs index 02d7222..e4d5ee1 100644 --- a/crates/nvisy-object/src/streams/read.rs +++ b/crates/nvisy-object/src/streams/read.rs @@ -5,7 +5,7 @@ use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; use nvisy_core::error::Error; -use nvisy_core::registry::stream::StreamSource; +use super::StreamSource; use crate::client::ObjectStoreBox; /// Typed parameters for [`ObjectReadStream`]. diff --git a/crates/nvisy-object/src/streams/write.rs b/crates/nvisy-object/src/streams/write.rs index 1de42a2..51b9964 100644 --- a/crates/nvisy-object/src/streams/write.rs +++ b/crates/nvisy-object/src/streams/write.rs @@ -5,7 +5,7 @@ use tokio::sync::mpsc; use nvisy_core::datatypes::blob::Blob; use nvisy_core::error::Error; -use nvisy_core::registry::stream::StreamTarget; +use super::StreamTarget; use crate::client::ObjectStoreBox; /// Typed parameters for [`ObjectWriteStream`]. diff --git a/crates/nvisy-ontology/Cargo.toml b/crates/nvisy-ontology/Cargo.toml new file mode 100644 index 0000000..ddbba21 --- /dev/null +++ b/crates/nvisy-ontology/Cargo.toml @@ -0,0 +1,43 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-ontology" +description = "Detection ontology and redaction policy types for the Nvisy platform" +keywords = ["nvisy", "ontology", "redaction", "policy"] +categories = ["data-structures"] + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[features] +schema = ["dep:schemars"] + +[dependencies] +# Internal crates +nvisy-core = { workspace = true, features = [] } + +# JSON Schema generation +schemars = { workspace = true, optional = true } + +# (De)serialization +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true, features = [] } + +# Primitive datatypes +uuid = { workspace = true, features = ["serde", "v4"] } +chrono = { workspace = true, features = ["serde"] } + +# Error handling +derive_more = { workspace = true, features = ["display"] } diff --git a/crates/nvisy-ontology/README.md b/crates/nvisy-ontology/README.md new file mode 100644 index 0000000..00ccb86 --- /dev/null +++ b/crates/nvisy-ontology/README.md @@ -0,0 +1,3 @@ +# nvisy-ontology + +Detection ontology and redaction policy types for the Nvisy platform. Defines entities, redaction methods, audit records, and policy rules that all detection and redaction crates depend on. diff --git a/crates/nvisy-ontology/src/lib.rs b/crates/nvisy-ontology/src/lib.rs new file mode 100644 index 0000000..0fafae8 --- /dev/null +++ b/crates/nvisy-ontology/src/lib.rs @@ -0,0 +1,9 @@ +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + +pub mod ontology; +pub mod redaction; + +#[doc(hidden)] +pub mod prelude; diff --git a/crates/nvisy-core/src/ontology/audit.rs b/crates/nvisy-ontology/src/ontology/audit.rs similarity index 98% rename from crates/nvisy-core/src/ontology/audit.rs rename to crates/nvisy-ontology/src/ontology/audit.rs index 3e91148..bb55ab7 100644 --- a/crates/nvisy-core/src/ontology/audit.rs +++ b/crates/nvisy-ontology/src/ontology/audit.rs @@ -3,8 +3,8 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use crate::datatypes::Data; -use crate::datatypes::Metadata; +use nvisy_core::datatypes::Data; +use nvisy_core::datatypes::Metadata; /// Kind of auditable action recorded in an [`Audit`] entry. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] diff --git a/crates/nvisy-core/src/ontology/entity.rs b/crates/nvisy-ontology/src/ontology/entity.rs similarity index 87% rename from crates/nvisy-core/src/ontology/entity.rs rename to crates/nvisy-ontology/src/ontology/entity.rs index 70fc576..14e69a5 100644 --- a/crates/nvisy-core/src/ontology/entity.rs +++ b/crates/nvisy-ontology/src/ontology/entity.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize}; use uuid::Uuid; -use crate::datatypes::Data; +use nvisy_core::datatypes::Data; /// Category of sensitive data an entity belongs to. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] @@ -36,6 +36,10 @@ pub enum DetectionMethod { Checksum, /// Multiple methods combined to produce a single detection. Composite, + /// OCR text extraction with bounding boxes. + Ocr, + /// User-provided annotations. + Manual, } /// Axis-aligned bounding box for image-based entity locations. @@ -69,6 +73,18 @@ pub struct EntityLocation { /// Bounding box for image-based detections. #[serde(skip_serializing_if = "Option::is_none")] pub bounding_box: Option<BoundingBox>, + /// Tabular row index (0-based). + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub row_index: Option<usize>, + /// Tabular column index (0-based). + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub column_index: Option<usize>, + /// Links this entity to a specific [`ImageData`](nvisy_core::datatypes::document::ImageData). + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub image_id: Option<Uuid>, } /// A detected sensitive data occurrence within a document. diff --git a/crates/nvisy-core/src/ontology/mod.rs b/crates/nvisy-ontology/src/ontology/mod.rs similarity index 100% rename from crates/nvisy-core/src/ontology/mod.rs rename to crates/nvisy-ontology/src/ontology/mod.rs diff --git a/crates/nvisy-core/src/ontology/redaction.rs b/crates/nvisy-ontology/src/ontology/redaction.rs similarity index 98% rename from crates/nvisy-core/src/ontology/redaction.rs rename to crates/nvisy-ontology/src/ontology/redaction.rs index 679736b..e7c9fe3 100644 --- a/crates/nvisy-core/src/ontology/redaction.rs +++ b/crates/nvisy-ontology/src/ontology/redaction.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Serialize}; use uuid::Uuid; -use crate::datatypes::Data; +use nvisy_core::datatypes::Data; /// Strategy used to redact or obfuscate a detected entity. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] diff --git a/crates/nvisy-ontology/src/prelude.rs b/crates/nvisy-ontology/src/prelude.rs new file mode 100644 index 0000000..1b91419 --- /dev/null +++ b/crates/nvisy-ontology/src/prelude.rs @@ -0,0 +1,9 @@ +//! Convenience re-exports for common nvisy-ontology types. + +pub use crate::ontology::{ + Audit, AuditAction, BoundingBox, DetectionMethod, Entity, EntityCategory, + EntityLocation, Redaction, RedactionMethod, +}; +pub use crate::redaction::{ + EntityRedactionRule, ManualAnnotation, Policy, PolicyRule, RedactionContext, +}; diff --git a/crates/nvisy-core/src/redaction/context.rs b/crates/nvisy-ontology/src/redaction/context.rs similarity index 74% rename from crates/nvisy-core/src/redaction/context.rs rename to crates/nvisy-ontology/src/redaction/context.rs index d2c6064..05404fe 100644 --- a/crates/nvisy-core/src/redaction/context.rs +++ b/crates/nvisy-ontology/src/redaction/context.rs @@ -1,7 +1,7 @@ //! Request-scoped redaction context for per-invocation control. use serde::{Deserialize, Serialize}; -use crate::ontology::entity::EntityCategory; +use crate::ontology::entity::{BoundingBox, EntityCategory}; use crate::ontology::redaction::RedactionMethod; /// Per-entity-type override for the redaction method. @@ -20,6 +20,41 @@ pub struct EntityRedactionRule { pub replacement: Option<String>, } +/// A user-provided annotation identifying a sensitive region. +/// +/// Manual annotations bypass automated detection — each is converted +/// directly into an [`Entity`](crate::ontology::entity::Entity) with +/// `DetectionMethod::Manual` and confidence 1.0. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +pub struct ManualAnnotation { + /// Broad classification of the annotated data. + pub category: EntityCategory, + /// Specific type label (e.g. `"ssn"`, `"name"`). + pub entity_type: String, + /// The matched or annotated text value. + #[serde(default)] + pub value: String, + /// 1-based page number, if applicable. + #[serde(skip_serializing_if = "Option::is_none")] + pub page_number: Option<u32>, + /// Bounding box for image-based annotations. + #[serde(skip_serializing_if = "Option::is_none")] + pub bounding_box: Option<BoundingBox>, + /// Start byte offset in text. + #[serde(skip_serializing_if = "Option::is_none")] + pub start_offset: Option<usize>, + /// End byte offset in text. + #[serde(skip_serializing_if = "Option::is_none")] + pub end_offset: Option<usize>, + /// Tabular row index (0-based). + #[serde(skip_serializing_if = "Option::is_none")] + pub row_index: Option<usize>, + /// Tabular column index (0-based). + #[serde(skip_serializing_if = "Option::is_none")] + pub column_index: Option<usize>, +} + /// Request-scoped description of what to redact. /// /// Acts as the per-request equivalent of a stored [`Policy`](super::policy::Policy), @@ -49,6 +84,9 @@ pub struct RedactionContext { /// Free-form labels (e.g. "gdpr-request"). #[serde(default)] pub labels: Vec<String>, + /// User-provided manual annotations to treat as detected entities. + #[serde(default)] + pub manual_entities: Vec<ManualAnnotation>, } fn default_method() -> RedactionMethod { @@ -69,6 +107,7 @@ impl Default for RedactionContext { min_confidence: 0.5, detect_images: false, labels: Vec::new(), + manual_entities: Vec::new(), } } } diff --git a/crates/nvisy-core/src/redaction/mod.rs b/crates/nvisy-ontology/src/redaction/mod.rs similarity index 59% rename from crates/nvisy-core/src/redaction/mod.rs rename to crates/nvisy-ontology/src/redaction/mod.rs index f523c68..a75b33c 100644 --- a/crates/nvisy-core/src/redaction/mod.rs +++ b/crates/nvisy-ontology/src/redaction/mod.rs @@ -3,5 +3,5 @@ pub mod context; pub mod policy; -pub use context::{EntityRedactionRule, RedactionContext}; +pub use context::{EntityRedactionRule, ManualAnnotation, RedactionContext}; pub use policy::{Policy, PolicyRule}; diff --git a/crates/nvisy-core/src/redaction/policy.rs b/crates/nvisy-ontology/src/redaction/policy.rs similarity index 99% rename from crates/nvisy-core/src/redaction/policy.rs rename to crates/nvisy-ontology/src/redaction/policy.rs index 584bc3a..19391ba 100644 --- a/crates/nvisy-core/src/redaction/policy.rs +++ b/crates/nvisy-ontology/src/redaction/policy.rs @@ -1,7 +1,7 @@ //! Redaction policies and rules. use serde::{Deserialize, Serialize}; -use crate::datatypes::Data; +use nvisy_core::datatypes::Data; use crate::ontology::entity::EntityCategory; use crate::ontology::redaction::RedactionMethod; diff --git a/crates/nvisy-python/Cargo.toml b/crates/nvisy-python/Cargo.toml index 0b1a89b..f93488a 100644 --- a/crates/nvisy-python/Cargo.toml +++ b/crates/nvisy-python/Cargo.toml @@ -24,6 +24,7 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } +nvisy-ontology = { workspace = true } # (De)serialization serde = { workspace = true, features = ["derive"] } diff --git a/crates/nvisy-python/src/actions/mod.rs b/crates/nvisy-python/src/actions/mod.rs index 16f4514..aca3c12 100644 --- a/crates/nvisy-python/src/actions/mod.rs +++ b/crates/nvisy-python/src/actions/mod.rs @@ -1,8 +1,12 @@ -//! Pipeline actions that perform AI-powered named-entity recognition. +//! Pipeline actions that perform AI-powered named-entity recognition and OCR. //! -//! Two actions are provided: +//! Three actions are provided: //! - [`DetectNerAction`] -- runs NER over text documents. //! - [`DetectNerImageAction`] -- runs NER over images (OCR + entity detection). +//! - [`OcrDetectAction`] -- performs OCR on images to extract text regions. + +/// OCR detection pipeline action. +pub mod ocr; use serde::Deserialize; use tokio::sync::mpsc; diff --git a/crates/nvisy-python/src/actions/ocr.rs b/crates/nvisy-python/src/actions/ocr.rs new file mode 100644 index 0000000..7670a0f --- /dev/null +++ b/crates/nvisy-python/src/actions/ocr.rs @@ -0,0 +1,136 @@ +//! OCR detection pipeline action. + +use serde::Deserialize; +use tokio::sync::mpsc; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::datatypes::document::{Document, ImageData}; +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::registry::action::Action; +use crate::bridge::PythonBridge; +use crate::ocr::{self, OcrConfig}; + +/// Typed parameters for [`OcrDetectAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct OcrDetectParams { + /// Language hint (default `"eng"`). + #[serde(default = "default_language")] + pub language: String, + /// OCR engine to use. + #[serde(default = "default_engine")] + pub engine: String, + /// Minimum confidence threshold. + #[serde(default = "default_confidence")] + pub confidence_threshold: f64, +} + +fn default_language() -> String { + "eng".to_string() +} +fn default_engine() -> String { + "tesseract".to_string() +} +fn default_confidence() -> f64 { + 0.5 +} + +/// Pipeline action that performs OCR on images and produces entities +/// with bounding boxes, plus a `Document` artifact from concatenated +/// OCR text so downstream regex/dictionary/NER can process it. +pub struct OcrDetectAction { + /// Python bridge used to call the OCR backend. + pub bridge: PythonBridge, +} + +#[async_trait::async_trait] +impl Action for OcrDetectAction { + type Params = OcrDetectParams; + + fn id(&self) -> &str { + "detect-ocr" + } + + fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { + Ok(()) + } + + async fn execute( + &self, + mut input: mpsc::Receiver<Blob>, + output: mpsc::Sender<Blob>, + params: Self::Params, + ) -> Result<u64, Error> { + let config = OcrConfig { + language: params.language, + engine: params.engine, + confidence_threshold: params.confidence_threshold, + }; + let mut count = 0u64; + + while let Some(mut blob) = input.recv().await { + let images: Vec<ImageData> = blob.get_artifacts("images").map_err(|e| { + Error::new( + ErrorKind::Runtime, + format!("failed to read images artifact: {e}"), + ) + })?; + + let mut all_ocr_text = Vec::new(); + + if images.is_empty() { + // Treat blob content as a single image + let mime_type = blob + .content_type() + .unwrap_or("application/octet-stream") + .to_string(); + let entities = + ocr::detect_ocr(&self.bridge, &blob.content, &mime_type, &config).await?; + for entity in &entities { + all_ocr_text.push(entity.value.clone()); + blob.add_artifact("entities", entity).map_err(|e| { + Error::new( + ErrorKind::Runtime, + format!("failed to add entity: {e}"), + ) + })?; + count += 1; + } + } else { + for img in &images { + let entities = + ocr::detect_ocr(&self.bridge, &img.image_data, &img.mime_type, &config) + .await?; + for entity in &entities { + all_ocr_text.push(entity.value.clone()); + blob.add_artifact("entities", entity).map_err(|e| { + Error::new( + ErrorKind::Runtime, + format!("failed to add entity: {e}"), + ) + })?; + count += 1; + } + } + } + + // Create a Document from concatenated OCR text for downstream processing + if !all_ocr_text.is_empty() { + let ocr_doc = Document::new(all_ocr_text.join("\n")) + .with_source_format("ocr"); + blob.add_artifact("documents", &ocr_doc).map_err(|e| { + Error::new( + ErrorKind::Runtime, + format!("failed to add OCR document: {e}"), + ) + })?; + } + + if output.send(blob).await.is_err() { + return Ok(count); + } + } + + Ok(count) + } +} diff --git a/crates/nvisy-python/src/lib.rs b/crates/nvisy-python/src/lib.rs index c3b8fca..97cc933 100644 --- a/crates/nvisy-python/src/lib.rs +++ b/crates/nvisy-python/src/lib.rs @@ -14,6 +14,7 @@ pub mod actions; pub mod bridge; pub mod error; pub mod ner; +pub mod ocr; pub mod provider; #[doc(hidden)] diff --git a/crates/nvisy-python/src/ner/mod.rs b/crates/nvisy-python/src/ner/mod.rs index d8a7bdf..8259e04 100644 --- a/crates/nvisy-python/src/ner/mod.rs +++ b/crates/nvisy-python/src/ner/mod.rs @@ -6,9 +6,9 @@ use pyo3::prelude::*; use pyo3::types::{PyDict, PyList}; -use nvisy_core::ontology::entity::{Entity, EntityLocation}; +use nvisy_ontology::ontology::entity::{Entity, EntityLocation}; use nvisy_core::error::Error; -use nvisy_core::ontology::entity::{DetectionMethod, EntityCategory}; +use nvisy_ontology::ontology::entity::{DetectionMethod, EntityCategory}; use crate::bridge::PythonBridge; use crate::error::from_pyerr; @@ -172,6 +172,9 @@ fn parse_python_entities(_py: Python<'_>, result: Bound<'_, PyAny>) -> Result<Ve element_id: None, page_number: None, bounding_box: None, + row_index: None, + column_index: None, + image_id: None, }, ); diff --git a/crates/nvisy-python/src/ocr/mod.rs b/crates/nvisy-python/src/ocr/mod.rs new file mode 100644 index 0000000..643c831 --- /dev/null +++ b/crates/nvisy-python/src/ocr/mod.rs @@ -0,0 +1,151 @@ +//! OCR text extraction via the Python backend. +//! +//! Calls `nvisy_ai.detect_ocr()` through the Python bridge to perform +//! optical character recognition on images, returning text regions with +//! bounding boxes. + +use pyo3::prelude::*; +use pyo3::types::{PyDict, PyList}; + +use nvisy_ontology::ontology::entity::{BoundingBox, Entity, EntityLocation}; +use nvisy_core::error::Error; +use nvisy_ontology::ontology::entity::{DetectionMethod, EntityCategory}; +use crate::bridge::PythonBridge; +use crate::error::from_pyerr; + +/// Configuration for OCR detection. +#[derive(Debug, Clone)] +pub struct OcrConfig { + /// Language hint (e.g. `"eng"` for English). + pub language: String, + /// OCR engine to use (`"tesseract"`, `"google-vision"`, `"aws-textract"`). + pub engine: String, + /// Minimum confidence threshold for OCR results. + pub confidence_threshold: f64, +} + +/// Call Python `detect_ocr()` via GIL + `spawn_blocking`. +/// +/// Returns a list of entities with `DetectionMethod::Ocr`, each carrying +/// a bounding box indicating where the text was found in the image. +pub async fn detect_ocr( + bridge: &PythonBridge, + image_data: &[u8], + mime_type: &str, + config: &OcrConfig, +) -> Result<Vec<Entity>, Error> { + let module_name = bridge.module_name().to_string(); + let image_data = image_data.to_vec(); + let mime_type = mime_type.to_string(); + let config = config.clone(); + + tokio::task::spawn_blocking(move || { + Python::with_gil(|py| { + let module = py.import(&module_name).map_err(from_pyerr)?; + + let kwargs = PyDict::new(py); + kwargs.set_item("image_bytes", &image_data[..]).map_err(from_pyerr)?; + kwargs.set_item("mime_type", &mime_type).map_err(from_pyerr)?; + kwargs.set_item("language", &config.language).map_err(from_pyerr)?; + kwargs.set_item("engine", &config.engine).map_err(from_pyerr)?; + kwargs.set_item("confidence_threshold", config.confidence_threshold).map_err(from_pyerr)?; + + let result = module + .call_method("detect_ocr", (), Some(&kwargs)) + .map_err(from_pyerr)?; + + parse_ocr_results(result) + }) + }) + .await + .map_err(|e| Error::python(format!("Task join error: {}", e)))? +} + +/// Parse Python list[dict] OCR response into Vec<Entity>. +/// +/// Expected Python response format: +/// ```python +/// [ +/// { +/// "text": "John Doe", +/// "x": 100.0, +/// "y": 200.0, +/// "width": 150.0, +/// "height": 30.0, +/// "confidence": 0.95 +/// }, +/// ... +/// ] +/// ``` +fn parse_ocr_results(result: Bound<'_, PyAny>) -> Result<Vec<Entity>, Error> { + let list: &Bound<'_, PyList> = result.downcast().map_err(|e| { + Error::python(format!("Expected list from Python OCR: {}", e)) + })?; + + let mut entities = Vec::new(); + + for item in list.iter() { + let dict: &Bound<'_, PyDict> = item.downcast().map_err(|e| { + Error::python(format!("Expected dict in OCR list: {}", e)) + })?; + + let text: String = dict + .get_item("text") + .map_err(from_pyerr)? + .ok_or_else(|| Error::python("Missing 'text' in OCR result"))? + .extract() + .map_err(from_pyerr)?; + + let x: f64 = dict + .get_item("x") + .map_err(from_pyerr)? + .and_then(|v| v.extract().ok()) + .unwrap_or(0.0); + + let y: f64 = dict + .get_item("y") + .map_err(from_pyerr)? + .and_then(|v| v.extract().ok()) + .unwrap_or(0.0); + + let width: f64 = dict + .get_item("width") + .map_err(from_pyerr)? + .and_then(|v| v.extract().ok()) + .unwrap_or(0.0); + + let height: f64 = dict + .get_item("height") + .map_err(from_pyerr)? + .and_then(|v| v.extract().ok()) + .unwrap_or(0.0); + + let confidence: f64 = dict + .get_item("confidence") + .map_err(from_pyerr)? + .and_then(|v| v.extract().ok()) + .unwrap_or(0.0); + + let entity = Entity::new( + EntityCategory::Pii, + "ocr_text", + &text, + DetectionMethod::Ocr, + confidence, + EntityLocation { + start_offset: 0, + end_offset: text.len(), + element_id: None, + page_number: None, + bounding_box: Some(BoundingBox { x, y, width, height }), + row_index: None, + column_index: None, + image_id: None, + }, + ); + + entities.push(entity); + } + + Ok(entities) +} diff --git a/crates/nvisy-python/src/prelude.rs b/crates/nvisy-python/src/prelude.rs index 1b6aca3..f88b3dd 100644 --- a/crates/nvisy-python/src/prelude.rs +++ b/crates/nvisy-python/src/prelude.rs @@ -1,4 +1,5 @@ //! Convenience re-exports. pub use crate::actions::{DetectNerAction, DetectNerImageAction}; +pub use crate::actions::ocr::OcrDetectAction; pub use crate::bridge::PythonBridge; pub use crate::provider::AiProviderFactory; diff --git a/crates/nvisy-server/Cargo.toml b/crates/nvisy-server/Cargo.toml index 539ef45..c095a2b 100644 --- a/crates/nvisy-server/Cargo.toml +++ b/crates/nvisy-server/Cargo.toml @@ -28,7 +28,12 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates nvisy-core = { workspace = true, features = ["schema"] } +nvisy-detect = { workspace = true } nvisy-engine = { workspace = true, features = ["schema"] } +nvisy-ingest = { workspace = true } +nvisy-media = { workspace = true } +nvisy-ontology = { workspace = true } +nvisy-python = { workspace = true } # JSON Schema generation schemars = { workspace = true } @@ -41,7 +46,7 @@ serde_json = { workspace = true, features = [] } tokio = { workspace = true, features = ["rt-multi-thread", "macros", "signal"] } # HTTP server -axum = { workspace = true, features = ["http2", "macros"] } +axum = { workspace = true, features = ["http2", "macros", "multipart"] } tower = { workspace = true, features = ["full"] } tower-http = { workspace = true, features = ["cors", "trace", "request-id", "limit"] } @@ -57,6 +62,12 @@ chrono = { workspace = true, features = [] } tracing = { workspace = true, features = [] } tracing-subscriber = { workspace = true, features = ["fmt", "ansi", "json", "env-filter"] } +# Encoding +base64 = { workspace = true } + +# Binary data +bytes = { workspace = true } + # Error handling thiserror = { workspace = true, features = [] } anyhow = { workspace = true, features = ["backtrace"] } diff --git a/crates/nvisy-server/src/handler/mod.rs b/crates/nvisy-server/src/handler/mod.rs index 584c2a5..1b68946 100644 --- a/crates/nvisy-server/src/handler/mod.rs +++ b/crates/nvisy-server/src/handler/mod.rs @@ -26,7 +26,7 @@ use utoipa::OpenApi; audit::get_audit_by_run, ), components(schemas( - redact::RedactRequest, + redact::RedactResponse, policies::CreatePolicyRequest, policies::UpdatePolicyRequest, )) diff --git a/crates/nvisy-server/src/handler/redact.rs b/crates/nvisy-server/src/handler/redact.rs index 0e44be6..ef1aa78 100644 --- a/crates/nvisy-server/src/handler/redact.rs +++ b/crates/nvisy-server/src/handler/redact.rs @@ -1,52 +1,222 @@ use axum::{ Router, - extract::State, + extract::{Multipart, Query, State}, routing::post, Json, + http::{StatusCode, HeaderMap, header}, + response::IntoResponse, }; +use bytes::Bytes; use std::sync::Arc; -use nvisy_core::redaction::RedactionContext; +use nvisy_ontology::redaction::RedactionContext; use nvisy_engine::runs::RunManager; +use nvisy_detect::actions::detect_dictionary::DictionaryDef; use crate::service::AppState; +use crate::service::pipeline; -#[derive(serde::Deserialize, schemars::JsonSchema, utoipa::ToSchema)] -pub(crate) struct RedactRequest { - source: serde_json::Value, +/// Query parameters for the redact endpoint. +#[derive(Debug, serde::Deserialize)] +pub(crate) struct RedactQuery { + /// Response format: `"json"` (default) or `"binary"`. #[serde(default)] - #[schema(value_type = Option<Object>)] - context: Option<RedactionContext>, - #[serde(default)] - output: Option<serde_json::Value>, - #[serde(rename = "policyId")] - #[serde(default)] - policy_id: Option<String>, + pub format: Option<String>, +} + +/// JSON response for the redact endpoint. +#[derive(Debug, serde::Serialize, schemars::JsonSchema, utoipa::ToSchema)] +pub(crate) struct RedactResponse { + /// Unique run identifier. + pub run_id: String, + /// Base64-encoded redacted file content. + pub file: String, + /// Output file name. + pub file_name: String, + /// Content type of the output. + pub content_type: String, + /// Pipeline execution summary. + pub summary: pipeline::PipelineSummary, + /// Audit trail entries. + pub audit_trail: Vec<serde_json::Value>, } -/// Submit a redaction request. +/// Submit a file for redaction via multipart upload. +/// +/// Parts: +/// - `file` (binary, required): The file to redact +/// - `context` (JSON, optional): RedactionContext with categories, rules, etc. +/// - `dictionaries` (JSON, optional): Array of DictionaryDef for dictionary matching #[utoipa::path( post, path = "/api/v1/redact", - request_body = RedactRequest, + request_body(content_type = "multipart/form-data"), + params( + ("format" = Option<String>, Query, description = "Response format: json (default) or binary") + ), responses( - (status = 202, description = "Redaction accepted") + (status = 200, description = "Redaction completed", body = RedactResponse), + (status = 400, description = "Bad request"), + (status = 500, description = "Internal server error") ) )] async fn redact( State(run_manager): State<Arc<RunManager>>, - Json(_body): Json<RedactRequest>, -) -> (axum::http::StatusCode, Json<serde_json::Value>) { + Query(query): Query<RedactQuery>, + mut multipart: Multipart, +) -> Result<impl IntoResponse, (StatusCode, Json<serde_json::Value>)> { let (run_id, _cancel_token) = run_manager.create_run().await; run_manager.set_running(run_id).await; - // TODO: build redaction graph from body and execute + let mut file_bytes: Option<Bytes> = None; + let mut file_name = String::from("upload"); + let mut content_type = String::from("application/octet-stream"); + let mut context = RedactionContext::default(); + let mut dictionaries: Vec<DictionaryDef> = Vec::new(); + + // Parse multipart parts + while let Ok(Some(field)) = multipart.next_field().await { + let name = field.name().unwrap_or("").to_string(); + + match name.as_str() { + "file" => { + if let Some(fname) = field.file_name() { + file_name = fname.to_string(); + } + if let Some(ct) = field.content_type() { + content_type = ct.to_string(); + } + let data = field.bytes().await.map_err(|e| { + ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ "error": format!("Failed to read file: {e}") })), + ) + })?; + file_bytes = Some(data); + } + "context" => { + let data = field.bytes().await.map_err(|e| { + ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ "error": format!("Failed to read context: {e}") })), + ) + })?; + context = serde_json::from_slice(&data).map_err(|e| { + ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ "error": format!("Invalid context JSON: {e}") })), + ) + })?; + } + "dictionaries" => { + let data = field.bytes().await.map_err(|e| { + ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ "error": format!("Failed to read dictionaries: {e}") })), + ) + })?; + dictionaries = serde_json::from_slice(&data).map_err(|e| { + ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ "error": format!("Invalid dictionaries JSON: {e}") })), + ) + })?; + } + _ => { + // Skip unknown fields + } + } + } - ( - axum::http::StatusCode::ACCEPTED, - Json(serde_json::json!({ - "runId": run_id.to_string(), - "status": "accepted" - })), + let file_bytes = file_bytes.ok_or_else(|| { + ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ "error": "Missing 'file' part in multipart upload" })), + ) + })?; + + // Detect content type from file extension if not provided + if content_type == "application/octet-stream" { + if let Some(ext) = file_name.rsplit('.').next() { + content_type = match ext.to_lowercase().as_str() { + "pdf" => "application/pdf", + "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "html" | "htm" => "text/html", + "csv" => "text/csv", + "json" => "application/json", + "txt" => "text/plain", + "xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "xls" => "application/vnd.ms-excel", + "parquet" => "application/x-parquet", + "jpg" | "jpeg" => "image/jpeg", + "png" => "image/png", + "tiff" => "image/tiff", + "bmp" => "image/bmp", + "webp" => "image/webp", + "mp3" => "audio/mpeg", + "wav" => "audio/wav", + _ => "application/octet-stream", + } + .to_string(); + } + } + + // Execute the pipeline + let result = pipeline::execute_pipeline( + file_bytes, + &file_name, + &content_type, + &context, + &dictionaries, ) + .await + .map_err(|e| { + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(serde_json::json!({ "error": format!("Pipeline failed: {e}") })), + ) + })?; + + // Return binary or JSON based on format query param + if query.format.as_deref() == Some("binary") { + let mut headers = HeaderMap::new(); + headers.insert( + header::CONTENT_TYPE, + result.content_type.parse().unwrap_or(header::HeaderValue::from_static("application/octet-stream")), + ); + headers.insert( + header::CONTENT_DISPOSITION, + format!("attachment; filename=\"{}\"", result.file_name) + .parse() + .unwrap_or(header::HeaderValue::from_static("attachment")), + ); + headers.insert( + "x-nvisy-run-id", + run_id.to_string().parse().unwrap(), + ); + headers.insert( + "x-nvisy-total-entities", + result.summary.total_entities.to_string().parse().unwrap(), + ); + headers.insert( + "x-nvisy-total-redactions", + result.summary.total_redactions.to_string().parse().unwrap(), + ); + + Ok((StatusCode::OK, headers, result.content).into_response()) + } else { + use base64::Engine; + let encoded = base64::engine::general_purpose::STANDARD.encode(&result.content); + + let response = RedactResponse { + run_id: run_id.to_string(), + file: encoded, + file_name: result.file_name, + content_type: result.content_type, + summary: result.summary, + audit_trail: result.audit_trail, + }; + + Ok((StatusCode::OK, Json(response)).into_response()) + } } pub fn router() -> Router<AppState> { diff --git a/crates/nvisy-server/src/service/mod.rs b/crates/nvisy-server/src/service/mod.rs index 171a036..c026656 100644 --- a/crates/nvisy-server/src/service/mod.rs +++ b/crates/nvisy-server/src/service/mod.rs @@ -6,6 +6,7 @@ pub mod audit_store; pub mod config; +pub mod pipeline; pub mod policy_store; pub mod state; diff --git a/crates/nvisy-server/src/service/pipeline.rs b/crates/nvisy-server/src/service/pipeline.rs new file mode 100644 index 0000000..7f8c811 --- /dev/null +++ b/crates/nvisy-server/src/service/pipeline.rs @@ -0,0 +1,332 @@ +//! Pipeline builder and executor for the `/redact` endpoint. +//! +//! Auto-detects file type and constructs the correct action sequence, +//! then executes actions sequentially via mpsc channels. + +use bytes::Bytes; +use std::collections::HashMap; +use tokio::sync::mpsc; + +use nvisy_core::datatypes::blob::Blob; +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_ontology::ontology::entity::Entity; +use nvisy_ontology::ontology::redaction::Redaction; +use nvisy_ontology::redaction::RedactionContext; +use nvisy_core::registry::action::Action; +use nvisy_ingest::loaders::{Loader, LoaderOutput}; + +use nvisy_detect::actions::detect_dictionary::{DetectDictionaryAction, DetectDictionaryParams, DictionaryDef}; +use nvisy_detect::actions::detect_manual::{DetectManualAction, DetectManualParams}; +use nvisy_detect::actions::detect_regex::{DetectRegexAction, DetectRegexParams}; +use nvisy_detect::actions::detect_tabular::{DetectTabularAction, DetectTabularParams}; +use nvisy_detect::actions::detect_checksum::DetectChecksumParams; +use nvisy_detect::actions::evaluate_policy::{EvaluatePolicyAction, EvaluatePolicyParams}; +use nvisy_detect::actions::emit_audit::EmitAuditParams; + +use nvisy_media::actions::apply_image_redaction::{ApplyImageRedactionAction, ApplyImageRedactionParams}; +use nvisy_media::actions::apply_tabular_redaction::{ApplyTabularRedactionAction, ApplyTabularRedactionParams}; +use nvisy_media::actions::apply_pdf_redaction::{ApplyPdfRedactionAction, ApplyPdfRedactionParams}; + +/// Result of a pipeline execution. +#[derive(Debug, serde::Serialize)] +pub struct PipelineResult { + /// Redacted file content. + #[serde(skip)] + pub content: Bytes, + /// Output file name. + pub file_name: String, + /// Content type of the output. + pub content_type: String, + /// Execution summary. + pub summary: PipelineSummary, + /// Audit trail entries. + pub audit_trail: Vec<serde_json::Value>, +} + +/// Summary statistics for a pipeline run. +#[derive(Debug, serde::Serialize, schemars::JsonSchema, utoipa::ToSchema)] +pub struct PipelineSummary { + pub total_entities: usize, + pub total_redactions: usize, + pub entities_by_category: HashMap<String, usize>, + pub processing_time_ms: u64, +} + +/// Execute the full redaction pipeline for a file. +pub async fn execute_pipeline( + file_bytes: Bytes, + file_name: &str, + content_type: &str, + context: &RedactionContext, + dictionaries: &[DictionaryDef], +) -> Result<PipelineResult, Error> { + let start = std::time::Instant::now(); + + // Create blob + let mut blob = Blob::new(file_name, file_bytes); + blob = blob.with_content_type(content_type); + + // Step 1: Load file + blob = run_loader(&blob, content_type, file_name).await?; + + // Step 2: Inject manual entities if present + if !context.manual_entities.is_empty() { + for ann in &context.manual_entities { + blob.add_artifact("manual_entities", ann).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add manual entity: {e}")) + })?; + } + } + + // Step 3: Run detection actions + blob = run_action(&DetectRegexAction, blob, DetectRegexParams { + confidence_threshold: context.min_confidence, + patterns: None, + }).await?; + + // Dictionary detection + if !dictionaries.is_empty() { + blob = run_action(&DetectDictionaryAction, blob, DetectDictionaryParams { + dictionaries: dictionaries.to_vec(), + confidence: 0.85, + }).await?; + } + + // Tabular detection if we have tabular data + let has_tabular = blob.has_artifacts("tabular"); + if has_tabular { + blob = run_action(&DetectTabularAction, blob, DetectTabularParams { + column_rules: vec![], + }).await?; + } + + // Manual entity detection + if !context.manual_entities.is_empty() { + blob = run_action(&DetectManualAction, blob, DetectManualParams {}).await?; + } + + // Checksum validation + blob = run_action(&nvisy_detect::actions::detect_checksum::DetectChecksumAction, blob, DetectChecksumParams { + drop_invalid: true, + confidence_boost: 0.05, + }).await?; + + // Classification + blob = run_action(&nvisy_detect::actions::classify::ClassifyAction, blob, ()).await?; + + // Step 4: Policy evaluation + blob = run_action(&EvaluatePolicyAction, blob, EvaluatePolicyParams { + rules: context.rules.iter().map(|r| { + nvisy_ontology::redaction::PolicyRule { + id: r.entity_type.clone(), + name: r.entity_type.clone(), + categories: vec![], + entity_types: vec![r.entity_type.clone()], + confidence_threshold: context.min_confidence, + method: r.method, + replacement_template: r.replacement.clone().unwrap_or_default(), + enabled: true, + priority: 0, + } + }).collect(), + default_method: context.default_method, + default_confidence_threshold: context.min_confidence, + }).await?; + + // Step 5: Apply redactions + blob = run_action(&nvisy_detect::actions::apply_redaction::ApplyRedactionAction, blob, ()).await?; + + // Apply image redaction if we have images + let has_images = blob.has_artifacts("images"); + if has_images { + blob = run_action(&ApplyImageRedactionAction, blob, ApplyImageRedactionParams { + blur_sigma: 15.0, + block_color: [0, 0, 0, 255], + }).await?; + } + + // Apply tabular redaction + if has_tabular { + blob = run_action(&ApplyTabularRedactionAction, blob, ApplyTabularRedactionParams {}).await?; + } + + // Apply PDF reassembly if this is a PDF + if content_type == "application/pdf" { + blob = run_action(&ApplyPdfRedactionAction, blob, ApplyPdfRedactionParams {}).await?; + } + + // Step 6: Audit + blob = run_action(&nvisy_detect::actions::emit_audit::EmitAuditAction, blob, EmitAuditParams { + run_id: None, + actor: None, + }).await?; + + // Collect results + let entities: Vec<Entity> = blob.get_artifacts("entities").unwrap_or_default(); + let redactions: Vec<Redaction> = blob.get_artifacts("redactions").unwrap_or_default(); + let audit_trail: Vec<serde_json::Value> = blob.get_artifacts("audit").unwrap_or_default(); + + let mut entities_by_category: HashMap<String, usize> = HashMap::new(); + for entity in &entities { + *entities_by_category + .entry(format!("{:?}", entity.category).to_lowercase()) + .or_insert(0) += 1; + } + + let elapsed = start.elapsed(); + + let output_file_name = format!("redacted_{}", file_name); + + Ok(PipelineResult { + content: blob.content, + file_name: output_file_name, + content_type: content_type.to_string(), + summary: PipelineSummary { + total_entities: entities.len(), + total_redactions: redactions.len(), + entities_by_category, + processing_time_ms: elapsed.as_millis() as u64, + }, + audit_trail, + }) +} + +/// Run a file loader based on content type and extension. +async fn run_loader(blob: &Blob, content_type: &str, file_name: &str) -> Result<Blob, Error> { + let mut result_blob = blob.clone(); + let ext = file_name + .rsplit('.') + .next() + .unwrap_or("") + .to_lowercase(); + + let outputs: Vec<LoaderOutput> = match (content_type, ext.as_str()) { + ("application/pdf", _) | (_, "pdf") => { + let loader = nvisy_ingest::loaders::pdf_loader::PdfLoader; + let params = nvisy_ingest::loaders::pdf_loader::PdfLoaderParams { + extract_images: true, + max_pages: None, + }; + loader.load(blob, ¶ms).await? + } + (ct, _) if ct.contains("wordprocessingml") => { + let loader = nvisy_ingest::loaders::docx_loader::DocxLoader; + let params = nvisy_ingest::loaders::docx_loader::DocxLoaderParams { + extract_images: true, + }; + loader.load(blob, ¶ms).await? + } + (_, "docx") => { + let loader = nvisy_ingest::loaders::docx_loader::DocxLoader; + let params = nvisy_ingest::loaders::docx_loader::DocxLoaderParams { + extract_images: true, + }; + loader.load(blob, ¶ms).await? + } + ("text/html", _) | (_, "html") | (_, "htm") => { + let loader = nvisy_ingest::loaders::html_loader::HtmlLoader; + let params = nvisy_ingest::loaders::html_loader::HtmlLoaderParams {}; + loader.load(blob, ¶ms).await? + } + (ct, _) if ct.starts_with("image/") => { + let loader = nvisy_ingest::loaders::image_loader::ImageLoader; + let params = nvisy_ingest::loaders::image_loader::ImageLoaderParams {}; + loader.load(blob, ¶ms).await? + } + (_, "jpg") | (_, "jpeg") | (_, "png") | (_, "tiff") | (_, "bmp") | (_, "webp") => { + let loader = nvisy_ingest::loaders::image_loader::ImageLoader; + let params = nvisy_ingest::loaders::image_loader::ImageLoaderParams {}; + loader.load(blob, ¶ms).await? + } + (_, "parquet") => { + let loader = nvisy_ingest::loaders::parquet_loader::ParquetLoader; + let params = nvisy_ingest::loaders::parquet_loader::ParquetLoaderParams { + max_rows: None, + }; + loader.load(blob, ¶ms).await? + } + (ct, _) if ct.contains("spreadsheetml") || ct.contains("ms-excel") => { + let loader = nvisy_ingest::loaders::xlsx_loader::XlsxLoader; + let params = nvisy_ingest::loaders::xlsx_loader::XlsxLoaderParams { + max_rows: None, + sheets: vec![], + }; + loader.load(blob, ¶ms).await? + } + (_, "xlsx") | (_, "xls") => { + let loader = nvisy_ingest::loaders::xlsx_loader::XlsxLoader; + let params = nvisy_ingest::loaders::xlsx_loader::XlsxLoaderParams { + max_rows: None, + sheets: vec![], + }; + loader.load(blob, ¶ms).await? + } + ("text/csv", _) | (_, "csv") => { + let loader = nvisy_ingest::loaders::csv_loader::CsvLoader; + loader.load(blob, &()).await? + } + ("application/json", _) | (_, "json") => { + let loader = nvisy_ingest::loaders::json_loader::JsonLoader; + loader.load(blob, &()).await? + } + (ct, _) if ct.starts_with("audio/") => { + let loader = nvisy_ingest::loaders::audio_loader::AudioLoader; + let params = nvisy_ingest::loaders::audio_loader::AudioLoaderParams {}; + loader.load(blob, ¶ms).await? + } + (_, "mp3") | (_, "wav") | (_, "flac") | (_, "ogg") | (_, "m4a") => { + let loader = nvisy_ingest::loaders::audio_loader::AudioLoader; + let params = nvisy_ingest::loaders::audio_loader::AudioLoaderParams {}; + loader.load(blob, ¶ms).await? + } + // Default: treat as plain text + _ => { + let loader = nvisy_ingest::loaders::plaintext::PlaintextLoader; + loader.load(blob, &()).await? + } + }; + + // Add loader outputs as artifacts + for output in outputs { + match output { + LoaderOutput::Document(doc) => { + result_blob.add_artifact("documents", &doc).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add document: {e}")) + })?; + } + LoaderOutput::Image(img) => { + result_blob.add_artifact("images", &img).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add image: {e}")) + })?; + } + LoaderOutput::Tabular(tab) => { + result_blob.add_artifact("tabular", &tab).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("failed to add tabular: {e}")) + })?; + } + } + } + + Ok(result_blob) +} + +/// Run a single action on a blob, returning the processed blob. +async fn run_action<A: Action>( + action: &A, + blob: Blob, + params: A::Params, +) -> Result<Blob, Error> { + let (tx_in, rx_in) = mpsc::channel(1); + let (tx_out, mut rx_out) = mpsc::channel(1); + + tx_in.send(blob).await.map_err(|_| { + Error::new(ErrorKind::Runtime, "failed to send blob to action") + })?; + drop(tx_in); + + action.execute(rx_in, tx_out, params).await?; + + rx_out.recv().await.ok_or_else(|| { + Error::new(ErrorKind::Runtime, "action produced no output") + }) +} From 78dc8f5bf202f0cbb78e8a7ba0769bea44821bac Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Thu, 12 Feb 2026 11:58:25 +0100 Subject: [PATCH 11/17] docs: add platform requirements as white paper series Replace previous ARCHITECTURE.md and DEVELOPMENT.md with a structured document set covering ingestion, detection, redaction, compliance, infrastructure, and developer experience. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- docs/ARCHITECTURE.md | 187 ----------------------------------------- docs/COMPLIANCE.md | 80 ++++++++++++++++++ docs/DETECTION.md | 60 +++++++++++++ docs/DEVELOPER.md | 65 ++++++++++++++ docs/DEVELOPMENT.md | 64 -------------- docs/INFRASTRUCTURE.md | 81 ++++++++++++++++++ docs/INGESTION.md | 69 +++++++++++++++ docs/README.md | 114 ++++++++----------------- docs/REDACTION.md | 61 ++++++++++++++ 9 files changed, 453 insertions(+), 328 deletions(-) delete mode 100644 docs/ARCHITECTURE.md create mode 100644 docs/COMPLIANCE.md create mode 100644 docs/DETECTION.md create mode 100644 docs/DEVELOPER.md delete mode 100644 docs/DEVELOPMENT.md create mode 100644 docs/INFRASTRUCTURE.md create mode 100644 docs/INGESTION.md create mode 100644 docs/REDACTION.md diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md deleted file mode 100644 index c17324f..0000000 --- a/docs/ARCHITECTURE.md +++ /dev/null @@ -1,187 +0,0 @@ -# Nvisy Runtime — Architecture - -**Technical architecture for the Nvisy Runtime data protection platform.** - ---- - -## 1. Overview - -Nvisy Runtime is a Rust-native, DAG-based data protection platform. It detects, classifies, and redacts sensitive data across documents, images, and streams. The system is structured as a Cargo workspace of composable crates, with Python extensions for AI-powered detection. - ---- - -## 2. Crate Structure - -``` -crates/ - nvisy-core/ Core types, traits, plugin registry, errors - nvisy-detect/ Regex patterns, checksum validation, policy evaluation, redaction - nvisy-engine/ Graph schema, DAG compiler, executor, run management - nvisy-object/ Object storage client and connectors (S3) - nvisy-python/ PyO3 bridge for Python AI modules - nvisy-server/ Axum HTTP server, handlers, middleware - -packages/ - nvisy-ai/ Python: LLM-based NER detection - nvisy-exif/ Python: EXIF metadata reading/stripping -``` - -### Dependency graph - -``` - nvisy-server - / | \ - ▼ ▼ ▼ - nvisy-engine nvisy-detect nvisy-python - \ | / - ▼ ▼ ▼ - nvisy-core - ▲ - | - nvisy-object -``` - -Every crate depends on `nvisy-core`. Plugin crates (`nvisy-detect`, `nvisy-object`, `nvisy-python`) are independent of each other. The server imports everything and wires plugins into the engine at startup. - ---- - -## 3. Core (`nvisy-core`) - -### 3.1 Type system - -All data flowing through a graph is represented as a `DataValue` — a discriminated union of typed primitives: `Document`, `Blob`, `Entity`, `Redaction`, `Policy`, `Audit`, `Image`. Each carries a `DataItem` with UUID, parent lineage, and metadata. - -### 3.2 Traits - -Extension points are defined as async traits: -- **Action** — transforms data (detect, redact, classify, emit audit) -- **Loader** — parses blobs into documents (plaintext, CSV, JSON) -- **ProviderFactory** — creates authenticated client connections -- **StreamSource / StreamTarget** — reads from / writes to external systems - -### 3.3 Plugin registry - -`PluginDescriptor` bundles actions, providers, sources, targets, and loaders under a namespace. `Registry` stores them keyed by `"plugin_id/item_id"` and resolves references at graph compilation time. - -### 3.4 Errors - -`Error` struct with `ErrorKind` enum (Validation, Connection, Timeout, Cancellation, Policy, Runtime, Python, Other). Carries optional source component, retryable flag, and boxed source error. `Result<T>` type alias for convenience. - ---- - -## 4. Detection (`nvisy-detect`) - -### 4.1 Pattern detection - -Regex patterns are loaded from `assets/patterns.json` at startup. Each pattern defines: name, category, entity type, regex, confidence score, and optional validator reference. Validators (SSN format check, Luhn checksum) are registered in Rust code and resolved by name. - -### 4.2 Actions - -- **detect-regex** — scans documents against all or selected patterns, emits entities -- **detect-checksum** — validates entities with checksum algorithms (Luhn), boosts confidence -- **evaluate-policy** — filters entities against policy rules -- **apply-redaction** — applies redaction methods (mask, replace, hash, etc.) -- **classify** — categorizes documents based on detected entities -- **emit-audit** — produces audit records for compliance - -### 4.3 Loaders - -- **plaintext** — loads text files -- **csv** — loads CSV with header detection -- **json** — loads JSON documents - ---- - -## 5. Engine (`nvisy-engine`) - -### 5.1 Graph schema - -Graphs are JSON structures with typed nodes (Source, Action, Target) and edges. Each node declares its provider/action reference, parameters, and optional retry/timeout policies. - -### 5.2 Compilation - -The compiler validates graph structure: parses JSON against the schema, checks for cycles via topological sort, verifies all node references resolve against the registry, and validates type compatibility between connected nodes. - -### 5.3 Execution - -The executor runs nodes in topological order. Data flows between nodes via `tokio::sync::mpsc` channels. Each node runs as a spawned task. The executor tracks per-node progress and aggregates results into a `RunResult`. - -### 5.4 Run management - -`RunManager` tracks all in-flight runs with status (pending, running, success, partial failure, failure, cancelled), progress per node, and cancellation tokens. - -### 5.5 Policies - -Retry policies (fixed, exponential, jitter backoff) and timeout policies are configurable per node. - ---- - -## 6. Server (`nvisy-server`) - -### 6.1 Role - -Short-lived Axum HTTP server. Accepts graph definitions, compiles and executes them, reports status. Designed for containerized deployment. - -### 6.2 REST API - -| Method | Path | Description | -|----------|-----------------------------|--------------------------------------| -| `GET` | `/health` | Liveness probe | -| `GET` | `/ready` | Readiness probe | -| `POST` | `/api/v1/graphs/execute` | Submit graph for execution | -| `POST` | `/api/v1/graphs/validate` | Validate graph without executing | -| `GET` | `/api/v1/graphs` | List runs | -| `GET` | `/api/v1/graphs/{runId}` | Get run status | -| `DELETE` | `/api/v1/graphs/{runId}` | Cancel run | -| `POST` | `/api/v1/redact` | Submit redaction request | -| `POST` | `/api/v1/policies` | Create policy | -| `GET` | `/api/v1/policies` | List policies | -| `GET` | `/api/v1/policies/{id}` | Get policy | -| `PUT` | `/api/v1/policies/{id}` | Update policy | -| `DELETE` | `/api/v1/policies/{id}` | Delete policy | -| `GET` | `/api/v1/audit` | Query audit records | -| `GET` | `/api/v1/audit/{runId}` | Get audit records for a run | - -### 6.3 Middleware - -- Request ID injection (`X-Request-Id`) -- Request/response tracing via `tower-http` -- CORS - -### 6.4 Service layer - -- `PolicyStore` — in-memory policy CRUD -- `AuditStore` — in-memory audit record storage -- `AppState` — shared state (registry, run manager, stores) -- `ServerConfig` — configuration from environment variables - ---- - -## 7. Python Extensions - -### 7.1 PyO3 bridge - -`PythonBridge` manages Python interpreter access via `pyo3`. Functions run on `spawn_blocking` threads to avoid blocking the async runtime. The GIL is acquired per-call. - -### 7.2 AI detection - -The `nvisy-ai` Python package provides LLM-based NER for text and images. Called from Rust via the bridge, it returns entity dicts that are parsed into `Entity` structs. - -### 7.3 EXIF handling - -The `nvisy-exif` Python package reads and strips EXIF metadata from images using Pillow. - ---- - -## 8. Error Handling - -Errors carry an `ErrorKind`, message, optional source component, retryable flag, and optional boxed source error. The runtime distinguishes transient failures (retry with backoff) from terminal failures (fail immediately). Downstream nodes dependent on a failed node are skipped. - ---- - -## 9. Security - -- Credentials resolved from environment variables, never stored in graph definitions -- TLS termination and CORS via middleware -- Detection patterns configurable per deployment -- Audit trail for all detection and redaction operations diff --git a/docs/COMPLIANCE.md b/docs/COMPLIANCE.md new file mode 100644 index 0000000..3bd0c1a --- /dev/null +++ b/docs/COMPLIANCE.md @@ -0,0 +1,80 @@ +# Compliance & Audit + +## 1. Overview + +Enterprises do not purchase redaction tools; they purchase compliance guarantees. The value of automated redaction is realized only when the organization can demonstrate — to regulators, auditors, and legal counsel — that sensitive data was identified, handled, and redacted in accordance with applicable policy. + +This requires two complementary capabilities: a policy engine that encodes regulatory and organizational rules into executable redaction policies, and an audit system that records every decision, action, and outcome with sufficient detail to reconstruct the chain of custody for any piece of content. + +## 2. Policy Engine + +### 2.1 Policy Definition + +The platform must provide a policy builder that enables administrators to define redaction rules without writing code. Policies should express conditions over entity types, document classifications, confidence thresholds, and organizational context. + +### 2.2 Regulation Packs + +Prebuilt policy packs aligned to common regulatory frameworks should be available out of the box: + +- **HIPAA**: Protected health information in medical records, communications, and claims. +- **GDPR**: Personal data of EU residents across all modalities. +- **PCI-DSS**: Payment card data in documents, images, and structured records. +- **CJIS**: Criminal justice information in law enforcement contexts. +- **CCPA**: Personal information of California residents, including the right to deletion and opt-out of sale. +- **FERPA**: Student educational records and related identifiers. + +### 2.3 Policy Simulation + +Before a policy is applied to production data, administrators must be able to simulate its effect — previewing what would be redacted across a representative sample. This "dry run" capability reduces the risk of over-redaction or under-redaction in production. + +### 2.4 Policy Versioning and Approval + +Policies must be versioned, with a full history of changes. Modifications to active policies should require approval through a configurable workflow before taking effect. + +## 3. Explainability + +Every redaction decision must be explainable. The system must record and surface: + +- **What was redacted**: The specific content span, region, or audio segment. +- **Why it was redacted**: The triggering rule, pattern, or model prediction. +- **Which model version**: The exact version of any ML model involved in the decision. +- **Confidence level**: The detection confidence associated with the decision. +- **Who reviewed it**: The identity of any human reviewer who approved, rejected, or modified the decision. +- **When it was processed**: Timestamps for each stage of the pipeline. + +## 4. Audit Trails + +### 4.1 Immutability + +Audit logs must be append-only and tamper-evident. Once a record is written, it cannot be modified or deleted. + +### 4.2 Chain of Custody + +The audit system must maintain a complete chain of custody for every piece of content: from ingestion, through detection and redaction, to export. Every access event — who viewed the content and when — must be recorded. + +### 4.3 Reporting + +The platform must generate compliance reports suitable for submission to regulators and internal audit teams. Reports should include: + +- Redaction statistics by entity type, document category, and time period +- Policy adherence metrics +- Reviewer activity and approval rates +- Exceptions and overrides + +### 4.4 SOC 2 Readiness + +Logging infrastructure must meet the requirements of SOC 2 Type II certification, including continuous monitoring, access controls, and retention policies. + +## 5. Data Retention Policies + +### 5.1 Original Content + +The platform must enforce configurable retention policies for original (pre-redaction) content. Organizations must be able to specify maximum retention periods after which originals are permanently deleted. Zero-retention mode — in which originals are discarded immediately after processing — must be available for environments where persistent storage of sensitive content is prohibited. + +### 5.2 Redacted Output + +Redacted artifacts may be retained independently of originals, subject to their own retention schedule. The platform must track the lifecycle of each artifact and enforce automated deletion at expiry. + +### 5.3 Audit Logs + +Audit log retention must be configurable separately from content retention. Regulatory frameworks often require audit records to be retained for longer periods than the underlying data (e.g., seven years for HIPAA, six years for SOX). Audit logs must never be deleted before their configured retention period expires, regardless of content deletion status. diff --git a/docs/DETECTION.md b/docs/DETECTION.md new file mode 100644 index 0000000..59d6d51 --- /dev/null +++ b/docs/DETECTION.md @@ -0,0 +1,60 @@ +# Sensitive Data Detection + +## 1. Overview + +The detection engine is the core intellectual property of the platform. It is responsible for identifying sensitive content across all supported modalities with high precision and recall. Detection must operate through multiple complementary strategies — deterministic pattern matching, learned models, and computer vision — to achieve robust coverage across diverse content types and regulatory categories. + +## 2. Language Coverage + +The platform must support detection across multiple languages and writing systems. Real-world data frequently contains non-English text, multilingual documents, and code-switched content (multiple languages within a single document or conversation). Detection models must handle at minimum the major European languages, CJK (Chinese, Japanese, Korean), and Arabic script. Deterministic patterns must be parameterized by locale — national identifier formats, date conventions, and address structures vary by jurisdiction. + +For audio, speech-to-text and subsequent NER must support the same language set, including language identification and mid-utterance language switching. + +## 3. Deterministic Detection + +Deterministic methods provide high-precision, low-latency detection for well-defined patterns: + +- **Regular expressions**: Pattern matching for structured identifiers such as Social Security numbers, credit card numbers, passport numbers, and other nationally defined formats. +- **Checksum validation**: Algorithmic verification (e.g., Luhn algorithm for credit card numbers) to reduce false positives from pattern matching alone. +- **Custom pattern libraries**: User-defined pattern sets that extend detection to organization-specific sensitive categories such as internal project identifiers, proprietary terms, or custom reference numbers. + +## 4. Machine Learning and NLP-Based Detection + +Learned models address the detection of sensitive content that cannot be captured by fixed patterns: + +- **Named entity recognition (NER)**: Identification of person names, locations, organizations, and other entity types in unstructured text. +- **Domain-specific entity models**: Specialized models trained on financial data, medical records (HIPAA-relevant entities), legal identifiers, and biometric references. +- **Contextual detection**: Inference of sensitivity from surrounding context rather than explicit entity presence. Phrases such as "the patient" or "my lawyer" may indicate sensitive content even in the absence of a named entity. This capability requires models that reason over discourse context rather than isolated tokens. + +## 5. Computer Vision Detection + +Visual content requires detection methods that operate on pixel-level and spatial features: + +- **Face detection and recognition**: Identification of human faces in images and video frames for subsequent obfuscation. +- **Document and identifier detection**: Recognition of identity documents, license plates, and other visual identifiers. +- **Handwritten text detection**: Extraction and analysis of handwritten content in scanned documents and images. +- **Screen capture analysis**: Detection of sensitive text rendered in screenshots, application windows, and other digital captures. + +## 6. Audio Detection + +Audio content introduces temporal and speaker-level dimensions to detection: + +- **Transcript-based NER**: Application of named entity recognition to speech-to-text output, with alignment back to audio timestamps. +- **Direct waveform redaction**: Replacement of sensitive audio segments with silence, tones, or noise at the waveform level. +- **Speaker-specific redaction**: Selective redaction of content from identified speakers while preserving contributions from others, enabled by speaker diarization. + +## 7. Detection Orchestration + +Individual detection strategies — deterministic, ML-based, vision, and audio — must be composed into a coherent pipeline rather than operating in isolation. + +### 7.1 Tiered Execution + +Detection should proceed in tiers ordered by cost and specificity. Deterministic patterns (regex, checksums) execute first, providing high-precision results at minimal computational cost. ML and vision models execute subsequently, targeting content that deterministic methods cannot address. This tiered architecture avoids unnecessary GPU inference for content that can be resolved through pattern matching alone. + +### 7.2 Result Merging + +When multiple detection strategies identify overlapping or adjacent sensitive regions within the same content, the platform must merge results into a unified set of detection annotations. Overlapping detections should be consolidated rather than duplicated. Each merged annotation must retain provenance — which strategies contributed to the detection and at what confidence level. + +### 7.3 Conflict Resolution + +When detection strategies disagree — for example, a regex match identifies a number as a credit card while an NER model classifies the surrounding context as non-sensitive — the platform must apply configurable conflict resolution rules. Default behavior should favor the higher-confidence or higher-sensitivity classification, but administrators must be able to override this through policy. diff --git a/docs/DEVELOPER.md b/docs/DEVELOPER.md new file mode 100644 index 0000000..20776dc --- /dev/null +++ b/docs/DEVELOPER.md @@ -0,0 +1,65 @@ +# Developer Experience & Advanced Capabilities + +## 1. Overview + +Platform adoption scales with developer experience. Organizations evaluate redaction platforms not only on detection accuracy but on how quickly they can integrate the platform into existing systems, automate workflows, and extend capabilities to meet domain-specific requirements. A first-class developer experience reduces time-to-value and expands the platform's addressable market beyond compliance teams to engineering organizations. + +## 2. Core Interfaces + +### 2.1 REST API + +A comprehensive REST API must expose all platform capabilities — content submission, policy management, redaction retrieval, and audit log access — as documented, versioned endpoints. The API is the primary integration surface and must be treated as a first-class product. + +API versioning must follow a clear strategy (URI-based or header-based) with documented deprecation timelines. Breaking changes must not be introduced without a major version increment and a migration period. + +### 2.2 SDKs + +Official client libraries for Python and JavaScript (at minimum) should wrap the REST API with idiomatic interfaces, type safety, and built-in error handling. SDKs lower the integration barrier and reduce the likelihood of misuse. + +### 2.3 Authentication and Rate Limiting + +All API access must be authenticated. The platform should support API key authentication for machine-to-machine integrations and OAuth 2.0 for user-facing applications. API keys must be scoped to specific permissions and rotatable without downtime. + +Rate limiting must be enforced per client and per tenant to prevent abuse and ensure fair resource allocation. Rate limit headers must be included in API responses so that clients can implement backoff strategies. Configurable rate tiers should be available for different client classes (e.g., higher limits for batch processing clients, lower limits for interactive use). + +### 2.4 Webhooks and Events + +An event-driven notification system must allow consumers to subscribe to processing lifecycle events — content ingested, detection complete, redaction applied, review approved — without polling. Webhook delivery should be reliable, with retry logic and delivery confirmation. + +## 3. Tooling + +### 3.1 CLI + +A command-line interface should support all common operations — submitting content, querying status, downloading results, managing policies — for scripting, automation, and developer workflows. + +### 3.2 Infrastructure as Code + +Terraform modules (or equivalent) should be provided for provisioning and configuring the platform in cloud environments, enabling reproducible deployments managed through version-controlled infrastructure definitions. + +### 3.3 Sample Policies and Synthetic Data + +A library of sample redaction policies and a synthetic data generator should be available to accelerate development and testing. Developers should be able to exercise the full pipeline against realistic but non-sensitive data without access to production content. + +## 4. Advanced Capabilities + +The following capabilities extend the platform beyond standard redaction into a category-defining position. Each represents an opportunity to increase the platform's value density — reducing the distance between raw ingestion and actionable, compliant output. + +### 4.1 Risk Scoring + +Documents and datasets should be scored by aggregate privacy exposure level, enabling organizations to prioritize review effort and allocate resources toward the highest-risk content. + +### 4.2 Smart Redaction Suggestion + +Rather than applying maximal redaction, the platform should be capable of suggesting the minimal set of redactions required to satisfy a given regulatory standard. This preserves data utility while meeting compliance obligations. + +### 4.3 Data Lineage Visualization + +A visual representation of the processing pipeline — from ingestion through detection, redaction, and export — should be available for each piece of content. Data lineage supports debugging, audit preparation, and stakeholder communication. + +### 4.4 Semantic Redaction + +Beyond named entity redaction, the platform should support redaction of semantic categories — for example, references to rare diseases, specific legal proceedings, or proprietary methodologies — that carry sensitivity not through the presence of a specific identifier but through their meaning in context. + +### 4.5 Synthetic Data Replacement + +Rather than replacing sensitive content with black bars or placeholder tokens, the platform should support replacement with realistic synthetic alternatives — generated names, addresses, dates, and other values that preserve the statistical and structural properties of the original data while eliminating re-identification risk. diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md deleted file mode 100644 index 470dc62..0000000 --- a/docs/DEVELOPMENT.md +++ /dev/null @@ -1,64 +0,0 @@ -# Nvisy Runtime — Development - -**Technology choices and development roadmap.** - ---- - -## Technology Choices - -| Concern | Choice | Rationale | -|---------|--------|-----------| -| Language | Rust | Performance, memory safety, zero-cost abstractions | -| Python extensions | PyO3 | AI/ML model inference where Python ecosystem dominates | -| Async runtime | Tokio | Industry-standard async I/O for Rust | -| HTTP framework | Axum | Tower-based, ergonomic, high performance | -| Serialization | Serde | De facto standard for Rust serialization | -| Graph library | Petgraph | DAG construction, cycle detection, topological sort | -| OpenAPI | utoipa | Compile-time OpenAPI spec generation | -| JSON Schema | schemars | Derive-based JSON Schema for all types | -| Testing | cargo test | Built-in test framework | -| Linting | clippy | Standard Rust linter | -| Formatting | rustfmt | Standard Rust formatter | -| Build | Cargo workspaces | Monorepo management | -| CI | GitHub Actions | Rust toolchain with cargo check, clippy, test, build | -| Python packaging | uv | Fast Python package management | -| Container | Docker | Multi-stage Rust build with Python runtime | - ---- - -## Development Roadmap - -### Phase 1 — Foundation (complete) - -- **`nvisy-core`** — Type system, traits, plugin registry, error handling -- **`nvisy-detect`** — Regex detection, checksum validation, policy evaluation, redaction -- **`nvisy-engine`** — Graph schema, DAG compiler, executor, run management -- **`nvisy-object`** — S3 object storage connector -- **`nvisy-python`** — PyO3 bridge, AI NER actions -- **`nvisy-server`** — Axum server, REST API, middleware -- **`nvisy-ai`** — Python LLM-based NER -- **`nvisy-exif`** — Python EXIF metadata handling - -### Phase 2 — Breadth - -- Additional detection patterns (IBAN, passport, driver's license) -- Image-based detection (face detection, license plates, document OCR) -- Additional storage connectors (GCS, Azure Blob) -- SQL connectors (PostgreSQL, MySQL) for audit persistence -- Webhook-based event triggers - -### Phase 3 — Production Hardening - -- Performance benchmarks and optimization -- Backpressure and memory management -- Graceful shutdown and in-flight run draining -- Secret provider integrations (AWS Secrets Manager, HashiCorp Vault) -- Rate limiting per connector -- Resumable execution with checkpoints - -### Phase 4 — Ecosystem - -- Plugin SDK documentation -- Community connector contribution guide -- Published crates on crates.io -- Dashboard UI for run monitoring and audit inspection diff --git a/docs/INFRASTRUCTURE.md b/docs/INFRASTRUCTURE.md new file mode 100644 index 0000000..8bc2ac3 --- /dev/null +++ b/docs/INFRASTRUCTURE.md @@ -0,0 +1,81 @@ +# Infrastructure + +## 1. Overview + +The regulated industries that require multimodal redaction — healthcare, legal, government, and financial services — impose stringent requirements on where and how data is processed. The platform must accommodate diverse deployment models, scale to meet variable workloads, and maintain rigorous security controls throughout. + +## 2. Deployment Models + +### 2.1 Cloud and On-Premises + +The platform must support deployment across multiple environments: + +- **Cloud-hosted**: Managed deployment in the vendor's infrastructure for organizations that accept cloud processing. +- **VPC deployment**: Installation within the customer's own virtual private cloud, ensuring data never leaves their network boundary. +- **On-premises**: Full deployment on customer-owned hardware for organizations with strict data sovereignty requirements. +- **Air-gapped**: Operation without network connectivity, required by certain government and defense use cases. +- **Edge processing**: Lightweight deployment at the point of data capture, relevant for law enforcement body cameras, field operations, and other latency-sensitive scenarios. + +### 2.2 Architecture + +The platform must be API-first, supporting both batch and streaming processing modes. An API-first design ensures that all platform capabilities are accessible programmatically, enabling integration into existing enterprise workflows without dependence on the platform's own user interface. + +## 3. Performance and Scale + +### 3.1 Workload Requirements + +The platform must handle workloads that span orders of magnitude in volume and latency sensitivity: + +- Large document sets (thousands to millions of PDFs) +- Long-form video and audio files +- Real-time stream redaction with sub-second latency targets +- Concurrent processing across multiple tenants or projects + +### 3.2 Scaling + +Horizontal scaling must be supported, allowing compute capacity to expand proportionally with workload volume. GPU acceleration should be available for ML inference workloads where throughput or latency requirements exceed CPU capacity. + +### 3.3 Cost Optimization + +The platform should optimize processing cost by routing content through the appropriate detection tier. Simple deterministic pattern matches should not incur the computational cost of ML inference. A tiered processing architecture — regex first, ML models only when deterministic methods are insufficient — reduces cost without sacrificing detection coverage. + +## 4. Security + +### 4.1 Data Protection + +Given that the platform processes the most sensitive data an organization holds, security must be foundational rather than additive: + +- **Encryption**: All data must be encrypted in transit (TLS) and at rest (AES-256 or equivalent). Field-level encryption should be available for particularly sensitive attributes. +- **Key management**: Integration with enterprise key management systems (AWS KMS, Azure Key Vault, HashiCorp Vault) for encryption key lifecycle management. +- **Zero-retention processing**: An operating mode in which no content persists on the platform after processing is complete. Content is held in memory only for the duration of the pipeline execution. +- **Ephemeral compute**: Processing environments that are created for each job and destroyed upon completion, leaving no residual data on disk. + +### 4.2 Access Control + +- **Role-based access control (RBAC)**: Fine-grained permissions governing who can configure policies, submit content, review redactions, and access audit logs. +- **Single sign-on (SSO) and SCIM**: Integration with enterprise identity providers for authentication and automated user provisioning. +- **Data residency controls**: Configuration to ensure that content is processed and stored only within specified geographic regions, in compliance with data sovereignty requirements. + +## 5. Multi-Tenancy + +### 5.1 Tenant Isolation + +The platform must support multi-tenant deployment with strict isolation between tenants. Content, policies, audit logs, detection models, and configuration must be segregated such that no tenant can access another tenant's data or influence another tenant's processing. Isolation must be enforced at the data layer (separate storage namespaces or encryption keys per tenant), the compute layer (dedicated or partitioned processing resources), and the API layer (tenant-scoped authentication and authorization). + +### 5.2 Tenant-Specific Configuration + +Each tenant must be able to configure its own detection policies, redaction rules, retention periods, and export formats independently. Platform-wide defaults may be set by the operator, but tenants must be able to override them within their permitted scope. + +## 6. Observability + +### 6.1 Metrics + +The platform must expose operational metrics covering ingestion throughput, detection latency, redaction processing time, queue depth, error rates, and resource utilization. Metrics must be available in a format compatible with standard monitoring systems (Prometheus, OpenTelemetry, or equivalent). + +### 6.2 Distributed Tracing + +Each piece of content should carry a trace identifier through every stage of the pipeline — ingestion, detection, redaction, review, and export. Distributed tracing enables operators to diagnose latency bottlenecks, identify failed processing stages, and correlate events across services. + +### 6.3 Alerting + +Configurable alerts must be available for operational anomalies: elevated error rates, processing latency exceeding thresholds, queue backpressure, model inference failures, and storage capacity warnings. Alerts must be deliverable through standard channels (email, webhook, PagerDuty, or equivalent). diff --git a/docs/INGESTION.md b/docs/INGESTION.md new file mode 100644 index 0000000..f7fb4f9 --- /dev/null +++ b/docs/INGESTION.md @@ -0,0 +1,69 @@ +# Ingestion & Transformation + +## 1. Overview + +The ingestion layer is responsible for accepting content from heterogeneous sources and normalizing it into a unified internal representation suitable for downstream detection and redaction. The transformation layer handles the inverse concern: producing redacted output in the appropriate format while preserving the structural integrity of the original document. + +The quality of the ingestion layer is a critical success factor. Redaction platforms that cannot reliably parse and extract content from real-world documents — scanned forms, embedded tables, multi-speaker audio — will produce incomplete redaction results regardless of the sophistication of their detection models. + +## 2. Supported Input Formats + +The platform must support ingestion across the following modalities: + +- **Documents**: PDF (native and scanned), DOCX, HTML, plain text +- **Images**: JPG, PNG, TIFF, and other common raster formats +- **Video**: Standard container formats with frame-level extraction +- **Audio**: WAV, MP3, and other common audio formats +- **Structured data**: CSV, JSON, and database connectors +- **Communications**: Email (with attachments), chat logs (Slack, Teams, WhatsApp exports) + +## 3. Extraction Capabilities + +Each modality requires specialized extraction techniques: + +- **Optical character recognition (OCR)**: Layout-aware OCR that preserves spatial relationships between text regions, table cells, headers, and form fields. +- **Speech-to-text**: Transcription with speaker diarization, enabling attribution of spoken content to individual speakers. +- **Video frame extraction**: Decomposition of video streams into individual frames for visual analysis, with temporal alignment to audio tracks. +- **Document structure parsing**: Identification of semantic document elements — headings, paragraphs, tables, lists, and form fields — beyond raw text extraction. +- **Metadata extraction**: Capture of authorship, timestamps, geolocation, and other embedded metadata that may itself constitute sensitive information. + +## 4. Transformation & Output + +Following redaction, the transformation layer must produce output that meets downstream requirements while maintaining fidelity to the original format. + +### 4.1 Format Preservation + +Redacted output should preserve the structural characteristics of the source document. Tables must remain aligned, page layouts must be maintained, and non-redacted content must remain unaltered. + +### 4.2 Export Formats + +The platform should support export as: + +- Redacted PDF with visual redaction markers +- Structured JSON with redaction metadata +- Masked CSV for tabular data +- Anonymized datasets for analytics consumption + +### 4.3 Masking Strategies + +Multiple masking strategies should be available, selected according to the use case: + +- **Tokenization and pseudonymization**: Replacement of sensitive values with consistent tokens that preserve referential integrity across documents. +- **Reversible masking**: Vault-based masking where original values can be recovered by authorized parties through a secure key exchange. +- **De-identification with re-linking key**: Removal of direct identifiers with a separately stored mapping that enables re-identification under controlled conditions. + +## 5. Validation and Error Handling + +Ingestion must account for real-world content that is malformed, incomplete, or unsupported. + +### 5.1 Input Validation + +Before processing begins, the platform must validate that submitted content meets minimum requirements: supported file format, non-zero size, and absence of corruption indicators. Invalid submissions must be rejected with actionable error messages that identify the specific validation failure. + +### 5.2 Partial Extraction + +When a document is partially parseable — a multi-page PDF with a corrupt page, an audio file with a damaged segment, or an image with an unreadable region — the platform should extract what it can and flag the remainder as incomplete. Partial extraction results must be clearly annotated so that downstream detection operates only on successfully extracted content. + +### 5.3 Error Reporting + +Every ingestion failure must produce a structured error record that includes the content identifier, the failure type (unsupported format, corrupt data, extraction timeout, codec unavailable), and the processing stage at which the failure occurred. These records must be available through the same audit infrastructure described in [COMPLIANCE.md](COMPLIANCE.md). diff --git a/docs/README.md b/docs/README.md index 4bbfcc8..35a4546 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,93 +1,53 @@ -# Nvisy Runtime - -**A data protection runtime for AI pipelines.** - ---- +# Multimodal Redaction & Privacy Platform ## Abstract -AI-powered products handle sensitive data at every stage — ingestion, transformation, enrichment, and storage. PII in documents, faces in images, credentials in logs, and financial data in spreadsheets all require detection, classification, and redaction before downstream consumption. - -**Nvisy Runtime** is a Rust-native data protection platform that treats sensitive data detection as a first-class pipeline primitive. It provides a DAG-based execution engine, typed data primitives with lineage tracking, regex and AI-powered entity detection, configurable redaction policies, and a pluggable connector system — all designed for throughput, correctness, and auditability. - ---- - -## Problem Statement - -### 1. Sensitive data is everywhere in AI pipelines - -Documents, images, API responses, and model outputs all carry PII, PHI, financial data, and credentials. Manual redaction doesn't scale. Teams need automated, configurable detection and redaction that runs inline with their data pipelines. - -### 2. Detection requires multiple methods - -Regex patterns catch structured data (SSNs, emails, credit cards). AI-powered NER catches unstructured entities (names, addresses, medical terms). Checksum validation reduces false positives. A production system needs all three, composable in a single pipeline. - -### 3. Redaction must be auditable - -Compliance (GDPR, HIPAA, PCI-DSS) requires proof of what was detected, what was redacted, and how. Every detection and redaction action must produce an audit trail with full lineage. - -### 4. Performance matters - -Data protection runs on every record. The runtime must handle high throughput without becoming a bottleneck. Rust provides the performance foundation; Python extensions handle AI workloads where model quality matters more than latency. - ---- - -## Design Principles - -### Typed data primitives - -Every data object flowing through a graph is typed: `Document`, `Blob`, `Entity`, `Redaction`, `Policy`, `Audit`, `Image`. Primitives carry metadata and enforce structural contracts at compile time (Rust) and runtime (serde validation). - -### DAG-based execution - -Graphs are directed acyclic graphs of nodes (sources, actions, targets). The engine resolves dependencies, manages concurrency, handles retries, and tracks execution state. - -### Regex + AI detection - -Built-in regex patterns detect structured sensitive data. Python-based NER (via PyO3) detects unstructured entities. Both produce the same `Entity` type, composable in a single pipeline. - -### Plugin architecture - -Connectors, actions, and loaders register through a plugin system. Each plugin bundles its capabilities under a namespace. The engine resolves references at compilation time. - -### Audit-first - -Every detection and redaction produces an `Audit` record. Policies define what to detect and how to redact. The audit trail provides full lineage from source document to redacted output. - ---- - -## Core Concepts - -### Entities - -An **Entity** is a detected piece of sensitive data: its category (PII, PHI, financial, credentials), type (SSN, email, face), value, confidence score, detection method, and location within the source document or image. - -### Policies - -A **Policy** defines detection and redaction rules: which entity categories to scan, minimum confidence thresholds, and per-type redaction methods (mask, replace, hash, encrypt, remove, blur, block, synthesize). +As organizations contend with an ever-growing volume of unstructured and multimodal data, the challenge of identifying and redacting sensitive information has become a critical concern. Regulatory frameworks such as GDPR, HIPAA, CCPA, and PCI-DSS impose strict obligations on how personally identifiable information (PII), protected health information (PHI), and other sensitive content must be handled across documents, images, audio, and video. -### Graphs +This document series presents the architectural and functional requirements for a multimodal redaction platform capable of extracting content from heterogeneous sources, detecting sensitive data through deterministic and learned methods, applying context-aware redaction, and producing auditable evidence of compliance. -A **Graph** is a DAG of nodes. Source nodes read data, action nodes detect/redact/classify, and target nodes write results. Graphs are defined as JSON and compiled into execution plans. +The guiding principle is: **extract everything, understand context, redact precisely, prove compliance.** -### Connectors +## Documents -Connectors implement the source and target interfaces. The object storage connector (S3) handles file ingestion and output. Additional connectors register through the plugin system. +| Document | Scope | +| --- | --- | +| [Ingestion & Transformation](INGESTION.md) | Multimodal content extraction and post-redaction output | +| [Detection](DETECTION.md) | Sensitive data detection across modalities | +| [Redaction & Review](REDACTION.md) | Context-aware redaction and human-in-the-loop workflows | +| [Compliance & Audit](COMPLIANCE.md) | Policy engine, explainability, and audit trails | +| [Infrastructure](INFRASTRUCTURE.md) | Deployment, performance, and security | +| [Developer Experience](DEVELOPER.md) | APIs, SDKs, tooling, and advanced capabilities | ---- +## Strategic Positioning -## Deployment +Three viable product directions exist for platforms in this space: -The server (`nvisy-server`) is a short-lived Axum HTTP server. It accepts graph definitions, executes them, and reports status. Designed for containerized deployment — the main server spins it up, feeds work, waits for completion. +1. **Compliance-first platform** — targets enterprise procurement cycles driven by regulatory mandates. +2. **Developer-first redaction API** — prioritizes integration speed, SDK quality, and self-serve adoption. +3. **AI-native multimodal privacy engine** — leads with model sophistication, context understanding, and semantic redaction. ---- +The strongest long-term defensibility lies in context-aware, explainable, policy-driven multimodal redaction — a convergence of all three directions. -## Project Status +## Target Verticals -Active development. The Rust runtime, detection engine, and server are implemented. AI-powered detection runs via Python extensions. +The platform is designed to serve regulated industries where sensitive data handling is a legal and operational requirement: ---- +- **Healthcare**: HIPAA-governed medical records, clinical communications, insurance claims, and patient intake forms. +- **Legal**: Court filings, discovery documents, attorney-client communications, and case management systems. +- **Government and defense**: Law enforcement records, intelligence reports, FOIA responses, and classified material processing. +- **Financial services**: Transaction records, customer onboarding documents, fraud investigation files, and PCI-scoped payment data. +- **Education**: Student records, admissions documents, and FERPA-governed institutional data. -## License +## Glossary -Apache License 2.0. See [LICENSE.txt](../LICENSE.txt). +| Term | Definition | +| --- | --- | +| **PII** | Personally identifiable information — any data that can identify a specific individual | +| **PHI** | Protected health information — health data covered under HIPAA | +| **NER** | Named entity recognition — ML technique for identifying entities (names, locations, organizations) in text | +| **OCR** | Optical character recognition — extraction of text from images and scanned documents | +| **RBAC** | Role-based access control — permissions model based on user roles | +| **SSO** | Single sign-on — authentication mechanism allowing one set of credentials across multiple systems | +| **SCIM** | System for Cross-domain Identity Management — protocol for automating user provisioning | +| **KMS** | Key management service — system for managing cryptographic keys | diff --git a/docs/REDACTION.md b/docs/REDACTION.md new file mode 100644 index 0000000..6753e24 --- /dev/null +++ b/docs/REDACTION.md @@ -0,0 +1,61 @@ +# Redaction & Review + +## 1. Overview + +Detection identifies what is sensitive; redaction determines what to do about it. The distinction between a basic redaction tool and a production-grade platform lies in the ability to apply redaction with contextual awareness — understanding not just that a name appears in a document, but whose name it is, why it matters, and whether the surrounding regulatory and organizational policy requires its removal. + +Equally important is the human-in-the-loop review process. Automated redaction at scale demands human oversight to maintain trust, catch edge cases, and provide a feedback signal for continuous model improvement. + +## 2. Context-Aware Redaction + +### 2.1 Instance-Level Precision + +The platform must distinguish between occurrences of the same entity across different contexts. Redacting "John Smith" in one document should not require redacting every occurrence of the name across an entire corpus. Redaction decisions must be scoped to the relevant instance, document, or case. + +### 2.2 Role-Based and Conditional Redaction + +Redaction rules must support conditional logic: + +- **Role-based rules**: Redact all references to minors while preserving references to adults. +- **Document-type conditions**: Apply medical redaction policies only when the document type is classified as a health record. +- **Temporal conditions**: Redact specific time segments in audio or video content. + +### 2.3 Relationship-Aware Redaction + +Advanced redaction scenarios require reasoning over relationships between entities. For example, redacting all names associated with a specific case identifier, or redacting all communications involving a particular individual across a document set. + +### 2.4 Policy Templates + +Predefined redaction templates aligned to regulatory frameworks (HIPAA, GDPR, CCPA) enable rapid deployment and reduce the burden of manual policy configuration. + +## 3. Human-in-the-Loop Review + +### 3.1 Review Interface + +The platform must provide a review interface that enables human reviewers to inspect, approve, reject, or modify automated redaction decisions. This interface should present the original and redacted content side by side, with clear visual indicators of each redaction and its triggering rule or model. + +### 3.2 Confidence Scoring + +Each automated redaction decision should carry a confidence score derived from the underlying detection model. Reviewers can then prioritize their attention on low-confidence decisions, improving throughput without sacrificing accuracy. + +### 3.3 Bulk Operations + +For large document sets, the review interface must support bulk approval, rejection, and modification of redaction decisions, filtered by confidence threshold, entity type, or document category. + +### 3.4 Access Control + +Reviewer permissions must be configurable through role-based access control. Not all reviewers should have access to all document types or sensitivity levels. + +### 3.5 Active Learning + +Reviewer corrections — accepted, rejected, or modified redactions — should feed back into the detection models as training signal. Over time, this active learning loop reduces the volume of decisions requiring human review and improves model accuracy on organization-specific content. + +## 4. Redaction Versioning and Rollback + +### 4.1 Versioned Redaction State + +The platform must maintain versioned snapshots of redaction state for each piece of content. Each modification — whether automated or manual — produces a new version. Prior versions must remain accessible for comparison, audit, and rollback. + +### 4.2 Rollback + +Before export, any redaction decision must be reversible. Reviewers must be able to roll back individual redactions or restore an entire document to a previous redaction state. After export, rollback is no longer available — the exported artifact is final, and any corrections require re-processing from the original content. From b6492774bc1981314410630453a7cb4efec7ae10 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Thu, 12 Feb 2026 11:58:49 +0100 Subject: [PATCH 12/17] refactor: dissolve nvisy-detect, nvisy-media, nvisy-server; add nvisy-pattern, nvisy-pipeline Remove nvisy-detect (actions, dictionaries, patterns), nvisy-media (render, media redaction actions), and nvisy-server (handlers, services, middleware). Restructure nvisy-core by removing datatypes and registry modules. Reorganize nvisy-ingest from flat loaders into modality-specific modules (audio, binary, image, tabular, text). Add nvisy-pattern and nvisy-pipeline crates. Update all workspace dependencies accordingly. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- Cargo.lock | 387 ++---------------- Cargo.toml | 28 +- crates/nvisy-core/Cargo.toml | 11 +- crates/nvisy-core/src/datatypes/blob.rs | 137 ------- crates/nvisy-core/src/datatypes/mod.rs | 61 --- crates/nvisy-core/src/error.rs | 1 - crates/nvisy-core/src/fs/content_handler.rs | 14 +- crates/nvisy-core/src/fs/content_metadata.rs | 7 +- crates/nvisy-core/src/io/content_data.rs | 37 +- crates/nvisy-core/src/lib.rs | 2 - crates/nvisy-core/src/path/source.rs | 36 +- crates/nvisy-core/src/prelude.rs | 5 - crates/nvisy-core/src/registry/action.rs | 37 -- crates/nvisy-core/src/registry/mod.rs | 4 - crates/nvisy-detect/README.md | 3 - .../src/actions/apply_redaction.rs | 149 ------- crates/nvisy-detect/src/actions/classify.rs | 96 ----- .../src/actions/detect_checksum.rs | 120 ------ .../nvisy-detect/src/actions/detect_manual.rs | 87 ---- .../nvisy-detect/src/actions/detect_regex.rs | 143 ------- .../src/actions/detect_tabular.rs | 134 ------ crates/nvisy-detect/src/actions/emit_audit.rs | 104 ----- crates/nvisy-detect/src/lib.rs | 19 - crates/nvisy-detect/src/prelude.rs | 10 - crates/nvisy-engine/Cargo.toml | 11 +- crates/nvisy-engine/src/compiler/graph.rs | 6 +- crates/nvisy-engine/src/connections/mod.rs | 2 +- crates/nvisy-engine/src/executor/context.rs | 10 +- crates/nvisy-engine/src/executor/runner.rs | 14 +- crates/nvisy-engine/src/policies/retry.rs | 4 +- crates/nvisy-engine/src/runs/mod.rs | 28 +- crates/nvisy-ingest/Cargo.toml | 11 +- crates/nvisy-ingest/src/audio/mod.rs | 4 + crates/nvisy-ingest/src/audio/mp3.rs | 51 +++ crates/nvisy-ingest/src/audio/wav.rs | 51 +++ .../docx_loader.rs => binary/docx.rs} | 58 +-- crates/nvisy-ingest/src/binary/mod.rs | 7 + .../{loaders/pdf_loader.rs => binary/pdf.rs} | 60 ++- crates/nvisy-ingest/src/document.rs | 299 ++++++++++++++ .../src/element.rs} | 256 +----------- crates/nvisy-ingest/src/handler.rs | 372 +++++++++++++++++ crates/nvisy-ingest/src/image/image.rs | 58 +++ crates/nvisy-ingest/src/image/mod.rs | 4 + crates/nvisy-ingest/src/lib.rs | 10 +- .../nvisy-ingest/src/loaders/audio_loader.rs | 58 --- crates/nvisy-ingest/src/loaders/csv_loader.rs | 44 -- .../nvisy-ingest/src/loaders/image_loader.rs | 67 --- .../nvisy-ingest/src/loaders/json_loader.rs | 48 --- crates/nvisy-ingest/src/loaders/mod.rs | 72 ---- crates/nvisy-ingest/src/loaders/plaintext.rs | 47 --- crates/nvisy-ingest/src/prelude.rs | 46 ++- crates/nvisy-ingest/src/tabular/mod.rs | 7 + .../parquet_loader.rs => tabular/parquet.rs} | 53 ++- .../xlsx_loader.rs => tabular/xlsx.rs} | 59 ++- crates/nvisy-ingest/src/text/csv.rs | 38 ++ .../{loaders/html_loader.rs => text/html.rs} | 45 +- crates/nvisy-ingest/src/text/json.rs | 42 ++ crates/nvisy-ingest/src/text/mod.rs | 8 + crates/nvisy-ingest/src/text/plaintext.rs | 41 ++ crates/nvisy-media/Cargo.toml | 49 --- crates/nvisy-media/README.md | 7 - .../src/actions/apply_audio_redaction.rs | 54 --- .../src/actions/apply_image_redaction.rs | 159 ------- .../src/actions/apply_pdf_redaction.rs | 153 ------- .../src/actions/apply_tabular_redaction.rs | 150 ------- crates/nvisy-media/src/actions/mod.rs | 10 - crates/nvisy-media/src/lib.rs | 11 - crates/nvisy-media/src/prelude.rs | 5 - crates/nvisy-object/Cargo.toml | 1 + crates/nvisy-object/src/prelude.rs | 2 +- crates/nvisy-object/src/providers/s3.rs | 8 +- crates/nvisy-object/src/streams/mod.rs | 28 +- crates/nvisy-object/src/streams/read.rs | 17 +- crates/nvisy-object/src/streams/write.rs | 23 +- crates/nvisy-ontology/Cargo.toml | 11 +- crates/nvisy-ontology/src/ontology/audit.rs | 24 +- crates/nvisy-ontology/src/ontology/entity.rs | 30 +- .../nvisy-ontology/src/ontology/redaction.rs | 12 +- .../nvisy-ontology/src/redaction/context.rs | 6 +- crates/nvisy-ontology/src/redaction/policy.rs | 12 +- crates/nvisy-pattern/Cargo.toml | 30 ++ .../assets/dictionaries/first_names.txt | 0 .../assets/dictionaries/last_names.txt | 0 .../assets/dictionaries/medical_terms.txt | 0 .../assets/patterns.json | 0 .../src/dictionaries/mod.rs | 0 crates/nvisy-pattern/src/lib.rs | 16 + .../src/patterns/mod.rs | 0 .../src/patterns/validators.rs | 2 +- crates/nvisy-pattern/src/prelude.rs | 4 + .../Cargo.toml | 23 +- crates/nvisy-pipeline/src/action.rs | 35 ++ .../src/actions/apply_audio_redaction.rs | 47 +++ .../src/actions/apply_image_redaction.rs | 129 ++++++ .../src/actions/apply_pdf_redaction.rs | 148 +++++++ .../src/actions/apply_redaction.rs | 133 ++++++ .../src/actions/apply_tabular_redaction.rs | 108 +++++ crates/nvisy-pipeline/src/actions/classify.rs | 83 ++++ .../src/actions/detect_checksum.rs | 103 +++++ .../src/actions/detect_dictionary.rs | 88 ++-- .../src/actions/detect_manual.rs | 68 +++ .../src/actions/detect_regex.rs | 113 +++++ .../src/actions/detect_tabular.rs | 132 ++++++ .../nvisy-pipeline/src/actions/emit_audit.rs | 91 ++++ .../src/actions/evaluate_policy.rs | 112 +++-- .../src/actions/mod.rs | 12 +- crates/nvisy-pipeline/src/lib.rs | 22 + crates/nvisy-pipeline/src/prelude.rs | 21 + .../src}/provider.rs | 11 +- .../src/render/block.rs | 0 .../src/render/blur.rs | 0 .../src/render/mod.rs | 0 crates/nvisy-python/Cargo.toml | 2 + crates/nvisy-python/src/actions/mod.rs | 172 ++++---- crates/nvisy-python/src/actions/ocr.rs | 121 +++--- crates/nvisy-python/src/lib.rs | 4 +- crates/nvisy-python/src/prelude.rs | 2 +- crates/nvisy-python/src/provider/mod.rs | 6 +- crates/nvisy-server/Cargo.toml | 73 ---- crates/nvisy-server/README.md | 3 - crates/nvisy-server/src/app/mod.rs | 46 --- crates/nvisy-server/src/handler/audit.rs | 69 ---- crates/nvisy-server/src/handler/graphs.rs | 121 ------ crates/nvisy-server/src/handler/health.rs | 32 -- crates/nvisy-server/src/handler/mod.rs | 34 -- crates/nvisy-server/src/handler/policies.rs | 156 ------- crates/nvisy-server/src/handler/redact.rs | 225 ---------- crates/nvisy-server/src/main.rs | 30 -- crates/nvisy-server/src/middleware/mod.rs | 2 - .../nvisy-server/src/service/audit_store.rs | 72 ---- crates/nvisy-server/src/service/config.rs | 19 - crates/nvisy-server/src/service/mod.rs | 35 -- crates/nvisy-server/src/service/pipeline.rs | 332 --------------- .../nvisy-server/src/service/policy_store.rs | 78 ---- crates/nvisy-server/src/service/state.rs | 19 - 135 files changed, 2962 insertions(+), 4617 deletions(-) delete mode 100644 crates/nvisy-core/src/datatypes/blob.rs delete mode 100644 crates/nvisy-core/src/datatypes/mod.rs delete mode 100644 crates/nvisy-core/src/registry/action.rs delete mode 100644 crates/nvisy-core/src/registry/mod.rs delete mode 100644 crates/nvisy-detect/README.md delete mode 100644 crates/nvisy-detect/src/actions/apply_redaction.rs delete mode 100644 crates/nvisy-detect/src/actions/classify.rs delete mode 100644 crates/nvisy-detect/src/actions/detect_checksum.rs delete mode 100644 crates/nvisy-detect/src/actions/detect_manual.rs delete mode 100644 crates/nvisy-detect/src/actions/detect_regex.rs delete mode 100644 crates/nvisy-detect/src/actions/detect_tabular.rs delete mode 100644 crates/nvisy-detect/src/actions/emit_audit.rs delete mode 100644 crates/nvisy-detect/src/lib.rs delete mode 100644 crates/nvisy-detect/src/prelude.rs create mode 100644 crates/nvisy-ingest/src/audio/mod.rs create mode 100644 crates/nvisy-ingest/src/audio/mp3.rs create mode 100644 crates/nvisy-ingest/src/audio/wav.rs rename crates/nvisy-ingest/src/{loaders/docx_loader.rs => binary/docx.rs} (80%) create mode 100644 crates/nvisy-ingest/src/binary/mod.rs rename crates/nvisy-ingest/src/{loaders/pdf_loader.rs => binary/pdf.rs} (78%) create mode 100644 crates/nvisy-ingest/src/document.rs rename crates/{nvisy-core/src/datatypes/document.rs => nvisy-ingest/src/element.rs} (57%) create mode 100644 crates/nvisy-ingest/src/handler.rs create mode 100644 crates/nvisy-ingest/src/image/image.rs create mode 100644 crates/nvisy-ingest/src/image/mod.rs delete mode 100644 crates/nvisy-ingest/src/loaders/audio_loader.rs delete mode 100644 crates/nvisy-ingest/src/loaders/csv_loader.rs delete mode 100644 crates/nvisy-ingest/src/loaders/image_loader.rs delete mode 100644 crates/nvisy-ingest/src/loaders/json_loader.rs delete mode 100644 crates/nvisy-ingest/src/loaders/mod.rs delete mode 100644 crates/nvisy-ingest/src/loaders/plaintext.rs create mode 100644 crates/nvisy-ingest/src/tabular/mod.rs rename crates/nvisy-ingest/src/{loaders/parquet_loader.rs => tabular/parquet.rs} (73%) rename crates/nvisy-ingest/src/{loaders/xlsx_loader.rs => tabular/xlsx.rs} (68%) create mode 100644 crates/nvisy-ingest/src/text/csv.rs rename crates/nvisy-ingest/src/{loaders/html_loader.rs => text/html.rs} (78%) create mode 100644 crates/nvisy-ingest/src/text/json.rs create mode 100644 crates/nvisy-ingest/src/text/mod.rs create mode 100644 crates/nvisy-ingest/src/text/plaintext.rs delete mode 100644 crates/nvisy-media/Cargo.toml delete mode 100644 crates/nvisy-media/README.md delete mode 100644 crates/nvisy-media/src/actions/apply_audio_redaction.rs delete mode 100644 crates/nvisy-media/src/actions/apply_image_redaction.rs delete mode 100644 crates/nvisy-media/src/actions/apply_pdf_redaction.rs delete mode 100644 crates/nvisy-media/src/actions/apply_tabular_redaction.rs delete mode 100644 crates/nvisy-media/src/actions/mod.rs delete mode 100644 crates/nvisy-media/src/lib.rs delete mode 100644 crates/nvisy-media/src/prelude.rs create mode 100644 crates/nvisy-pattern/Cargo.toml rename crates/{nvisy-detect => nvisy-pattern}/assets/dictionaries/first_names.txt (100%) rename crates/{nvisy-detect => nvisy-pattern}/assets/dictionaries/last_names.txt (100%) rename crates/{nvisy-detect => nvisy-pattern}/assets/dictionaries/medical_terms.txt (100%) rename crates/{nvisy-detect => nvisy-pattern}/assets/patterns.json (100%) rename crates/{nvisy-detect => nvisy-pattern}/src/dictionaries/mod.rs (100%) create mode 100644 crates/nvisy-pattern/src/lib.rs rename crates/{nvisy-detect => nvisy-pattern}/src/patterns/mod.rs (100%) rename crates/{nvisy-detect => nvisy-pattern}/src/patterns/validators.rs (91%) create mode 100644 crates/nvisy-pattern/src/prelude.rs rename crates/{nvisy-detect => nvisy-pipeline}/Cargo.toml (59%) create mode 100644 crates/nvisy-pipeline/src/action.rs create mode 100644 crates/nvisy-pipeline/src/actions/apply_audio_redaction.rs create mode 100644 crates/nvisy-pipeline/src/actions/apply_image_redaction.rs create mode 100644 crates/nvisy-pipeline/src/actions/apply_pdf_redaction.rs create mode 100644 crates/nvisy-pipeline/src/actions/apply_redaction.rs create mode 100644 crates/nvisy-pipeline/src/actions/apply_tabular_redaction.rs create mode 100644 crates/nvisy-pipeline/src/actions/classify.rs create mode 100644 crates/nvisy-pipeline/src/actions/detect_checksum.rs rename crates/{nvisy-detect => nvisy-pipeline}/src/actions/detect_dictionary.rs (68%) create mode 100644 crates/nvisy-pipeline/src/actions/detect_manual.rs create mode 100644 crates/nvisy-pipeline/src/actions/detect_regex.rs create mode 100644 crates/nvisy-pipeline/src/actions/detect_tabular.rs create mode 100644 crates/nvisy-pipeline/src/actions/emit_audit.rs rename crates/{nvisy-detect => nvisy-pipeline}/src/actions/evaluate_policy.rs (54%) rename crates/{nvisy-detect => nvisy-pipeline}/src/actions/mod.rs (67%) create mode 100644 crates/nvisy-pipeline/src/lib.rs create mode 100644 crates/nvisy-pipeline/src/prelude.rs rename crates/{nvisy-core/src/registry => nvisy-pipeline/src}/provider.rs (82%) rename crates/{nvisy-media => nvisy-pipeline}/src/render/block.rs (100%) rename crates/{nvisy-media => nvisy-pipeline}/src/render/blur.rs (100%) rename crates/{nvisy-media => nvisy-pipeline}/src/render/mod.rs (100%) delete mode 100644 crates/nvisy-server/Cargo.toml delete mode 100644 crates/nvisy-server/README.md delete mode 100644 crates/nvisy-server/src/app/mod.rs delete mode 100644 crates/nvisy-server/src/handler/audit.rs delete mode 100644 crates/nvisy-server/src/handler/graphs.rs delete mode 100644 crates/nvisy-server/src/handler/health.rs delete mode 100644 crates/nvisy-server/src/handler/mod.rs delete mode 100644 crates/nvisy-server/src/handler/policies.rs delete mode 100644 crates/nvisy-server/src/handler/redact.rs delete mode 100644 crates/nvisy-server/src/main.rs delete mode 100644 crates/nvisy-server/src/middleware/mod.rs delete mode 100644 crates/nvisy-server/src/service/audit_store.rs delete mode 100644 crates/nvisy-server/src/service/config.rs delete mode 100644 crates/nvisy-server/src/service/mod.rs delete mode 100644 crates/nvisy-server/src/service/pipeline.rs delete mode 100644 crates/nvisy-server/src/service/policy_store.rs delete mode 100644 crates/nvisy-server/src/service/state.rs diff --git a/Cargo.lock b/Cargo.lock index f2f931a..c780d56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -18,15 +18,6 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "366ffbaa4442f4684d91e2cd7c5ea7c4ed8add41959a31447066e279e432b618" -[[package]] -name = "addr2line" -version = "0.25.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" -dependencies = [ - "gimli", -] - [[package]] name = "adler2" version = "2.0.1" @@ -158,9 +149,6 @@ name = "anyhow" version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" -dependencies = [ - "backtrace", -] [[package]] name = "approx" @@ -468,86 +456,6 @@ dependencies = [ "arrayvec", ] -[[package]] -name = "axum" -version = "0.8.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" -dependencies = [ - "axum-core", - "axum-macros", - "bytes", - "form_urlencoded", - "futures-util", - "http", - "http-body", - "http-body-util", - "hyper", - "hyper-util", - "itoa", - "matchit", - "memchr", - "mime", - "multer", - "percent-encoding", - "pin-project-lite", - "serde_core", - "serde_json", - "serde_path_to_error", - "serde_urlencoded", - "sync_wrapper", - "tokio", - "tower", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "axum-core" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" -dependencies = [ - "bytes", - "futures-core", - "http", - "http-body", - "http-body-util", - "mime", - "pin-project-lite", - "sync_wrapper", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "axum-macros" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "604fde5e028fea851ce1d8570bbdc034bec850d157f7569d10f347d06808c05c" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.114", -] - -[[package]] -name = "backtrace" -version = "0.3.76" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" -dependencies = [ - "addr2line", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", - "windows-link", -] - [[package]] name = "base64" version = "0.22.1" @@ -703,7 +611,6 @@ dependencies = [ "iana-time-zone", "js-sys", "num-traits", - "serde", "wasm-bindgen", "windows-link", ] @@ -1397,12 +1304,6 @@ dependencies = [ "weezl", ] -[[package]] -name = "gimli" -version = "0.32.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" - [[package]] name = "h2" version = "0.4.13" @@ -1455,16 +1356,6 @@ version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" -[[package]] -name = "hdrhistogram" -version = "7.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d" -dependencies = [ - "byteorder", - "num-traits", -] - [[package]] name = "heck" version = "0.5.0" @@ -1808,8 +1699,6 @@ checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" dependencies = [ "equivalent", "hashbrown 0.16.1", - "serde", - "serde_core", ] [[package]] @@ -2162,21 +2051,6 @@ dependencies = [ "syn 2.0.114", ] -[[package]] -name = "matchers" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" -dependencies = [ - "regex-automata", -] - -[[package]] -name = "matchit" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" - [[package]] name = "matrixmultiply" version = "0.3.10" @@ -2228,12 +2102,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "mime" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" - [[package]] name = "minimal-lexical" version = "0.2.1" @@ -2311,23 +2179,6 @@ dependencies = [ "pxfm", ] -[[package]] -name = "multer" -version = "3.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83e87776546dc87511aa5ee218730c92b666d7264ab6ed41f9d215af9cd5224b" -dependencies = [ - "bytes", - "encoding_rs", - "futures-util", - "http", - "httparse", - "memchr", - "mime", - "spin", - "version_check", -] - [[package]] name = "multimap" version = "0.10.1" @@ -2400,15 +2251,6 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" -[[package]] -name = "nu-ansi-term" -version = "0.50.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" -dependencies = [ - "windows-sys 0.61.2", -] - [[package]] name = "num" version = "0.4.3" @@ -2505,9 +2347,7 @@ name = "nvisy-core" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "bytes", - "chrono", "derive_more 1.0.0", "hex", "hipstr", @@ -2525,28 +2365,12 @@ dependencies = [ "uuid", ] -[[package]] -name = "nvisy-detect" -version = "0.1.0" -dependencies = [ - "aho-corasick", - "async-trait", - "nvisy-core", - "nvisy-ontology", - "regex", - "serde", - "serde_json", - "tokio", - "tracing", - "uuid", -] - [[package]] name = "nvisy-engine" version = "0.1.0" dependencies = [ "anyhow", - "chrono", + "jiff", "nvisy-core", "petgraph", "rand 0.9.2", @@ -2575,6 +2399,7 @@ dependencies = [ "parquet", "pdf-extract", "quick-xml 0.37.5", + "schemars", "scraper", "serde", "serde_json", @@ -2584,24 +2409,6 @@ dependencies = [ "zip 2.4.2", ] -[[package]] -name = "nvisy-media" -version = "0.1.0" -dependencies = [ - "async-trait", - "bytes", - "image", - "imageproc", - "lopdf", - "nvisy-core", - "nvisy-ontology", - "serde", - "serde_json", - "tokio", - "tracing", - "uuid", -] - [[package]] name = "nvisy-object" version = "0.1.0" @@ -2611,6 +2418,7 @@ dependencies = [ "futures", "minio", "nvisy-core", + "nvisy-pipeline", "serde", "serde_json", "thiserror", @@ -2623,8 +2431,8 @@ dependencies = [ name = "nvisy-ontology" version = "0.1.0" dependencies = [ - "chrono", "derive_more 1.0.0", + "jiff", "nvisy-core", "schemars", "serde", @@ -2633,60 +2441,54 @@ dependencies = [ ] [[package]] -name = "nvisy-python" +name = "nvisy-pattern" +version = "0.1.0" +dependencies = [ + "nvisy-ontology", + "serde", + "serde_json", +] + +[[package]] +name = "nvisy-pipeline" version = "0.1.0" dependencies = [ + "aho-corasick", "async-trait", + "bytes", + "image", + "imageproc", + "lopdf", "nvisy-core", + "nvisy-ingest", "nvisy-ontology", - "pyo3", + "nvisy-pattern", + "regex", "serde", "serde_json", - "thiserror", "tokio", "tracing", "uuid", ] [[package]] -name = "nvisy-server" +name = "nvisy-python" version = "0.1.0" dependencies = [ - "anyhow", - "axum", - "base64", - "bytes", - "chrono", + "async-trait", "nvisy-core", - "nvisy-detect", - "nvisy-engine", "nvisy-ingest", - "nvisy-media", "nvisy-ontology", - "nvisy-python", - "schemars", + "nvisy-pipeline", + "pyo3", "serde", "serde_json", "thiserror", "tokio", - "tower", - "tower-http", "tracing", - "tracing-subscriber", - "utoipa", - "utoipa-scalar", "uuid", ] -[[package]] -name = "object" -version = "0.37.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" -dependencies = [ - "memchr", -] - [[package]] name = "once_cell" version = "1.21.3" @@ -3410,12 +3212,6 @@ version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce" -[[package]] -name = "rustc-demangle" -version = "0.1.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" - [[package]] name = "rustc_version" version = "0.4.1" @@ -3484,7 +3280,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" dependencies = [ "bytes", - "chrono", "dyn-clone", "ref-cast", "schemars_derive", @@ -3644,17 +3439,6 @@ dependencies = [ "zmij", ] -[[package]] -name = "serde_path_to_error" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" -dependencies = [ - "itoa", - "serde", - "serde_core", -] - [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -3698,15 +3482,6 @@ dependencies = [ "digest", ] -[[package]] -name = "sharded-slab" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" -dependencies = [ - "lazy_static", -] - [[package]] name = "shlex" version = "1.3.0" @@ -3779,12 +3554,6 @@ dependencies = [ "windows-sys 0.60.2", ] -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" - [[package]] name = "sptr" version = "0.3.2" @@ -3948,15 +3717,6 @@ dependencies = [ "syn 2.0.114", ] -[[package]] -name = "thread_local" -version = "1.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" -dependencies = [ - "cfg-if", -] - [[package]] name = "thrift" version = "0.17.0" @@ -4117,16 +3877,11 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", - "hdrhistogram", - "indexmap", "pin-project-lite", - "slab", "sync_wrapper", "tokio", - "tokio-util", "tower-layer", "tower-service", - "tracing", ] [[package]] @@ -4140,14 +3895,11 @@ dependencies = [ "futures-util", "http", "http-body", - "http-body-util", "iri-string", "pin-project-lite", "tower", "tower-layer", "tower-service", - "tracing", - "uuid", ] [[package]] @@ -4168,7 +3920,6 @@ version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ - "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -4192,49 +3943,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", - "valuable", -] - -[[package]] -name = "tracing-log" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" -dependencies = [ - "log", - "once_cell", - "tracing-core", -] - -[[package]] -name = "tracing-serde" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" -dependencies = [ - "serde", - "tracing-core", -] - -[[package]] -name = "tracing-subscriber" -version = "0.3.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" -dependencies = [ - "matchers", - "nu-ansi-term", - "once_cell", - "regex-automata", - "serde", - "serde_json", - "sharded-slab", - "smallvec", - "thread_local", - "tracing", - "tracing-core", - "tracing-log", - "tracing-serde", ] [[package]] @@ -4349,43 +4057,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" -[[package]] -name = "utoipa" -version = "5.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fcc29c80c21c31608227e0912b2d7fddba57ad76b606890627ba8ee7964e993" -dependencies = [ - "indexmap", - "serde", - "serde_json", - "utoipa-gen", -] - -[[package]] -name = "utoipa-gen" -version = "5.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d79d08d92ab8af4c5e8a6da20c47ae3f61a0f1dabc1997cdf2d082b757ca08b" -dependencies = [ - "proc-macro2", - "quote", - "regex", - "syn 2.0.114", - "uuid", -] - -[[package]] -name = "utoipa-scalar" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59559e1509172f6b26c1cdbc7247c4ddd1ac6560fe94b584f81ee489b141f719" -dependencies = [ - "axum", - "serde", - "serde_json", - "utoipa", -] - [[package]] name = "uuid" version = "1.20.0" @@ -4409,12 +4080,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "valuable" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" - [[package]] name = "vcpkg" version = "0.2.15" diff --git a/Cargo.toml b/Cargo.toml index 274f78d..f6f854f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,14 +4,13 @@ resolver = "2" members = [ "./crates/nvisy-core", - "./crates/nvisy-detect", "./crates/nvisy-engine", "./crates/nvisy-ingest", - "./crates/nvisy-media", "./crates/nvisy-object", + "./crates/nvisy-pattern", + "./crates/nvisy-pipeline", "./crates/nvisy-ontology", "./crates/nvisy-python", - "./crates/nvisy-server", ] [workspace.package] @@ -34,14 +33,13 @@ documentation = "https://docs.rs/nvisy-runtime" # Internal crates nvisy-core = { path = "./crates/nvisy-core", version = "0.1.0" } -nvisy-detect = { path = "./crates/nvisy-detect", version = "0.1.0" } nvisy-engine = { path = "./crates/nvisy-engine", version = "0.1.0" } nvisy-ingest = { path = "./crates/nvisy-ingest", version = "0.1.0" } -nvisy-media = { path = "./crates/nvisy-media", version = "0.1.0" } nvisy-object = { path = "./crates/nvisy-object", version = "0.1.0" } +nvisy-pattern = { path = "./crates/nvisy-pattern", version = "0.1.0" } +nvisy-pipeline = { path = "./crates/nvisy-pipeline", version = "0.1.0" } nvisy-ontology = { path = "./crates/nvisy-ontology", version = "0.1.0" } nvisy-python = { path = "./crates/nvisy-python", version = "0.1.0" } -nvisy-server = { path = "./crates/nvisy-server", version = "0.1.0" } # Async runtime tokio = { version = "1", features = [] } @@ -49,18 +47,8 @@ tokio-util = { version = "0.7", features = [] } futures = { version = "0.3", features = [] } async-trait = { version = "0.1", features = [] } -# HTTP server -axum = { version = "0.8", features = [] } -tower = { version = "0.5", features = [] } -tower-http = { version = "0.6", features = [] } - -# OpenAPI / Documentation -utoipa = { version = "5", features = ["uuid"] } -utoipa-scalar = { version = "0.3", features = [] } - # Observability tracing = { version = "0.1", features = [] } -tracing-subscriber = { version = "0.3", features = [] } # (De)serialization serde = { version = "1.0", features = ["derive"] } @@ -73,7 +61,6 @@ derive_more = { version = "1", features = ["display"] } # Primitive datatypes uuid = { version = "1", features = ["serde", "v4", "v7"] } -chrono = { version = "0.4", features = ["serde"] } bytes = { version = "1", features = ["serde"] } # Text processing @@ -87,7 +74,7 @@ petgraph = { version = "0.8", features = [] } infer = { version = "0.19", features = [] } # JSON Schema generation -schemars = { version = "1", features = ["uuid1", "chrono04", "bytes1"] } +schemars = { version = "1", features = ["uuid1", "bytes1"] } # Python interop pyo3 = { version = "0.23", features = [] } @@ -109,11 +96,8 @@ quick-xml = "0.37" arrow = { version = "54", default-features = false } parquet = { version = "54", default-features = false, features = ["arrow"] } -# Encoding -base64 = "0.22" - # Time -jiff = "0.2" +jiff = { version = "0.2", features = ["serde"] } # Interned strings hipstr = "0.6" diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index cf597db..7cd9a1a 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -17,16 +17,9 @@ repository = { workspace = true } homepage = { workspace = true } documentation = { workspace = true } -[package.metadata.docs.rs] -all-features = true -rustdoc-args = ["--cfg", "docsrs"] - -[features] -schema = ["dep:schemars"] - [dependencies] # JSON Schema generation -schemars = { workspace = true, optional = true } +schemars = { workspace = true } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -34,11 +27,9 @@ serde_json = { workspace = true, features = [] } # Async runtime tokio = { workspace = true, features = ["sync", "fs", "io-util", "rt"] } -async-trait = { workspace = true, features = [] } # Primitive datatypes uuid = { workspace = true, features = ["serde", "v4", "v7"] } -chrono = { workspace = true, features = ["serde"] } bytes = { workspace = true, features = ["serde"] } # File type detection diff --git a/crates/nvisy-core/src/datatypes/blob.rs b/crates/nvisy-core/src/datatypes/blob.rs deleted file mode 100644 index a5e4c21..0000000 --- a/crates/nvisy-core/src/datatypes/blob.rs +++ /dev/null @@ -1,137 +0,0 @@ -//! Binary large object type and helpers. - -use std::collections::HashMap; - -use bytes::Bytes; -use serde::de::DeserializeOwned; -use serde::{Deserialize, Serialize}; -use super::Data; - -/// Content type information for a blob. -/// -/// Tracks both the caller-supplied MIME type and the type detected -/// from the file's magic bytes so consumers can choose the most -/// reliable value. -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct BlobContentInfo { - /// MIME type provided by the caller (e.g. from HTTP Content-Type header). - #[serde(skip_serializing_if = "Option::is_none")] - pub mime: Option<String>, - /// MIME type detected from magic bytes. - #[serde(skip_serializing_if = "Option::is_none")] - pub detected_mime: Option<String>, -} - -/// A binary large object flowing through the pipeline. -/// -/// Blobs carry raw byte content along with an artifact registry -/// for derived data produced during pipeline processing. Each -/// pipeline action may attach artifacts (entities, documents, -/// redactions, etc.) to the blob as it passes through. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct Blob { - /// Common data-item fields (id, parent_id, metadata). - #[serde(flatten)] - pub data: Data, - /// Storage path or key identifying this blob's origin. - pub path: String, - /// Raw byte content of the blob. - #[serde(with = "bytes_serde")] - #[cfg_attr(feature = "schema", schemars(with = "Vec<u8>"))] - pub content: Bytes, - /// Caller-supplied and auto-detected MIME type information. - pub provided: BlobContentInfo, - /// Artifacts derived from this blob during pipeline processing. - /// - /// Keys are artifact type names (e.g. `"documents"`, `"entities"`, `"redactions"`). - /// Values are lists of JSON-serialized artifacts. Use [`add_artifact`] and - /// [`get_artifacts`] for type-safe access. - #[serde(default, skip_serializing_if = "HashMap::is_empty")] - pub artifacts: HashMap<String, Vec<serde_json::Value>>, -} - -impl Blob { - /// Create a new blob from a storage path and raw content bytes. - /// - /// The MIME type is auto-detected from magic bytes when possible. - pub fn new(path: impl Into<String>, content: impl Into<Bytes>) -> Self { - let content = content.into(); - let detected_mime = infer::get(&content).map(|t| t.mime_type().to_string()); - Self { - data: Data::new(), - path: path.into(), - content, - provided: BlobContentInfo { - mime: None, - detected_mime, - }, - artifacts: HashMap::new(), - } - } - - /// Set the caller-provided MIME type (builder pattern). - pub fn with_content_type(mut self, mime: impl Into<String>) -> Self { - self.provided.mime = Some(mime.into()); - self - } - - /// Get the best-available MIME type (provided takes precedence over detected). - pub fn content_type(&self) -> Option<&str> { - self.provided - .mime - .as_deref() - .or(self.provided.detected_mime.as_deref()) - } - - /// Get the file extension from the path. - pub fn extension(&self) -> Option<&str> { - self.path.rsplit('.').next() - } - - /// Store a serializable artifact under the given key. - pub fn add_artifact<T: Serialize>(&mut self, key: &str, value: &T) -> Result<(), serde_json::Error> { - let json = serde_json::to_value(value)?; - self.artifacts.entry(key.to_string()).or_default().push(json); - Ok(()) - } - - /// Retrieve all artifacts under the given key, deserializing into `T`. - pub fn get_artifacts<T: DeserializeOwned>(&self, key: &str) -> Result<Vec<T>, serde_json::Error> { - match self.artifacts.get(key) { - Some(values) => values.iter().map(|v| serde_json::from_value(v.clone())).collect(), - None => Ok(Vec::new()), - } - } - - /// Check if any artifacts exist under the given key. - pub fn has_artifacts(&self, key: &str) -> bool { - self.artifacts.get(key).is_some_and(|v| !v.is_empty()) - } -} - -pub(crate) mod bytes_serde { - use bytes::Bytes; - use serde::{self, Deserialize, Deserializer, Serializer}; - - pub fn serialize<S>(bytes: &Bytes, serializer: S) -> Result<S::Ok, S::Error> - where - S: Serializer, - { - use serde::ser::SerializeSeq; - let mut seq = serializer.serialize_seq(Some(bytes.len()))?; - for b in bytes.iter() { - seq.serialize_element(b)?; - } - seq.end() - } - - pub fn deserialize<'de, D>(deserializer: D) -> Result<Bytes, D::Error> - where - D: Deserializer<'de>, - { - let v: Vec<u8> = Vec::deserialize(deserializer)?; - Ok(Bytes::from(v)) - } -} diff --git a/crates/nvisy-core/src/datatypes/mod.rs b/crates/nvisy-core/src/datatypes/mod.rs deleted file mode 100644 index b937467..0000000 --- a/crates/nvisy-core/src/datatypes/mod.rs +++ /dev/null @@ -1,61 +0,0 @@ -//! Domain data types for the nvisy pipeline. -//! -//! This module defines the core data structures that flow through the nvisy -//! processing pipeline: blobs and documents. - -use serde::{Deserialize, Serialize}; -use uuid::Uuid; - -pub mod blob; -pub mod document; - -/// General-purpose metadata map. -pub type Metadata = serde_json::Map<String, serde_json::Value>; - -/// Common fields shared by all domain data items. -/// -/// Every first-class object in the pipeline (blobs, documents, entities, etc.) -/// embeds a `Data` to carry a unique identifier, an optional parent -/// lineage link, and arbitrary metadata. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct Data { - /// Unique identifier for this item, generated as a v4 UUID on creation. - pub id: Uuid, - /// Identifier of the item this was derived from, if any. - #[serde(skip_serializing_if = "Option::is_none")] - pub parent_id: Option<Uuid>, - /// Arbitrary key-value metadata associated with this item. - #[serde(skip_serializing_if = "Option::is_none")] - pub metadata: Option<Metadata>, -} - -impl Data { - /// Create a new `Data` with a freshly generated UUID and no parent or metadata. - pub fn new() -> Self { - Self { - id: Uuid::new_v4(), - parent_id: None, - metadata: None, - } - } - - /// Attach metadata to this item (builder pattern). - pub fn with_metadata(mut self, metadata: Metadata) -> Self { - self.metadata = Some(metadata); - self - } - - /// Set `parent_id` to the id of `parent`, establishing lineage. - pub fn derive_from(mut self, parent: &Data) -> Self { - self.parent_id = Some(parent.id); - self - } -} - -impl Default for Data { - fn default() -> Self { - Self::new() - } -} - diff --git a/crates/nvisy-core/src/error.rs b/crates/nvisy-core/src/error.rs index a7a8452..3d669b7 100644 --- a/crates/nvisy-core/src/error.rs +++ b/crates/nvisy-core/src/error.rs @@ -19,7 +19,6 @@ pub enum ErrorKind { /// An operation exceeded its time limit. Timeout, /// The operation was explicitly cancelled. - #[display("Cancelled")] Cancellation, /// A policy rule was violated. Policy, diff --git a/crates/nvisy-core/src/fs/content_handler.rs b/crates/nvisy-core/src/fs/content_handler.rs index 69d3f8b..8e54505 100644 --- a/crates/nvisy-core/src/fs/content_handler.rs +++ b/crates/nvisy-core/src/fs/content_handler.rs @@ -2,13 +2,15 @@ use std::fmt; use std::path::{Path, PathBuf}; use std::sync::Arc; +use tokio::runtime::Handle; + use crate::path::ContentSource; /// Inner state cleaned up when the last `ContentHandler` reference is dropped. struct ContentHandlerInner { content_source: ContentSource, dir: PathBuf, - runtime_handle: tokio::runtime::Handle, + runtime_handle: Handle, } impl fmt::Debug for ContentHandlerInner { @@ -57,16 +59,12 @@ pub struct ContentHandler { impl ContentHandler { /// Creates a new content handler. - pub(crate) fn new( - content_source: ContentSource, - dir: PathBuf, - runtime_handle: tokio::runtime::Handle, - ) -> Self { + pub(crate) fn new(source: ContentSource, dir: PathBuf, handle: Handle) -> Self { Self { inner: Arc::new(ContentHandlerInner { - content_source, + content_source: source, dir, - runtime_handle, + runtime_handle: handle, }), } } diff --git a/crates/nvisy-core/src/fs/content_metadata.rs b/crates/nvisy-core/src/fs/content_metadata.rs index 23d01da..11cb976 100644 --- a/crates/nvisy-core/src/fs/content_metadata.rs +++ b/crates/nvisy-core/src/fs/content_metadata.rs @@ -13,12 +13,15 @@ use crate::path::ContentSource; /// /// This struct stores metadata about content including its source identifier /// and file path. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct ContentMetadata { /// Unique identifier for the content source pub content_source: ContentSource, /// Optional path to the source file pub source_path: Option<PathBuf>, + /// Arbitrary key-value metadata associated with this content. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub metadata: Option<serde_json::Map<String, serde_json::Value>>, } impl ContentMetadata { @@ -37,6 +40,7 @@ impl ContentMetadata { Self { content_source, source_path: None, + metadata: None, } } @@ -56,6 +60,7 @@ impl ContentMetadata { Self { content_source, source_path: Some(path.into()), + metadata: None, } } diff --git a/crates/nvisy-core/src/io/content_data.rs b/crates/nvisy-core/src/io/content_data.rs index 6b6073a..e1adb0c 100644 --- a/crates/nvisy-core/src/io/content_data.rs +++ b/crates/nvisy-core/src/io/content_data.rs @@ -160,6 +160,12 @@ pub struct ContentData { /// Lazily computed SHA256 hash of the content. #[serde(skip)] sha256_cache: OnceLock<Bytes>, + /// Caller-supplied MIME type (e.g. from HTTP Content-Type header). + #[serde(skip_serializing_if = "Option::is_none")] + pub mime: Option<String>, + /// MIME type detected from magic bytes. + #[serde(skip_serializing_if = "Option::is_none")] + pub detected_mime: Option<String>, } impl ContentData { @@ -182,6 +188,8 @@ impl ContentData { content_source, data: ContentBytes::new(data), sha256_cache: OnceLock::new(), + mime: None, + detected_mime: None, } } @@ -202,6 +210,8 @@ impl ContentData { content_source, data: ContentBytes::from(text.into()), sha256_cache: OnceLock::new(), + mime: None, + detected_mime: None, } } @@ -211,9 +221,29 @@ impl ContentData { content_source, data, sha256_cache: OnceLock::new(), + mime: None, + detected_mime: None, } } + /// Set the caller-provided MIME type (builder pattern). + #[must_use] + pub fn with_content_type(mut self, mime: impl Into<String>) -> Self { + self.mime = Some(mime.into()); + self + } + + /// Get the best-available MIME type (provided takes precedence over detected). + #[must_use] + pub fn content_type(&self) -> Option<&str> { + self.mime.as_deref().or(self.detected_mime.as_deref()) + } + + /// Detect the MIME type from magic bytes and store it. + pub fn detect_mime(&mut self) { + self.detected_mime = infer::get(self.data.as_bytes()).map(|t| t.mime_type().to_string()); + } + /// Returns the size of the content in bytes. #[must_use] pub fn size(&self) -> usize { @@ -376,13 +406,18 @@ impl Clone for ContentData { content_source: self.content_source, data: self.data.clone(), sha256_cache: new_lock, + mime: self.mime.clone(), + detected_mime: self.detected_mime.clone(), } } } impl PartialEq for ContentData { fn eq(&self, other: &Self) -> bool { - self.content_source == other.content_source && self.data == other.data + self.content_source == other.content_source + && self.data == other.data + && self.mime == other.mime + && self.detected_mime == other.detected_mime } } diff --git a/crates/nvisy-core/src/lib.rs b/crates/nvisy-core/src/lib.rs index b94c8b9..ad85b54 100644 --- a/crates/nvisy-core/src/lib.rs +++ b/crates/nvisy-core/src/lib.rs @@ -2,12 +2,10 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -pub mod datatypes; pub mod error; pub mod fs; pub mod io; pub mod path; -pub mod registry; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-core/src/path/source.rs b/crates/nvisy-core/src/path/source.rs index 49b2811..ff8a1cd 100644 --- a/crates/nvisy-core/src/path/source.rs +++ b/crates/nvisy-core/src/path/source.rs @@ -16,10 +16,13 @@ use uuid::Uuid; /// This allows for efficient tracking and correlation of content throughout /// the processing pipeline. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, schemars::JsonSchema)] pub struct ContentSource { /// `UUIDv7` identifier id: Uuid, + /// Optional parent source for lineage tracking + #[serde(skip_serializing_if = "Option::is_none")] + parent_id: Option<Uuid>, } impl ContentSource { @@ -44,6 +47,7 @@ impl ContentSource { Self { id: Uuid::new_v7(timestamp), + parent_id: None, } } @@ -62,7 +66,7 @@ impl ContentSource { /// ``` #[must_use] pub fn from_uuid(id: Uuid) -> Self { - Self { id } + Self { id, parent_id: None } } /// Get the underlying UUID @@ -111,7 +115,33 @@ impl ContentSource { /// ``` pub fn parse(s: &str) -> Result<Self, uuid::Error> { let id = Uuid::parse_str(s)?; - Ok(Self { id }) + Ok(Self { id, parent_id: None }) + } + + /// Get the parent source identifier, if any. + #[must_use] + pub fn parent_id(&self) -> Option<Uuid> { + self.parent_id + } + + /// Set the parent source identifier. + pub fn set_parent_id(&mut self, parent_id: Option<Uuid>) { + self.parent_id = parent_id; + } + + /// Create a copy of this source with the given parent (builder pattern). + #[must_use] + pub fn with_parent(mut self, parent: &ContentSource) -> Self { + self.parent_id = Some(parent.id); + self + } + + /// Create a new content source derived from this one (new ID, self as parent). + #[must_use] + pub fn derive(&self) -> Self { + let mut child = Self::new(); + child.parent_id = Some(self.id); + child } /// Get the timestamp component from the `UUIDv7` diff --git a/crates/nvisy-core/src/prelude.rs b/crates/nvisy-core/src/prelude.rs index 7c8c950..5690858 100644 --- a/crates/nvisy-core/src/prelude.rs +++ b/crates/nvisy-core/src/prelude.rs @@ -3,12 +3,7 @@ //! Import everything from this module to get the most commonly used //! types without individual `use` statements. -pub use crate::datatypes::blob::Blob; -pub use crate::datatypes::document::TabularData; -pub use crate::datatypes::Data; pub use crate::error::{Error, ErrorKind, Result}; pub use crate::fs::{ContentFile, ContentHandler, ContentKind, ContentMetadata, ContentRegistry}; pub use crate::io::{AsyncContentRead, AsyncContentWrite, Content, ContentBytes, ContentData, DataReference}; pub use crate::path::ContentSource; -pub use crate::registry::action::Action; -pub use crate::registry::provider::{ConnectedInstance, ProviderFactory}; diff --git a/crates/nvisy-core/src/registry/action.rs b/crates/nvisy-core/src/registry/action.rs deleted file mode 100644 index 23d5133..0000000 --- a/crates/nvisy-core/src/registry/action.rs +++ /dev/null @@ -1,37 +0,0 @@ -//! The `Action` trait -- the fundamental processing unit in a pipeline. - -use serde::de::DeserializeOwned; -use tokio::sync::mpsc; - -use crate::datatypes::blob::Blob; -use crate::error::Error; - -/// A processing step that consumes blobs from an input channel and -/// produces blobs to an output channel. -/// -/// Actions are the primary unit of work in a pipeline. Each action -/// receives blobs via an async MPSC channel, transforms them (possibly -/// attaching artifacts), and forwards results to the next stage. -/// -/// Actions that need a provider client should hold it as a struct field -/// rather than receiving it as a parameter. -#[async_trait::async_trait] -pub trait Action: Send + Sync + 'static { - /// Strongly-typed parameters for this action. - type Params: DeserializeOwned + Send; - - /// Unique identifier for this action (e.g. "detect-regex"). - fn id(&self) -> &str; - - /// Validate action parameters. - fn validate_params(&self, params: &Self::Params) -> Result<(), Error>; - - /// Execute the action, consuming blobs from input and sending results to output. - /// Returns the number of items processed. - async fn execute( - &self, - input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - params: Self::Params, - ) -> Result<u64, Error>; -} diff --git a/crates/nvisy-core/src/registry/mod.rs b/crates/nvisy-core/src/registry/mod.rs deleted file mode 100644 index 825061c..0000000 --- a/crates/nvisy-core/src/registry/mod.rs +++ /dev/null @@ -1,4 +0,0 @@ -//! Core traits defining the pipeline extension points. - -pub mod action; -pub mod provider; diff --git a/crates/nvisy-detect/README.md b/crates/nvisy-detect/README.md deleted file mode 100644 index 5d620aa..0000000 --- a/crates/nvisy-detect/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# nvisy-detect - -Detection and redaction plugin for the Nvisy runtime. Provides regex-based entity detection, checksum validation, policy evaluation, classification, and audit emission. diff --git a/crates/nvisy-detect/src/actions/apply_redaction.rs b/crates/nvisy-detect/src/actions/apply_redaction.rs deleted file mode 100644 index 56cabf3..0000000 --- a/crates/nvisy-detect/src/actions/apply_redaction.rs +++ /dev/null @@ -1,149 +0,0 @@ -//! Action that applies pending redactions to document text. - -use std::collections::HashMap; -use tokio::sync::mpsc; -use uuid::Uuid; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::Document; -use nvisy_ontology::ontology::entity::Entity; -use nvisy_ontology::ontology::redaction::Redaction; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; - -/// Applies pending [`Redaction`] artifacts to document content. -/// -/// The action correlates entities with their redactions, locates the -/// corresponding text spans inside each document, and replaces them with -/// the computed replacement values. The resulting redacted documents are -/// re-emitted as `"documents"` artifacts. -pub struct ApplyRedactionAction; - -/// A single text replacement that has been resolved but not yet applied. -struct PendingRedaction { - /// Byte offset where the redaction starts in the original text. - start_offset: usize, - /// Byte offset where the redaction ends (exclusive) in the original text. - end_offset: usize, - /// The string that will replace the original span. - replacement_value: String, -} - -#[async_trait::async_trait] -impl Action for ApplyRedactionAction { - type Params = (); - - fn id(&self) -> &str { - "apply-redaction" - } - - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) - } - - async fn execute( - &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - _params: Self::Params, - ) -> Result<u64, Error> { - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let documents: Vec<Document> = blob.get_artifacts("documents").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read documents artifact: {e}")) - })?; - let entities: Vec<Entity> = blob.get_artifacts("entities").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read entities artifact: {e}")) - })?; - let redactions: Vec<Redaction> = blob.get_artifacts("redactions").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read redactions artifact: {e}")) - })?; - - let entity_map: HashMap<Uuid, &Entity> = - entities.iter().map(|e| (e.data.id, e)).collect(); - let redaction_map: HashMap<Uuid, &Redaction> = - redactions.iter().map(|r| (r.entity_id, r)).collect(); - - // Clear existing documents -- we will re-add the (possibly redacted) versions - blob.artifacts.remove("documents"); - - for doc in &documents { - let mut pending: Vec<PendingRedaction> = Vec::new(); - - for (entity_id, redaction) in &redaction_map { - let entity = match entity_map.get(entity_id) { - Some(e) => e, - None => continue, - }; - - // Check entity belongs to this document - let belongs = entity.data.parent_id == Some(doc.data.id) - || entity.source_id == Some(doc.data.id); - if !belongs { - continue; - } - - pending.push(PendingRedaction { - start_offset: entity.location.start_offset, - end_offset: entity.location.end_offset, - replacement_value: redaction.replacement_value.clone(), - }); - } - - if pending.is_empty() { - blob.add_artifact("documents", doc).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add document artifact: {e}")) - })?; - count += 1; - continue; - } - - let redacted_content = apply_redactions(&doc.content, &mut pending); - let mut result = Document::new(redacted_content); - result.title = doc.title.clone(); - result.elements = doc.elements.clone(); - result.source_format = doc.source_format.clone(); - result.page_count = doc.page_count; - result.data.parent_id = Some(doc.data.id); - - blob.add_artifact("documents", &result).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add document artifact: {e}")) - })?; - count += 1; - } - - if output.send(blob).await.is_err() { - return Ok(count); - } - } - - Ok(count) - } -} - -/// Applies a set of pending redactions to `text`, returning the redacted result. -/// -/// Replacements are applied right-to-left (descending start offset) so that -/// earlier byte offsets remain valid after each substitution. -fn apply_redactions(text: &str, pending: &mut [PendingRedaction]) -> String { - // Sort by start offset descending (right-to-left) to preserve positions - pending.sort_by(|a, b| b.start_offset.cmp(&a.start_offset)); - - let mut result = text.to_string(); - for redaction in pending.iter() { - let start = redaction.start_offset.min(result.len()); - let end = redaction.end_offset.min(result.len()); - if start >= end { - continue; - } - - result = format!( - "{}{}{}", - &result[..start], - redaction.replacement_value, - &result[end..] - ); - } - result -} diff --git a/crates/nvisy-detect/src/actions/classify.rs b/crates/nvisy-detect/src/actions/classify.rs deleted file mode 100644 index 28855f6..0000000 --- a/crates/nvisy-detect/src/actions/classify.rs +++ /dev/null @@ -1,96 +0,0 @@ -//! Sensitivity classification action. - -use tokio::sync::mpsc; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_ontology::ontology::entity::Entity; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; - -/// Assigns a sensitivity level to each blob based on its detected entities. -/// -/// The action inspects the `"entities"` artifact, computes a sensitivity level -/// (`"none"`, `"low"`, `"medium"`, `"high"`, or `"critical"`), and writes it -/// into the blob metadata as `"sensitivityLevel"`. It also records the -/// `"totalEntities"` count. -pub struct ClassifyAction; - -#[async_trait::async_trait] -impl Action for ClassifyAction { - type Params = (); - - fn id(&self) -> &str { - "classify" - } - - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) - } - - async fn execute( - &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - _params: Self::Params, - ) -> Result<u64, Error> { - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let entities: Vec<Entity> = blob.get_artifacts("entities").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read entities artifact: {e}")) - })?; - - let sensitivity_level = compute_sensitivity_level(&entities); - - let mut meta = blob.data.metadata.clone().unwrap_or_default(); - meta.insert( - "sensitivityLevel".to_string(), - serde_json::Value::String(sensitivity_level), - ); - meta.insert( - "totalEntities".to_string(), - serde_json::Value::Number(entities.len().into()), - ); - blob.data.metadata = Some(meta); - - count += 1; - if output.send(blob).await.is_err() { - return Ok(count); - } - } - - Ok(count) - } -} - -/// Computes a sensitivity level string from a set of detected entities. -/// -/// The heuristic is: -/// - `"none"` -- no entities. -/// - `"critical"` -- at least one high-confidence (>= 0.9) credential, SSN, or credit card. -/// - `"high"` -- any critical type present, or more than 10 entities total. -/// - `"medium"` -- more than 3 entities. -/// - `"low"` -- 1-3 non-critical entities. -fn compute_sensitivity_level(entities: &[Entity]) -> String { - if entities.is_empty() { - return "none".to_string(); - } - - let has_high_confidence = entities.iter().any(|e| e.confidence >= 0.9); - let has_critical_types = entities.iter().any(|e| { - matches!(e.category, nvisy_ontology::ontology::entity::EntityCategory::Credentials) - || e.entity_type == "ssn" - || e.entity_type == "credit_card" - }); - - if has_critical_types && has_high_confidence { - return "critical".to_string(); - } - if has_critical_types || entities.len() > 10 { - return "high".to_string(); - } - if entities.len() > 3 { - return "medium".to_string(); - } - "low".to_string() -} diff --git a/crates/nvisy-detect/src/actions/detect_checksum.rs b/crates/nvisy-detect/src/actions/detect_checksum.rs deleted file mode 100644 index 25e2f85..0000000 --- a/crates/nvisy-detect/src/actions/detect_checksum.rs +++ /dev/null @@ -1,120 +0,0 @@ -//! Checksum-based entity validation action. - -use serde::Deserialize; -use tokio::sync::mpsc; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_ontology::ontology::entity::{DetectionMethod, Entity}; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; - -use crate::patterns::validators::luhn_check; - -/// Typed parameters for [`DetectChecksumAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct DetectChecksumParams { - /// Whether to discard entities that fail validation. - #[serde(default = "default_true")] - pub drop_invalid: bool, - /// Amount added to confidence on successful validation. - #[serde(default = "default_boost")] - pub confidence_boost: f64, -} - -fn default_true() -> bool { true } -fn default_boost() -> f64 { 0.05 } - -/// Validates previously detected entities using checksum algorithms. -/// -/// Entities whose type has a registered validator (e.g. Luhn for credit cards) -/// are verified. Valid matches receive a confidence boost and are re-emitted -/// with [`DetectionMethod::Checksum`]. Invalid matches can optionally be -/// dropped from the pipeline. -pub struct DetectChecksumAction; - -#[async_trait::async_trait] -impl Action for DetectChecksumAction { - type Params = DetectChecksumParams; - - fn id(&self) -> &str { - "detect-checksum" - } - - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) - } - - async fn execute( - &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - params: Self::Params, - ) -> Result<u64, Error> { - let drop_invalid = params.drop_invalid; - let confidence_boost = params.confidence_boost; - - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let entities: Vec<Entity> = blob.get_artifacts("entities").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read entities artifact: {e}")) - })?; - - // Clear existing entities -- we will re-add validated ones - blob.artifacts.remove("entities"); - - for entity in entities { - let validator = get_validator(&entity.entity_type); - - if let Some(validate) = validator { - let is_valid = validate(&entity.value); - - if !is_valid && drop_invalid { - continue; - } - - if is_valid { - let mut boosted = Entity::new( - entity.category, - &entity.entity_type, - &entity.value, - DetectionMethod::Checksum, - (entity.confidence + confidence_boost).min(1.0), - entity.location.clone(), - ); - boosted.data.parent_id = entity.data.parent_id; - boosted.source_id = entity.source_id; - - blob.add_artifact("entities", &boosted).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add entity artifact: {e}")) - })?; - - count += 1; - continue; - } - } - - // No validator or not valid but not dropping -- pass through - blob.add_artifact("entities", &entity).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add entity artifact: {e}")) - })?; - count += 1; - } - - if output.send(blob).await.is_err() { - return Ok(count); - } - } - - Ok(count) - } -} - -/// Returns the checksum validator function for a given entity type, if one exists. -fn get_validator(entity_type: &str) -> Option<fn(&str) -> bool> { - match entity_type { - "credit_card" => Some(luhn_check), - _ => None, - } -} diff --git a/crates/nvisy-detect/src/actions/detect_manual.rs b/crates/nvisy-detect/src/actions/detect_manual.rs deleted file mode 100644 index 7768323..0000000 --- a/crates/nvisy-detect/src/actions/detect_manual.rs +++ /dev/null @@ -1,87 +0,0 @@ -//! Manual annotation detection action. -//! -//! Converts user-provided [`ManualAnnotation`]s from the blob's -//! `"manual_entities"` artifact into full [`Entity`] objects. - -use serde::Deserialize; -use tokio::sync::mpsc; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityLocation}; -use nvisy_ontology::redaction::ManualAnnotation; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; - -/// Typed parameters for [`DetectManualAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct DetectManualParams {} - -/// Reads `"manual_entities"` artifacts from the blob (injected by the -/// server from `RedactionContext.manual_entities`) and converts each -/// [`ManualAnnotation`] into a full [`Entity`] with -/// `DetectionMethod::Manual` and confidence 1.0. -pub struct DetectManualAction; - -#[async_trait::async_trait] -impl Action for DetectManualAction { - type Params = DetectManualParams; - - fn id(&self) -> &str { - "detect-manual" - } - - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) - } - - async fn execute( - &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - _params: Self::Params, - ) -> Result<u64, Error> { - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let annotations: Vec<ManualAnnotation> = - blob.get_artifacts("manual_entities").map_err(|e| { - Error::new( - ErrorKind::Runtime, - format!("failed to read manual_entities artifact: {e}"), - ) - })?; - - for ann in &annotations { - let entity = Entity::new( - ann.category, - &ann.entity_type, - &ann.value, - DetectionMethod::Manual, - 1.0, - EntityLocation { - start_offset: ann.start_offset.unwrap_or(0), - end_offset: ann.end_offset.unwrap_or(0), - element_id: None, - page_number: ann.page_number, - bounding_box: ann.bounding_box.clone(), - row_index: ann.row_index, - column_index: ann.column_index, - image_id: None, - }, - ); - - blob.add_artifact("entities", &entity).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add entity: {e}")) - })?; - count += 1; - } - - if output.send(blob).await.is_err() { - return Ok(count); - } - } - - Ok(count) - } -} diff --git a/crates/nvisy-detect/src/actions/detect_regex.rs b/crates/nvisy-detect/src/actions/detect_regex.rs deleted file mode 100644 index 1603a39..0000000 --- a/crates/nvisy-detect/src/actions/detect_regex.rs +++ /dev/null @@ -1,143 +0,0 @@ -//! Regex-based PII/PHI entity detection action. - -use regex::Regex; -use serde::Deserialize; -use tokio::sync::mpsc; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::Document; -use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityLocation}; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; - -use crate::patterns::{self, PatternDefinition}; - -/// Typed parameters for [`DetectRegexAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct DetectRegexParams { - /// Minimum pattern confidence to emit. - #[serde(default)] - pub confidence_threshold: f64, - /// Subset of built-in pattern names to use. `None` means all. - #[serde(default)] - pub patterns: Option<Vec<String>>, -} - -/// Scans document text against compiled regex patterns to detect PII/PHI entities. -/// -/// For each blob the action reads the `"documents"` artifact (or falls back to -/// the raw blob content), runs every active pattern, optionally validates -/// matches, and appends resulting [`Entity`] artifacts. -pub struct DetectRegexAction; - -#[async_trait::async_trait] -impl Action for DetectRegexAction { - type Params = DetectRegexParams; - - fn id(&self) -> &str { - "detect-regex" - } - - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) - } - - async fn execute( - &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - params: Self::Params, - ) -> Result<u64, Error> { - let confidence_threshold = params.confidence_threshold; - let requested_patterns = params.patterns; - - // Resolve patterns - let active_patterns = resolve_patterns(&requested_patterns); - - // Compile regexes - let compiled: Vec<(&PatternDefinition, Regex)> = active_patterns - .iter() - .filter_map(|p| Regex::new(&p.pattern_str).ok().map(|r| (*p, r))) - .collect(); - - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let documents: Vec<Document> = blob.get_artifacts("documents").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read documents artifact: {e}")) - })?; - - let docs = if documents.is_empty() { - // No documents artifact -- treat blob content as plain text - let text = String::from_utf8_lossy(&blob.content).into_owned(); - vec![Document::new(text)] - } else { - documents - }; - - for doc in &docs { - for (pattern, regex) in &compiled { - for mat in regex.find_iter(&doc.content) { - let value = mat.as_str(); - - if let Some(validate) = pattern.validate { - if !validate(value) { - continue; - } - } - - if pattern.confidence < confidence_threshold { - continue; - } - - let mut entity = Entity::new( - pattern.category, - &pattern.entity_type, - value, - DetectionMethod::Regex, - pattern.confidence, - EntityLocation { - start_offset: mat.start(), - end_offset: mat.end(), - element_id: None, - page_number: None, - bounding_box: None, - row_index: None, - column_index: None, - image_id: None, - }, - ); - entity.source_id = Some(doc.data.id); - entity.data.parent_id = Some(doc.data.id); - - blob.add_artifact("entities", &entity).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add entity artifact: {e}")) - })?; - - count += 1; - } - } - } - - if output.send(blob).await.is_err() { - return Ok(count); - } - } - - Ok(count) - } -} - -/// Resolves the set of active patterns from an optional list of requested names. -/// -/// When `requested` is `None` or empty, all built-in patterns are returned. -fn resolve_patterns(requested: &Option<Vec<String>>) -> Vec<&'static PatternDefinition> { - match requested { - Some(names) if !names.is_empty() => names - .iter() - .filter_map(|n| patterns::get_pattern(n)) - .collect(), - _ => patterns::get_all_patterns(), - } -} diff --git a/crates/nvisy-detect/src/actions/detect_tabular.rs b/crates/nvisy-detect/src/actions/detect_tabular.rs deleted file mode 100644 index 8e43bd7..0000000 --- a/crates/nvisy-detect/src/actions/detect_tabular.rs +++ /dev/null @@ -1,134 +0,0 @@ -//! Column-based rule matching for tabular data. - -use regex::Regex; -use serde::Deserialize; -use tokio::sync::mpsc; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::TabularData; -use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityLocation}; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; - -/// A rule that matches column headers to classify entire columns. -#[derive(Debug, Clone, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ColumnRule { - /// Regex pattern to match against column names. - pub column_name_pattern: String, - /// Entity category for matches in the column. - pub category: EntityCategory, - /// Entity type label for matches. - pub entity_type: String, -} - -/// Typed parameters for [`DetectTabularAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct DetectTabularParams { - /// Column-matching rules. - pub column_rules: Vec<ColumnRule>, -} - -/// Matches column headers against rules and marks every non-empty cell -/// in matched columns as an entity. -pub struct DetectTabularAction; - -#[async_trait::async_trait] -impl Action for DetectTabularAction { - type Params = DetectTabularParams; - - fn id(&self) -> &str { - "detect-tabular" - } - - fn validate_params(&self, params: &Self::Params) -> Result<(), Error> { - for rule in ¶ms.column_rules { - Regex::new(&rule.column_name_pattern).map_err(|e| { - Error::new( - ErrorKind::Validation, - format!("invalid column_name_pattern '{}': {e}", rule.column_name_pattern), - ) - })?; - } - Ok(()) - } - - async fn execute( - &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - params: Self::Params, - ) -> Result<u64, Error> { - // Compile column-name regexes - let compiled_rules: Vec<(Regex, &ColumnRule)> = params - .column_rules - .iter() - .filter_map(|r| Regex::new(&r.column_name_pattern).ok().map(|re| (re, r))) - .collect(); - - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let tables: Vec<TabularData> = blob.get_artifacts("tabular").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read tabular artifact: {e}")) - })?; - - for table in &tables { - // For each column, check if any rule matches the column name - for (col_idx, col_name) in table.columns.iter().enumerate() { - for (regex, rule) in &compiled_rules { - if !regex.is_match(col_name) { - continue; - } - - // Mark every non-empty cell in this column - for (row_idx, row) in table.rows.iter().enumerate() { - if let Some(cell) = row.get(col_idx) { - if cell.is_empty() { - continue; - } - - let entity = Entity::new( - rule.category, - &rule.entity_type, - cell.as_str(), - DetectionMethod::Composite, - 0.9, - EntityLocation { - start_offset: 0, - end_offset: cell.len(), - element_id: None, - page_number: None, - bounding_box: None, - row_index: Some(row_idx), - column_index: Some(col_idx), - image_id: None, - }, - ) - .with_source_id(table.data.id); - - blob.add_artifact("entities", &entity).map_err(|e| { - Error::new( - ErrorKind::Runtime, - format!("failed to add entity: {e}"), - ) - })?; - count += 1; - } - } - - // Only apply first matching rule per column - break; - } - } - } - - if output.send(blob).await.is_err() { - return Ok(count); - } - } - - Ok(count) - } -} diff --git a/crates/nvisy-detect/src/actions/emit_audit.rs b/crates/nvisy-detect/src/actions/emit_audit.rs deleted file mode 100644 index 63edb42..0000000 --- a/crates/nvisy-detect/src/actions/emit_audit.rs +++ /dev/null @@ -1,104 +0,0 @@ -//! Audit trail emission action. - -use serde::Deserialize; -use tokio::sync::mpsc; -use uuid::Uuid; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_ontology::ontology::audit::{Audit, AuditAction}; -use nvisy_ontology::ontology::redaction::Redaction; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; - -/// Typed parameters for [`EmitAuditAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct EmitAuditParams { - /// Pipeline run identifier to attach. - #[serde(default)] - pub run_id: Option<Uuid>, - /// Human or service identity to record. - #[serde(default)] - pub actor: Option<String>, -} - -/// Emits an [`Audit`] record for every [`Redaction`] found in the blob. -/// -/// Each audit entry captures the redaction method, replacement value, and -/// (when available) the originating policy rule ID. -pub struct EmitAuditAction; - -#[async_trait::async_trait] -impl Action for EmitAuditAction { - type Params = EmitAuditParams; - - fn id(&self) -> &str { - "emit-audit" - } - - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) - } - - async fn execute( - &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - params: Self::Params, - ) -> Result<u64, Error> { - let run_id = params.run_id; - let actor = params.actor; - - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let redactions: Vec<Redaction> = blob.get_artifacts("redactions").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read redactions artifact: {e}")) - })?; - - for redaction in &redactions { - let mut audit = Audit::new(AuditAction::Redaction) - .with_entity_id(redaction.entity_id) - .with_redaction_id(redaction.data.id); - - if let Some(run_id) = run_id { - audit = audit.with_run_id(run_id); - } - if let Some(ref actor) = actor { - audit = audit.with_actor(actor); - } - - let mut details = serde_json::Map::new(); - details.insert( - "method".to_string(), - serde_json::to_value(redaction.method).unwrap_or_default(), - ); - details.insert( - "replacementValue".to_string(), - serde_json::Value::String(redaction.replacement_value.clone()), - ); - if let Some(ref rule_id) = redaction.policy_rule_id { - details.insert( - "policyRuleId".to_string(), - serde_json::Value::String(rule_id.clone()), - ); - } - audit = audit.with_details(details); - - audit.data.parent_id = Some(redaction.data.id); - - blob.add_artifact("audits", &audit).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add audit artifact: {e}")) - })?; - - count += 1; - } - - if output.send(blob).await.is_err() { - return Ok(count); - } - } - - Ok(count) - } -} diff --git a/crates/nvisy-detect/src/lib.rs b/crates/nvisy-detect/src/lib.rs deleted file mode 100644 index 955b5bb..0000000 --- a/crates/nvisy-detect/src/lib.rs +++ /dev/null @@ -1,19 +0,0 @@ -//! PII/PHI detection actions for the nvisy pipeline. -//! -//! This crate provides the detection, classification, policy evaluation, -//! redaction, and audit-trail stages used by the nvisy runtime. It also -//! ships a built-in set of regex patterns compiled from `assets/patterns.json`. - -#![forbid(unsafe_code)] -#![cfg_attr(docsrs, feature(doc_cfg))] -#![doc = include_str!("../README.md")] - -/// Pipeline actions for detection, classification, policy, redaction, and audit. -pub mod actions; -/// Built-in dictionary data for name and term matching. -pub mod dictionaries; -/// Built-in regex pattern definitions and validation helpers. -pub mod patterns; - -#[doc(hidden)] -pub mod prelude; diff --git a/crates/nvisy-detect/src/prelude.rs b/crates/nvisy-detect/src/prelude.rs deleted file mode 100644 index e99bbe7..0000000 --- a/crates/nvisy-detect/src/prelude.rs +++ /dev/null @@ -1,10 +0,0 @@ -//! Convenience re-exports. -pub use crate::actions::apply_redaction::ApplyRedactionAction; -pub use crate::actions::classify::ClassifyAction; -pub use crate::actions::detect_checksum::DetectChecksumAction; -pub use crate::actions::detect_dictionary::DetectDictionaryAction; -pub use crate::actions::detect_manual::DetectManualAction; -pub use crate::actions::detect_regex::DetectRegexAction; -pub use crate::actions::detect_tabular::DetectTabularAction; -pub use crate::actions::emit_audit::EmitAuditAction; -pub use crate::actions::evaluate_policy::EvaluatePolicyAction; diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml index dad052c..2206600 100644 --- a/crates/nvisy-engine/Cargo.toml +++ b/crates/nvisy-engine/Cargo.toml @@ -17,19 +17,12 @@ repository = { workspace = true } homepage = { workspace = true } documentation = { workspace = true } -[package.metadata.docs.rs] -all-features = true -rustdoc-args = ["--cfg", "docsrs"] - -[features] -schema = ["dep:schemars", "nvisy-core/schema"] - [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } # JSON Schema generation -schemars = { workspace = true, optional = true } +schemars = { workspace = true } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -41,7 +34,7 @@ tokio-util = { workspace = true, features = [] } # Primitive datatypes uuid = { workspace = true, features = ["v4"] } -chrono = { workspace = true, features = [] } +jiff = { workspace = true } # Graph data structures petgraph = { workspace = true, features = [] } diff --git a/crates/nvisy-engine/src/compiler/graph.rs b/crates/nvisy-engine/src/compiler/graph.rs index 7cc9c6c..ab5056c 100644 --- a/crates/nvisy-engine/src/compiler/graph.rs +++ b/crates/nvisy-engine/src/compiler/graph.rs @@ -12,7 +12,7 @@ use crate::policies::retry::RetryPolicy; /// Nodes are serialized with a `"type"` discriminator so JSON definitions /// can specify `"source"`, `"action"`, or `"target"`. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(tag = "type", rename_all = "snake_case")] pub enum GraphNode { /// A data source that reads from an external provider via a named stream. @@ -109,7 +109,7 @@ impl GraphNode { /// A directed edge connecting two nodes by their IDs. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct GraphEdge { /// ID of the upstream (source) node. pub from: String, @@ -121,7 +121,7 @@ pub struct GraphEdge { /// /// The graph must be a valid DAG (directed acyclic graph) with unique node IDs. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct Graph { /// All nodes in the pipeline. pub nodes: Vec<GraphNode>, diff --git a/crates/nvisy-engine/src/connections/mod.rs b/crates/nvisy-engine/src/connections/mod.rs index 73793a1..cab9543 100644 --- a/crates/nvisy-engine/src/connections/mod.rs +++ b/crates/nvisy-engine/src/connections/mod.rs @@ -9,7 +9,7 @@ use serde::{Deserialize, Serialize}; /// A validated connection to an external service such as S3 or a database. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct Connection { /// Provider type identifier (e.g. `"s3"`, `"postgres"`). #[serde(rename = "type")] diff --git a/crates/nvisy-engine/src/executor/context.rs b/crates/nvisy-engine/src/executor/context.rs index 0189e88..58c5c7d 100644 --- a/crates/nvisy-engine/src/executor/context.rs +++ b/crates/nvisy-engine/src/executor/context.rs @@ -1,21 +1,21 @@ //! Channel primitives used to wire data flow between pipeline nodes. //! -//! [`EdgeChannel`] carries [`Blob`] items along a graph edge, while +//! [`EdgeChannel`] carries [`ContentData`] items along a graph edge, while //! [`NodeSignal`] broadcasts node completion. use tokio::sync::{mpsc, watch}; -use nvisy_core::datatypes::blob::Blob; +use nvisy_core::io::ContentData; /// Default buffer size for bounded inter-node MPSC channels. pub const CHANNEL_BUFFER_SIZE: usize = 256; -/// A bounded MPSC channel pair used to transfer [`Blob`] items along a +/// A bounded MPSC channel pair used to transfer [`ContentData`] items along a /// single graph edge from an upstream node to a downstream node. pub struct EdgeChannel { /// Sending half, held by the upstream node. - pub sender: mpsc::Sender<Blob>, + pub sender: mpsc::Sender<ContentData>, /// Receiving half, held by the downstream node. - pub receiver: mpsc::Receiver<Blob>, + pub receiver: mpsc::Receiver<ContentData>, } impl Default for EdgeChannel { diff --git a/crates/nvisy-engine/src/executor/runner.rs b/crates/nvisy-engine/src/executor/runner.rs index fcfdbf3..4a949ac 100644 --- a/crates/nvisy-engine/src/executor/runner.rs +++ b/crates/nvisy-engine/src/executor/runner.rs @@ -8,7 +8,7 @@ use std::collections::HashMap; use tokio::sync::{mpsc, watch}; use tokio::task::JoinSet; use uuid::Uuid; -use nvisy_core::datatypes::blob::Blob; +use nvisy_core::io::ContentData; use nvisy_core::error::Error; use crate::compiler::plan::ExecutionPlan; use crate::connections::Connections; @@ -17,7 +17,7 @@ use crate::compiler::graph::GraphNode; /// Outcome of executing a single node in the pipeline. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct NodeResult { /// ID of the node that produced this result. pub node_id: String, @@ -29,7 +29,7 @@ pub struct NodeResult { /// Aggregate outcome of executing an entire pipeline graph. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct RunResult { /// Unique identifier for this execution run. pub run_id: Uuid, @@ -49,8 +49,8 @@ pub async fn run_graph( let run_id = Uuid::new_v4(); // Create channels for each edge - let mut senders: HashMap<String, Vec<mpsc::Sender<Blob>>> = HashMap::new(); - let mut receivers: HashMap<String, Vec<mpsc::Receiver<Blob>>> = HashMap::new(); + let mut senders: HashMap<String, Vec<mpsc::Sender<ContentData>>> = HashMap::new(); + let mut receivers: HashMap<String, Vec<mpsc::Receiver<ContentData>>> = HashMap::new(); for node in &plan.nodes { let node_id = node.node.id(); @@ -142,8 +142,8 @@ pub async fn run_graph( /// Execute a single node with its channels (simplified -- does not use registry directly). async fn execute_node( _node: &GraphNode, - senders: Vec<mpsc::Sender<Blob>>, - mut receivers: Vec<mpsc::Receiver<Blob>>, + senders: Vec<mpsc::Sender<ContentData>>, + mut receivers: Vec<mpsc::Receiver<ContentData>>, ) -> Result<u64, Error> { // For now, forward items from receivers to senders (passthrough behavior). // The actual registry-based dispatch happens via the Engine wrapper. diff --git a/crates/nvisy-engine/src/policies/retry.rs b/crates/nvisy-engine/src/policies/retry.rs index 62b2337..859d6fe 100644 --- a/crates/nvisy-engine/src/policies/retry.rs +++ b/crates/nvisy-engine/src/policies/retry.rs @@ -9,7 +9,7 @@ use serde::{Deserialize, Serialize}; /// /// Defaults to 3 retries with a 1 000 ms fixed delay. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct RetryPolicy { /// Maximum number of retry attempts after the initial failure. #[serde(default = "default_max_retries")] @@ -39,7 +39,7 @@ impl Default for RetryPolicy { /// Strategy for computing the delay between retry attempts. #[derive(Debug, Clone, Default, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum BackoffStrategy { /// Constant delay equal to `delay_ms` on every attempt. diff --git a/crates/nvisy-engine/src/runs/mod.rs b/crates/nvisy-engine/src/runs/mod.rs index 2423efa..da290e5 100644 --- a/crates/nvisy-engine/src/runs/mod.rs +++ b/crates/nvisy-engine/src/runs/mod.rs @@ -6,7 +6,7 @@ use std::collections::HashMap; use std::sync::Arc; -use chrono::{DateTime, Utc}; +use jiff::Timestamp; use tokio::sync::RwLock; use tokio_util::sync::CancellationToken; use uuid::Uuid; @@ -14,7 +14,7 @@ use crate::executor::runner::RunResult; /// Lifecycle status of a pipeline run. #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum RunStatus { /// The run has been created but not yet started. @@ -33,7 +33,7 @@ pub enum RunStatus { /// Execution progress of a single node within a run. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct NodeProgress { /// ID of the node this progress belongs to. pub node_id: String, @@ -48,17 +48,19 @@ pub struct NodeProgress { /// Complete mutable state of a pipeline run. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct RunState { /// Unique run identifier. pub id: Uuid, /// Current overall status. pub status: RunStatus, /// Timestamp when the run was created. - pub created_at: DateTime<Utc>, + #[schemars(with = "String")] + pub created_at: Timestamp, /// Timestamp when the run finished, if applicable. #[serde(skip_serializing_if = "Option::is_none")] - pub completed_at: Option<DateTime<Utc>>, + #[schemars(with = "Option<String>")] + pub completed_at: Option<Timestamp>, /// Per-node progress keyed by node ID. pub node_progress: HashMap<String, NodeProgress>, /// Final result after the run completes. @@ -68,17 +70,19 @@ pub struct RunState { /// Lightweight summary of a run for listing endpoints. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct RunSummary { /// Unique run identifier. pub id: Uuid, /// Current overall status. pub status: RunStatus, /// Timestamp when the run was created. - pub created_at: DateTime<Utc>, + #[schemars(with = "String")] + pub created_at: Timestamp, /// Timestamp when the run finished, if applicable. #[serde(skip_serializing_if = "Option::is_none")] - pub completed_at: Option<DateTime<Utc>>, + #[schemars(with = "Option<String>")] + pub completed_at: Option<Timestamp>, } /// Thread-safe manager that tracks all pipeline runs. @@ -109,7 +113,7 @@ impl RunManager { let state = RunState { id, status: RunStatus::Pending, - created_at: Utc::now(), + created_at: Timestamp::now(), completed_at: None, node_progress: HashMap::new(), result: None, @@ -138,7 +142,7 @@ impl RunManager { } else { RunStatus::Failure }; - state.completed_at = Some(Utc::now()); + state.completed_at = Some(Timestamp::now()); for nr in &result.node_results { state.node_progress.insert( @@ -188,7 +192,7 @@ impl RunManager { token.cancel(); if let Some(state) = self.runs.write().await.get_mut(&id) { state.status = RunStatus::Cancelled; - state.completed_at = Some(Utc::now()); + state.completed_at = Some(Timestamp::now()); } true } else { diff --git a/crates/nvisy-ingest/Cargo.toml b/crates/nvisy-ingest/Cargo.toml index cad8474..7b04966 100644 --- a/crates/nvisy-ingest/Cargo.toml +++ b/crates/nvisy-ingest/Cargo.toml @@ -2,7 +2,7 @@ [package] name = "nvisy-ingest" -description = "File-format loaders for the Nvisy multimodal redaction platform" +description = "File-format loaders and unified Document type for the Nvisy multimodal redaction platform" keywords = ["nvisy", "ingest", "loader", "pdf", "docx"] categories = ["parser-implementations"] @@ -23,17 +23,26 @@ rustdoc-args = ["--cfg", "docsrs"] [features] default = ["pdf", "docx", "html", "xlsx", "parquet", "image"] +# PDF parsing and text extraction via pdf-extract + lopdf pdf = ["dep:pdf-extract", "dep:lopdf"] +# Microsoft Word (.docx) parsing via zip + quick-xml docx = ["dep:zip", "dep:quick-xml"] +# HTML parsing and text extraction via scraper html = ["dep:scraper"] +# Excel (.xlsx) spreadsheet parsing via calamine xlsx = ["dep:calamine"] +# Apache Parquet columnar data via arrow + parquet parquet = ["dep:parquet", "dep:arrow"] +# Image decoding (PNG, JPEG, TIFF) via the image crate image = ["dep:image"] [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } +# JSON Schema generation +schemars = { workspace = true } + # (De)serialization serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = [] } diff --git a/crates/nvisy-ingest/src/audio/mod.rs b/crates/nvisy-ingest/src/audio/mod.rs new file mode 100644 index 0000000..1849018 --- /dev/null +++ b/crates/nvisy-ingest/src/audio/mod.rs @@ -0,0 +1,4 @@ +//! Audio file loaders. + +pub mod wav; +pub mod mp3; diff --git a/crates/nvisy-ingest/src/audio/mp3.rs b/crates/nvisy-ingest/src/audio/mp3.rs new file mode 100644 index 0000000..445003c --- /dev/null +++ b/crates/nvisy-ingest/src/audio/mp3.rs @@ -0,0 +1,51 @@ +//! MP3 audio file loader. +//! +//! Returns a document with metadata only -- audio redaction is not yet implemented. + +use serde::Deserialize; + +use nvisy_core::io::ContentData; +use nvisy_core::error::Error; + +use crate::document::Document; +use crate::handler::{Mp3Handler, FormatHandler, AudioLoader}; + +/// Typed parameters for [`Mp3Loader`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct Mp3LoaderParams {} + +/// Placeholder loader for MP3 audio files. Returns a metadata-only document. +pub struct Mp3Loader; + +impl Clone for Mp3Loader { + fn clone(&self) -> Self { Self } +} + +#[async_trait::async_trait] +impl AudioLoader for Mp3Loader { + type Params = Mp3LoaderParams; + + async fn load( + &self, + content: &ContentData, + _params: &Self::Params, + ) -> Result<Vec<Document<FormatHandler>>, Error> { + let content_type = content.content_type().unwrap_or("audio/mpeg").to_string(); + let size = content.to_bytes().len(); + + let mut doc = Document::new(Mp3Handler) + .with_text(format!( + "[Audio file: type={}, size={} bytes. Audio redaction not yet implemented.]", + content_type, size + )); + doc.source.set_parent_id(Some(content.content_source.as_uuid())); + Ok(vec![doc.into_format()]) + } +} + +impl crate::handler::Handler for Mp3Loader { + fn id(&self) -> &str { Mp3Handler.id() } + fn extensions(&self) -> &[&str] { Mp3Handler.extensions() } + fn content_types(&self) -> &[&str] { Mp3Handler.content_types() } +} diff --git a/crates/nvisy-ingest/src/audio/wav.rs b/crates/nvisy-ingest/src/audio/wav.rs new file mode 100644 index 0000000..fa9feab --- /dev/null +++ b/crates/nvisy-ingest/src/audio/wav.rs @@ -0,0 +1,51 @@ +//! WAV audio file loader. +//! +//! Returns a document with metadata only -- audio redaction is not yet implemented. + +use serde::Deserialize; + +use nvisy_core::io::ContentData; +use nvisy_core::error::Error; + +use crate::document::Document; +use crate::handler::{WavHandler, FormatHandler, AudioLoader}; + +/// Typed parameters for [`WavLoader`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct WavLoaderParams {} + +/// Placeholder loader for WAV audio files. Returns a metadata-only document. +pub struct WavLoader; + +impl Clone for WavLoader { + fn clone(&self) -> Self { Self } +} + +#[async_trait::async_trait] +impl AudioLoader for WavLoader { + type Params = WavLoaderParams; + + async fn load( + &self, + content: &ContentData, + _params: &Self::Params, + ) -> Result<Vec<Document<FormatHandler>>, Error> { + let content_type = content.content_type().unwrap_or("audio/wav").to_string(); + let size = content.to_bytes().len(); + + let mut doc = Document::new(WavHandler) + .with_text(format!( + "[Audio file: type={}, size={} bytes. Audio redaction not yet implemented.]", + content_type, size + )); + doc.source.set_parent_id(Some(content.content_source.as_uuid())); + Ok(vec![doc.into_format()]) + } +} + +impl crate::handler::Handler for WavLoader { + fn id(&self) -> &str { WavHandler.id() } + fn extensions(&self) -> &[&str] { WavHandler.extensions() } + fn content_types(&self) -> &[&str] { WavHandler.content_types() } +} diff --git a/crates/nvisy-ingest/src/loaders/docx_loader.rs b/crates/nvisy-ingest/src/binary/docx.rs similarity index 80% rename from crates/nvisy-ingest/src/loaders/docx_loader.rs rename to crates/nvisy-ingest/src/binary/docx.rs index d8d71ea..98d61b6 100644 --- a/crates/nvisy-ingest/src/loaders/docx_loader.rs +++ b/crates/nvisy-ingest/src/binary/docx.rs @@ -4,10 +4,12 @@ use bytes::Bytes; use serde::Deserialize; use std::io::Cursor; -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::{Document, Element, ElementType, ImageData}; +use nvisy_core::io::ContentData; use nvisy_core::error::{Error, ErrorKind}; -use super::{Loader, LoaderOutput}; + +use crate::document::Document; +use crate::element::{Element, ElementType}; +use crate::handler::{DocxHandler, ImageHandler, FormatHandler, BinaryLoader}; /// Typed parameters for [`DocxLoader`]. #[derive(Debug, Deserialize)] @@ -25,33 +27,25 @@ fn default_true() -> bool { /// Extracts text and optionally images from DOCX files. pub struct DocxLoader; +impl Clone for DocxLoader { + fn clone(&self) -> Self { Self } +} + #[async_trait::async_trait] -impl Loader for DocxLoader { +impl BinaryLoader for DocxLoader { type Params = DocxLoaderParams; - fn id(&self) -> &str { - "docx" - } - - fn extensions(&self) -> &[&str] { - &["docx"] - } - - fn content_types(&self) -> &[&str] { - &["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] - } - async fn load( &self, - blob: &Blob, + content: &ContentData, params: &Self::Params, - ) -> Result<Vec<LoaderOutput>, Error> { - let cursor = Cursor::new(blob.content.to_vec()); + ) -> Result<Vec<Document<FormatHandler>>, Error> { + let cursor = Cursor::new(content.to_bytes().to_vec()); let mut archive = zip::ZipArchive::new(cursor).map_err(|e| { Error::new(ErrorKind::Runtime, format!("Failed to open DOCX ZIP: {e}")) })?; - let mut outputs = Vec::new(); + let mut documents = Vec::new(); let mut elements = Vec::new(); let mut full_text = String::new(); @@ -125,11 +119,11 @@ impl Loader for DocxLoader { } } - let doc = Document::new(full_text) - .with_elements(elements) - .with_source_format("docx"); - - outputs.push(LoaderOutput::Document(doc)); + let mut doc = Document::new(DocxHandler) + .with_text(full_text) + .with_elements(elements); + doc.source.set_parent_id(Some(content.content_source.as_uuid())); + documents.push(doc.into_format()); // Extract images from word/media/ if params.extract_images { @@ -153,14 +147,22 @@ impl Loader for DocxLoader { let mime = infer::get(&buf) .map(|t| t.mime_type().to_string()) .unwrap_or_else(|| "image/png".to_string()); - let img = ImageData::new(Bytes::from(buf), mime) + let mut img = Document::new(ImageHandler) + .with_data(Bytes::from(buf), mime) .with_source_path(&name); - outputs.push(LoaderOutput::Image(img)); + img.source.set_parent_id(Some(content.content_source.as_uuid())); + documents.push(img.into_format()); } } } } - Ok(outputs) + Ok(documents) } } + +impl crate::handler::Handler for DocxLoader { + fn id(&self) -> &str { DocxHandler.id() } + fn extensions(&self) -> &[&str] { DocxHandler.extensions() } + fn content_types(&self) -> &[&str] { DocxHandler.content_types() } +} diff --git a/crates/nvisy-ingest/src/binary/mod.rs b/crates/nvisy-ingest/src/binary/mod.rs new file mode 100644 index 0000000..02642a4 --- /dev/null +++ b/crates/nvisy-ingest/src/binary/mod.rs @@ -0,0 +1,7 @@ +//! Binary document loaders (PDF, DOCX). + +#[cfg(feature = "pdf")] +pub mod pdf; + +#[cfg(feature = "docx")] +pub mod docx; diff --git a/crates/nvisy-ingest/src/loaders/pdf_loader.rs b/crates/nvisy-ingest/src/binary/pdf.rs similarity index 78% rename from crates/nvisy-ingest/src/loaders/pdf_loader.rs rename to crates/nvisy-ingest/src/binary/pdf.rs index 9847c87..982f70a 100644 --- a/crates/nvisy-ingest/src/loaders/pdf_loader.rs +++ b/crates/nvisy-ingest/src/binary/pdf.rs @@ -3,10 +3,11 @@ use bytes::Bytes; use serde::Deserialize; -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::{Document, ImageData}; +use nvisy_core::io::ContentData; use nvisy_core::error::{Error, ErrorKind}; -use super::{Loader, LoaderOutput}; + +use crate::document::Document; +use crate::handler::{PdfHandler, ImageHandler, FormatHandler, BinaryLoader}; /// Typed parameters for [`PdfLoader`]. #[derive(Debug, Deserialize)] @@ -27,29 +28,21 @@ fn default_true() -> bool { /// Extracts text and optionally images from PDF files. pub struct PdfLoader; +impl Clone for PdfLoader { + fn clone(&self) -> Self { Self } +} + #[async_trait::async_trait] -impl Loader for PdfLoader { +impl BinaryLoader for PdfLoader { type Params = PdfLoaderParams; - fn id(&self) -> &str { - "pdf" - } - - fn extensions(&self) -> &[&str] { - &["pdf"] - } - - fn content_types(&self) -> &[&str] { - &["application/pdf"] - } - async fn load( &self, - blob: &Blob, + content: &ContentData, params: &Self::Params, - ) -> Result<Vec<LoaderOutput>, Error> { - let bytes = blob.content.to_vec(); - let mut outputs = Vec::new(); + ) -> Result<Vec<Document<FormatHandler>>, Error> { + let bytes = content.to_bytes().to_vec(); + let mut documents = Vec::new(); // Extract text let text = pdf_extract::extract_text_from_mem(&bytes).map_err(|e| { @@ -62,11 +55,11 @@ impl Loader for PdfLoader { let page_count = lop_doc.get_pages().len() as u32; - let doc = Document::new(text) - .with_source_format("pdf") + let mut doc = Document::new(PdfHandler) + .with_text(text) .with_page_count(page_count); - - outputs.push(LoaderOutput::Document(doc)); + doc.source.set_parent_id(Some(content.content_source.as_uuid())); + documents.push(doc.into_format()); // Extract embedded images if params.extract_images { @@ -148,21 +141,26 @@ impl Loader for PdfLoader { } }); - let mut img = ImageData::new( - Bytes::from(image_bytes), - "image/png", - ) - .with_page_number(page_num); + let mut img = Document::new(ImageHandler) + .with_data(Bytes::from(image_bytes), "image/png") + .with_page_number(page_num); if let (Some(w), Some(h)) = (width, height) { img = img.with_dimensions(w, h); } - outputs.push(LoaderOutput::Image(img)); + img.source.set_parent_id(Some(content.content_source.as_uuid())); + documents.push(img.into_format()); } } } - Ok(outputs) + Ok(documents) } } + +impl crate::handler::Handler for PdfLoader { + fn id(&self) -> &str { PdfHandler.id() } + fn extensions(&self) -> &[&str] { PdfHandler.extensions() } + fn content_types(&self) -> &[&str] { PdfHandler.content_types() } +} diff --git a/crates/nvisy-ingest/src/document.rs b/crates/nvisy-ingest/src/document.rs new file mode 100644 index 0000000..c7a0cee --- /dev/null +++ b/crates/nvisy-ingest/src/document.rs @@ -0,0 +1,299 @@ +//! Unified document representation for any handleable content. + +use bytes::Bytes; +use nvisy_core::path::ContentSource; +use serde::Serialize; + +use crate::element::Element; +use crate::handler::{FormatHandler, Handler}; + +/// A unified representation of any content that can be handled by the pipeline. +/// +/// `Document` is generic over `H`, a [`Handler`] that describes the source +/// format. For heterogeneous collections, use `Document<FormatHandler>`. +/// +/// Fields are grouped by content modality: +/// - **Text** (`content`, `title`, `elements`, `page_count`) — for PDF, DOCX, HTML, etc. +/// - **Binary/image** (`data`, `mime_type`, `width`, `height`, etc.) — for images and raw bytes. +/// - **Tabular** (`columns`, `rows`, `sheet_name`) — for CSV, XLSX, Parquet. +#[derive(Debug, Clone)] +pub struct Document<H: Handler> { + /// Content source identity and lineage. + pub source: ContentSource, + + // -- Text content (from text, PDF, DOCX, HTML, etc.) -- + + /// Full text content, if applicable. + pub content: Option<String>, + /// Document title, if extracted. + pub title: Option<String>, + /// Structural elements parsed from the document. + pub elements: Option<Vec<Element>>, + /// Total page count for paginated formats. + pub page_count: Option<u32>, + + // -- Binary/image content -- + + /// Raw binary data (image bytes, audio bytes, etc.). + pub data: Option<Bytes>, + /// MIME type of the data (e.g. `"image/png"`, `"audio/wav"`). + pub mime_type: Option<String>, + /// Width in pixels (images). + pub width: Option<u32>, + /// Height in pixels (images). + pub height: Option<u32>, + /// File path or URL the content was loaded from. + pub source_path: Option<String>, + /// 1-based page number this was extracted from. + pub page_number: Option<u32>, + + // -- Tabular content -- + + /// Column header names. + pub columns: Option<Vec<String>>, + /// Row data (each inner Vec same length as columns). + pub rows: Option<Vec<Vec<String>>>, + /// Sheet or tab name within a multi-sheet workbook. + pub sheet_name: Option<String>, + + /// Format handler (not serialized). + handler: H, +} + +impl<H: Handler> Document<H> { + /// Create a new empty document with the given handler. + pub fn new(handler: H) -> Self { + Self { + source: ContentSource::new(), + content: None, + title: None, + elements: None, + page_count: None, + data: None, + mime_type: None, + width: None, + height: None, + source_path: None, + page_number: None, + columns: None, + rows: None, + sheet_name: None, + handler, + } + } + + /// Get a reference to the format handler. + pub fn handler(&self) -> &H { + &self.handler + } + + /// Original file format identifier (delegates to `handler.id()`). + pub fn source_format(&self) -> &str { + self.handler.id() + } + + // -- Builder methods -- + + /// Set text content. + pub fn with_text(mut self, content: impl Into<String>) -> Self { + self.content = Some(content.into()); + self + } + + /// Set binary data and MIME type. + pub fn with_data(mut self, data: impl Into<Bytes>, mime: impl Into<String>) -> Self { + self.data = Some(data.into()); + self.mime_type = Some(mime.into()); + self + } + + /// Set tabular content (columns + rows). + pub fn with_tabular(mut self, columns: Vec<String>, rows: Vec<Vec<String>>) -> Self { + self.columns = Some(columns); + self.rows = Some(rows); + self + } + + /// Set the document title. + pub fn with_title(mut self, title: impl Into<String>) -> Self { + self.title = Some(title.into()); + self + } + + /// Attach parsed structural elements. + pub fn with_elements(mut self, elements: Vec<Element>) -> Self { + self.elements = Some(elements); + self + } + + /// Set the total page count. + pub fn with_page_count(mut self, count: u32) -> Self { + self.page_count = Some(count); + self + } + + /// Set pixel dimensions (images). + pub fn with_dimensions(mut self, width: u32, height: u32) -> Self { + self.width = Some(width); + self.height = Some(height); + self + } + + /// Set the source file path or URL. + pub fn with_source_path(mut self, path: impl Into<String>) -> Self { + self.source_path = Some(path.into()); + self + } + + /// Set the 1-based page number this was extracted from. + pub fn with_page_number(mut self, page: u32) -> Self { + self.page_number = Some(page); + self + } + + /// Set the sheet/tab name for tabular data. + pub fn with_sheet_name(mut self, name: impl Into<String>) -> Self { + self.sheet_name = Some(name.into()); + self + } + + /// Convert into a `Document<FormatHandler>` by wrapping the handler. + pub fn into_format(self) -> Document<FormatHandler> + where + H: Into<FormatHandler>, + { + Document { + source: self.source, + content: self.content, + title: self.title, + elements: self.elements, + page_count: self.page_count, + data: self.data, + mime_type: self.mime_type, + width: self.width, + height: self.height, + source_path: self.source_path, + page_number: self.page_number, + columns: self.columns, + rows: self.rows, + sheet_name: self.sheet_name, + handler: self.handler.into(), + } + } + + /// Unique BCP-47 language tags collected from all elements. + pub fn languages(&self) -> Vec<String> { + let mut langs = Vec::new(); + if let Some(elements) = &self.elements { + for el in elements { + if let Some(ref element_langs) = el.languages { + for lang in element_langs { + if !langs.contains(lang) { + langs.push(lang.clone()); + } + } + } + } + } + langs + } + + /// Group elements by their 1-based page number. + /// Elements without a page_number are collected under key 0. + pub fn get_elements_by_page(&self) -> std::collections::HashMap<u32, Vec<&Element>> { + let mut map = std::collections::HashMap::new(); + if let Some(elements) = &self.elements { + for el in elements { + let page = el.page_number.unwrap_or(0); + map.entry(page).or_insert_with(Vec::new).push(el); + } + } + map + } + + /// Create a Document by deriving content from element texts joined with "\n\n". + pub fn from_elements(elements: Vec<Element>, handler: H) -> Self { + let content = elements + .iter() + .map(|e| e.text.as_str()) + .collect::<Vec<_>>() + .join("\n\n"); + let mut doc = Self::new(handler); + doc.content = Some(content); + doc.elements = Some(elements); + doc + } +} + +// --------------------------------------------------------------------------- +// Serialization +// --------------------------------------------------------------------------- + +impl<H: Handler> Serialize for Document<H> { + fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> { + use serde::ser::SerializeStruct; + + // Count always-present fields + let mut count = 3; // id + parent_id + source_format + if self.content.is_some() { count += 1; } + if self.title.is_some() { count += 1; } + if self.elements.is_some() { count += 1; } + if self.page_count.is_some() { count += 1; } + if self.data.is_some() { count += 1; } + if self.mime_type.is_some() { count += 1; } + if self.width.is_some() { count += 1; } + if self.height.is_some() { count += 1; } + if self.source_path.is_some() { count += 1; } + if self.page_number.is_some() { count += 1; } + if self.columns.is_some() { count += 1; } + if self.rows.is_some() { count += 1; } + if self.sheet_name.is_some() { count += 1; } + + let mut state = serializer.serialize_struct("Document", count)?; + state.serialize_field("id", &self.source.as_uuid())?; + state.serialize_field("parent_id", &self.source.parent_id())?; + state.serialize_field("source_format", self.handler.id())?; + + if let Some(ref content) = self.content { + state.serialize_field("content", content)?; + } + if let Some(ref title) = self.title { + state.serialize_field("title", title)?; + } + if let Some(ref elements) = self.elements { + state.serialize_field("elements", elements)?; + } + if let Some(page_count) = self.page_count { + state.serialize_field("page_count", &page_count)?; + } + if let Some(ref data) = self.data { + state.serialize_field("data", data.as_ref())?; + } + if let Some(ref mime_type) = self.mime_type { + state.serialize_field("mime_type", mime_type)?; + } + if let Some(width) = self.width { + state.serialize_field("width", &width)?; + } + if let Some(height) = self.height { + state.serialize_field("height", &height)?; + } + if let Some(ref source_path) = self.source_path { + state.serialize_field("source_path", source_path)?; + } + if let Some(page_number) = self.page_number { + state.serialize_field("page_number", &page_number)?; + } + if let Some(ref columns) = self.columns { + state.serialize_field("columns", columns)?; + } + if let Some(ref rows) = self.rows { + state.serialize_field("rows", rows)?; + } + if let Some(ref sheet_name) = self.sheet_name { + state.serialize_field("sheet_name", sheet_name)?; + } + + state.end() + } +} diff --git a/crates/nvisy-core/src/datatypes/document.rs b/crates/nvisy-ingest/src/element.rs similarity index 57% rename from crates/nvisy-core/src/datatypes/document.rs rename to crates/nvisy-ingest/src/element.rs index 5d7101e..b942597 100644 --- a/crates/nvisy-core/src/datatypes/document.rs +++ b/crates/nvisy-ingest/src/element.rs @@ -1,10 +1,10 @@ -//! Parsed document representation, structural elements, and element ontology. +//! Structural elements extracted from documents and their ontology. -use bytes::Bytes; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use super::Data; -use super::Metadata; + +/// General-purpose metadata map. +pub type Metadata = serde_json::Map<String, serde_json::Value>; // --------------------------------------------------------------------------- // Element ontology @@ -16,7 +16,7 @@ use super::Metadata; /// a coarse filter for pipeline actions that only operate on certain /// kinds of content. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum ElementCategory { /// Narrative text, headings, list items, captions, and addresses. @@ -42,7 +42,7 @@ pub enum ElementCategory { /// Each variant maps to a single [`ElementCategory`] via /// [`ElementType::category`]. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "kebab-case")] pub enum ElementType { // -- Text -- @@ -142,7 +142,7 @@ pub fn category_of(type_str: &str) -> Option<ElementCategory> { /// An inline hyperlink within element text. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct Link { /// Display text of the hyperlink. pub text: String, @@ -154,7 +154,7 @@ pub struct Link { /// An inline formatting span within element text. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct EmphasizedText { /// The emphasized text content. pub text: String, @@ -164,7 +164,7 @@ pub struct EmphasizedText { /// A single cell within a table structure. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct TableCellData { /// Zero-based row index. pub row: usize, @@ -182,7 +182,7 @@ pub struct TableCellData { /// Records how an element was detected and any extraction /// confidence metadata. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct ElementProvenance { /// Confidence score of the extraction (0.0 to 1.0). #[serde(skip_serializing_if = "Option::is_none")] @@ -200,7 +200,7 @@ pub struct ElementProvenance { /// Structured key-value pair extracted from a form. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct FormKeyValuePair { /// Form field label or key. pub key: String, @@ -217,7 +217,7 @@ pub struct FormKeyValuePair { /// Combines base element fields with optional type-specific fields /// (image, table, form, email) in a flat struct rather than inheritance. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct Element { /// Unique identifier for this element. pub id: Uuid, @@ -373,235 +373,3 @@ impl Element { self } } - -// --------------------------------------------------------------------------- -// Document -// --------------------------------------------------------------------------- - -/// A parsed human-readable text representation of a document. -/// -/// Documents are produced by loaders from raw blobs and contain the -/// extracted text along with optional structural elements, title, and -/// source format metadata. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct Document { - /// Common data-item fields (id, parent_id, metadata). - #[serde(flatten)] - pub data: Data, - /// Full text content of the document. - pub content: String, - /// Document title, if one was extracted. - #[serde(skip_serializing_if = "Option::is_none")] - pub title: Option<String>, - /// Structural elements (paragraphs, tables, images, etc.) parsed from the document. - #[serde(skip_serializing_if = "Option::is_none")] - pub elements: Option<Vec<Element>>, - /// Original file format (e.g. `"pdf"`, `"docx"`, `"html"`). - #[serde(skip_serializing_if = "Option::is_none")] - pub source_format: Option<String>, - /// Total number of pages, if the source format is paginated. - #[serde(skip_serializing_if = "Option::is_none")] - pub page_count: Option<u32>, -} - -impl Document { - /// Create a new document from raw text content. - pub fn new(content: impl Into<String>) -> Self { - Self { - data: Data::new(), - content: content.into(), - title: None, - elements: None, - source_format: None, - page_count: None, - } - } - - /// Set the document title (builder pattern). - pub fn with_title(mut self, title: impl Into<String>) -> Self { - self.title = Some(title.into()); - self - } - - /// Attach parsed structural elements to this document. - pub fn with_elements(mut self, elements: Vec<Element>) -> Self { - self.elements = Some(elements); - self - } - - /// Record the original file format (e.g. `"pdf"`, `"docx"`). - pub fn with_source_format(mut self, format: impl Into<String>) -> Self { - self.source_format = Some(format.into()); - self - } - - /// Set the total page count for paginated source formats. - pub fn with_page_count(mut self, count: u32) -> Self { - self.page_count = Some(count); - self - } - - /// Create a Document by deriving content from element texts joined with "\n\n". - pub fn from_elements(elements: Vec<Element>) -> Self { - let content = elements.iter().map(|e| e.text.as_str()).collect::<Vec<_>>().join("\n\n"); - Self { - data: Data::new(), - content, - title: None, - elements: Some(elements), - source_format: None, - page_count: None, - } - } - - /// Unique BCP-47 language tags collected from all elements. - pub fn languages(&self) -> Vec<String> { - let mut langs = Vec::new(); - if let Some(elements) = &self.elements { - for el in elements { - if let Some(ref element_langs) = el.languages { - for lang in element_langs { - if !langs.contains(lang) { - langs.push(lang.clone()); - } - } - } - } - } - langs - } - - /// Group elements by their 1-based page number. - /// Elements without a page_number are collected under key 0. - pub fn get_elements_by_page(&self) -> std::collections::HashMap<u32, Vec<&Element>> { - let mut map = std::collections::HashMap::new(); - if let Some(elements) = &self.elements { - for el in elements { - let page = el.page_number.unwrap_or(0); - map.entry(page).or_insert_with(Vec::new).push(el); - } - } - map - } -} - -// --------------------------------------------------------------------------- -// TabularData -// --------------------------------------------------------------------------- - -/// Tabular data extracted from spreadsheets, CSV files, or database exports. -/// -/// Represents a two-dimensional table with named columns and string cell -/// values. Carries optional metadata about the original file format and -/// sheet name for multi-sheet workbooks. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct TabularData { - /// Common data-item fields (id, parent_id, metadata). - #[serde(flatten)] - pub data: Data, - /// Column header names. - pub columns: Vec<String>, - /// Row data — each inner Vec has the same length as `columns`. - pub rows: Vec<Vec<String>>, - /// Original file format (e.g. `"csv"`, `"parquet"`, `"xlsx"`). - #[serde(skip_serializing_if = "Option::is_none")] - pub source_format: Option<String>, - /// Sheet or tab name within a multi-sheet workbook. - #[serde(skip_serializing_if = "Option::is_none")] - pub sheet_name: Option<String>, -} - -impl TabularData { - /// Create new tabular data with the given columns and rows. - pub fn new(columns: Vec<String>, rows: Vec<Vec<String>>) -> Self { - Self { - data: Data::new(), - columns, - rows, - source_format: None, - sheet_name: None, - } - } - - /// Record the original file format (e.g. `"csv"`, `"xlsx"`). - pub fn with_source_format(mut self, format: impl Into<String>) -> Self { - self.source_format = Some(format.into()); - self - } - - /// Set the sheet name for multi-sheet workbooks. - pub fn with_sheet_name(mut self, name: impl Into<String>) -> Self { - self.sheet_name = Some(name.into()); - self - } -} - -// --------------------------------------------------------------------------- -// ImageData -// --------------------------------------------------------------------------- - -/// An image extracted from a document or provided directly. -/// -/// Carries the raw pixel data, MIME type, optional dimensions, and -/// provenance information linking back to its source. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] -pub struct ImageData { - /// Common data-item fields (id, parent_id, metadata). - #[serde(flatten)] - pub data: Data, - /// Raw image bytes (PNG, JPEG, etc.). - #[serde(with = "crate::datatypes::blob::bytes_serde")] - #[cfg_attr(feature = "schema", schemars(with = "Vec<u8>"))] - pub image_data: Bytes, - /// MIME type of the image (e.g. `"image/png"`). - pub mime_type: String, - /// Width of the image in pixels, if known. - #[serde(skip_serializing_if = "Option::is_none")] - pub width: Option<u32>, - /// Height of the image in pixels, if known. - #[serde(skip_serializing_if = "Option::is_none")] - pub height: Option<u32>, - /// File path or URL the image was loaded from, if applicable. - #[serde(skip_serializing_if = "Option::is_none")] - pub source_path: Option<String>, - /// 1-based page number the image was extracted from, if applicable. - #[serde(skip_serializing_if = "Option::is_none")] - pub page_number: Option<u32>, -} - -impl ImageData { - /// Create a new image from raw bytes and a MIME type. - pub fn new(image_data: impl Into<Bytes>, mime_type: impl Into<String>) -> Self { - Self { - data: Data::new(), - image_data: image_data.into(), - mime_type: mime_type.into(), - width: None, - height: None, - source_path: None, - page_number: None, - } - } - - /// Set the pixel dimensions of the image. - pub fn with_dimensions(mut self, width: u32, height: u32) -> Self { - self.width = Some(width); - self.height = Some(height); - self - } - - /// Record the file path or URL the image originated from. - pub fn with_source_path(mut self, path: impl Into<String>) -> Self { - self.source_path = Some(path.into()); - self - } - - /// Set the page number this image was extracted from. - pub fn with_page_number(mut self, page: u32) -> Self { - self.page_number = Some(page); - self - } -} diff --git a/crates/nvisy-ingest/src/handler.rs b/crates/nvisy-ingest/src/handler.rs new file mode 100644 index 0000000..eaf1648 --- /dev/null +++ b/crates/nvisy-ingest/src/handler.rs @@ -0,0 +1,372 @@ +//! Handler trait, format handler enum, and loader traits. +//! +//! The [`Handler`] supertrait defines metadata shared by all format handlers. +//! The closed [`FormatHandler`] enum provides type erasure so that +//! `Document<FormatHandler>` can represent any supported format in +//! heterogeneous collections. +//! +//! Loader traits ([`TextLoader`], [`BinaryLoader`], [`ImageLoader`], +//! [`SpreadsheetLoader`], [`AudioLoader`]) extend `Handler` with a typed +//! `load()` method that returns `Vec<Document<Self>>`. + +use nvisy_core::error::Error; +use nvisy_core::io::ContentData; + +use crate::document::Document; + +// --------------------------------------------------------------------------- +// Handler supertrait +// --------------------------------------------------------------------------- + +/// Base trait for all format handlers. +/// +/// Every concrete handler (e.g. `CsvHandler`, `PdfHandler`) implements this +/// trait, providing an identifier, supported file extensions, and MIME types. +pub trait Handler: Send + Sync + Clone + 'static { + /// Unique identifier (e.g. `"csv"`, `"pdf"`, `"wav"`). + fn id(&self) -> &str; + /// File extensions this handler supports (e.g. `&["csv"]`). + fn extensions(&self) -> &[&str]; + /// MIME content types this handler supports (e.g. `&["text/csv"]`). + fn content_types(&self) -> &[&str]; +} + +// --------------------------------------------------------------------------- +// Concrete handler structs +// --------------------------------------------------------------------------- + +/// Handles plain-text files (`.txt`, `.text`). +#[derive(Debug, Clone)] +pub struct PlaintextHandler; + +impl Handler for PlaintextHandler { + fn id(&self) -> &str { "plaintext" } + fn extensions(&self) -> &[&str] { &["txt", "text"] } + fn content_types(&self) -> &[&str] { &["text/plain"] } +} + +/// Handles CSV files (`.csv`). +#[derive(Debug, Clone)] +pub struct CsvHandler; + +impl Handler for CsvHandler { + fn id(&self) -> &str { "csv" } + fn extensions(&self) -> &[&str] { &["csv"] } + fn content_types(&self) -> &[&str] { &["text/csv"] } +} + +/// Handles JSON files (`.json`). +#[derive(Debug, Clone)] +pub struct JsonHandler; + +impl Handler for JsonHandler { + fn id(&self) -> &str { "json" } + fn extensions(&self) -> &[&str] { &["json"] } + fn content_types(&self) -> &[&str] { &["application/json"] } +} + +/// Handles HTML files (`.html`, `.htm`). +#[cfg(feature = "html")] +#[derive(Debug, Clone)] +pub struct HtmlHandler; + +#[cfg(feature = "html")] +impl Handler for HtmlHandler { + fn id(&self) -> &str { "html" } + fn extensions(&self) -> &[&str] { &["html", "htm"] } + fn content_types(&self) -> &[&str] { &["text/html"] } +} + +/// Handles PDF files (`.pdf`). +#[cfg(feature = "pdf")] +#[derive(Debug, Clone)] +pub struct PdfHandler; + +#[cfg(feature = "pdf")] +impl Handler for PdfHandler { + fn id(&self) -> &str { "pdf" } + fn extensions(&self) -> &[&str] { &["pdf"] } + fn content_types(&self) -> &[&str] { &["application/pdf"] } +} + +/// Handles DOCX files (`.docx`). +#[cfg(feature = "docx")] +#[derive(Debug, Clone)] +pub struct DocxHandler; + +#[cfg(feature = "docx")] +impl Handler for DocxHandler { + fn id(&self) -> &str { "docx" } + fn extensions(&self) -> &[&str] { &["docx"] } + fn content_types(&self) -> &[&str] { &["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] } +} + +/// Handles image files (PNG, JPEG, TIFF, etc.). +#[cfg(feature = "image")] +#[derive(Debug, Clone)] +pub struct ImageHandler; + +#[cfg(feature = "image")] +impl Handler for ImageHandler { + fn id(&self) -> &str { "image" } + fn extensions(&self) -> &[&str] { &["jpg", "jpeg", "png", "tiff", "bmp", "webp"] } + fn content_types(&self) -> &[&str] { &["image/jpeg", "image/png", "image/tiff", "image/bmp", "image/webp"] } +} + +/// Handles XLSX/XLS spreadsheet files. +#[cfg(feature = "xlsx")] +#[derive(Debug, Clone)] +pub struct XlsxHandler; + +#[cfg(feature = "xlsx")] +impl Handler for XlsxHandler { + fn id(&self) -> &str { "xlsx" } + fn extensions(&self) -> &[&str] { &["xlsx", "xls"] } + fn content_types(&self) -> &[&str] { &["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel"] } +} + +/// Handles Apache Parquet files. +#[cfg(feature = "parquet")] +#[derive(Debug, Clone)] +pub struct ParquetHandler; + +#[cfg(feature = "parquet")] +impl Handler for ParquetHandler { + fn id(&self) -> &str { "parquet" } + fn extensions(&self) -> &[&str] { &["parquet"] } + fn content_types(&self) -> &[&str] { &["application/x-parquet"] } +} + +/// Handles WAV audio files. +#[derive(Debug, Clone)] +pub struct WavHandler; + +impl Handler for WavHandler { + fn id(&self) -> &str { "wav" } + fn extensions(&self) -> &[&str] { &["wav"] } + fn content_types(&self) -> &[&str] { &["audio/wav", "audio/x-wav"] } +} + +/// Handles MP3 audio files. +#[derive(Debug, Clone)] +pub struct Mp3Handler; + +impl Handler for Mp3Handler { + fn id(&self) -> &str { "mp3" } + fn extensions(&self) -> &[&str] { &["mp3"] } + fn content_types(&self) -> &[&str] { &["audio/mpeg"] } +} + +// --------------------------------------------------------------------------- +// FormatHandler enum — closed type erasure +// --------------------------------------------------------------------------- + +/// Closed enum of all supported format handlers. +/// +/// Provides type erasure: `Document<FormatHandler>` can represent +/// content from any supported format in heterogeneous collections. +#[derive(Debug, Clone)] +pub enum FormatHandler { + Plaintext(PlaintextHandler), + Csv(CsvHandler), + Json(JsonHandler), + #[cfg(feature = "html")] + Html(HtmlHandler), + #[cfg(feature = "pdf")] + Pdf(PdfHandler), + #[cfg(feature = "docx")] + Docx(DocxHandler), + #[cfg(feature = "image")] + Image(ImageHandler), + #[cfg(feature = "xlsx")] + Xlsx(XlsxHandler), + #[cfg(feature = "parquet")] + Parquet(ParquetHandler), + Wav(WavHandler), + Mp3(Mp3Handler), +} + +impl Handler for FormatHandler { + fn id(&self) -> &str { + match self { + Self::Plaintext(h) => h.id(), + Self::Csv(h) => h.id(), + Self::Json(h) => h.id(), + #[cfg(feature = "html")] + Self::Html(h) => h.id(), + #[cfg(feature = "pdf")] + Self::Pdf(h) => h.id(), + #[cfg(feature = "docx")] + Self::Docx(h) => h.id(), + #[cfg(feature = "image")] + Self::Image(h) => h.id(), + #[cfg(feature = "xlsx")] + Self::Xlsx(h) => h.id(), + #[cfg(feature = "parquet")] + Self::Parquet(h) => h.id(), + Self::Wav(h) => h.id(), + Self::Mp3(h) => h.id(), + } + } + + fn extensions(&self) -> &[&str] { + match self { + Self::Plaintext(h) => h.extensions(), + Self::Csv(h) => h.extensions(), + Self::Json(h) => h.extensions(), + #[cfg(feature = "html")] + Self::Html(h) => h.extensions(), + #[cfg(feature = "pdf")] + Self::Pdf(h) => h.extensions(), + #[cfg(feature = "docx")] + Self::Docx(h) => h.extensions(), + #[cfg(feature = "image")] + Self::Image(h) => h.extensions(), + #[cfg(feature = "xlsx")] + Self::Xlsx(h) => h.extensions(), + #[cfg(feature = "parquet")] + Self::Parquet(h) => h.extensions(), + Self::Wav(h) => h.extensions(), + Self::Mp3(h) => h.extensions(), + } + } + + fn content_types(&self) -> &[&str] { + match self { + Self::Plaintext(h) => h.content_types(), + Self::Csv(h) => h.content_types(), + Self::Json(h) => h.content_types(), + #[cfg(feature = "html")] + Self::Html(h) => h.content_types(), + #[cfg(feature = "pdf")] + Self::Pdf(h) => h.content_types(), + #[cfg(feature = "docx")] + Self::Docx(h) => h.content_types(), + #[cfg(feature = "image")] + Self::Image(h) => h.content_types(), + #[cfg(feature = "xlsx")] + Self::Xlsx(h) => h.content_types(), + #[cfg(feature = "parquet")] + Self::Parquet(h) => h.content_types(), + Self::Wav(h) => h.content_types(), + Self::Mp3(h) => h.content_types(), + } + } +} + +// -- From impls for each concrete handler -> FormatHandler -- + +impl From<PlaintextHandler> for FormatHandler { + fn from(h: PlaintextHandler) -> Self { Self::Plaintext(h) } +} +impl From<CsvHandler> for FormatHandler { + fn from(h: CsvHandler) -> Self { Self::Csv(h) } +} +impl From<JsonHandler> for FormatHandler { + fn from(h: JsonHandler) -> Self { Self::Json(h) } +} +#[cfg(feature = "html")] +impl From<HtmlHandler> for FormatHandler { + fn from(h: HtmlHandler) -> Self { Self::Html(h) } +} +#[cfg(feature = "pdf")] +impl From<PdfHandler> for FormatHandler { + fn from(h: PdfHandler) -> Self { Self::Pdf(h) } +} +#[cfg(feature = "docx")] +impl From<DocxHandler> for FormatHandler { + fn from(h: DocxHandler) -> Self { Self::Docx(h) } +} +#[cfg(feature = "image")] +impl From<ImageHandler> for FormatHandler { + fn from(h: ImageHandler) -> Self { Self::Image(h) } +} +#[cfg(feature = "xlsx")] +impl From<XlsxHandler> for FormatHandler { + fn from(h: XlsxHandler) -> Self { Self::Xlsx(h) } +} +#[cfg(feature = "parquet")] +impl From<ParquetHandler> for FormatHandler { + fn from(h: ParquetHandler) -> Self { Self::Parquet(h) } +} +impl From<WavHandler> for FormatHandler { + fn from(h: WavHandler) -> Self { Self::Wav(h) } +} +impl From<Mp3Handler> for FormatHandler { + fn from(h: Mp3Handler) -> Self { Self::Mp3(h) } +} + +// --------------------------------------------------------------------------- +// Loader traits +// --------------------------------------------------------------------------- + +/// Loader for text-based formats (plain text, CSV, JSON, HTML). +#[async_trait::async_trait] +pub trait TextLoader: Handler { + /// Strongly-typed parameters for this loader. + type Params: Send; + + /// Parse the content into documents. + async fn load( + &self, + content: &ContentData, + params: &Self::Params, + ) -> Result<Vec<Document<FormatHandler>>, Error>; +} + +/// Loader for binary document formats (PDF, DOCX) that produce both +/// text documents and extracted images. +#[async_trait::async_trait] +pub trait BinaryLoader: Handler { + /// Strongly-typed parameters for this loader. + type Params: Send; + + /// Parse the content into documents (text pages and extracted images). + async fn load( + &self, + content: &ContentData, + params: &Self::Params, + ) -> Result<Vec<Document<FormatHandler>>, Error>; +} + +/// Loader for image formats (PNG, JPEG, TIFF, etc.). +#[async_trait::async_trait] +pub trait ImageLoader: Handler { + /// Strongly-typed parameters for this loader. + type Params: Send; + + /// Decode the content into image documents. + async fn load( + &self, + content: &ContentData, + params: &Self::Params, + ) -> Result<Vec<Document<FormatHandler>>, Error>; +} + +/// Loader for spreadsheet/tabular formats (XLSX, Parquet). +#[async_trait::async_trait] +pub trait SpreadsheetLoader: Handler { + /// Strongly-typed parameters for this loader. + type Params: Send; + + /// Parse the content into tabular documents. + async fn load( + &self, + content: &ContentData, + params: &Self::Params, + ) -> Result<Vec<Document<FormatHandler>>, Error>; +} + +/// Loader for audio formats (WAV, MP3). +#[async_trait::async_trait] +pub trait AudioLoader: Handler { + /// Strongly-typed parameters for this loader. + type Params: Send; + + /// Process the audio content. + async fn load( + &self, + content: &ContentData, + params: &Self::Params, + ) -> Result<Vec<Document<FormatHandler>>, Error>; +} diff --git a/crates/nvisy-ingest/src/image/image.rs b/crates/nvisy-ingest/src/image/image.rs new file mode 100644 index 0000000..0fac9fc --- /dev/null +++ b/crates/nvisy-ingest/src/image/image.rs @@ -0,0 +1,58 @@ +//! Image file loader using the `image` crate. + +use bytes::Bytes; +use serde::Deserialize; + +use nvisy_core::io::ContentData; +use nvisy_core::error::{Error, ErrorKind}; + +use crate::document::Document; +use crate::handler::{ImageHandler as ImageHandlerType, FormatHandler, ImageLoader}; + +/// Typed parameters for [`ImageFileLoader`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ImageLoaderParams {} + +/// Decodes image files and returns a [`Document`] with binary data and dimensions. +pub struct ImageFileLoader; + +impl Clone for ImageFileLoader { + fn clone(&self) -> Self { Self } +} + +#[async_trait::async_trait] +impl ImageLoader for ImageFileLoader { + type Params = ImageLoaderParams; + + async fn load( + &self, + content: &ContentData, + _params: &Self::Params, + ) -> Result<Vec<Document<FormatHandler>>, Error> { + let raw = content.to_bytes(); + let img = image::load_from_memory(&raw).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("Image decode failed: {e}")) + })?; + + let width = img.width(); + let height = img.height(); + + let mime_type = content + .content_type() + .unwrap_or("image/png") + .to_string(); + + let mut doc = Document::new(ImageHandlerType) + .with_data(Bytes::copy_from_slice(&raw), mime_type) + .with_dimensions(width, height); + doc.source.set_parent_id(Some(content.content_source.as_uuid())); + Ok(vec![doc.into_format()]) + } +} + +impl crate::handler::Handler for ImageFileLoader { + fn id(&self) -> &str { ImageHandlerType.id() } + fn extensions(&self) -> &[&str] { ImageHandlerType.extensions() } + fn content_types(&self) -> &[&str] { ImageHandlerType.content_types() } +} diff --git a/crates/nvisy-ingest/src/image/mod.rs b/crates/nvisy-ingest/src/image/mod.rs new file mode 100644 index 0000000..ebcc729 --- /dev/null +++ b/crates/nvisy-ingest/src/image/mod.rs @@ -0,0 +1,4 @@ +//! Image file loaders. + +#[cfg(feature = "image")] +pub mod image; diff --git a/crates/nvisy-ingest/src/lib.rs b/crates/nvisy-ingest/src/lib.rs index cd4b052..cde5e1a 100644 --- a/crates/nvisy-ingest/src/lib.rs +++ b/crates/nvisy-ingest/src/lib.rs @@ -2,8 +2,14 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -/// File-format loaders. -pub mod loaders; +pub mod handler; +pub mod document; +pub mod element; +pub mod text; +pub mod binary; +pub mod image; +pub mod tabular; +pub mod audio; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-ingest/src/loaders/audio_loader.rs b/crates/nvisy-ingest/src/loaders/audio_loader.rs deleted file mode 100644 index 535e706..0000000 --- a/crates/nvisy-ingest/src/loaders/audio_loader.rs +++ /dev/null @@ -1,58 +0,0 @@ -//! Placeholder audio file loader. -//! -//! Returns a document with metadata only — audio redaction is not yet implemented. - -use serde::Deserialize; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::Document; -use nvisy_core::error::Error; -use super::{Loader, LoaderOutput}; - -/// Typed parameters for [`AudioLoader`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct AudioLoaderParams {} - -/// Placeholder loader for audio files. Returns a metadata-only document. -pub struct AudioLoader; - -#[async_trait::async_trait] -impl Loader for AudioLoader { - type Params = AudioLoaderParams; - - fn id(&self) -> &str { - "audio" - } - - fn extensions(&self) -> &[&str] { - &["mp3", "wav", "flac", "ogg", "m4a"] - } - - fn content_types(&self) -> &[&str] { - &[ - "audio/mpeg", - "audio/wav", - "audio/flac", - "audio/ogg", - "audio/mp4", - ] - } - - async fn load( - &self, - blob: &Blob, - _params: &Self::Params, - ) -> Result<Vec<LoaderOutput>, Error> { - let content_type = blob.content_type().unwrap_or("audio/unknown").to_string(); - let size = blob.content.len(); - - let doc = Document::new(format!( - "[Audio file: type={}, size={} bytes. Audio redaction not yet implemented.]", - content_type, size - )) - .with_source_format("audio"); - - Ok(vec![LoaderOutput::Document(doc)]) - } -} diff --git a/crates/nvisy-ingest/src/loaders/csv_loader.rs b/crates/nvisy-ingest/src/loaders/csv_loader.rs deleted file mode 100644 index f9c4a3c..0000000 --- a/crates/nvisy-ingest/src/loaders/csv_loader.rs +++ /dev/null @@ -1,44 +0,0 @@ -//! CSV file loader. - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::Document; -use nvisy_core::error::Error; -use super::{Loader, LoaderOutput}; - -/// Loads CSV blobs into a single [`Document`] containing the raw CSV text. -/// -/// The loader validates that the blob content is valid UTF-8 and tags the -/// resulting document with `source_format = "csv"`. It handles the `text/csv` -/// content type and `.csv` file extension. -pub struct CsvLoader; - -#[async_trait::async_trait] -impl Loader for CsvLoader { - type Params = (); - - fn id(&self) -> &str { - "csv" - } - - fn extensions(&self) -> &[&str] { - &["csv"] - } - - fn content_types(&self) -> &[&str] { - &["text/csv"] - } - - async fn load( - &self, - blob: &Blob, - _params: &Self::Params, - ) -> Result<Vec<LoaderOutput>, Error> { - let content = String::from_utf8(blob.content.to_vec()).map_err(|e| { - Error::validation(format!("Invalid UTF-8 in CSV: {}", e), "csv-loader") - })?; - let mut doc = Document::new(content); - doc.source_format = Some("csv".to_string()); - doc.data.parent_id = Some(blob.data.id); - Ok(vec![LoaderOutput::Document(doc)]) - } -} diff --git a/crates/nvisy-ingest/src/loaders/image_loader.rs b/crates/nvisy-ingest/src/loaders/image_loader.rs deleted file mode 100644 index 468aa87..0000000 --- a/crates/nvisy-ingest/src/loaders/image_loader.rs +++ /dev/null @@ -1,67 +0,0 @@ -//! Image file loader using the `image` crate. - -use bytes::Bytes; -use serde::Deserialize; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::ImageData; -use nvisy_core::error::{Error, ErrorKind}; -use super::{Loader, LoaderOutput}; - -/// Typed parameters for [`ImageLoader`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ImageLoaderParams {} - -/// Decodes image files and returns an [`ImageData`] with dimensions. -pub struct ImageLoader; - -#[async_trait::async_trait] -impl Loader for ImageLoader { - type Params = ImageLoaderParams; - - fn id(&self) -> &str { - "image" - } - - fn extensions(&self) -> &[&str] { - &["jpg", "jpeg", "png", "tiff", "bmp", "webp"] - } - - fn content_types(&self) -> &[&str] { - &[ - "image/jpeg", - "image/png", - "image/tiff", - "image/bmp", - "image/webp", - ] - } - - async fn load( - &self, - blob: &Blob, - _params: &Self::Params, - ) -> Result<Vec<LoaderOutput>, Error> { - let img = image::load_from_memory(&blob.content).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("Image decode failed: {e}")) - })?; - - let width = img.width(); - let height = img.height(); - - // Detect MIME type from blob or infer - let mime_type = blob - .content_type() - .unwrap_or("image/png") - .to_string(); - - let image_data = ImageData::new( - Bytes::copy_from_slice(&blob.content), - mime_type, - ) - .with_dimensions(width, height); - - Ok(vec![LoaderOutput::Image(image_data)]) - } -} diff --git a/crates/nvisy-ingest/src/loaders/json_loader.rs b/crates/nvisy-ingest/src/loaders/json_loader.rs deleted file mode 100644 index 68c5490..0000000 --- a/crates/nvisy-ingest/src/loaders/json_loader.rs +++ /dev/null @@ -1,48 +0,0 @@ -//! JSON file loader. - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::Document; -use nvisy_core::error::Error; -use super::{Loader, LoaderOutput}; - -/// Loads JSON blobs into a single [`Document`] containing the raw JSON text. -/// -/// The loader validates that the blob content is valid UTF-8 **and** valid JSON -/// before producing the document. It handles the `application/json` content type -/// and `.json` file extension. -pub struct JsonLoader; - -#[async_trait::async_trait] -impl Loader for JsonLoader { - type Params = (); - - fn id(&self) -> &str { - "json" - } - - fn extensions(&self) -> &[&str] { - &["json"] - } - - fn content_types(&self) -> &[&str] { - &["application/json"] - } - - async fn load( - &self, - blob: &Blob, - _params: &Self::Params, - ) -> Result<Vec<LoaderOutput>, Error> { - let content = String::from_utf8(blob.content.to_vec()).map_err(|e| { - Error::validation(format!("Invalid UTF-8 in JSON: {}", e), "json-loader") - })?; - // Validate it's valid JSON - let _: serde_json::Value = serde_json::from_str(&content).map_err(|e| { - Error::validation(format!("Invalid JSON: {}", e), "json-loader") - })?; - let mut doc = Document::new(content); - doc.source_format = Some("json".to_string()); - doc.data.parent_id = Some(blob.data.id); - Ok(vec![LoaderOutput::Document(doc)]) - } -} diff --git a/crates/nvisy-ingest/src/loaders/mod.rs b/crates/nvisy-ingest/src/loaders/mod.rs deleted file mode 100644 index 1ee9d45..0000000 --- a/crates/nvisy-ingest/src/loaders/mod.rs +++ /dev/null @@ -1,72 +0,0 @@ -//! File-format loaders for multimodal document ingestion. - -use serde::de::DeserializeOwned; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::Document; -use nvisy_core::datatypes::document::ImageData; -use nvisy_core::datatypes::document::TabularData; -use nvisy_core::error::Error; - -/// Output of a loader -- either a parsed document, an extracted image, or tabular data. -pub enum LoaderOutput { - /// A successfully parsed text document. - Document(Document), - /// An extracted or decoded image. - Image(ImageData), - /// Tabular data extracted from a spreadsheet or data file. - Tabular(TabularData), -} - -/// Converts raw [`Blob`] content into structured [`Document`]s or [`ImageData`]. -/// -/// Loaders declare which file extensions and MIME types they support. -/// The engine selects the appropriate loader based on the blob's -/// content type and extension. -#[async_trait::async_trait] -pub trait Loader: Send + Sync + 'static { - /// Strongly-typed parameters for this loader. - type Params: DeserializeOwned + Send; - - /// Unique identifier for this loader (e.g. `"csv"`, `"pdf"`). - fn id(&self) -> &str; - /// File extensions this loader handles (e.g. `["csv", "tsv"]`). - fn extensions(&self) -> &[&str]; - /// MIME types this loader handles (e.g. `["text/csv"]`). - fn content_types(&self) -> &[&str]; - - /// Parse the blob and return one or more documents or images. - async fn load( - &self, - blob: &Blob, - params: &Self::Params, - ) -> Result<Vec<LoaderOutput>, Error>; -} - -/// Loader for CSV files. -pub mod csv_loader; -/// Loader for JSON files. -pub mod json_loader; -/// Loader for plain-text files. -pub mod plaintext; - -/// Loader for PDF files. -#[cfg(feature = "pdf")] -pub mod pdf_loader; -/// Loader for DOCX (Office Open XML) files. -#[cfg(feature = "docx")] -pub mod docx_loader; -/// Loader for HTML files. -#[cfg(feature = "html")] -pub mod html_loader; -/// Loader for image files (PNG, JPEG, TIFF, etc.). -#[cfg(feature = "image")] -pub mod image_loader; -/// Loader for Apache Parquet files. -#[cfg(feature = "parquet")] -pub mod parquet_loader; -/// Loader for Excel XLSX/XLS files. -#[cfg(feature = "xlsx")] -pub mod xlsx_loader; -/// Placeholder loader for audio files. -pub mod audio_loader; diff --git a/crates/nvisy-ingest/src/loaders/plaintext.rs b/crates/nvisy-ingest/src/loaders/plaintext.rs deleted file mode 100644 index 9222d4e..0000000 --- a/crates/nvisy-ingest/src/loaders/plaintext.rs +++ /dev/null @@ -1,47 +0,0 @@ -//! Plain-text file loader. - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::Document; -use nvisy_core::error::Error; -use super::{Loader, LoaderOutput}; - -/// Loads plain-text blobs into a single [`Document`]. -/// -/// The loader validates that the blob content is valid UTF-8 and tags the -/// resulting document with `source_format = "txt"`. It handles the `text/plain` -/// content type and `.txt` / `.text` file extensions. -pub struct PlaintextLoader; - -#[async_trait::async_trait] -impl Loader for PlaintextLoader { - type Params = (); - - fn id(&self) -> &str { - "plaintext" - } - - fn extensions(&self) -> &[&str] { - &["txt", "text"] - } - - fn content_types(&self) -> &[&str] { - &["text/plain"] - } - - async fn load( - &self, - blob: &Blob, - _params: &Self::Params, - ) -> Result<Vec<LoaderOutput>, Error> { - let content = String::from_utf8(blob.content.to_vec()).map_err(|e| { - Error::validation( - format!("Invalid UTF-8 in plaintext: {}", e), - "plaintext-loader", - ) - })?; - let mut doc = Document::new(content); - doc.source_format = Some("txt".to_string()); - doc.data.parent_id = Some(blob.data.id); - Ok(vec![LoaderOutput::Document(doc)]) - } -} diff --git a/crates/nvisy-ingest/src/prelude.rs b/crates/nvisy-ingest/src/prelude.rs index caf6edb..0ad5b84 100644 --- a/crates/nvisy-ingest/src/prelude.rs +++ b/crates/nvisy-ingest/src/prelude.rs @@ -1,19 +1,43 @@ //! Convenience re-exports. -pub use crate::loaders::csv_loader::CsvLoader; -pub use crate::loaders::json_loader::JsonLoader; -pub use crate::loaders::plaintext::PlaintextLoader; +pub use crate::handler::{ + Handler, FormatHandler, + PlaintextHandler, CsvHandler, JsonHandler, + WavHandler, Mp3Handler, + TextLoader, BinaryLoader, ImageLoader, SpreadsheetLoader, AudioLoader, +}; +#[cfg(feature = "html")] +pub use crate::handler::HtmlHandler; #[cfg(feature = "pdf")] -pub use crate::loaders::pdf_loader::PdfLoader; +pub use crate::handler::PdfHandler; #[cfg(feature = "docx")] -pub use crate::loaders::docx_loader::DocxLoader; -#[cfg(feature = "html")] -pub use crate::loaders::html_loader::HtmlLoader; +pub use crate::handler::DocxHandler; #[cfg(feature = "image")] -pub use crate::loaders::image_loader::ImageLoader; +pub use crate::handler::ImageHandler; +#[cfg(feature = "xlsx")] +pub use crate::handler::XlsxHandler; #[cfg(feature = "parquet")] -pub use crate::loaders::parquet_loader::ParquetLoader; +pub use crate::handler::ParquetHandler; + +pub use crate::document::Document; +pub use crate::element::{Element, ElementCategory, ElementType}; + +pub use crate::text::csv::CsvLoader; +pub use crate::text::json::JsonLoader; +pub use crate::text::plaintext::PlaintextLoader; + +#[cfg(feature = "html")] +pub use crate::text::html::HtmlLoader; +#[cfg(feature = "pdf")] +pub use crate::binary::pdf::PdfLoader; +#[cfg(feature = "docx")] +pub use crate::binary::docx::DocxLoader; +#[cfg(feature = "image")] +pub use crate::image::image::ImageFileLoader; #[cfg(feature = "xlsx")] -pub use crate::loaders::xlsx_loader::XlsxLoader; -pub use crate::loaders::audio_loader::AudioLoader; +pub use crate::tabular::xlsx::XlsxLoader; +#[cfg(feature = "parquet")] +pub use crate::tabular::parquet::ParquetLoader; +pub use crate::audio::wav::WavLoader; +pub use crate::audio::mp3::Mp3Loader; diff --git a/crates/nvisy-ingest/src/tabular/mod.rs b/crates/nvisy-ingest/src/tabular/mod.rs new file mode 100644 index 0000000..d66c51b --- /dev/null +++ b/crates/nvisy-ingest/src/tabular/mod.rs @@ -0,0 +1,7 @@ +//! Tabular/spreadsheet file loaders (XLSX, Parquet). + +#[cfg(feature = "xlsx")] +pub mod xlsx; + +#[cfg(feature = "parquet")] +pub mod parquet; diff --git a/crates/nvisy-ingest/src/loaders/parquet_loader.rs b/crates/nvisy-ingest/src/tabular/parquet.rs similarity index 73% rename from crates/nvisy-ingest/src/loaders/parquet_loader.rs rename to crates/nvisy-ingest/src/tabular/parquet.rs index 3117eb4..d977241 100644 --- a/crates/nvisy-ingest/src/loaders/parquet_loader.rs +++ b/crates/nvisy-ingest/src/tabular/parquet.rs @@ -1,13 +1,13 @@ //! Apache Parquet file loader. -use bytes::Bytes; use serde::Deserialize; use std::sync::Arc; -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::{Document, TabularData}; +use nvisy_core::io::ContentData; use nvisy_core::error::{Error, ErrorKind}; -use super::{Loader, LoaderOutput}; + +use crate::document::Document; +use crate::handler::{ParquetHandler, PlaintextHandler, FormatHandler, SpreadsheetLoader}; use arrow::array::{Array, RecordBatchReader}; use arrow::record_batch::RecordBatch; @@ -26,28 +26,20 @@ pub struct ParquetLoaderParams { /// for regex/dictionary scanning. pub struct ParquetLoader; +impl Clone for ParquetLoader { + fn clone(&self) -> Self { Self } +} + #[async_trait::async_trait] -impl Loader for ParquetLoader { +impl SpreadsheetLoader for ParquetLoader { type Params = ParquetLoaderParams; - fn id(&self) -> &str { - "parquet" - } - - fn extensions(&self) -> &[&str] { - &["parquet"] - } - - fn content_types(&self) -> &[&str] { - &["application/x-parquet"] - } - async fn load( &self, - blob: &Blob, + content: &ContentData, params: &Self::Params, - ) -> Result<Vec<LoaderOutput>, Error> { - let data = Bytes::copy_from_slice(&blob.content); + ) -> Result<Vec<Document<FormatHandler>>, Error> { + let data = bytes::Bytes::copy_from_slice(&content.to_bytes()); let builder = ParquetRecordBatchReaderBuilder::try_new(data) .map_err(|e| { Error::new(ErrorKind::Runtime, format!("Parquet open failed: {e}")) @@ -94,8 +86,9 @@ impl Loader for ParquetLoader { } } - let tabular = TabularData::new(columns.clone(), all_rows.clone()) - .with_source_format("parquet"); + let mut tabular_doc = Document::new(ParquetHandler) + .with_tabular(columns, all_rows.clone()); + tabular_doc.source.set_parent_id(Some(content.content_source.as_uuid())); // Flatten to text for regex/dictionary scanning let mut text_parts = Vec::new(); @@ -103,12 +96,11 @@ impl Loader for ParquetLoader { text_parts.push(row.join("\t")); } let flat_text = text_parts.join("\n"); - let doc = Document::new(flat_text).with_source_format("parquet"); + let mut text_doc = Document::new(PlaintextHandler) + .with_text(flat_text); + text_doc.source.set_parent_id(Some(content.content_source.as_uuid())); - Ok(vec![ - LoaderOutput::Tabular(tabular), - LoaderOutput::Document(doc), - ]) + Ok(vec![tabular_doc.into_format(), text_doc.into_format()]) } } @@ -117,7 +109,6 @@ fn array_value_to_string(array: &dyn Array, index: usize) -> String { return String::new(); } - // Use Arrow's display formatting use std::fmt::Write; let mut buf = String::new(); let formatter = arrow::util::display::ArrayFormatter::try_new(array, &Default::default()); @@ -129,3 +120,9 @@ fn array_value_to_string(array: &dyn Array, index: usize) -> String { Err(_) => String::new(), } } + +impl crate::handler::Handler for ParquetLoader { + fn id(&self) -> &str { ParquetHandler.id() } + fn extensions(&self) -> &[&str] { ParquetHandler.extensions() } + fn content_types(&self) -> &[&str] { ParquetHandler.content_types() } +} diff --git a/crates/nvisy-ingest/src/loaders/xlsx_loader.rs b/crates/nvisy-ingest/src/tabular/xlsx.rs similarity index 68% rename from crates/nvisy-ingest/src/loaders/xlsx_loader.rs rename to crates/nvisy-ingest/src/tabular/xlsx.rs index 5fa6cf6..f446d4d 100644 --- a/crates/nvisy-ingest/src/loaders/xlsx_loader.rs +++ b/crates/nvisy-ingest/src/tabular/xlsx.rs @@ -3,10 +3,11 @@ use serde::Deserialize; use std::io::Cursor; -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::{Document, TabularData}; +use nvisy_core::io::ContentData; use nvisy_core::error::{Error, ErrorKind}; -use super::{Loader, LoaderOutput}; + +use crate::document::Document; +use crate::handler::{XlsxHandler, PlaintextHandler, FormatHandler, SpreadsheetLoader}; use calamine::{Reader, open_workbook_auto_from_rs}; @@ -26,37 +27,26 @@ pub struct XlsxLoaderParams { /// text document for regex/dictionary scanning. pub struct XlsxLoader; +impl Clone for XlsxLoader { + fn clone(&self) -> Self { Self } +} + #[async_trait::async_trait] -impl Loader for XlsxLoader { +impl SpreadsheetLoader for XlsxLoader { type Params = XlsxLoaderParams; - fn id(&self) -> &str { - "xlsx" - } - - fn extensions(&self) -> &[&str] { - &["xlsx", "xls"] - } - - fn content_types(&self) -> &[&str] { - &[ - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "application/vnd.ms-excel", - ] - } - async fn load( &self, - blob: &Blob, + content: &ContentData, params: &Self::Params, - ) -> Result<Vec<LoaderOutput>, Error> { - let cursor = Cursor::new(blob.content.to_vec()); + ) -> Result<Vec<Document<FormatHandler>>, Error> { + let cursor = Cursor::new(content.to_bytes().to_vec()); let mut workbook = open_workbook_auto_from_rs(cursor).map_err(|e| { Error::new(ErrorKind::Runtime, format!("XLSX open failed: {e}")) })?; let sheet_names: Vec<String> = workbook.sheet_names().to_vec(); - let mut outputs = Vec::new(); + let mut documents = Vec::new(); let mut all_text_parts = Vec::new(); for sheet_name in &sheet_names { @@ -97,20 +87,27 @@ impl Loader for XlsxLoader { rows.push(row_data); } - let tabular = TabularData::new(columns, rows) - .with_source_format("xlsx") + let mut tabular_doc = Document::new(XlsxHandler) + .with_tabular(columns, rows) .with_sheet_name(sheet_name); - - outputs.push(LoaderOutput::Tabular(tabular)); + tabular_doc.source.set_parent_id(Some(content.content_source.as_uuid())); + documents.push(tabular_doc.into_format()); } // Create a flattened document for regex/dictionary scanning if !all_text_parts.is_empty() { - let doc = Document::new(all_text_parts.join("\n")) - .with_source_format("xlsx"); - outputs.push(LoaderOutput::Document(doc)); + let mut doc = Document::new(PlaintextHandler) + .with_text(all_text_parts.join("\n")); + doc.source.set_parent_id(Some(content.content_source.as_uuid())); + documents.push(doc.into_format()); } - Ok(outputs) + Ok(documents) } } + +impl crate::handler::Handler for XlsxLoader { + fn id(&self) -> &str { XlsxHandler.id() } + fn extensions(&self) -> &[&str] { XlsxHandler.extensions() } + fn content_types(&self) -> &[&str] { XlsxHandler.content_types() } +} diff --git a/crates/nvisy-ingest/src/text/csv.rs b/crates/nvisy-ingest/src/text/csv.rs new file mode 100644 index 0000000..17d9b01 --- /dev/null +++ b/crates/nvisy-ingest/src/text/csv.rs @@ -0,0 +1,38 @@ +//! CSV file loader. + +use nvisy_core::io::ContentData; +use nvisy_core::error::Error; + +use crate::document::Document; +use crate::handler::{CsvHandler, FormatHandler, TextLoader}; + +/// Loads CSV content into a single [`Document`] containing the raw CSV text. +pub struct CsvLoader; + +impl Clone for CsvLoader { + fn clone(&self) -> Self { Self } +} + +#[async_trait::async_trait] +impl TextLoader for CsvLoader { + type Params = (); + + async fn load( + &self, + content: &ContentData, + _params: &Self::Params, + ) -> Result<Vec<Document<FormatHandler>>, Error> { + let text = String::from_utf8(content.to_bytes().to_vec()).map_err(|e| { + Error::validation(format!("Invalid UTF-8 in CSV: {}", e), "csv-loader") + })?; + let mut doc = Document::new(CsvHandler).with_text(text); + doc.source.set_parent_id(Some(content.content_source.as_uuid())); + Ok(vec![doc.into_format()]) + } +} + +impl crate::handler::Handler for CsvLoader { + fn id(&self) -> &str { CsvHandler.id() } + fn extensions(&self) -> &[&str] { CsvHandler.extensions() } + fn content_types(&self) -> &[&str] { CsvHandler.content_types() } +} diff --git a/crates/nvisy-ingest/src/loaders/html_loader.rs b/crates/nvisy-ingest/src/text/html.rs similarity index 78% rename from crates/nvisy-ingest/src/loaders/html_loader.rs rename to crates/nvisy-ingest/src/text/html.rs index 1c933f7..574203b 100644 --- a/crates/nvisy-ingest/src/loaders/html_loader.rs +++ b/crates/nvisy-ingest/src/text/html.rs @@ -2,10 +2,12 @@ use serde::Deserialize; -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::{Document, Element, ElementType}; +use nvisy_core::io::ContentData; use nvisy_core::error::{Error, ErrorKind}; -use super::{Loader, LoaderOutput}; + +use crate::document::Document; +use crate::element::{Element, ElementType}; +use crate::handler::{HtmlHandler, FormatHandler, TextLoader}; /// Typed parameters for [`HtmlLoader`]. #[derive(Debug, Deserialize)] @@ -15,28 +17,20 @@ pub struct HtmlLoaderParams {} /// Extracts text and structural elements from HTML documents. pub struct HtmlLoader; +impl Clone for HtmlLoader { + fn clone(&self) -> Self { Self } +} + #[async_trait::async_trait] -impl Loader for HtmlLoader { +impl TextLoader for HtmlLoader { type Params = HtmlLoaderParams; - fn id(&self) -> &str { - "html" - } - - fn extensions(&self) -> &[&str] { - &["html", "htm"] - } - - fn content_types(&self) -> &[&str] { - &["text/html"] - } - async fn load( &self, - blob: &Blob, + content: &ContentData, _params: &Self::Params, - ) -> Result<Vec<LoaderOutput>, Error> { - let html_str = String::from_utf8(blob.content.to_vec()).map_err(|e| { + ) -> Result<Vec<Document<FormatHandler>>, Error> { + let html_str = String::from_utf8(content.to_bytes().to_vec()).map_err(|e| { Error::new(ErrorKind::Runtime, format!("HTML is not valid UTF-8: {e}")) })?; @@ -96,10 +90,17 @@ impl Loader for HtmlLoader { } } - let doc = Document::new(full_text) + let doc = Document::new(HtmlHandler) + .with_text(full_text) .with_elements(elements) - .with_source_format("html"); + .into_format(); - Ok(vec![LoaderOutput::Document(doc)]) + Ok(vec![doc]) } } + +impl crate::handler::Handler for HtmlLoader { + fn id(&self) -> &str { HtmlHandler.id() } + fn extensions(&self) -> &[&str] { HtmlHandler.extensions() } + fn content_types(&self) -> &[&str] { HtmlHandler.content_types() } +} diff --git a/crates/nvisy-ingest/src/text/json.rs b/crates/nvisy-ingest/src/text/json.rs new file mode 100644 index 0000000..2576347 --- /dev/null +++ b/crates/nvisy-ingest/src/text/json.rs @@ -0,0 +1,42 @@ +//! JSON file loader. + +use nvisy_core::io::ContentData; +use nvisy_core::error::Error; + +use crate::document::Document; +use crate::handler::{JsonHandler, FormatHandler, TextLoader}; + +/// Loads JSON content into a single [`Document`] containing the raw JSON text. +pub struct JsonLoader; + +impl Clone for JsonLoader { + fn clone(&self) -> Self { Self } +} + +#[async_trait::async_trait] +impl TextLoader for JsonLoader { + type Params = (); + + async fn load( + &self, + content: &ContentData, + _params: &Self::Params, + ) -> Result<Vec<Document<FormatHandler>>, Error> { + let text = String::from_utf8(content.to_bytes().to_vec()).map_err(|e| { + Error::validation(format!("Invalid UTF-8 in JSON: {}", e), "json-loader") + })?; + // Validate it's valid JSON + let _: serde_json::Value = serde_json::from_str(&text).map_err(|e| { + Error::validation(format!("Invalid JSON: {}", e), "json-loader") + })?; + let mut doc = Document::new(JsonHandler).with_text(text); + doc.source.set_parent_id(Some(content.content_source.as_uuid())); + Ok(vec![doc.into_format()]) + } +} + +impl crate::handler::Handler for JsonLoader { + fn id(&self) -> &str { JsonHandler.id() } + fn extensions(&self) -> &[&str] { JsonHandler.extensions() } + fn content_types(&self) -> &[&str] { JsonHandler.content_types() } +} diff --git a/crates/nvisy-ingest/src/text/mod.rs b/crates/nvisy-ingest/src/text/mod.rs new file mode 100644 index 0000000..fdb9ae1 --- /dev/null +++ b/crates/nvisy-ingest/src/text/mod.rs @@ -0,0 +1,8 @@ +//! Text-based file loaders (CSV, JSON, plaintext, HTML). + +pub mod csv; +pub mod json; +pub mod plaintext; + +#[cfg(feature = "html")] +pub mod html; diff --git a/crates/nvisy-ingest/src/text/plaintext.rs b/crates/nvisy-ingest/src/text/plaintext.rs new file mode 100644 index 0000000..c55b429 --- /dev/null +++ b/crates/nvisy-ingest/src/text/plaintext.rs @@ -0,0 +1,41 @@ +//! Plain-text file loader. + +use nvisy_core::io::ContentData; +use nvisy_core::error::Error; + +use crate::document::Document; +use crate::handler::{PlaintextHandler, FormatHandler, TextLoader}; + +/// Loads plain-text content into a single [`Document`]. +pub struct PlaintextLoader; + +impl Clone for PlaintextLoader { + fn clone(&self) -> Self { Self } +} + +#[async_trait::async_trait] +impl TextLoader for PlaintextLoader { + type Params = (); + + async fn load( + &self, + content: &ContentData, + _params: &Self::Params, + ) -> Result<Vec<Document<FormatHandler>>, Error> { + let text = String::from_utf8(content.to_bytes().to_vec()).map_err(|e| { + Error::validation( + format!("Invalid UTF-8 in plaintext: {}", e), + "plaintext-loader", + ) + })?; + let mut doc = Document::new(PlaintextHandler).with_text(text); + doc.source.set_parent_id(Some(content.content_source.as_uuid())); + Ok(vec![doc.into_format()]) + } +} + +impl crate::handler::Handler for PlaintextLoader { + fn id(&self) -> &str { PlaintextHandler.id() } + fn extensions(&self) -> &[&str] { PlaintextHandler.extensions() } + fn content_types(&self) -> &[&str] { PlaintextHandler.content_types() } +} diff --git a/crates/nvisy-media/Cargo.toml b/crates/nvisy-media/Cargo.toml deleted file mode 100644 index b93c7fd..0000000 --- a/crates/nvisy-media/Cargo.toml +++ /dev/null @@ -1,49 +0,0 @@ -# https://doc.rust-lang.org/cargo/reference/manifest.html - -[package] -name = "nvisy-media" -description = "Pixel-level image redaction, tabular redaction, and PDF reassembly for Nvisy" -keywords = ["nvisy", "media", "redaction", "image", "pdf"] -categories = ["multimedia::images"] - -version = { workspace = true } -rust-version = { workspace = true } -edition = { workspace = true } -license = { workspace = true } -publish = { workspace = true } - -authors = { workspace = true } -repository = { workspace = true } -homepage = { workspace = true } -documentation = { workspace = true } - -[package.metadata.docs.rs] -all-features = true -rustdoc-args = ["--cfg", "docsrs"] - -[dependencies] -# Internal crates -nvisy-core = { workspace = true, features = [] } -nvisy-ontology = { workspace = true } - -# (De)serialization -serde = { workspace = true, features = ["derive"] } -serde_json = { workspace = true, features = [] } - -# Async runtime -tokio = { workspace = true, features = ["sync"] } -async-trait = { workspace = true, features = [] } - -# Primitive datatypes -uuid = { workspace = true, features = ["v4"] } -bytes = { workspace = true } - -# Image processing -image = { workspace = true } -imageproc = { workspace = true } - -# PDF manipulation -lopdf = { workspace = true } - -# Observability -tracing = { workspace = true, features = [] } diff --git a/crates/nvisy-media/README.md b/crates/nvisy-media/README.md deleted file mode 100644 index df45d19..0000000 --- a/crates/nvisy-media/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# nvisy-media - -Pixel-level image redaction, tabular redaction, and PDF reassembly. - -This crate provides media processing actions for the Nvisy redaction -pipeline, including image blur/block overlays, tabular cell redaction, -and PDF content stream replacement. diff --git a/crates/nvisy-media/src/actions/apply_audio_redaction.rs b/crates/nvisy-media/src/actions/apply_audio_redaction.rs deleted file mode 100644 index 7c185ae..0000000 --- a/crates/nvisy-media/src/actions/apply_audio_redaction.rs +++ /dev/null @@ -1,54 +0,0 @@ -//! Placeholder audio redaction action. - -use serde::Deserialize; -use tokio::sync::mpsc; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; - -/// Typed parameters for [`ApplyAudioRedactionAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ApplyAudioRedactionParams { - /// Time segments to mute, as `(start_seconds, end_seconds)` pairs. - #[serde(default)] - pub mute_segments: Vec<(f64, f64)>, -} - -/// Placeholder action for audio redaction. -/// -/// Returns a runtime error indicating audio redaction is not yet implemented. -pub struct ApplyAudioRedactionAction; - -#[async_trait::async_trait] -impl Action for ApplyAudioRedactionAction { - type Params = ApplyAudioRedactionParams; - - fn id(&self) -> &str { - "apply-audio-redaction" - } - - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) - } - - async fn execute( - &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - _params: Self::Params, - ) -> Result<u64, Error> { - // Pass through blobs unchanged — audio redaction is not implemented - while let Some(blob) = input.recv().await { - tracing::warn!("Audio redaction not yet implemented, passing through unchanged"); - if output.send(blob).await.is_err() { - return Err(Error::new( - ErrorKind::Runtime, - "Audio redaction not yet implemented", - )); - } - } - Ok(0) - } -} diff --git a/crates/nvisy-media/src/actions/apply_image_redaction.rs b/crates/nvisy-media/src/actions/apply_image_redaction.rs deleted file mode 100644 index d7e4e93..0000000 --- a/crates/nvisy-media/src/actions/apply_image_redaction.rs +++ /dev/null @@ -1,159 +0,0 @@ -//! Image redaction action — applies blur or block overlay to image regions. - -use bytes::Bytes; -use serde::Deserialize; -use tokio::sync::mpsc; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::ImageData; -use nvisy_ontology::ontology::entity::{BoundingBox, Entity}; -use nvisy_ontology::ontology::redaction::{Redaction, RedactionMethod}; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; - -use crate::render::{blur, block}; - -/// Typed parameters for [`ApplyImageRedactionAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ApplyImageRedactionParams { - /// Sigma value for gaussian blur. - #[serde(default = "default_sigma")] - pub blur_sigma: f32, - /// RGBA color for block overlays. - #[serde(default = "default_color")] - pub block_color: [u8; 4], -} - -fn default_sigma() -> f32 { - 15.0 -} -fn default_color() -> [u8; 4] { - [0, 0, 0, 255] -} - -/// Applies blur or block redaction to image regions identified by entities -/// with bounding boxes. -pub struct ApplyImageRedactionAction; - -#[async_trait::async_trait] -impl Action for ApplyImageRedactionAction { - type Params = ApplyImageRedactionParams; - - fn id(&self) -> &str { - "apply-image-redaction" - } - - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) - } - - async fn execute( - &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - params: Self::Params, - ) -> Result<u64, Error> { - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let images: Vec<ImageData> = blob.get_artifacts("images").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read images: {e}")) - })?; - let entities: Vec<Entity> = blob.get_artifacts("entities").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read entities: {e}")) - })?; - let redactions: Vec<Redaction> = blob.get_artifacts("redactions").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read redactions: {e}")) - })?; - - // Build entity->redaction map - let redaction_map: std::collections::HashMap<uuid::Uuid, &Redaction> = redactions - .iter() - .filter(|r| !r.applied) - .map(|r| (r.entity_id, r)) - .collect(); - - // Collect entities with bounding boxes, grouped by redaction method - let mut blur_regions: Vec<BoundingBox> = Vec::new(); - let mut block_regions: Vec<BoundingBox> = Vec::new(); - - for entity in &entities { - if let Some(bbox) = &entity.location.bounding_box { - if let Some(redaction) = redaction_map.get(&entity.data.id) { - match redaction.method { - RedactionMethod::Blur => blur_regions.push(bbox.clone()), - RedactionMethod::Block => block_regions.push(bbox.clone()), - // Default non-image methods to block for images - _ => block_regions.push(bbox.clone()), - } - } - } - } - - if !blur_regions.is_empty() || !block_regions.is_empty() { - // Process each image - let mut new_images = Vec::new(); - for img in &images { - let dyn_img = image::load_from_memory(&img.image_data).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("image decode failed: {e}")) - })?; - - let mut result = dyn_img; - if !blur_regions.is_empty() { - result = blur::apply_gaussian_blur(&result, &blur_regions, params.blur_sigma); - } - if !block_regions.is_empty() { - let color = image::Rgba(params.block_color); - result = block::apply_block_overlay(&result, &block_regions, color); - } - - // Encode back to PNG - let mut buf = std::io::Cursor::new(Vec::new()); - result - .write_to(&mut buf, image::ImageFormat::Png) - .map_err(|e| { - Error::new(ErrorKind::Runtime, format!("image encode failed: {e}")) - })?; - - let new_img = ImageData::new( - Bytes::from(buf.into_inner()), - "image/png", - ) - .with_dimensions(result.width(), result.height()); - - new_images.push(new_img); - count += 1; - } - - // Replace images artifact - blob.artifacts.remove("images"); - for img in &new_images { - blob.add_artifact("images", img).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add image: {e}")) - })?; - } - - // Mark redactions as applied - let mut updated_redactions: Vec<Redaction> = redactions.clone(); - for r in &mut updated_redactions { - if redaction_map.contains_key(&r.entity_id) { - r.applied = true; - } - } - blob.artifacts.remove("redactions"); - for r in &updated_redactions { - blob.add_artifact("redactions", r).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add redaction: {e}")) - })?; - } - } - - if output.send(blob).await.is_err() { - return Ok(count); - } - } - - Ok(count) - } -} diff --git a/crates/nvisy-media/src/actions/apply_pdf_redaction.rs b/crates/nvisy-media/src/actions/apply_pdf_redaction.rs deleted file mode 100644 index 7593382..0000000 --- a/crates/nvisy-media/src/actions/apply_pdf_redaction.rs +++ /dev/null @@ -1,153 +0,0 @@ -//! PDF reassembly action — writes redacted content back to PDF bytes. - -use bytes::Bytes; -use serde::Deserialize; -use tokio::sync::mpsc; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::{Document, ImageData}; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; - -/// Typed parameters for [`ApplyPdfRedactionAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ApplyPdfRedactionParams {} - -/// Reassembles redacted text and images back into the original PDF. -/// -/// Uses `lopdf` to: -/// 1. Replace PDF content streams with redacted text. -/// 2. Replace embedded image XObjects with redacted image data. -/// 3. Write the modified PDF back to `blob.content`. -pub struct ApplyPdfRedactionAction; - -#[async_trait::async_trait] -impl Action for ApplyPdfRedactionAction { - type Params = ApplyPdfRedactionParams; - - fn id(&self) -> &str { - "apply-pdf-redaction" - } - - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) - } - - async fn execute( - &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - _params: Self::Params, - ) -> Result<u64, Error> { - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let _documents: Vec<Document> = blob.get_artifacts("documents").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read documents: {e}")) - })?; - let images: Vec<ImageData> = blob.get_artifacts("images").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read images: {e}")) - })?; - - // Only process if the blob is actually a PDF - let is_pdf = blob - .content_type() - .map(|ct| ct == "application/pdf") - .unwrap_or(false); - - if !is_pdf { - if output.send(blob).await.is_err() { - return Ok(count); - } - continue; - } - - let mut pdf_doc = lopdf::Document::load_mem(&blob.content).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("PDF load failed: {e}")) - })?; - - // Replace embedded image XObjects with redacted versions - if !images.is_empty() { - let pages: Vec<(u32, lopdf::ObjectId)> = - pdf_doc.get_pages().into_iter().collect(); - let mut image_idx = 0; - - for (_page_num, page_id) in &pages { - let (resources_opt, _) = match pdf_doc.get_page_resources(*page_id) { - Ok(r) => r, - Err(_) => continue, - }; - - let resources = match resources_opt { - Some(res) => res.clone(), - None => continue, - }; - - let xobject_obj = match resources.get(b"XObject") { - Ok(obj) => obj.clone(), - Err(_) => continue, - }; - - let xobjects = match pdf_doc.dereference(&xobject_obj) { - Ok((_, lopdf::Object::Dictionary(dict))) => dict.clone(), - _ => continue, - }; - - for (_name, obj_ref) in xobjects.iter() { - let stream_id = match obj_ref { - lopdf::Object::Reference(id) => Some(*id), - _ => None, - }; - - let is_image = match pdf_doc.dereference(obj_ref) { - Ok((_, lopdf::Object::Stream(s))) => s - .dict - .get(b"Subtype") - .ok() - .and_then(|st| { - if let lopdf::Object::Name(n) = st { - Some(n.as_slice() == b"Image") - } else { - None - } - }) - .unwrap_or(false), - _ => false, - }; - - if is_image { - if let (Some(sid), Some(redacted_img)) = - (stream_id, images.get(image_idx)) - { - let new_stream = lopdf::Stream::new( - lopdf::Dictionary::new(), - redacted_img.image_data.to_vec(), - ); - pdf_doc - .objects - .insert(sid, lopdf::Object::Stream(new_stream)); - image_idx += 1; - } - } - } - } - } - - // Write the modified PDF to a buffer - let mut output_buf = Vec::new(); - pdf_doc.save_to(&mut output_buf).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("PDF save failed: {e}")) - })?; - - blob.content = Bytes::from(output_buf); - count += 1; - - if output.send(blob).await.is_err() { - return Ok(count); - } - } - - Ok(count) - } -} diff --git a/crates/nvisy-media/src/actions/apply_tabular_redaction.rs b/crates/nvisy-media/src/actions/apply_tabular_redaction.rs deleted file mode 100644 index ead39f9..0000000 --- a/crates/nvisy-media/src/actions/apply_tabular_redaction.rs +++ /dev/null @@ -1,150 +0,0 @@ -//! Tabular data redaction action — applies redaction to specific cells. - -use serde::Deserialize; -use tokio::sync::mpsc; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::TabularData; -use nvisy_ontology::ontology::entity::Entity; -use nvisy_ontology::ontology::redaction::{Redaction, RedactionMethod}; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; - -/// Typed parameters for [`ApplyTabularRedactionAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ApplyTabularRedactionParams {} - -/// Applies pending redactions to tabular data cells. -/// -/// For entities with `row_index` and `column_index`, the corresponding cell -/// value is redacted according to the redaction method (mask, replace, -/// remove, hash). -pub struct ApplyTabularRedactionAction; - -#[async_trait::async_trait] -impl Action for ApplyTabularRedactionAction { - type Params = ApplyTabularRedactionParams; - - fn id(&self) -> &str { - "apply-tabular-redaction" - } - - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) - } - - async fn execute( - &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - _params: Self::Params, - ) -> Result<u64, Error> { - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let mut tables: Vec<TabularData> = blob.get_artifacts("tabular").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read tabular: {e}")) - })?; - let entities: Vec<Entity> = blob.get_artifacts("entities").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read entities: {e}")) - })?; - let redactions: Vec<Redaction> = blob.get_artifacts("redactions").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read redactions: {e}")) - })?; - - // Build entity->redaction map - let redaction_map: std::collections::HashMap<uuid::Uuid, &Redaction> = redactions - .iter() - .filter(|r| !r.applied) - .map(|r| (r.entity_id, r)) - .collect(); - - let mut modified = false; - - for entity in &entities { - if let (Some(row_idx), Some(col_idx)) = - (entity.location.row_index, entity.location.column_index) - { - if let Some(redaction) = redaction_map.get(&entity.data.id) { - // Apply to all matching tables - for table in &mut tables { - if let Some(row) = table.rows.get_mut(row_idx) { - if let Some(cell) = row.get_mut(col_idx) { - *cell = apply_cell_redaction( - cell, - redaction.method, - &redaction.replacement_value, - ); - modified = true; - count += 1; - } - } - } - } - } - } - - if modified { - // Replace tabular artifact - blob.artifacts.remove("tabular"); - for table in &tables { - blob.add_artifact("tabular", table).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add tabular: {e}")) - })?; - } - - // Mark redactions as applied - let mut updated_redactions: Vec<Redaction> = redactions.clone(); - for r in &mut updated_redactions { - if redaction_map.contains_key(&r.entity_id) { - r.applied = true; - } - } - blob.artifacts.remove("redactions"); - for r in &updated_redactions { - blob.add_artifact("redactions", r).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add redaction: {e}")) - })?; - } - } - - if output.send(blob).await.is_err() { - return Ok(count); - } - } - - Ok(count) - } -} - -fn apply_cell_redaction( - cell: &str, - method: RedactionMethod, - replacement: &str, -) -> String { - match method { - RedactionMethod::Mask => { - // Mask all but last 4 characters - if cell.len() > 4 { - format!("{}{}", "*".repeat(cell.len() - 4), &cell[cell.len() - 4..]) - } else { - "*".repeat(cell.len()) - } - } - RedactionMethod::Replace => replacement.to_string(), - RedactionMethod::Remove => String::new(), - RedactionMethod::Hash => { - // Simple hash representation - format!("[HASH:{:x}]", hash_string(cell)) - } - _ => replacement.to_string(), - } -} - -fn hash_string(s: &str) -> u64 { - use std::hash::{Hash, Hasher}; - let mut hasher = std::collections::hash_map::DefaultHasher::new(); - s.hash(&mut hasher); - hasher.finish() -} diff --git a/crates/nvisy-media/src/actions/mod.rs b/crates/nvisy-media/src/actions/mod.rs deleted file mode 100644 index 5e37f74..0000000 --- a/crates/nvisy-media/src/actions/mod.rs +++ /dev/null @@ -1,10 +0,0 @@ -//! Pipeline actions for applying redactions to media (images, tabular data, PDFs). - -/// Applies image redactions (blur, block) to image artifacts. -pub mod apply_image_redaction; -/// Applies redactions to tabular data cells. -pub mod apply_tabular_redaction; -/// Reassembles redacted content into PDF files. -pub mod apply_pdf_redaction; -/// Placeholder for audio redaction. -pub mod apply_audio_redaction; diff --git a/crates/nvisy-media/src/lib.rs b/crates/nvisy-media/src/lib.rs deleted file mode 100644 index cf43651..0000000 --- a/crates/nvisy-media/src/lib.rs +++ /dev/null @@ -1,11 +0,0 @@ -#![forbid(unsafe_code)] -#![cfg_attr(docsrs, feature(doc_cfg))] -#![doc = include_str!("../README.md")] - -/// Image rendering primitives (blur, block overlay). -pub mod render; -/// Pipeline actions for applying redactions to media. -pub mod actions; - -#[doc(hidden)] -pub mod prelude; diff --git a/crates/nvisy-media/src/prelude.rs b/crates/nvisy-media/src/prelude.rs deleted file mode 100644 index b450238..0000000 --- a/crates/nvisy-media/src/prelude.rs +++ /dev/null @@ -1,5 +0,0 @@ -//! Convenience re-exports. -pub use crate::actions::apply_image_redaction::ApplyImageRedactionAction; -pub use crate::actions::apply_tabular_redaction::ApplyTabularRedactionAction; -pub use crate::actions::apply_pdf_redaction::ApplyPdfRedactionAction; -pub use crate::actions::apply_audio_redaction::ApplyAudioRedactionAction; diff --git a/crates/nvisy-object/Cargo.toml b/crates/nvisy-object/Cargo.toml index dd265af..b8682c1 100644 --- a/crates/nvisy-object/Cargo.toml +++ b/crates/nvisy-object/Cargo.toml @@ -24,6 +24,7 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } +nvisy-pipeline = { workspace = true } # (De)serialization serde = { workspace = true, features = ["derive"] } diff --git a/crates/nvisy-object/src/prelude.rs b/crates/nvisy-object/src/prelude.rs index 3fc3355..0217bb4 100644 --- a/crates/nvisy-object/src/prelude.rs +++ b/crates/nvisy-object/src/prelude.rs @@ -1,5 +1,5 @@ //! Convenience re-exports. -pub use crate::providers::s3::S3ProviderFactory; +pub use crate::providers::s3::S3Provider; pub use crate::streams::read::ObjectReadStream; pub use crate::streams::write::ObjectWriteStream; pub use crate::streams::{StreamSource, StreamTarget}; diff --git a/crates/nvisy-object/src/providers/s3.rs b/crates/nvisy-object/src/providers/s3.rs index d264112..c1cc8e7 100644 --- a/crates/nvisy-object/src/providers/s3.rs +++ b/crates/nvisy-object/src/providers/s3.rs @@ -1,7 +1,7 @@ //! S3-compatible provider implementation using the MinIO Rust SDK. //! //! Provides [`S3ObjectStoreClient`] which implements [`ObjectStoreClient`] and -//! [`S3ProviderFactory`] which plugs into the engine's provider system. +//! [`S3Provider`] which plugs into the engine's provider system. //! //! Works with MinIO, AWS S3, and any S3-compatible service. @@ -14,7 +14,7 @@ use minio::s3::types::{S3Api, ToStream}; use minio::s3::{Client as MinioClient, ClientBuilder as MinioClientBuilder}; use nvisy_core::error::Error; -use nvisy_core::registry::provider::{ConnectedInstance, ProviderFactory}; +use nvisy_pipeline::provider::{ConnectedInstance, Provider}; use crate::client::{GetResult, ListResult, ObjectStoreBox, ObjectStoreClient}; /// S3-compatible object store client. @@ -131,10 +131,10 @@ pub struct S3Credentials { fn default_region() -> String { "us-east-1".to_string() } /// Factory that creates [`S3ObjectStoreClient`] instances from typed credentials. -pub struct S3ProviderFactory; +pub struct S3Provider; #[async_trait::async_trait] -impl ProviderFactory for S3ProviderFactory { +impl Provider for S3Provider { type Credentials = S3Credentials; type Client = ObjectStoreBox; diff --git a/crates/nvisy-object/src/streams/mod.rs b/crates/nvisy-object/src/streams/mod.rs index 6542776..3befadf 100644 --- a/crates/nvisy-object/src/streams/mod.rs +++ b/crates/nvisy-object/src/streams/mod.rs @@ -3,13 +3,13 @@ use serde::de::DeserializeOwned; use tokio::sync::mpsc; -use nvisy_core::datatypes::blob::Blob; +use nvisy_core::io::ContentData; use nvisy_core::error::Error; -/// A source stream that reads blobs from an external system into the pipeline. +/// A source stream that reads content from an external system into the pipeline. /// /// Implementations connect to a storage backend (e.g. S3, local filesystem) -/// and emit blobs into the pipeline's input channel. +/// and emit content data into the pipeline's input channel. #[async_trait::async_trait] pub trait StreamSource: Send + Sync + 'static { /// Strongly-typed parameters for this stream source. @@ -19,24 +19,22 @@ pub trait StreamSource: Send + Sync + 'static { /// Unique identifier for this stream source (e.g. `"s3-read"`). fn id(&self) -> &str; - /// Validate source parameters before execution. - fn validate_params(&self, params: &Self::Params) -> Result<(), Error>; - /// Read blobs from the external system and send them to `output`. + /// Read content from the external system and send it to `output`. /// - /// Returns the number of blobs read. + /// Returns the number of items read. async fn read( &self, - output: mpsc::Sender<Blob>, + output: mpsc::Sender<ContentData>, params: Self::Params, client: Self::Client, ) -> Result<u64, Error>; } -/// A target stream that writes blobs from the pipeline to an external system. +/// A target stream that writes content from the pipeline to an external system. /// -/// Implementations receive processed blobs from the pipeline and persist -/// them to a storage backend. +/// Implementations receive processed content data from the pipeline and persist +/// it to a storage backend. #[async_trait::async_trait] pub trait StreamTarget: Send + Sync + 'static { /// Strongly-typed parameters for this stream target. @@ -46,15 +44,13 @@ pub trait StreamTarget: Send + Sync + 'static { /// Unique identifier for this stream target (e.g. `"s3-write"`). fn id(&self) -> &str; - /// Validate target parameters before execution. - fn validate_params(&self, params: &Self::Params) -> Result<(), Error>; - /// Receive blobs from `input` and write them to the external system. + /// Receive content from `input` and write it to the external system. /// - /// Returns the number of blobs written. + /// Returns the number of items written. async fn write( &self, - input: mpsc::Receiver<Blob>, + input: mpsc::Receiver<ContentData>, params: Self::Params, client: Self::Client, ) -> Result<u64, Error>; diff --git a/crates/nvisy-object/src/streams/read.rs b/crates/nvisy-object/src/streams/read.rs index e4d5ee1..90fb8a4 100644 --- a/crates/nvisy-object/src/streams/read.rs +++ b/crates/nvisy-object/src/streams/read.rs @@ -3,7 +3,8 @@ use serde::Deserialize; use tokio::sync::mpsc; -use nvisy_core::datatypes::blob::Blob; +use nvisy_core::io::ContentData; +use nvisy_core::path::ContentSource; use nvisy_core::error::Error; use super::StreamSource; use crate::client::ObjectStoreBox; @@ -23,7 +24,7 @@ pub struct ObjectReadParams { fn default_batch_size() -> usize { 100 } /// A [`StreamSource`] that lists and fetches objects from an S3-compatible store, -/// emitting each object as a [`Blob`] onto the output channel. +/// emitting each object as a [`ContentData`] onto the output channel. pub struct ObjectReadStream; #[async_trait::async_trait] @@ -33,13 +34,9 @@ impl StreamSource for ObjectReadStream { fn id(&self) -> &str { "read" } - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) - } - async fn read( &self, - output: mpsc::Sender<Blob>, + output: mpsc::Sender<ContentData>, params: Self::Params, client: Self::Client, ) -> Result<u64, Error> { @@ -65,13 +62,13 @@ impl StreamSource for ObjectReadStream { .await .map_err(|e| Error::runtime(format!("Get failed for {}: {}", key, e), "object/read", true))?; - let mut blob = Blob::new(key.clone(), get_result.data); + let mut content = ContentData::new(ContentSource::new(), get_result.data); if let Some(ct) = get_result.content_type { - blob = blob.with_content_type(ct); + content = content.with_content_type(ct); } total += 1; - if output.send(blob).await.is_err() { + if output.send(content).await.is_err() { return Ok(total); } } diff --git a/crates/nvisy-object/src/streams/write.rs b/crates/nvisy-object/src/streams/write.rs index 51b9964..75902a9 100644 --- a/crates/nvisy-object/src/streams/write.rs +++ b/crates/nvisy-object/src/streams/write.rs @@ -1,9 +1,9 @@ -//! Streaming writer that uploads blobs to an S3-compatible store. +//! Streaming writer that uploads content to an S3-compatible store. use serde::Deserialize; use tokio::sync::mpsc; -use nvisy_core::datatypes::blob::Blob; +use nvisy_core::io::ContentData; use nvisy_core::error::Error; use super::StreamTarget; use crate::client::ObjectStoreBox; @@ -12,12 +12,12 @@ use crate::client::ObjectStoreBox; #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] pub struct ObjectWriteParams { - /// Key prefix prepended to each blob path. + /// Key prefix prepended to each content source UUID. #[serde(default)] pub prefix: String, } -/// A [`StreamTarget`] that receives [`Blob`]s from the input channel and +/// A [`StreamTarget`] that receives [`ContentData`] from the input channel and /// uploads each one to an S3-compatible object store. pub struct ObjectWriteStream; @@ -28,13 +28,9 @@ impl StreamTarget for ObjectWriteStream { fn id(&self) -> &str { "write" } - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) - } - async fn write( &self, - mut input: mpsc::Receiver<Blob>, + mut input: mpsc::Receiver<ContentData>, params: Self::Params, client: Self::Client, ) -> Result<u64, Error> { @@ -43,15 +39,16 @@ impl StreamTarget for ObjectWriteStream { let prefix = ¶ms.prefix; let mut total = 0u64; - while let Some(blob) = input.recv().await { + while let Some(content) = input.recv().await { + let source_id = content.content_source.to_string(); let key = if prefix.is_empty() { - blob.path.clone() + source_id } else { - format!("{}{}", prefix, blob.path) + format!("{}{}", prefix, source_id) }; store_client - .put(&key, blob.content.clone(), blob.content_type()) + .put(&key, content.to_bytes(), content.content_type()) .await .map_err(|e| Error::runtime(format!("Put failed for {}: {}", key, e), "object/write", true))?; diff --git a/crates/nvisy-ontology/Cargo.toml b/crates/nvisy-ontology/Cargo.toml index ddbba21..0dd29c1 100644 --- a/crates/nvisy-ontology/Cargo.toml +++ b/crates/nvisy-ontology/Cargo.toml @@ -17,19 +17,12 @@ repository = { workspace = true } homepage = { workspace = true } documentation = { workspace = true } -[package.metadata.docs.rs] -all-features = true -rustdoc-args = ["--cfg", "docsrs"] - -[features] -schema = ["dep:schemars"] - [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } # JSON Schema generation -schemars = { workspace = true, optional = true } +schemars = { workspace = true } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -37,7 +30,7 @@ serde_json = { workspace = true, features = [] } # Primitive datatypes uuid = { workspace = true, features = ["serde", "v4"] } -chrono = { workspace = true, features = ["serde"] } +jiff = { workspace = true } # Error handling derive_more = { workspace = true, features = ["display"] } diff --git a/crates/nvisy-ontology/src/ontology/audit.rs b/crates/nvisy-ontology/src/ontology/audit.rs index bb55ab7..9e2e247 100644 --- a/crates/nvisy-ontology/src/ontology/audit.rs +++ b/crates/nvisy-ontology/src/ontology/audit.rs @@ -1,14 +1,13 @@ //! Audit trail records for data protection events. -use chrono::{DateTime, Utc}; +use jiff::Timestamp; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use nvisy_core::datatypes::Data; -use nvisy_core::datatypes::Metadata; +use nvisy_core::path::ContentSource; /// Kind of auditable action recorded in an [`Audit`] entry. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum AuditAction { /// A sensitive entity was detected. @@ -28,15 +27,16 @@ pub enum AuditAction { /// Audit entries are emitted by pipeline actions and form a tamper-evident /// log of all detection, redaction, and policy decisions. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct Audit { - /// Common data-item fields (id, parent_id, metadata). + /// Content source identity and lineage. #[serde(flatten)] - pub data: Data, + pub source: ContentSource, /// The kind of event this audit entry records. pub action: AuditAction, /// UTC timestamp when the event occurred. - pub timestamp: DateTime<Utc>, + #[schemars(with = "String")] + pub timestamp: Timestamp, /// Identifier of the related entity, if applicable. #[serde(skip_serializing_if = "Option::is_none")] pub entity_id: Option<Uuid>, @@ -57,16 +57,16 @@ pub struct Audit { pub actor: Option<String>, /// Additional unstructured details about the event. #[serde(skip_serializing_if = "Option::is_none")] - pub details: Option<Metadata>, + pub details: Option<serde_json::Map<String, serde_json::Value>>, } impl Audit { /// Create a new audit record for the given action, timestamped to now. pub fn new(action: AuditAction) -> Self { Self { - data: Data::new(), + source: ContentSource::new(), action, - timestamp: Utc::now(), + timestamp: Timestamp::now(), entity_id: None, redaction_id: None, policy_id: None, @@ -102,7 +102,7 @@ impl Audit { } /// Attach additional unstructured details to this audit entry. - pub fn with_details(mut self, details: Metadata) -> Self { + pub fn with_details(mut self, details: serde_json::Map<String, serde_json::Value>) -> Self { self.details = Some(details); self } diff --git a/crates/nvisy-ontology/src/ontology/entity.rs b/crates/nvisy-ontology/src/ontology/entity.rs index 14e69a5..9186a8a 100644 --- a/crates/nvisy-ontology/src/ontology/entity.rs +++ b/crates/nvisy-ontology/src/ontology/entity.rs @@ -2,11 +2,11 @@ use serde::{Deserialize, Serialize}; use uuid::Uuid; -use nvisy_core::datatypes::Data; +use nvisy_core::path::ContentSource; /// Category of sensitive data an entity belongs to. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum EntityCategory { /// Personally Identifiable Information (names, SSNs, addresses, etc.). @@ -23,7 +23,7 @@ pub enum EntityCategory { /// Method used to detect a sensitive entity. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum DetectionMethod { /// Regular expression pattern matching. @@ -44,7 +44,7 @@ pub enum DetectionMethod { /// Axis-aligned bounding box for image-based entity locations. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct BoundingBox { /// Horizontal offset of the top-left corner (pixels or normalized). pub x: f64, @@ -58,7 +58,7 @@ pub struct BoundingBox { /// Location of an entity within its source document or image. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct EntityLocation { /// Byte or character offset where the entity starts in the text. pub start_offset: usize, @@ -81,7 +81,7 @@ pub struct EntityLocation { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub column_index: Option<usize>, - /// Links this entity to a specific [`ImageData`](nvisy_core::datatypes::document::ImageData). + /// Links this entity to a specific image document. #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub image_id: Option<Uuid>, @@ -92,11 +92,11 @@ pub struct EntityLocation { /// Entities are produced by detection actions (regex, NER, checksum, etc.) /// and later consumed by redaction and audit actions. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct Entity { - /// Common data-item fields (id, parent_id, metadata). + /// Content source identity and lineage. #[serde(flatten)] - pub data: Data, + pub source: ContentSource, /// Broad classification of the sensitive data. pub category: EntityCategory, /// Specific type label (e.g. `"ssn"`, `"email"`, `"credit_card"`). @@ -109,9 +109,6 @@ pub struct Entity { pub confidence: f64, /// Where this entity was found in the source document. pub location: EntityLocation, - /// Identifier of the source blob or document this entity came from. - #[serde(skip_serializing_if = "Option::is_none")] - pub source_id: Option<Uuid>, } impl Entity { @@ -125,20 +122,19 @@ impl Entity { location: EntityLocation, ) -> Self { Self { - data: Data::new(), + source: ContentSource::new(), category, entity_type: entity_type.into(), value: value.into(), detection_method, confidence, location, - source_id: None, } } - /// Link this entity to the blob or document it was extracted from. - pub fn with_source_id(mut self, source_id: Uuid) -> Self { - self.source_id = Some(source_id); + /// Set the parent source for lineage tracking. + pub fn with_parent(mut self, parent: &ContentSource) -> Self { + self.source = self.source.with_parent(parent); self } } diff --git a/crates/nvisy-ontology/src/ontology/redaction.rs b/crates/nvisy-ontology/src/ontology/redaction.rs index e7c9fe3..ea75cc1 100644 --- a/crates/nvisy-ontology/src/ontology/redaction.rs +++ b/crates/nvisy-ontology/src/ontology/redaction.rs @@ -2,11 +2,11 @@ use serde::{Deserialize, Serialize}; use uuid::Uuid; -use nvisy_core::datatypes::Data; +use nvisy_core::path::ContentSource; /// Strategy used to redact or obfuscate a detected entity. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] #[serde(rename_all = "snake_case")] pub enum RedactionMethod { /// Replace characters with a mask character (e.g. `***-**-1234`). @@ -32,11 +32,11 @@ pub enum RedactionMethod { /// Each `Redaction` is linked to exactly one [`Entity`](super::entity::Entity) /// via `entity_id`. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct Redaction { - /// Common data-item fields (id, parent_id, metadata). + /// Content source identity and lineage. #[serde(flatten)] - pub data: Data, + pub source: ContentSource, /// Identifier of the entity being redacted. pub entity_id: Uuid, /// Redaction strategy applied to the entity. @@ -61,7 +61,7 @@ impl Redaction { replacement_value: impl Into<String>, ) -> Self { Self { - data: Data::new(), + source: ContentSource::new(), entity_id, method, replacement_value: replacement_value.into(), diff --git a/crates/nvisy-ontology/src/redaction/context.rs b/crates/nvisy-ontology/src/redaction/context.rs index 05404fe..8a2b089 100644 --- a/crates/nvisy-ontology/src/redaction/context.rs +++ b/crates/nvisy-ontology/src/redaction/context.rs @@ -9,7 +9,7 @@ use crate::ontology::redaction::RedactionMethod; /// When included in a [`RedactionContext`], this rule overrides the /// default redaction method for a specific entity type. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct EntityRedactionRule { /// The entity type this override applies to (e.g. `"ssn"`, `"email"`). pub entity_type: String, @@ -26,7 +26,7 @@ pub struct EntityRedactionRule { /// directly into an [`Entity`](crate::ontology::entity::Entity) with /// `DetectionMethod::Manual` and confidence 1.0. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct ManualAnnotation { /// Broad classification of the annotated data. pub category: EntityCategory, @@ -61,7 +61,7 @@ pub struct ManualAnnotation { /// specifying categories, entity types, confidence thresholds, and /// redaction methods for a single redaction invocation. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct RedactionContext { /// Entity categories to scan for. Empty = all. #[serde(default)] diff --git a/crates/nvisy-ontology/src/redaction/policy.rs b/crates/nvisy-ontology/src/redaction/policy.rs index 19391ba..5e37676 100644 --- a/crates/nvisy-ontology/src/redaction/policy.rs +++ b/crates/nvisy-ontology/src/redaction/policy.rs @@ -1,7 +1,7 @@ //! Redaction policies and rules. use serde::{Deserialize, Serialize}; -use nvisy_core::datatypes::Data; +use nvisy_core::path::ContentSource; use crate::ontology::entity::EntityCategory; use crate::ontology::redaction::RedactionMethod; @@ -11,7 +11,7 @@ use crate::ontology::redaction::RedactionMethod; /// confidence threshold, and the redaction method to apply. Rules are /// evaluated in ascending priority order. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct PolicyRule { /// Unique identifier for this rule within its policy. pub id: String, @@ -38,11 +38,11 @@ pub struct PolicyRule { /// Policies are evaluated by [`find_matching_rule`](Policy::find_matching_rule) /// which returns the first matching enabled rule sorted by priority. #[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))] +#[derive(schemars::JsonSchema)] pub struct Policy { - /// Common data-item fields (id, parent_id, metadata). + /// Content source identity and lineage. #[serde(flatten)] - pub data: Data, + pub source: ContentSource, /// Human-readable policy name. pub name: String, /// Ordered list of redaction rules. @@ -58,7 +58,7 @@ impl Policy { /// fallback method ([`Mask`](RedactionMethod::Mask)) and threshold (0.5). pub fn new(name: impl Into<String>, rules: Vec<PolicyRule>) -> Self { Self { - data: Data::new(), + source: ContentSource::new(), name: name.into(), rules, default_method: RedactionMethod::Mask, diff --git a/crates/nvisy-pattern/Cargo.toml b/crates/nvisy-pattern/Cargo.toml new file mode 100644 index 0000000..6e08fa8 --- /dev/null +++ b/crates/nvisy-pattern/Cargo.toml @@ -0,0 +1,30 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-pattern" +description = "Built-in regex patterns and dictionaries for PII/PHI detection" +keywords = ["nvisy", "pattern", "pii", "dictionary"] +categories = ["text-processing"] + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +# Internal crates +nvisy-ontology = { workspace = true } + +# (De)serialization +serde = { workspace = true, features = ["derive"] } +serde_json = { workspace = true, features = [] } diff --git a/crates/nvisy-detect/assets/dictionaries/first_names.txt b/crates/nvisy-pattern/assets/dictionaries/first_names.txt similarity index 100% rename from crates/nvisy-detect/assets/dictionaries/first_names.txt rename to crates/nvisy-pattern/assets/dictionaries/first_names.txt diff --git a/crates/nvisy-detect/assets/dictionaries/last_names.txt b/crates/nvisy-pattern/assets/dictionaries/last_names.txt similarity index 100% rename from crates/nvisy-detect/assets/dictionaries/last_names.txt rename to crates/nvisy-pattern/assets/dictionaries/last_names.txt diff --git a/crates/nvisy-detect/assets/dictionaries/medical_terms.txt b/crates/nvisy-pattern/assets/dictionaries/medical_terms.txt similarity index 100% rename from crates/nvisy-detect/assets/dictionaries/medical_terms.txt rename to crates/nvisy-pattern/assets/dictionaries/medical_terms.txt diff --git a/crates/nvisy-detect/assets/patterns.json b/crates/nvisy-pattern/assets/patterns.json similarity index 100% rename from crates/nvisy-detect/assets/patterns.json rename to crates/nvisy-pattern/assets/patterns.json diff --git a/crates/nvisy-detect/src/dictionaries/mod.rs b/crates/nvisy-pattern/src/dictionaries/mod.rs similarity index 100% rename from crates/nvisy-detect/src/dictionaries/mod.rs rename to crates/nvisy-pattern/src/dictionaries/mod.rs diff --git a/crates/nvisy-pattern/src/lib.rs b/crates/nvisy-pattern/src/lib.rs new file mode 100644 index 0000000..e927e45 --- /dev/null +++ b/crates/nvisy-pattern/src/lib.rs @@ -0,0 +1,16 @@ +//! Built-in regex patterns and dictionaries for PII/PHI detection. +//! +//! This crate provides the embedded pattern definitions (compiled from +//! `assets/patterns.json`) and dictionary data (first names, last names, +//! medical terms) used by the nvisy pipeline's detection actions. + +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] + +/// Built-in regex pattern definitions and validation helpers. +pub mod patterns; +/// Built-in dictionary data for name and term matching. +pub mod dictionaries; + +#[doc(hidden)] +pub mod prelude; diff --git a/crates/nvisy-detect/src/patterns/mod.rs b/crates/nvisy-pattern/src/patterns/mod.rs similarity index 100% rename from crates/nvisy-detect/src/patterns/mod.rs rename to crates/nvisy-pattern/src/patterns/mod.rs diff --git a/crates/nvisy-detect/src/patterns/validators.rs b/crates/nvisy-pattern/src/patterns/validators.rs similarity index 91% rename from crates/nvisy-detect/src/patterns/validators.rs rename to crates/nvisy-pattern/src/patterns/validators.rs index 842c3ca..b18baad 100644 --- a/crates/nvisy-detect/src/patterns/validators.rs +++ b/crates/nvisy-pattern/src/patterns/validators.rs @@ -1,7 +1,7 @@ //! Checksum and format validators for detected entity values. //! //! These functions are referenced by pattern definitions in `patterns.json` -//! and are also used directly by [`DetectChecksumAction`](crate::actions::detect_checksum::DetectChecksumAction). +//! and are also used directly by the checksum detection action. /// Validate a US Social Security Number. pub fn validate_ssn(value: &str) -> bool { diff --git a/crates/nvisy-pattern/src/prelude.rs b/crates/nvisy-pattern/src/prelude.rs new file mode 100644 index 0000000..2a352cd --- /dev/null +++ b/crates/nvisy-pattern/src/prelude.rs @@ -0,0 +1,4 @@ +//! Convenience re-exports for common nvisy-pattern types. + +pub use crate::patterns::{PatternDefinition, get_all_pattern_names, get_all_patterns, get_pattern}; +pub use crate::dictionaries::get_builtin; diff --git a/crates/nvisy-detect/Cargo.toml b/crates/nvisy-pipeline/Cargo.toml similarity index 59% rename from crates/nvisy-detect/Cargo.toml rename to crates/nvisy-pipeline/Cargo.toml index c169374..11fb7d8 100644 --- a/crates/nvisy-detect/Cargo.toml +++ b/crates/nvisy-pipeline/Cargo.toml @@ -1,9 +1,9 @@ # https://doc.rust-lang.org/cargo/reference/manifest.html [package] -name = "nvisy-detect" -description = "Regex patterns, policy evaluation, and redaction actions for Nvisy" -keywords = ["nvisy", "detection", "regex", "redaction"] +name = "nvisy-pipeline" +description = "Pipeline action/provider traits with detection and redaction actions for Nvisy" +keywords = ["nvisy", "pipeline", "detection", "redaction"] categories = ["text-processing"] version = { workspace = true } @@ -21,10 +21,19 @@ documentation = { workspace = true } all-features = true rustdoc-args = ["--cfg", "docsrs"] +[features] +default = ["image-redaction", "pdf-redaction"] +# Image blur/block redaction via image + imageproc +image-redaction = ["dep:image", "dep:imageproc"] +# PDF reassembly with redacted content via lopdf +pdf-redaction = ["dep:lopdf"] + [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } nvisy-ontology = { workspace = true } +nvisy-ingest = { workspace = true } +nvisy-pattern = { workspace = true } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -36,6 +45,7 @@ async-trait = { workspace = true, features = [] } # Primitive datatypes uuid = { workspace = true, features = ["v4"] } +bytes = { workspace = true } # Text processing regex = { workspace = true, features = [] } @@ -43,3 +53,10 @@ aho-corasick = { workspace = true } # Observability tracing = { workspace = true, features = [] } + +# Image processing (feature-gated) +image = { workspace = true, optional = true } +imageproc = { workspace = true, optional = true } + +# PDF manipulation (feature-gated) +lopdf = { workspace = true, optional = true } diff --git a/crates/nvisy-pipeline/src/action.rs b/crates/nvisy-pipeline/src/action.rs new file mode 100644 index 0000000..584c5d9 --- /dev/null +++ b/crates/nvisy-pipeline/src/action.rs @@ -0,0 +1,35 @@ +//! The `Action` trait -- the fundamental processing unit in a pipeline. + +use serde::de::DeserializeOwned; + +use nvisy_core::error::Error; + +/// A processing step with typed input and output. +/// +/// Actions are the primary unit of work in a pipeline. Each action is +/// constructed via [`connect`](Action::connect), which validates and +/// stores parameters, then executed via [`execute`](Action::execute). +/// +/// Actions that need a provider client should hold it as a struct field +/// rather than receiving it as a parameter. +#[async_trait::async_trait] +pub trait Action: Sized + Send + Sync + 'static { + /// Strongly-typed parameters for this action. + type Params: DeserializeOwned + Send; + /// Typed input for this action. + type Input: Send; + /// Typed output for this action. + type Output: Send; + + /// Unique identifier for this action (e.g. "detect-regex"). + fn id(&self) -> &str; + + /// Validate parameters and construct a configured action instance. + /// + /// This is where parameter validation, regex compilation, automata + /// building, and other setup work happens. + async fn connect(params: Self::Params) -> Result<Self, Error>; + + /// Execute the action with typed input, returning typed output. + async fn execute(&self, input: Self::Input) -> Result<Self::Output, Error>; +} diff --git a/crates/nvisy-pipeline/src/actions/apply_audio_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_audio_redaction.rs new file mode 100644 index 0000000..77b7593 --- /dev/null +++ b/crates/nvisy-pipeline/src/actions/apply_audio_redaction.rs @@ -0,0 +1,47 @@ +//! Placeholder audio redaction action. + +use serde::Deserialize; + +use nvisy_core::error::Error; +use nvisy_core::io::ContentData; + +use crate::action::Action; + +/// Typed parameters for [`ApplyAudioRedactionAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ApplyAudioRedactionParams { + /// Time segments to mute, as `(start_seconds, end_seconds)` pairs. + #[serde(default)] + pub mute_segments: Vec<(f64, f64)>, +} + +/// Placeholder action for audio redaction. +/// +/// Passes through content unchanged -- audio redaction is not yet implemented. +pub struct ApplyAudioRedactionAction { + params: ApplyAudioRedactionParams, +} + +#[async_trait::async_trait] +impl Action for ApplyAudioRedactionAction { + type Params = ApplyAudioRedactionParams; + type Input = ContentData; + type Output = ContentData; + + fn id(&self) -> &str { + "apply-audio-redaction" + } + + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { params }) + } + + async fn execute( + &self, + input: Self::Input, + ) -> Result<Self::Output, Error> { + tracing::warn!("Audio redaction not yet implemented, passing through unchanged"); + Ok(input) + } +} diff --git a/crates/nvisy-pipeline/src/actions/apply_image_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_image_redaction.rs new file mode 100644 index 0000000..3b48527 --- /dev/null +++ b/crates/nvisy-pipeline/src/actions/apply_image_redaction.rs @@ -0,0 +1,129 @@ +//! Image redaction action -- applies blur or block overlay to image regions. + +use bytes::Bytes; +use serde::Deserialize; + +use nvisy_ingest::handler::{FormatHandler, ImageHandler}; +use nvisy_ingest::document::Document; +use nvisy_ontology::ontology::entity::{BoundingBox, Entity}; +use nvisy_ontology::ontology::redaction::{Redaction, RedactionMethod}; +use nvisy_core::error::{Error, ErrorKind}; + +use crate::action::Action; +use crate::render::{blur, block}; + +/// Typed parameters for [`ApplyImageRedactionAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ApplyImageRedactionParams { + /// Sigma value for gaussian blur. + #[serde(default = "default_sigma")] + pub blur_sigma: f32, + /// RGBA color for block overlays. + #[serde(default = "default_color")] + pub block_color: [u8; 4], +} + +fn default_sigma() -> f32 { + 15.0 +} +fn default_color() -> [u8; 4] { + [0, 0, 0, 255] +} + +/// Applies blur or block redaction to image regions identified by entities +/// with bounding boxes. +pub struct ApplyImageRedactionAction { + params: ApplyImageRedactionParams, +} + +#[async_trait::async_trait] +impl Action for ApplyImageRedactionAction { + type Params = ApplyImageRedactionParams; + type Input = (Vec<Document<FormatHandler>>, Vec<Entity>, Vec<Redaction>); + type Output = Vec<Document<FormatHandler>>; + + fn id(&self) -> &str { + "apply-image-redaction" + } + + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { params }) + } + + async fn execute( + &self, + input: Self::Input, + ) -> Result<Self::Output, Error> { + let (documents, entities, redactions) = input; + + // Build entity->redaction map + let redaction_map: std::collections::HashMap<uuid::Uuid, &Redaction> = redactions + .iter() + .filter(|r| !r.applied) + .map(|r| (r.entity_id, r)) + .collect(); + + // Collect entities with bounding boxes, grouped by redaction method + let mut blur_regions: Vec<BoundingBox> = Vec::new(); + let mut block_regions: Vec<BoundingBox> = Vec::new(); + + for entity in &entities { + if let Some(bbox) = &entity.location.bounding_box { + if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { + match redaction.method { + RedactionMethod::Blur => blur_regions.push(bbox.clone()), + RedactionMethod::Block => block_regions.push(bbox.clone()), + // Default non-image methods to block for images + _ => block_regions.push(bbox.clone()), + } + } + } + } + + if blur_regions.is_empty() && block_regions.is_empty() { + return Ok(documents); + } + + // Filter for image documents only + let mut new_docs = Vec::new(); + for doc in &documents { + let image_data = match &doc.data { + Some(d) => d, + None => { + new_docs.push(doc.clone()); + continue; + } + }; + + let dyn_img = image::load_from_memory(image_data).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("image decode failed: {e}")) + })?; + + let mut result = dyn_img; + if !blur_regions.is_empty() { + result = blur::apply_gaussian_blur(&result, &blur_regions, self.params.blur_sigma); + } + if !block_regions.is_empty() { + let color = image::Rgba(self.params.block_color); + result = block::apply_block_overlay(&result, &block_regions, color); + } + + // Encode back to PNG + let mut buf = std::io::Cursor::new(Vec::new()); + result + .write_to(&mut buf, image::ImageFormat::Png) + .map_err(|e| { + Error::new(ErrorKind::Runtime, format!("image encode failed: {e}")) + })?; + + let new_doc = Document::new(FormatHandler::Image(ImageHandler)) + .with_data(Bytes::from(buf.into_inner()), "image/png") + .with_dimensions(result.width(), result.height()); + + new_docs.push(new_doc); + } + + Ok(new_docs) + } +} diff --git a/crates/nvisy-pipeline/src/actions/apply_pdf_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_pdf_redaction.rs new file mode 100644 index 0000000..7d5b6c3 --- /dev/null +++ b/crates/nvisy-pipeline/src/actions/apply_pdf_redaction.rs @@ -0,0 +1,148 @@ +//! PDF reassembly action -- writes redacted content back to PDF bytes. + +use bytes::Bytes; +use serde::Deserialize; + +use nvisy_ingest::handler::FormatHandler; +use nvisy_ingest::document::Document; +use nvisy_core::error::{Error, ErrorKind}; +use nvisy_core::io::ContentData; +use nvisy_core::path::ContentSource; + +use crate::action::Action; + +/// Typed parameters for [`ApplyPdfRedactionAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ApplyPdfRedactionParams {} + +/// Reassembles redacted text and images back into the original PDF. +/// +/// Uses `lopdf` to: +/// 1. Replace PDF content streams with redacted text. +/// 2. Replace embedded image XObjects with redacted image data. +/// 3. Write the modified PDF back to a new `ContentData`. +pub struct ApplyPdfRedactionAction { + params: ApplyPdfRedactionParams, +} + +#[async_trait::async_trait] +impl Action for ApplyPdfRedactionAction { + type Params = ApplyPdfRedactionParams; + type Input = (ContentData, Vec<Document<FormatHandler>>); + type Output = ContentData; + + fn id(&self) -> &str { + "apply-pdf-redaction" + } + + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { params }) + } + + async fn execute( + &self, + input: Self::Input, + ) -> Result<Self::Output, Error> { + let (content, documents) = input; + + // Only process if the content is actually a PDF + let is_pdf = content + .content_type() + .map(|ct| ct == "application/pdf") + .unwrap_or(false); + + if !is_pdf { + return Ok(content); + } + + let mut pdf_doc = lopdf::Document::load_mem(content.as_bytes()).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("PDF load failed: {e}")) + })?; + + // Collect image documents for XObject replacement + let images: Vec<&Document<FormatHandler>> = documents + .iter() + .filter(|d| d.data.is_some()) + .collect(); + + if !images.is_empty() { + let pages: Vec<(u32, lopdf::ObjectId)> = + pdf_doc.get_pages().into_iter().collect(); + let mut image_idx = 0; + + for (_page_num, page_id) in &pages { + let (resources_opt, _) = match pdf_doc.get_page_resources(*page_id) { + Ok(r) => r, + Err(_) => continue, + }; + + let resources = match resources_opt { + Some(res) => res.clone(), + None => continue, + }; + + let xobject_obj = match resources.get(b"XObject") { + Ok(obj) => obj.clone(), + Err(_) => continue, + }; + + let xobjects = match pdf_doc.dereference(&xobject_obj) { + Ok((_, lopdf::Object::Dictionary(dict))) => dict.clone(), + _ => continue, + }; + + for (_name, obj_ref) in xobjects.iter() { + let stream_id = match obj_ref { + lopdf::Object::Reference(id) => Some(*id), + _ => None, + }; + + let is_image = match pdf_doc.dereference(obj_ref) { + Ok((_, lopdf::Object::Stream(s))) => s + .dict + .get(b"Subtype") + .ok() + .and_then(|st| { + if let lopdf::Object::Name(n) = st { + Some(n.as_slice() == b"Image") + } else { + None + } + }) + .unwrap_or(false), + _ => false, + }; + + if is_image { + if let (Some(sid), Some(redacted_doc)) = + (stream_id, images.get(image_idx)) + { + if let Some(ref data) = redacted_doc.data { + let new_stream = lopdf::Stream::new( + lopdf::Dictionary::new(), + data.to_vec(), + ); + pdf_doc + .objects + .insert(sid, lopdf::Object::Stream(new_stream)); + } + image_idx += 1; + } + } + } + } + } + + // Write the modified PDF to a buffer + let mut output_buf = Vec::new(); + pdf_doc.save_to(&mut output_buf).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("PDF save failed: {e}")) + })?; + + let result = ContentData::new(ContentSource::new(), Bytes::from(output_buf)) + .with_content_type("application/pdf"); + + Ok(result) + } +} diff --git a/crates/nvisy-pipeline/src/actions/apply_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_redaction.rs new file mode 100644 index 0000000..cdce89a --- /dev/null +++ b/crates/nvisy-pipeline/src/actions/apply_redaction.rs @@ -0,0 +1,133 @@ +//! Action that applies pending redactions to document text. + +use std::collections::HashMap; +use uuid::Uuid; + +use nvisy_ingest::handler::{FormatHandler, PlaintextHandler}; +use nvisy_ingest::document::Document; +use nvisy_ontology::ontology::entity::Entity; +use nvisy_ontology::ontology::redaction::Redaction; +use nvisy_core::error::Error; + +use crate::action::Action; + +/// Applies pending [`Redaction`] instructions to document content. +/// +/// The action correlates entities with their redactions, locates the +/// corresponding text spans inside each document, and replaces them with +/// the computed replacement values. The resulting redacted documents are +/// returned. +pub struct ApplyRedactionAction; + +/// A single text replacement that has been resolved but not yet applied. +struct PendingRedaction { + /// Byte offset where the redaction starts in the original text. + start_offset: usize, + /// Byte offset where the redaction ends (exclusive) in the original text. + end_offset: usize, + /// The string that will replace the original span. + replacement_value: String, +} + +#[async_trait::async_trait] +impl Action for ApplyRedactionAction { + type Params = (); + type Input = (Vec<Document<FormatHandler>>, Vec<Entity>, Vec<Redaction>); + type Output = Vec<Document<FormatHandler>>; + + fn id(&self) -> &str { + "apply-redaction" + } + + async fn connect(_params: Self::Params) -> Result<Self, Error> { + Ok(Self) + } + + async fn execute( + &self, + input: Self::Input, + ) -> Result<Vec<Document<FormatHandler>>, Error> { + let (documents, entities, redactions) = input; + + let entity_map: HashMap<Uuid, &Entity> = + entities.iter().map(|e| (e.source.as_uuid(), e)).collect(); + let redaction_map: HashMap<Uuid, &Redaction> = + redactions.iter().map(|r| (r.entity_id, r)).collect(); + + let mut result_docs = Vec::new(); + + for doc in &documents { + let content = match &doc.content { + Some(c) => c, + None => { + result_docs.push(doc.clone()); + continue; + } + }; + + let mut pending: Vec<PendingRedaction> = Vec::new(); + + for (entity_id, redaction) in &redaction_map { + let entity = match entity_map.get(entity_id) { + Some(e) => e, + None => continue, + }; + + // Check entity belongs to this document + let belongs = entity.source.parent_id() == Some(doc.source.as_uuid()); + if !belongs { + continue; + } + + pending.push(PendingRedaction { + start_offset: entity.location.start_offset, + end_offset: entity.location.end_offset, + replacement_value: redaction.replacement_value.clone(), + }); + } + + if pending.is_empty() { + result_docs.push(doc.clone()); + continue; + } + + let redacted_content = apply_redactions(content, &mut pending); + let mut result = Document::new(FormatHandler::Plaintext(PlaintextHandler)) + .with_text(redacted_content); + result.title = doc.title.clone(); + result.elements = doc.elements.clone(); + result.page_count = doc.page_count; + result.source.set_parent_id(Some(doc.source.as_uuid())); + + result_docs.push(result); + } + + Ok(result_docs) + } +} + +/// Applies a set of pending redactions to `text`, returning the redacted result. +/// +/// Replacements are applied right-to-left (descending start offset) so that +/// earlier byte offsets remain valid after each substitution. +fn apply_redactions(text: &str, pending: &mut [PendingRedaction]) -> String { + // Sort by start offset descending (right-to-left) to preserve positions + pending.sort_by(|a, b| b.start_offset.cmp(&a.start_offset)); + + let mut result = text.to_string(); + for redaction in pending.iter() { + let start = redaction.start_offset.min(result.len()); + let end = redaction.end_offset.min(result.len()); + if start >= end { + continue; + } + + result = format!( + "{}{}{}", + &result[..start], + redaction.replacement_value, + &result[end..] + ); + } + result +} diff --git a/crates/nvisy-pipeline/src/actions/apply_tabular_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_tabular_redaction.rs new file mode 100644 index 0000000..686c5cc --- /dev/null +++ b/crates/nvisy-pipeline/src/actions/apply_tabular_redaction.rs @@ -0,0 +1,108 @@ +//! Tabular data redaction action -- applies redaction to specific cells. + +use serde::Deserialize; + +use nvisy_ingest::handler::FormatHandler; +use nvisy_ingest::document::Document; +use nvisy_ontology::ontology::entity::Entity; +use nvisy_ontology::ontology::redaction::{Redaction, RedactionMethod}; +use nvisy_core::error::Error; + +use crate::action::Action; + +/// Typed parameters for [`ApplyTabularRedactionAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ApplyTabularRedactionParams {} + +/// Applies pending redactions to tabular data cells. +/// +/// For entities with `row_index` and `column_index`, the corresponding cell +/// value is redacted according to the redaction method (mask, replace, +/// remove, hash). +pub struct ApplyTabularRedactionAction { + params: ApplyTabularRedactionParams, +} + +#[async_trait::async_trait] +impl Action for ApplyTabularRedactionAction { + type Params = ApplyTabularRedactionParams; + type Input = (Vec<Document<FormatHandler>>, Vec<Entity>, Vec<Redaction>); + type Output = Vec<Document<FormatHandler>>; + + fn id(&self) -> &str { + "apply-tabular-redaction" + } + + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { params }) + } + + async fn execute( + &self, + input: Self::Input, + ) -> Result<Self::Output, Error> { + let (mut documents, entities, redactions) = input; + + // Build entity->redaction map + let redaction_map: std::collections::HashMap<uuid::Uuid, &Redaction> = redactions + .iter() + .filter(|r| !r.applied) + .map(|r| (r.entity_id, r)) + .collect(); + + for entity in &entities { + if let (Some(row_idx), Some(col_idx)) = + (entity.location.row_index, entity.location.column_index) + { + if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { + for doc in &mut documents { + if let Some(rows) = &mut doc.rows { + if let Some(row) = rows.get_mut(row_idx) { + if let Some(cell) = row.get_mut(col_idx) { + *cell = apply_cell_redaction( + cell, + redaction.method, + &redaction.replacement_value, + ); + } + } + } + } + } + } + } + + Ok(documents) + } +} + +fn apply_cell_redaction(cell: &str, method: RedactionMethod, replacement: &str) -> String { + match method { + RedactionMethod::Mask => { + // Mask all but last 4 characters + if cell.len() > 4 { + format!( + "{}{}", + "*".repeat(cell.len() - 4), + &cell[cell.len() - 4..] + ) + } else { + "*".repeat(cell.len()) + } + } + RedactionMethod::Replace => replacement.to_string(), + RedactionMethod::Remove => String::new(), + RedactionMethod::Hash => { + format!("[HASH:{:x}]", hash_string(cell)) + } + _ => replacement.to_string(), + } +} + +fn hash_string(s: &str) -> u64 { + use std::hash::{Hash, Hasher}; + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + s.hash(&mut hasher); + hasher.finish() +} diff --git a/crates/nvisy-pipeline/src/actions/classify.rs b/crates/nvisy-pipeline/src/actions/classify.rs new file mode 100644 index 0000000..a6387c0 --- /dev/null +++ b/crates/nvisy-pipeline/src/actions/classify.rs @@ -0,0 +1,83 @@ +//! Sensitivity classification action. + +use nvisy_ontology::ontology::entity::Entity; +use nvisy_core::error::Error; + +use crate::action::Action; + +/// Result of sensitivity classification. +pub struct ClassificationResult { + /// The computed sensitivity level (`"none"`, `"low"`, `"medium"`, `"high"`, or `"critical"`). + pub sensitivity_level: String, + /// Total number of entities considered. + pub total_entities: usize, +} + +/// Assigns a sensitivity level based on detected entities. +/// +/// The action inspects the entities, computes a sensitivity level +/// (`"none"`, `"low"`, `"medium"`, `"high"`, or `"critical"`), and returns +/// a [`ClassificationResult`]. +pub struct ClassifyAction; + +#[async_trait::async_trait] +impl Action for ClassifyAction { + type Params = (); + type Input = Vec<Entity>; + type Output = ClassificationResult; + + fn id(&self) -> &str { + "classify" + } + + async fn connect(_params: Self::Params) -> Result<Self, Error> { + Ok(Self) + } + + async fn execute( + &self, + entities: Self::Input, + ) -> Result<ClassificationResult, Error> { + let total_entities = entities.len(); + let sensitivity_level = compute_sensitivity_level(&entities); + + Ok(ClassificationResult { + sensitivity_level, + total_entities, + }) + } +} + +/// Computes a sensitivity level string from a set of detected entities. +/// +/// The heuristic is: +/// - `"none"` -- no entities. +/// - `"critical"` -- at least one high-confidence (>= 0.9) credential, SSN, or credit card. +/// - `"high"` -- any critical type present, or more than 10 entities total. +/// - `"medium"` -- more than 3 entities. +/// - `"low"` -- 1-3 non-critical entities. +fn compute_sensitivity_level(entities: &[Entity]) -> String { + if entities.is_empty() { + return "none".to_string(); + } + + let has_high_confidence = entities.iter().any(|e| e.confidence >= 0.9); + let has_critical_types = entities.iter().any(|e| { + matches!( + e.category, + nvisy_ontology::ontology::entity::EntityCategory::Credentials + ) || e.entity_type == "ssn" + || e.entity_type == "credit_card" + }); + + if has_critical_types && has_high_confidence { + return "critical".to_string(); + } + if has_critical_types || entities.len() > 10 { + return "high".to_string(); + } + if entities.len() > 3 { + return "medium".to_string(); + } + "low".to_string() +} diff --git a/crates/nvisy-pipeline/src/actions/detect_checksum.rs b/crates/nvisy-pipeline/src/actions/detect_checksum.rs new file mode 100644 index 0000000..aafb39d --- /dev/null +++ b/crates/nvisy-pipeline/src/actions/detect_checksum.rs @@ -0,0 +1,103 @@ +//! Checksum-based entity validation action. + +use serde::Deserialize; + +use nvisy_ontology::ontology::entity::{DetectionMethod, Entity}; +use nvisy_core::error::Error; +use nvisy_pattern::patterns::validators::luhn_check; + +use crate::action::Action; + +/// Typed parameters for [`DetectChecksumAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DetectChecksumParams { + /// Whether to discard entities that fail validation. + #[serde(default = "default_true")] + pub drop_invalid: bool, + /// Amount added to confidence on successful validation. + #[serde(default = "default_boost")] + pub confidence_boost: f64, +} + +fn default_true() -> bool { + true +} +fn default_boost() -> f64 { + 0.05 +} + +/// Validates previously detected entities using checksum algorithms. +/// +/// Entities whose type has a registered validator (e.g. Luhn for credit cards) +/// are verified. Valid matches receive a confidence boost and are re-emitted +/// with [`DetectionMethod::Checksum`]. Invalid matches can optionally be +/// dropped from the pipeline. +pub struct DetectChecksumAction { + params: DetectChecksumParams, +} + +#[async_trait::async_trait] +impl Action for DetectChecksumAction { + type Params = DetectChecksumParams; + type Input = Vec<Entity>; + type Output = Vec<Entity>; + + fn id(&self) -> &str { + "detect-checksum" + } + + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { params }) + } + + async fn execute( + &self, + entities: Self::Input, + ) -> Result<Vec<Entity>, Error> { + let drop_invalid = self.params.drop_invalid; + let confidence_boost = self.params.confidence_boost; + + let mut result = Vec::new(); + + for entity in entities { + let validator = get_validator(&entity.entity_type); + + if let Some(validate) = validator { + let is_valid = validate(&entity.value); + + if !is_valid && drop_invalid { + continue; + } + + if is_valid { + let mut boosted = Entity::new( + entity.category, + &entity.entity_type, + &entity.value, + DetectionMethod::Checksum, + (entity.confidence + confidence_boost).min(1.0), + entity.location.clone(), + ); + boosted.source.set_parent_id(entity.source.parent_id()); + + result.push(boosted); + continue; + } + } + + // No validator or not valid but not dropping -- pass through + result.push(entity); + } + + Ok(result) + } +} + +/// Returns the checksum validator function for a given entity type, if one exists. +fn get_validator(entity_type: &str) -> Option<fn(&str) -> bool> { + match entity_type { + "credit_card" => Some(luhn_check), + _ => None, + } +} diff --git a/crates/nvisy-detect/src/actions/detect_dictionary.rs b/crates/nvisy-pipeline/src/actions/detect_dictionary.rs similarity index 68% rename from crates/nvisy-detect/src/actions/detect_dictionary.rs rename to crates/nvisy-pipeline/src/actions/detect_dictionary.rs index 76ae21d..c5a75b9 100644 --- a/crates/nvisy-detect/src/actions/detect_dictionary.rs +++ b/crates/nvisy-pipeline/src/actions/detect_dictionary.rs @@ -2,15 +2,14 @@ use aho_corasick::AhoCorasick; use serde::Deserialize; -use tokio::sync::mpsc; -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::{Document, TabularData}; +use nvisy_ingest::handler::FormatHandler; +use nvisy_ingest::document::Document; use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityLocation}; use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; +use nvisy_pattern::dictionaries; -use crate::dictionaries; +use crate::action::Action; /// Definition of a single dictionary for matching. #[derive(Debug, Clone, Deserialize)] @@ -47,46 +46,44 @@ fn default_confidence() -> f64 { /// Scans document text and tabular cells against Aho-Corasick automata /// built from user-provided word lists and/or built-in gazetteers. -pub struct DetectDictionaryAction; +pub struct DetectDictionaryAction { + params: DetectDictionaryParams, + automata: Vec<(DictionaryDef, AhoCorasick, Vec<String>)>, +} #[async_trait::async_trait] impl Action for DetectDictionaryAction { type Params = DetectDictionaryParams; + type Input = Vec<Document<FormatHandler>>; + type Output = Vec<Entity>; fn id(&self) -> &str { "detect-dictionary" } - fn validate_params(&self, params: &Self::Params) -> Result<(), Error> { + async fn connect(params: Self::Params) -> Result<Self, Error> { if params.dictionaries.is_empty() { return Err(Error::new( ErrorKind::Validation, "at least one dictionary definition is required", )); } - Ok(()) + let automata = build_automata(¶ms.dictionaries)?; + Ok(Self { params, automata }) } async fn execute( &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - params: Self::Params, - ) -> Result<u64, Error> { - // Build automata for each dictionary - let automata = build_automata(¶ms.dictionaries)?; - let confidence = params.confidence; - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - // Scan documents - let documents: Vec<Document> = blob.get_artifacts("documents").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read documents artifact: {e}")) - })?; - - for doc in &documents { - for (def, ac, values) in &automata { - for mat in ac.find_iter(&doc.content) { + documents: Self::Input, + ) -> Result<Vec<Entity>, Error> { + let confidence = self.params.confidence; + let mut entities = Vec::new(); + + for doc in &documents { + // Text content matching + if let Some(content) = &doc.content { + for (def, ac, values) in &self.automata { + for mat in ac.find_iter(content) { let value = &values[mat.pattern().as_usize()]; let entity = Entity::new( def.category, @@ -105,27 +102,20 @@ impl Action for DetectDictionaryAction { image_id: None, }, ) - .with_source_id(doc.data.id); - blob.add_artifact("entities", &entity).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add entity: {e}")) - })?; - count += 1; + .with_parent(&doc.source); + entities.push(entity); } } } - // Scan tabular data - let tables: Vec<TabularData> = blob.get_artifacts("tabular").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read tabular artifact: {e}")) - })?; - - for table in &tables { - for (row_idx, row) in table.rows.iter().enumerate() { + // Tabular content matching + if let Some(rows) = &doc.rows { + for (row_idx, row) in rows.iter().enumerate() { for (col_idx, cell) in row.iter().enumerate() { if cell.is_empty() { continue; } - for (def, ac, values) in &automata { + for (def, ac, values) in &self.automata { for mat in ac.find_iter(cell) { let value = &values[mat.pattern().as_usize()]; let entity = Entity::new( @@ -145,33 +135,23 @@ impl Action for DetectDictionaryAction { image_id: None, }, ) - .with_source_id(table.data.id); - blob.add_artifact("entities", &entity).map_err(|e| { - Error::new( - ErrorKind::Runtime, - format!("failed to add entity: {e}"), - ) - })?; - count += 1; + .with_parent(&doc.source); + entities.push(entity); } } } } } - - if output.send(blob).await.is_err() { - return Ok(count); - } } - Ok(count) + Ok(entities) } } /// Resolve dictionary values (builtin or custom) and build Aho-Corasick automata. fn build_automata( defs: &[DictionaryDef], -) -> Result<Vec<(&DictionaryDef, AhoCorasick, Vec<String>)>, Error> { +) -> Result<Vec<(DictionaryDef, AhoCorasick, Vec<String>)>, Error> { let mut result = Vec::with_capacity(defs.len()); for def in defs { @@ -198,7 +178,7 @@ fn build_automata( Error::new(ErrorKind::Runtime, format!("failed to build automaton: {e}")) })?; - result.push((def, ac, values)); + result.push((def.clone(), ac, values)); } Ok(result) diff --git a/crates/nvisy-pipeline/src/actions/detect_manual.rs b/crates/nvisy-pipeline/src/actions/detect_manual.rs new file mode 100644 index 0000000..121b9ec --- /dev/null +++ b/crates/nvisy-pipeline/src/actions/detect_manual.rs @@ -0,0 +1,68 @@ +//! Manual annotation detection action. +//! +//! Converts user-provided [`ManualAnnotation`]s into full [`Entity`] objects. + +use serde::Deserialize; + +use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityLocation}; +use nvisy_ontology::redaction::ManualAnnotation; +use nvisy_core::error::Error; + +use crate::action::Action; + +/// Typed parameters for [`DetectManualAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DetectManualParams {} + +/// Converts each [`ManualAnnotation`] into a full [`Entity`] with +/// `DetectionMethod::Manual` and confidence 1.0. +pub struct DetectManualAction { + params: DetectManualParams, +} + +#[async_trait::async_trait] +impl Action for DetectManualAction { + type Params = DetectManualParams; + type Input = Vec<ManualAnnotation>; + type Output = Vec<Entity>; + + fn id(&self) -> &str { + "detect-manual" + } + + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { params }) + } + + async fn execute( + &self, + annotations: Self::Input, + ) -> Result<Vec<Entity>, Error> { + let mut entities = Vec::new(); + + for ann in &annotations { + let entity = Entity::new( + ann.category, + &ann.entity_type, + &ann.value, + DetectionMethod::Manual, + 1.0, + EntityLocation { + start_offset: ann.start_offset.unwrap_or(0), + end_offset: ann.end_offset.unwrap_or(0), + element_id: None, + page_number: ann.page_number, + bounding_box: ann.bounding_box.clone(), + row_index: ann.row_index, + column_index: ann.column_index, + image_id: None, + }, + ); + + entities.push(entity); + } + + Ok(entities) + } +} diff --git a/crates/nvisy-pipeline/src/actions/detect_regex.rs b/crates/nvisy-pipeline/src/actions/detect_regex.rs new file mode 100644 index 0000000..e3c59f7 --- /dev/null +++ b/crates/nvisy-pipeline/src/actions/detect_regex.rs @@ -0,0 +1,113 @@ +//! Regex-based PII/PHI entity detection action. + +use regex::Regex; +use serde::Deserialize; + +use nvisy_ingest::handler::FormatHandler; +use nvisy_ingest::document::Document; +use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityLocation}; +use nvisy_core::error::Error; +use nvisy_pattern::patterns::{self, PatternDefinition}; + +use crate::action::Action; + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DetectRegexParams { + #[serde(default)] + pub confidence_threshold: f64, + #[serde(default)] + pub patterns: Option<Vec<String>>, +} + +pub struct DetectRegexAction { + params: DetectRegexParams, +} + +#[async_trait::async_trait] +impl Action for DetectRegexAction { + type Params = DetectRegexParams; + type Input = Vec<Document<FormatHandler>>; + type Output = Vec<Entity>; + + fn id(&self) -> &str { + "detect-regex" + } + + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { params }) + } + + async fn execute( + &self, + documents: Self::Input, + ) -> Result<Vec<Entity>, Error> { + let confidence_threshold = self.params.confidence_threshold; + let requested_patterns = &self.params.patterns; + + let active_patterns = resolve_patterns(requested_patterns); + + let compiled: Vec<(&PatternDefinition, Regex)> = active_patterns + .iter() + .filter_map(|p| Regex::new(&p.pattern_str).ok().map(|r| (*p, r))) + .collect(); + + let mut entities = Vec::new(); + + for doc in &documents { + let content = match &doc.content { + Some(c) => c, + None => continue, + }; + + for (pattern, regex) in &compiled { + for mat in regex.find_iter(content) { + let value = mat.as_str(); + + if let Some(validate) = pattern.validate { + if !validate(value) { + continue; + } + } + + if pattern.confidence < confidence_threshold { + continue; + } + + let entity = Entity::new( + pattern.category, + &pattern.entity_type, + value, + DetectionMethod::Regex, + pattern.confidence, + EntityLocation { + start_offset: mat.start(), + end_offset: mat.end(), + element_id: None, + page_number: None, + bounding_box: None, + row_index: None, + column_index: None, + image_id: None, + }, + ) + .with_parent(&doc.source); + + entities.push(entity); + } + } + } + + Ok(entities) + } +} + +fn resolve_patterns(requested: &Option<Vec<String>>) -> Vec<&'static PatternDefinition> { + match requested { + Some(names) if !names.is_empty() => names + .iter() + .filter_map(|n| patterns::get_pattern(n)) + .collect(), + _ => patterns::get_all_patterns(), + } +} diff --git a/crates/nvisy-pipeline/src/actions/detect_tabular.rs b/crates/nvisy-pipeline/src/actions/detect_tabular.rs new file mode 100644 index 0000000..c6f1e91 --- /dev/null +++ b/crates/nvisy-pipeline/src/actions/detect_tabular.rs @@ -0,0 +1,132 @@ +//! Column-based rule matching for tabular data. + +use regex::Regex; +use serde::Deserialize; + +use nvisy_ingest::handler::FormatHandler; +use nvisy_ingest::document::Document; +use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityLocation}; +use nvisy_core::error::{Error, ErrorKind}; + +use crate::action::Action; + +/// A rule that matches column headers to classify entire columns. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ColumnRule { + /// Regex pattern to match against column names. + pub column_name_pattern: String, + /// Entity category for matches in the column. + pub category: EntityCategory, + /// Entity type label for matches. + pub entity_type: String, +} + +/// Typed parameters for [`DetectTabularAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DetectTabularParams { + /// Column-matching rules. + pub column_rules: Vec<ColumnRule>, +} + +/// Matches column headers against rules and marks every non-empty cell +/// in matched columns as an entity. +pub struct DetectTabularAction { + params: DetectTabularParams, + compiled_rules: Vec<(Regex, ColumnRule)>, +} + +#[async_trait::async_trait] +impl Action for DetectTabularAction { + type Params = DetectTabularParams; + type Input = Vec<Document<FormatHandler>>; + type Output = Vec<Entity>; + + fn id(&self) -> &str { + "detect-tabular" + } + + async fn connect(params: Self::Params) -> Result<Self, Error> { + let compiled_rules = params + .column_rules + .iter() + .map(|r| { + let re = Regex::new(&r.column_name_pattern).map_err(|e| { + Error::new( + ErrorKind::Validation, + format!( + "invalid column_name_pattern '{}': {e}", + r.column_name_pattern + ), + ) + })?; + Ok((re, r.clone())) + }) + .collect::<Result<Vec<_>, Error>>()?; + Ok(Self { + params, + compiled_rules, + }) + } + + async fn execute( + &self, + documents: Self::Input, + ) -> Result<Vec<Entity>, Error> { + let mut entities = Vec::new(); + + for doc in &documents { + let columns = match &doc.columns { + Some(c) => c, + None => continue, + }; + let rows = match &doc.rows { + Some(r) => r, + None => continue, + }; + + for (col_idx, col_name) in columns.iter().enumerate() { + for (regex, rule) in &self.compiled_rules { + if !regex.is_match(col_name) { + continue; + } + + for (row_idx, row) in rows.iter().enumerate() { + if let Some(cell) = row.get(col_idx) { + if cell.is_empty() { + continue; + } + + let entity = Entity::new( + rule.category, + &rule.entity_type, + cell.as_str(), + DetectionMethod::Composite, + 0.9, + EntityLocation { + start_offset: 0, + end_offset: cell.len(), + element_id: None, + page_number: None, + bounding_box: None, + row_index: Some(row_idx), + column_index: Some(col_idx), + image_id: None, + }, + ) + .with_parent(&doc.source); + + entities.push(entity); + } + } + + // Only apply first matching rule per column + break; + } + } + } + + Ok(entities) + } +} diff --git a/crates/nvisy-pipeline/src/actions/emit_audit.rs b/crates/nvisy-pipeline/src/actions/emit_audit.rs new file mode 100644 index 0000000..924ad72 --- /dev/null +++ b/crates/nvisy-pipeline/src/actions/emit_audit.rs @@ -0,0 +1,91 @@ +//! Audit trail emission action. + +use serde::Deserialize; +use uuid::Uuid; + +use nvisy_ontology::ontology::audit::{Audit, AuditAction}; +use nvisy_ontology::ontology::redaction::Redaction; +use nvisy_core::error::Error; + +use crate::action::Action; + +/// Typed parameters for [`EmitAuditAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct EmitAuditParams { + /// Pipeline run identifier to attach. + #[serde(default)] + pub run_id: Option<Uuid>, + /// Human or service identity to record. + #[serde(default)] + pub actor: Option<String>, +} + +/// Emits an [`Audit`] record for every [`Redaction`] provided. +/// +/// Each audit entry captures the redaction method, replacement value, and +/// (when available) the originating policy rule ID. +pub struct EmitAuditAction { + params: EmitAuditParams, +} + +#[async_trait::async_trait] +impl Action for EmitAuditAction { + type Params = EmitAuditParams; + type Input = Vec<Redaction>; + type Output = Vec<Audit>; + + fn id(&self) -> &str { + "emit-audit" + } + + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { params }) + } + + async fn execute( + &self, + redactions: Self::Input, + ) -> Result<Vec<Audit>, Error> { + let run_id = self.params.run_id; + let actor = &self.params.actor; + + let mut audits = Vec::new(); + + for redaction in &redactions { + let mut audit = Audit::new(AuditAction::Redaction) + .with_entity_id(redaction.entity_id) + .with_redaction_id(redaction.source.as_uuid()); + + if let Some(run_id) = run_id { + audit = audit.with_run_id(run_id); + } + if let Some(actor) = actor { + audit = audit.with_actor(actor); + } + + let mut details = serde_json::Map::new(); + details.insert( + "method".to_string(), + serde_json::to_value(redaction.method).unwrap_or_default(), + ); + details.insert( + "replacementValue".to_string(), + serde_json::Value::String(redaction.replacement_value.clone()), + ); + if let Some(ref rule_id) = redaction.policy_rule_id { + details.insert( + "policyRuleId".to_string(), + serde_json::Value::String(rule_id.clone()), + ); + } + audit = audit.with_details(details); + + audit.source.set_parent_id(Some(redaction.source.as_uuid())); + + audits.push(audit); + } + + Ok(audits) + } +} diff --git a/crates/nvisy-detect/src/actions/evaluate_policy.rs b/crates/nvisy-pipeline/src/actions/evaluate_policy.rs similarity index 54% rename from crates/nvisy-detect/src/actions/evaluate_policy.rs rename to crates/nvisy-pipeline/src/actions/evaluate_policy.rs index 0ad81d1..bbba900 100644 --- a/crates/nvisy-detect/src/actions/evaluate_policy.rs +++ b/crates/nvisy-pipeline/src/actions/evaluate_policy.rs @@ -1,14 +1,13 @@ //! Policy evaluation action that maps detected entities to redaction instructions. use serde::Deserialize; -use tokio::sync::mpsc; -use nvisy_core::datatypes::blob::Blob; use nvisy_ontology::ontology::entity::Entity; use nvisy_ontology::redaction::policy::PolicyRule; use nvisy_ontology::ontology::redaction::{Redaction, RedactionMethod}; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; +use nvisy_core::error::Error; + +use crate::action::Action; /// Typed parameters for [`EvaluatePolicyAction`]. #[derive(Debug, Deserialize)] @@ -25,86 +24,77 @@ pub struct EvaluatePolicyParams { pub default_confidence_threshold: f64, } -fn default_method() -> RedactionMethod { RedactionMethod::Mask } -fn default_threshold() -> f64 { 0.5 } +fn default_method() -> RedactionMethod { + RedactionMethod::Mask +} +fn default_threshold() -> f64 { + 0.5 +} -/// Evaluates policy rules against detected entities and emits [`Redaction`] artifacts. +/// Evaluates policy rules against detected entities and produces [`Redaction`] instructions. /// /// For each entity the action finds the first matching rule (sorted by priority), -/// applies its redaction method and replacement template, and writes a -/// `"redactions"` artifact to the blob. Entities that fall below the confidence -/// threshold are skipped. -pub struct EvaluatePolicyAction; +/// applies its redaction method and replacement template, and creates a +/// [`Redaction`]. Entities that fall below the confidence threshold are skipped. +pub struct EvaluatePolicyAction { + params: EvaluatePolicyParams, +} #[async_trait::async_trait] impl Action for EvaluatePolicyAction { type Params = EvaluatePolicyParams; + type Input = Vec<Entity>; + type Output = Vec<Redaction>; fn id(&self) -> &str { "evaluate-policy" } - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { params }) } async fn execute( &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - params: Self::Params, - ) -> Result<u64, Error> { - let default_method = params.default_method; - let default_threshold = params.default_confidence_threshold; - - let mut sorted_rules = params.rules; + entities: Self::Input, + ) -> Result<Vec<Redaction>, Error> { + let default_method = self.params.default_method; + let default_threshold = self.params.default_confidence_threshold; + + let mut sorted_rules = self.params.rules.clone(); sorted_rules.sort_by_key(|r| r.priority); - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let entities: Vec<Entity> = blob.get_artifacts("entities").map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to read entities artifact: {e}")) - })?; - - for entity in &entities { - let rule = find_matching_rule(entity, &sorted_rules); - let method = rule.map(|r| r.method).unwrap_or(default_method); - let threshold = rule - .map(|r| r.confidence_threshold) - .unwrap_or(default_threshold); - - if entity.confidence < threshold { - continue; - } - - let replacement_value = if let Some(r) = rule { - apply_template(&r.replacement_template, entity) - } else { - apply_default_mask(entity, default_method) - }; - - let mut redaction = - Redaction::new(entity.data.id, method, replacement_value); - redaction = redaction.with_original_value(&entity.value); - if let Some(r) = rule { - redaction = redaction.with_policy_rule_id(&r.id); - } - redaction.data.parent_id = Some(entity.data.id); - - blob.add_artifact("redactions", &redaction).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add redaction artifact: {e}")) - })?; - - count += 1; + let mut redactions = Vec::new(); + + for entity in &entities { + let rule = find_matching_rule(entity, &sorted_rules); + let method = rule.map(|r| r.method).unwrap_or(default_method); + let threshold = rule + .map(|r| r.confidence_threshold) + .unwrap_or(default_threshold); + + if entity.confidence < threshold { + continue; } - if output.send(blob).await.is_err() { - return Ok(count); + let replacement_value = if let Some(r) = rule { + apply_template(&r.replacement_template, entity) + } else { + apply_default_mask(entity, default_method) + }; + + let mut redaction = + Redaction::new(entity.source.as_uuid(), method, replacement_value); + redaction = redaction.with_original_value(&entity.value); + if let Some(r) = rule { + redaction = redaction.with_policy_rule_id(&r.id); } + redaction.source.set_parent_id(Some(entity.source.as_uuid())); + + redactions.push(redaction); } - Ok(count) + Ok(redactions) } } diff --git a/crates/nvisy-detect/src/actions/mod.rs b/crates/nvisy-pipeline/src/actions/mod.rs similarity index 67% rename from crates/nvisy-detect/src/actions/mod.rs rename to crates/nvisy-pipeline/src/actions/mod.rs index af6d5f2..444e46f 100644 --- a/crates/nvisy-detect/src/actions/mod.rs +++ b/crates/nvisy-pipeline/src/actions/mod.rs @@ -1,6 +1,6 @@ //! Pipeline actions for the detection and redaction workflow. //! -//! Each sub-module exposes a single [`Action`](nvisy_core::registry::action::Action) +//! Each sub-module exposes a single [`Action`](crate::action::Action) //! implementation that can be wired into an nvisy execution plan. /// Applies pending redactions to document content. @@ -21,3 +21,13 @@ pub mod detect_tabular; pub mod emit_audit; /// Evaluates policy rules against detected entities and produces redaction instructions. pub mod evaluate_policy; +/// Applies image redactions (blur, block) to image artifacts. +#[cfg(feature = "image-redaction")] +pub mod apply_image_redaction; +/// Applies redactions to tabular data cells. +pub mod apply_tabular_redaction; +/// Reassembles redacted content into PDF files. +#[cfg(feature = "pdf-redaction")] +pub mod apply_pdf_redaction; +/// Placeholder for audio redaction. +pub mod apply_audio_redaction; diff --git a/crates/nvisy-pipeline/src/lib.rs b/crates/nvisy-pipeline/src/lib.rs new file mode 100644 index 0000000..efe66b8 --- /dev/null +++ b/crates/nvisy-pipeline/src/lib.rs @@ -0,0 +1,22 @@ +//! Pipeline action/provider traits with detection and redaction actions. +//! +//! This crate consolidates the processing pipeline: the [`Action`] and +//! [`Provider`] traits, all detection actions (regex, dictionary, checksum, +//! tabular, manual), policy evaluation, text/image/tabular/PDF/audio +//! redaction, and audit-trail emission. + +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] + +/// The `Action` trait — the fundamental processing unit in a pipeline. +pub mod action; +/// The `Provider` trait — factory for authenticated client connections. +pub mod provider; +/// Pipeline actions for detection, redaction, policy, and audit. +pub mod actions; +/// Image rendering primitives for redaction overlays. +#[cfg(feature = "image-redaction")] +pub mod render; + +#[doc(hidden)] +pub mod prelude; diff --git a/crates/nvisy-pipeline/src/prelude.rs b/crates/nvisy-pipeline/src/prelude.rs new file mode 100644 index 0000000..e3d7556 --- /dev/null +++ b/crates/nvisy-pipeline/src/prelude.rs @@ -0,0 +1,21 @@ +//! Convenience re-exports for common nvisy-pipeline types. + +pub use crate::action::Action; +pub use crate::provider::{ConnectedInstance, Provider}; + +pub use crate::actions::detect_regex::{DetectRegexAction, DetectRegexParams}; +pub use crate::actions::detect_dictionary::{DetectDictionaryAction, DetectDictionaryParams, DictionaryDef}; +pub use crate::actions::detect_tabular::{DetectTabularAction, DetectTabularParams, ColumnRule}; +pub use crate::actions::detect_manual::{DetectManualAction, DetectManualParams}; +pub use crate::actions::detect_checksum::{DetectChecksumAction, DetectChecksumParams}; +pub use crate::actions::classify::{ClassifyAction, ClassificationResult}; +pub use crate::actions::evaluate_policy::{EvaluatePolicyAction, EvaluatePolicyParams}; +pub use crate::actions::apply_redaction::ApplyRedactionAction; +pub use crate::actions::emit_audit::{EmitAuditAction, EmitAuditParams}; +pub use crate::actions::apply_tabular_redaction::{ApplyTabularRedactionAction, ApplyTabularRedactionParams}; +pub use crate::actions::apply_audio_redaction::{ApplyAudioRedactionAction, ApplyAudioRedactionParams}; + +#[cfg(feature = "image-redaction")] +pub use crate::actions::apply_image_redaction::{ApplyImageRedactionAction, ApplyImageRedactionParams}; +#[cfg(feature = "pdf-redaction")] +pub use crate::actions::apply_pdf_redaction::{ApplyPdfRedactionAction, ApplyPdfRedactionParams}; diff --git a/crates/nvisy-core/src/registry/provider.rs b/crates/nvisy-pipeline/src/provider.rs similarity index 82% rename from crates/nvisy-core/src/registry/provider.rs rename to crates/nvisy-pipeline/src/provider.rs index 132b58c..b14ee26 100644 --- a/crates/nvisy-core/src/registry/provider.rs +++ b/crates/nvisy-pipeline/src/provider.rs @@ -1,11 +1,11 @@ -//! Provider factory trait for creating authenticated client connections. +//! Provider trait for creating authenticated client connections. use std::future::Future; use std::pin::Pin; use serde::de::DeserializeOwned; -use crate::error::Error; +use nvisy_core::error::Error; /// A connected provider instance holding a typed client and an /// optional async disconnect callback. @@ -21,7 +21,7 @@ pub struct ConnectedInstance<C> { /// Implementations handle credential validation, connectivity verification, /// and client construction for a specific provider (e.g. S3, OpenAI). #[async_trait::async_trait] -pub trait ProviderFactory: Send + Sync + 'static { +pub trait Provider: Send + Sync + 'static { /// Strongly-typed credentials for this provider. type Credentials: DeserializeOwned + Send; /// The client type produced by [`connect`](Self::connect). @@ -37,5 +37,8 @@ pub trait ProviderFactory: Send + Sync + 'static { async fn verify(&self, creds: &Self::Credentials) -> Result<(), Error>; /// Create a connected instance. - async fn connect(&self, creds: &Self::Credentials) -> Result<ConnectedInstance<Self::Client>, Error>; + async fn connect( + &self, + creds: &Self::Credentials, + ) -> Result<ConnectedInstance<Self::Client>, Error>; } diff --git a/crates/nvisy-media/src/render/block.rs b/crates/nvisy-pipeline/src/render/block.rs similarity index 100% rename from crates/nvisy-media/src/render/block.rs rename to crates/nvisy-pipeline/src/render/block.rs diff --git a/crates/nvisy-media/src/render/blur.rs b/crates/nvisy-pipeline/src/render/blur.rs similarity index 100% rename from crates/nvisy-media/src/render/blur.rs rename to crates/nvisy-pipeline/src/render/blur.rs diff --git a/crates/nvisy-media/src/render/mod.rs b/crates/nvisy-pipeline/src/render/mod.rs similarity index 100% rename from crates/nvisy-media/src/render/mod.rs rename to crates/nvisy-pipeline/src/render/mod.rs diff --git a/crates/nvisy-python/Cargo.toml b/crates/nvisy-python/Cargo.toml index f93488a..7e999e1 100644 --- a/crates/nvisy-python/Cargo.toml +++ b/crates/nvisy-python/Cargo.toml @@ -25,6 +25,8 @@ rustdoc-args = ["--cfg", "docsrs"] # Internal crates nvisy-core = { workspace = true, features = [] } nvisy-ontology = { workspace = true } +nvisy-pipeline = { workspace = true } +nvisy-ingest = { workspace = true } # (De)serialization serde = { workspace = true, features = ["derive"] } diff --git a/crates/nvisy-python/src/actions/mod.rs b/crates/nvisy-python/src/actions/mod.rs index aca3c12..089c47e 100644 --- a/crates/nvisy-python/src/actions/mod.rs +++ b/crates/nvisy-python/src/actions/mod.rs @@ -9,13 +9,13 @@ pub mod ocr; use serde::Deserialize; -use tokio::sync::mpsc; -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::Document; -use nvisy_core::datatypes::document::ImageData; +use nvisy_ingest::handler::{FormatHandler, PlaintextHandler}; +use nvisy_ingest::document::Document; +use nvisy_ontology::ontology::entity::Entity; use nvisy_core::error::Error; -use nvisy_core::registry::action::Action; +use nvisy_core::io::ContentData; +use nvisy_pipeline::action::Action; use crate::bridge::PythonBridge; use crate::ner::{self, NerConfig}; @@ -49,133 +49,133 @@ fn default_provider() -> String { "openai".to_string() } /// Pipeline action that detects named entities in text documents. /// -/// If the incoming [`Blob`] carries `"documents"` artifacts, each document's -/// text is sent through the NER model. Otherwise the raw blob content is -/// interpreted as UTF-8 text. Detected entities are stored as `"entities"` -/// artifacts on the blob. +/// Each document's text is sent through the NER model. If no documents are +/// provided, the raw content is interpreted as UTF-8 text. Detected entities +/// are returned directly. pub struct DetectNerAction { /// Python bridge used to call the NER model. pub bridge: PythonBridge, + params: DetectNerParams, +} + +impl DetectNerAction { + /// Replace the default bridge with a pre-configured one. + pub fn with_bridge(mut self, bridge: PythonBridge) -> Self { + self.bridge = bridge; + self + } } #[async_trait::async_trait] impl Action for DetectNerAction { type Params = DetectNerParams; + type Input = (ContentData, Vec<Document<FormatHandler>>); + type Output = Vec<Entity>; fn id(&self) -> &str { "detect-ner" } - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { bridge: PythonBridge::default(), params }) } async fn execute( &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - params: Self::Params, - ) -> Result<u64, Error> { - let config = ner_config_from_params(¶ms); - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let documents: Vec<Document> = blob.get_artifacts("documents") - .map_err(|e| Error::runtime(format!("Failed to get document artifacts: {}", e), "python/ner", false))?; - - let docs = if documents.is_empty() { - let text = String::from_utf8(blob.content.to_vec()) - .map_err(|e| Error::runtime(format!("Blob content is not valid UTF-8: {}", e), "python/ner", false))?; - vec![Document::new(text)] - } else { - documents - }; - - for doc in &docs { - let entities = ner::detect_ner(&self.bridge, &doc.content, &config).await?; - for entity in &entities { - blob.add_artifact("entities", entity) - .map_err(|e| Error::runtime(format!("Failed to add entity artifact: {}", e), "python/ner", false))?; - count += 1; - } - } - - if output.send(blob).await.is_err() { - return Ok(count); + input: Self::Input, + ) -> Result<Self::Output, Error> { + let (content, documents) = input; + let config = ner_config_from_params(&self.params); + + let docs = if documents.is_empty() { + let text = content.as_str() + .map_err(|e| Error::runtime( + format!("Content is not valid UTF-8: {}", e), + "python/ner", + false, + ))?; + vec![Document::new(FormatHandler::Plaintext(PlaintextHandler)).with_text(text)] + } else { + documents + }; + + let mut all_entities = Vec::new(); + for doc in &docs { + if let Some(ref content) = doc.content { + let entities = ner::detect_ner(&self.bridge, content, &config).await?; + all_entities.extend(entities); } } - Ok(count) + Ok(all_entities) } } /// Pipeline action that detects named entities in images. /// -/// If the incoming [`Blob`] carries `"images"` artifacts, each image is -/// processed individually. Otherwise the raw blob content is treated as a -/// single image whose MIME type is inferred from the blob metadata. -/// Detected entities are stored as `"entities"` artifacts on the blob. +/// Each image is processed individually through NER. If no images are +/// provided, the raw content is treated as a single image whose MIME type +/// is inferred from the content metadata. Detected entities are returned +/// directly. pub struct DetectNerImageAction { /// Python bridge used to call the NER model. pub bridge: PythonBridge, + params: DetectNerParams, +} + +impl DetectNerImageAction { + /// Replace the default bridge with a pre-configured one. + pub fn with_bridge(mut self, bridge: PythonBridge) -> Self { + self.bridge = bridge; + self + } } #[async_trait::async_trait] impl Action for DetectNerImageAction { type Params = DetectNerParams; + type Input = (ContentData, Vec<Document<FormatHandler>>); + type Output = Vec<Entity>; fn id(&self) -> &str { "detect-ner-image" } - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { bridge: PythonBridge::default(), params }) } async fn execute( &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - params: Self::Params, - ) -> Result<u64, Error> { - let config = ner_config_from_params(¶ms); - let mut count = 0u64; - - while let Some(mut blob) = input.recv().await { - let images: Vec<ImageData> = blob.get_artifacts("images") - .map_err(|e| Error::runtime(format!("Failed to get image artifacts: {}", e), "python/ner-image", false))?; - - if images.is_empty() { - let mime_type = blob.content_type().unwrap_or("application/octet-stream").to_string(); - let entities = ner::detect_ner_image( - &self.bridge, - &blob.content, - &mime_type, - &config, - ).await?; - for entity in &entities { - blob.add_artifact("entities", entity) - .map_err(|e| Error::runtime(format!("Failed to add entity artifact: {}", e), "python/ner-image", false))?; - count += 1; - } - } else { - for img in &images { + input: Self::Input, + ) -> Result<Self::Output, Error> { + let (content, images) = input; + let config = ner_config_from_params(&self.params); + + let mut all_entities = Vec::new(); + + if images.is_empty() { + let mime_type = content.content_type() + .unwrap_or("application/octet-stream") + .to_string(); + let entities = ner::detect_ner_image( + &self.bridge, + content.as_bytes(), + &mime_type, + &config, + ).await?; + all_entities.extend(entities); + } else { + for doc in &images { + if let (Some(data), Some(mime)) = (&doc.data, &doc.mime_type) { let entities = ner::detect_ner_image( &self.bridge, - &img.image_data, - &img.mime_type, + data, + mime, &config, ).await?; - for entity in &entities { - blob.add_artifact("entities", entity) - .map_err(|e| Error::runtime(format!("Failed to add entity artifact: {}", e), "python/ner-image", false))?; - count += 1; - } + all_entities.extend(entities); } } - - if output.send(blob).await.is_err() { - return Ok(count); - } } - Ok(count) + Ok(all_entities) } } diff --git a/crates/nvisy-python/src/actions/ocr.rs b/crates/nvisy-python/src/actions/ocr.rs index 7670a0f..763d700 100644 --- a/crates/nvisy-python/src/actions/ocr.rs +++ b/crates/nvisy-python/src/actions/ocr.rs @@ -1,12 +1,13 @@ //! OCR detection pipeline action. use serde::Deserialize; -use tokio::sync::mpsc; -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::datatypes::document::{Document, ImageData}; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::registry::action::Action; +use nvisy_ingest::handler::{FormatHandler, PlaintextHandler}; +use nvisy_ingest::document::Document; +use nvisy_ontology::ontology::entity::Entity; +use nvisy_core::error::Error; +use nvisy_core::io::ContentData; +use nvisy_pipeline::action::Action; use crate::bridge::PythonBridge; use crate::ocr::{self, OcrConfig}; @@ -36,101 +37,83 @@ fn default_confidence() -> f64 { } /// Pipeline action that performs OCR on images and produces entities -/// with bounding boxes, plus a `Document` artifact from concatenated +/// with bounding boxes, plus `Document` artifacts from concatenated /// OCR text so downstream regex/dictionary/NER can process it. pub struct OcrDetectAction { /// Python bridge used to call the OCR backend. pub bridge: PythonBridge, + params: OcrDetectParams, +} + +impl OcrDetectAction { + /// Replace the default bridge with a pre-configured one. + pub fn with_bridge(mut self, bridge: PythonBridge) -> Self { + self.bridge = bridge; + self + } } #[async_trait::async_trait] impl Action for OcrDetectAction { type Params = OcrDetectParams; + type Input = (ContentData, Vec<Document<FormatHandler>>); + type Output = (Vec<Entity>, Vec<Document<FormatHandler>>); fn id(&self) -> &str { "detect-ocr" } - fn validate_params(&self, _params: &Self::Params) -> Result<(), Error> { - Ok(()) + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { bridge: PythonBridge::default(), params }) } async fn execute( &self, - mut input: mpsc::Receiver<Blob>, - output: mpsc::Sender<Blob>, - params: Self::Params, - ) -> Result<u64, Error> { + input: Self::Input, + ) -> Result<Self::Output, Error> { + let (content, images) = input; let config = OcrConfig { - language: params.language, - engine: params.engine, - confidence_threshold: params.confidence_threshold, + language: self.params.language.clone(), + engine: self.params.engine.clone(), + confidence_threshold: self.params.confidence_threshold, }; - let mut count = 0u64; - while let Some(mut blob) = input.recv().await { - let images: Vec<ImageData> = blob.get_artifacts("images").map_err(|e| { - Error::new( - ErrorKind::Runtime, - format!("failed to read images artifact: {e}"), - ) - })?; + let mut all_entities = Vec::new(); + let mut all_ocr_text = Vec::new(); - let mut all_ocr_text = Vec::new(); - - if images.is_empty() { - // Treat blob content as a single image - let mime_type = blob - .content_type() - .unwrap_or("application/octet-stream") - .to_string(); - let entities = - ocr::detect_ocr(&self.bridge, &blob.content, &mime_type, &config).await?; - for entity in &entities { - all_ocr_text.push(entity.value.clone()); - blob.add_artifact("entities", entity).map_err(|e| { - Error::new( - ErrorKind::Runtime, - format!("failed to add entity: {e}"), - ) - })?; - count += 1; - } - } else { - for img in &images { + if images.is_empty() { + // Treat content as a single image + let mime_type = content + .content_type() + .unwrap_or("application/octet-stream") + .to_string(); + let entities = + ocr::detect_ocr(&self.bridge, content.as_bytes(), &mime_type, &config).await?; + for entity in &entities { + all_ocr_text.push(entity.value.clone()); + } + all_entities.extend(entities); + } else { + for doc in &images { + if let (Some(data), Some(mime)) = (&doc.data, &doc.mime_type) { let entities = - ocr::detect_ocr(&self.bridge, &img.image_data, &img.mime_type, &config) + ocr::detect_ocr(&self.bridge, data, mime, &config) .await?; for entity in &entities { all_ocr_text.push(entity.value.clone()); - blob.add_artifact("entities", entity).map_err(|e| { - Error::new( - ErrorKind::Runtime, - format!("failed to add entity: {e}"), - ) - })?; - count += 1; } + all_entities.extend(entities); } } + } - // Create a Document from concatenated OCR text for downstream processing - if !all_ocr_text.is_empty() { - let ocr_doc = Document::new(all_ocr_text.join("\n")) - .with_source_format("ocr"); - blob.add_artifact("documents", &ocr_doc).map_err(|e| { - Error::new( - ErrorKind::Runtime, - format!("failed to add OCR document: {e}"), - ) - })?; - } - - if output.send(blob).await.is_err() { - return Ok(count); - } + // Create a Document from concatenated OCR text for downstream processing + let mut documents = Vec::new(); + if !all_ocr_text.is_empty() { + let ocr_doc = Document::new(FormatHandler::Plaintext(PlaintextHandler)).with_text(all_ocr_text.join("\n")); + documents.push(ocr_doc); } - Ok(count) + Ok((all_entities, documents)) } } diff --git a/crates/nvisy-python/src/lib.rs b/crates/nvisy-python/src/lib.rs index 97cc933..00f0411 100644 --- a/crates/nvisy-python/src/lib.rs +++ b/crates/nvisy-python/src/lib.rs @@ -2,8 +2,8 @@ //! //! This crate embeds a CPython interpreter via PyO3 and delegates named-entity //! recognition (NER) to a Python module (`nvisy_ai`). It exposes pipeline -//! [`Action`](nvisy_core::registry::action::Action) implementations as well as a -//! [`ProviderFactory`](nvisy_core::registry::provider::ProviderFactory) for the +//! [`Action`](nvisy_pipeline::action::Action) implementations as well as a +//! [`Provider`](nvisy_pipeline::provider::Provider) for the //! `"ai"` provider. #![deny(unsafe_code)] diff --git a/crates/nvisy-python/src/prelude.rs b/crates/nvisy-python/src/prelude.rs index f88b3dd..325beb1 100644 --- a/crates/nvisy-python/src/prelude.rs +++ b/crates/nvisy-python/src/prelude.rs @@ -2,4 +2,4 @@ pub use crate::actions::{DetectNerAction, DetectNerImageAction}; pub use crate::actions::ocr::OcrDetectAction; pub use crate::bridge::PythonBridge; -pub use crate::provider::AiProviderFactory; +pub use crate::provider::AiProvider; diff --git a/crates/nvisy-python/src/provider/mod.rs b/crates/nvisy-python/src/provider/mod.rs index fdacca0..efae0f7 100644 --- a/crates/nvisy-python/src/provider/mod.rs +++ b/crates/nvisy-python/src/provider/mod.rs @@ -6,7 +6,7 @@ use serde::Deserialize; use nvisy_core::error::Error; -use nvisy_core::registry::provider::{ConnectedInstance, ProviderFactory}; +use nvisy_pipeline::provider::{ConnectedInstance, Provider}; use crate::bridge::PythonBridge; /// Typed credentials for the AI provider. @@ -21,10 +21,10 @@ pub struct AiCredentials { /// /// The Python interpreter is **not** initialized at connection time; it is /// lazily loaded on the first NER call. -pub struct AiProviderFactory; +pub struct AiProvider; #[async_trait::async_trait] -impl ProviderFactory for AiProviderFactory { +impl Provider for AiProvider { type Credentials = AiCredentials; type Client = PythonBridge; diff --git a/crates/nvisy-server/Cargo.toml b/crates/nvisy-server/Cargo.toml deleted file mode 100644 index c095a2b..0000000 --- a/crates/nvisy-server/Cargo.toml +++ /dev/null @@ -1,73 +0,0 @@ -# https://doc.rust-lang.org/cargo/reference/manifest.html - -[package] -name = "nvisy-server" -description = "Axum HTTP server for the Nvisy data protection platform" -keywords = ["nvisy", "server", "http", "axum"] -categories = ["web-programming::http-server"] - -version = { workspace = true } -rust-version = { workspace = true } -edition = { workspace = true } -license = { workspace = true } -publish = { workspace = true } - -authors = { workspace = true } -repository = { workspace = true } -homepage = { workspace = true } -documentation = { workspace = true } - -[[bin]] -name = "nvisy-server" -path = "src/main.rs" - -[package.metadata.docs.rs] -all-features = true -rustdoc-args = ["--cfg", "docsrs"] - -[dependencies] -# Internal crates -nvisy-core = { workspace = true, features = ["schema"] } -nvisy-detect = { workspace = true } -nvisy-engine = { workspace = true, features = ["schema"] } -nvisy-ingest = { workspace = true } -nvisy-media = { workspace = true } -nvisy-ontology = { workspace = true } -nvisy-python = { workspace = true } - -# JSON Schema generation -schemars = { workspace = true } - -# (De)serialization -serde = { workspace = true, features = ["derive"] } -serde_json = { workspace = true, features = [] } - -# Async runtime -tokio = { workspace = true, features = ["rt-multi-thread", "macros", "signal"] } - -# HTTP server -axum = { workspace = true, features = ["http2", "macros", "multipart"] } -tower = { workspace = true, features = ["full"] } -tower-http = { workspace = true, features = ["cors", "trace", "request-id", "limit"] } - -# OpenAPI / Documentation -utoipa = { workspace = true, features = ["axum_extras"] } -utoipa-scalar = { workspace = true, features = ["axum"] } - -# Primitive datatypes -uuid = { workspace = true, features = ["v4"] } -chrono = { workspace = true, features = [] } - -# Observability -tracing = { workspace = true, features = [] } -tracing-subscriber = { workspace = true, features = ["fmt", "ansi", "json", "env-filter"] } - -# Encoding -base64 = { workspace = true } - -# Binary data -bytes = { workspace = true } - -# Error handling -thiserror = { workspace = true, features = [] } -anyhow = { workspace = true, features = ["backtrace"] } diff --git a/crates/nvisy-server/README.md b/crates/nvisy-server/README.md deleted file mode 100644 index a774adb..0000000 --- a/crates/nvisy-server/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# nvisy-server - -Axum-based HTTP server for the Nvisy runtime. Exposes REST endpoints for graph execution, data redaction, policy management, and audit log queries, with dependency-injected service layer. diff --git a/crates/nvisy-server/src/app/mod.rs b/crates/nvisy-server/src/app/mod.rs deleted file mode 100644 index 45392f7..0000000 --- a/crates/nvisy-server/src/app/mod.rs +++ /dev/null @@ -1,46 +0,0 @@ -//! HTTP application bootstrap and route composition. -//! -//! The [`build_app`] function wires together all Axum routers, middleware -//! (CORS, tracing), and shared application state into a single [`Router`]. - -use axum::Router; -use std::sync::Arc; -use tower_http::cors::{Any, CorsLayer}; -use tower_http::trace::TraceLayer; -use utoipa::OpenApi; -use utoipa_scalar::{Scalar, Servable}; - -use crate::handler; -use crate::service::{AuditStore, AppState, PolicyStore, ServerConfig}; -use nvisy_engine::runs::RunManager; - -/// Build a fully configured Axum [`Router`] with all handlers and middleware. -/// -/// This constructs the shared [`AppState`], applies CORS and HTTP tracing -/// layers, and merges the health, graphs, redact, policies, audit, and -/// Scalar API-docs routes. -pub async fn build_app(_config: &ServerConfig) -> anyhow::Result<Router> { - let state = AppState { - run_manager: Arc::new(RunManager::new()), - policy_store: Arc::new(PolicyStore::new()), - audit_store: Arc::new(AuditStore::new()), - }; - - let cors = CorsLayer::new() - .allow_origin(Any) - .allow_methods(Any) - .allow_headers(Any); - - let app = Router::new() - .merge(handler::health::router()) - .merge(handler::graphs::router()) - .merge(handler::redact::router()) - .merge(handler::policies::router()) - .merge(handler::audit::router()) - .merge(Scalar::with_url("/scalar", handler::ApiDoc::openapi())) - .layer(TraceLayer::new_for_http()) - .layer(cors) - .with_state(state); - - Ok(app) -} diff --git a/crates/nvisy-server/src/handler/audit.rs b/crates/nvisy-server/src/handler/audit.rs deleted file mode 100644 index eb85032..0000000 --- a/crates/nvisy-server/src/handler/audit.rs +++ /dev/null @@ -1,69 +0,0 @@ -use axum::{ - Router, - extract::{Path, Query, State}, - routing::get, - Json, -}; -use std::sync::Arc; -use uuid::Uuid; -use crate::service::AuditStore; -use crate::service::AppState; - -#[derive(serde::Deserialize, schemars::JsonSchema, utoipa::IntoParams)] -struct AuditQuery { - #[serde(rename = "runId")] - run_id: Option<String>, - action: Option<String>, - #[serde(rename = "sourceId")] - source_id: Option<String>, - limit: Option<usize>, - offset: Option<usize>, -} - -/// List audit records with optional filters. -#[utoipa::path( - get, - path = "/api/v1/audit", - params(AuditQuery), - responses( - (status = 200, description = "List of audit records") - ) -)] -async fn list_audit( - State(audit_store): State<Arc<AuditStore>>, - Query(query): Query<AuditQuery>, -) -> Json<serde_json::Value> { - let records = audit_store.query( - query.run_id.as_deref(), - query.action.as_deref(), - query.source_id.as_deref(), - query.limit.unwrap_or(100), - query.offset.unwrap_or(0), - ); - Json(serde_json::to_value(&records).unwrap_or_default()) -} - -/// Get audit records for a specific run. -#[utoipa::path( - get, - path = "/api/v1/audit/{run_id}", - params( - ("run_id" = Uuid, Path, description = "Run ID") - ), - responses( - (status = 200, description = "Audit records for the run") - ) -)] -async fn get_audit_by_run( - State(audit_store): State<Arc<AuditStore>>, - Path(run_id): Path<Uuid>, -) -> Json<serde_json::Value> { - let records = audit_store.get_by_run_id(run_id); - Json(serde_json::to_value(&records).unwrap_or_default()) -} - -pub fn router() -> Router<AppState> { - Router::new() - .route("/api/v1/audit", get(list_audit)) - .route("/api/v1/audit/{run_id}", get(get_audit_by_run)) -} diff --git a/crates/nvisy-server/src/handler/graphs.rs b/crates/nvisy-server/src/handler/graphs.rs deleted file mode 100644 index 482f35f..0000000 --- a/crates/nvisy-server/src/handler/graphs.rs +++ /dev/null @@ -1,121 +0,0 @@ -use axum::{ - Router, - extract::{Path, State}, - routing::{delete, get, post}, - Json, -}; -use std::sync::Arc; -use uuid::Uuid; -use nvisy_engine::runs::RunManager; -use crate::service::AppState; - -/// Submit a graph for execution. -#[utoipa::path( - post, - path = "/api/v1/graphs/execute", - request_body = serde_json::Value, - responses( - (status = 202, description = "Graph execution accepted") - ) -)] -async fn execute_graph( - State(run_manager): State<Arc<RunManager>>, - Json(_body): Json<serde_json::Value>, -) -> (axum::http::StatusCode, Json<serde_json::Value>) { - let (run_id, _cancel_token) = run_manager.create_run().await; - run_manager.set_running(run_id).await; - - // TODO: spawn actual graph execution - ( - axum::http::StatusCode::ACCEPTED, - Json(serde_json::json!({ - "runId": run_id.to_string(), - "status": "accepted" - })), - ) -} - -/// Validate a graph definition without executing. -#[utoipa::path( - post, - path = "/api/v1/graphs/validate", - request_body = serde_json::Value, - responses( - (status = 200, description = "Validation result") - ) -)] -async fn validate_graph( - Json(_body): Json<serde_json::Value>, -) -> Json<serde_json::Value> { - // TODO: validate graph against registry - Json(serde_json::json!({ "valid": true, "errors": [] })) -} - -/// List all runs. -#[utoipa::path( - get, - path = "/api/v1/graphs", - responses( - (status = 200, description = "List of runs") - ) -)] -async fn list_runs( - State(run_manager): State<Arc<RunManager>>, -) -> Json<serde_json::Value> { - let runs = run_manager.list(None).await; - Json(serde_json::to_value(&runs).unwrap_or_default()) -} - -/// Get status of a single run. -#[utoipa::path( - get, - path = "/api/v1/graphs/{run_id}", - params( - ("run_id" = Uuid, Path, description = "Run ID") - ), - responses( - (status = 200, description = "Run details"), - (status = 404, description = "Run not found") - ) -)] -async fn get_run( - State(run_manager): State<Arc<RunManager>>, - Path(run_id): Path<Uuid>, -) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { - match run_manager.get(run_id).await { - Some(run) => Ok(Json(serde_json::to_value(&run).unwrap_or_default())), - None => Err(axum::http::StatusCode::NOT_FOUND), - } -} - -/// Cancel a running execution. -#[utoipa::path( - delete, - path = "/api/v1/graphs/{run_id}", - params( - ("run_id" = Uuid, Path, description = "Run ID") - ), - responses( - (status = 200, description = "Run cancelled"), - (status = 404, description = "Run not found") - ) -)] -async fn cancel_run( - State(run_manager): State<Arc<RunManager>>, - Path(run_id): Path<Uuid>, -) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { - if run_manager.cancel(run_id).await { - Ok(Json(serde_json::json!({ "cancelled": true }))) - } else { - Err(axum::http::StatusCode::NOT_FOUND) - } -} - -pub fn router() -> Router<AppState> { - Router::new() - .route("/api/v1/graphs/execute", post(execute_graph)) - .route("/api/v1/graphs/validate", post(validate_graph)) - .route("/api/v1/graphs", get(list_runs)) - .route("/api/v1/graphs/{run_id}", get(get_run)) - .route("/api/v1/graphs/{run_id}", delete(cancel_run)) -} diff --git a/crates/nvisy-server/src/handler/health.rs b/crates/nvisy-server/src/handler/health.rs deleted file mode 100644 index 2dca70c..0000000 --- a/crates/nvisy-server/src/handler/health.rs +++ /dev/null @@ -1,32 +0,0 @@ -use axum::{Router, routing::get, Json}; -use crate::service::AppState; - -/// Health check response. -#[utoipa::path( - get, - path = "/health", - responses( - (status = 200, description = "Service is healthy") - ) -)] -async fn health() -> Json<serde_json::Value> { - Json(serde_json::json!({ "status": "ok" })) -} - -/// Readiness check response. -#[utoipa::path( - get, - path = "/ready", - responses( - (status = 200, description = "Service is ready") - ) -)] -async fn ready() -> Json<serde_json::Value> { - Json(serde_json::json!({ "status": "ready" })) -} - -pub fn router() -> Router<AppState> { - Router::new() - .route("/health", get(health)) - .route("/ready", get(ready)) -} diff --git a/crates/nvisy-server/src/handler/mod.rs b/crates/nvisy-server/src/handler/mod.rs deleted file mode 100644 index 1b68946..0000000 --- a/crates/nvisy-server/src/handler/mod.rs +++ /dev/null @@ -1,34 +0,0 @@ -pub mod audit; -pub mod graphs; -pub mod health; -pub mod policies; -pub mod redact; - -use utoipa::OpenApi; - -#[derive(OpenApi)] -#[openapi( - paths( - health::health, - health::ready, - graphs::execute_graph, - graphs::validate_graph, - graphs::list_runs, - graphs::get_run, - graphs::cancel_run, - redact::redact, - policies::create_policy, - policies::list_policies, - policies::get_policy, - policies::update_policy, - policies::delete_policy, - audit::list_audit, - audit::get_audit_by_run, - ), - components(schemas( - redact::RedactResponse, - policies::CreatePolicyRequest, - policies::UpdatePolicyRequest, - )) -)] -pub struct ApiDoc; diff --git a/crates/nvisy-server/src/handler/policies.rs b/crates/nvisy-server/src/handler/policies.rs deleted file mode 100644 index 9f0155c..0000000 --- a/crates/nvisy-server/src/handler/policies.rs +++ /dev/null @@ -1,156 +0,0 @@ -use axum::{ - Router, - extract::{Path, State}, - routing::{delete, get, post, put}, - Json, -}; -use std::sync::Arc; -use uuid::Uuid; -use crate::service::PolicyStore; -use crate::service::AppState; - -#[derive(serde::Deserialize, schemars::JsonSchema, utoipa::ToSchema)] -pub(crate) struct CreatePolicyRequest { - name: String, - #[serde(default)] - rules: Vec<serde_json::Value>, - #[serde(rename = "defaultMethod", default = "default_method")] - default_method: String, - #[serde(rename = "defaultConfidenceThreshold", default = "default_threshold")] - default_confidence_threshold: f64, -} - -fn default_method() -> String { "mask".to_string() } -fn default_threshold() -> f64 { 0.5 } - -#[derive(serde::Deserialize, schemars::JsonSchema, utoipa::ToSchema)] -pub(crate) struct UpdatePolicyRequest { - #[serde(default)] - name: Option<String>, - #[serde(default)] - rules: Option<Vec<serde_json::Value>>, - #[serde(rename = "defaultMethod")] - #[serde(default)] - default_method: Option<String>, - #[serde(rename = "defaultConfidenceThreshold")] - #[serde(default)] - default_confidence_threshold: Option<f64>, -} - -/// Create a new policy. -#[utoipa::path( - post, - path = "/api/v1/policies", - request_body = CreatePolicyRequest, - responses( - (status = 201, description = "Policy created") - ) -)] -async fn create_policy( - State(policy_store): State<Arc<PolicyStore>>, - Json(body): Json<CreatePolicyRequest>, -) -> (axum::http::StatusCode, Json<serde_json::Value>) { - let policy = policy_store.create( - body.name, - body.rules, - body.default_method, - body.default_confidence_threshold, - ); - ( - axum::http::StatusCode::CREATED, - Json(serde_json::to_value(&policy).unwrap_or_default()), - ) -} - -/// List all policies. -#[utoipa::path( - get, - path = "/api/v1/policies", - responses( - (status = 200, description = "List of policies") - ) -)] -async fn list_policies( - State(policy_store): State<Arc<PolicyStore>>, -) -> Json<serde_json::Value> { - let policies = policy_store.list(); - Json(serde_json::to_value(&policies).unwrap_or_default()) -} - -/// Get a policy by ID. -#[utoipa::path( - get, - path = "/api/v1/policies/{id}", - params( - ("id" = Uuid, Path, description = "Policy ID") - ), - responses( - (status = 200, description = "Policy details"), - (status = 404, description = "Policy not found") - ) -)] -async fn get_policy( - State(policy_store): State<Arc<PolicyStore>>, - Path(id): Path<Uuid>, -) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { - match policy_store.get(id) { - Some(policy) => Ok(Json(serde_json::to_value(&policy).unwrap_or_default())), - None => Err(axum::http::StatusCode::NOT_FOUND), - } -} - -/// Update an existing policy. -#[utoipa::path( - put, - path = "/api/v1/policies/{id}", - params( - ("id" = Uuid, Path, description = "Policy ID") - ), - request_body = UpdatePolicyRequest, - responses( - (status = 200, description = "Policy updated"), - (status = 404, description = "Policy not found") - ) -)] -async fn update_policy( - State(policy_store): State<Arc<PolicyStore>>, - Path(id): Path<Uuid>, - Json(body): Json<UpdatePolicyRequest>, -) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { - match policy_store.update(id, body.name, body.rules, body.default_method, body.default_confidence_threshold) { - Some(policy) => Ok(Json(serde_json::to_value(&policy).unwrap_or_default())), - None => Err(axum::http::StatusCode::NOT_FOUND), - } -} - -/// Delete a policy. -#[utoipa::path( - delete, - path = "/api/v1/policies/{id}", - params( - ("id" = Uuid, Path, description = "Policy ID") - ), - responses( - (status = 200, description = "Policy deleted"), - (status = 404, description = "Policy not found") - ) -)] -async fn delete_policy( - State(policy_store): State<Arc<PolicyStore>>, - Path(id): Path<Uuid>, -) -> Result<Json<serde_json::Value>, axum::http::StatusCode> { - if policy_store.delete(id) { - Ok(Json(serde_json::json!({ "deleted": true }))) - } else { - Err(axum::http::StatusCode::NOT_FOUND) - } -} - -pub fn router() -> Router<AppState> { - Router::new() - .route("/api/v1/policies", post(create_policy)) - .route("/api/v1/policies", get(list_policies)) - .route("/api/v1/policies/{id}", get(get_policy)) - .route("/api/v1/policies/{id}", put(update_policy)) - .route("/api/v1/policies/{id}", delete(delete_policy)) -} diff --git a/crates/nvisy-server/src/handler/redact.rs b/crates/nvisy-server/src/handler/redact.rs deleted file mode 100644 index ef1aa78..0000000 --- a/crates/nvisy-server/src/handler/redact.rs +++ /dev/null @@ -1,225 +0,0 @@ -use axum::{ - Router, - extract::{Multipart, Query, State}, - routing::post, - Json, - http::{StatusCode, HeaderMap, header}, - response::IntoResponse, -}; -use bytes::Bytes; -use std::sync::Arc; -use nvisy_ontology::redaction::RedactionContext; -use nvisy_engine::runs::RunManager; -use nvisy_detect::actions::detect_dictionary::DictionaryDef; -use crate::service::AppState; -use crate::service::pipeline; - -/// Query parameters for the redact endpoint. -#[derive(Debug, serde::Deserialize)] -pub(crate) struct RedactQuery { - /// Response format: `"json"` (default) or `"binary"`. - #[serde(default)] - pub format: Option<String>, -} - -/// JSON response for the redact endpoint. -#[derive(Debug, serde::Serialize, schemars::JsonSchema, utoipa::ToSchema)] -pub(crate) struct RedactResponse { - /// Unique run identifier. - pub run_id: String, - /// Base64-encoded redacted file content. - pub file: String, - /// Output file name. - pub file_name: String, - /// Content type of the output. - pub content_type: String, - /// Pipeline execution summary. - pub summary: pipeline::PipelineSummary, - /// Audit trail entries. - pub audit_trail: Vec<serde_json::Value>, -} - -/// Submit a file for redaction via multipart upload. -/// -/// Parts: -/// - `file` (binary, required): The file to redact -/// - `context` (JSON, optional): RedactionContext with categories, rules, etc. -/// - `dictionaries` (JSON, optional): Array of DictionaryDef for dictionary matching -#[utoipa::path( - post, - path = "/api/v1/redact", - request_body(content_type = "multipart/form-data"), - params( - ("format" = Option<String>, Query, description = "Response format: json (default) or binary") - ), - responses( - (status = 200, description = "Redaction completed", body = RedactResponse), - (status = 400, description = "Bad request"), - (status = 500, description = "Internal server error") - ) -)] -async fn redact( - State(run_manager): State<Arc<RunManager>>, - Query(query): Query<RedactQuery>, - mut multipart: Multipart, -) -> Result<impl IntoResponse, (StatusCode, Json<serde_json::Value>)> { - let (run_id, _cancel_token) = run_manager.create_run().await; - run_manager.set_running(run_id).await; - - let mut file_bytes: Option<Bytes> = None; - let mut file_name = String::from("upload"); - let mut content_type = String::from("application/octet-stream"); - let mut context = RedactionContext::default(); - let mut dictionaries: Vec<DictionaryDef> = Vec::new(); - - // Parse multipart parts - while let Ok(Some(field)) = multipart.next_field().await { - let name = field.name().unwrap_or("").to_string(); - - match name.as_str() { - "file" => { - if let Some(fname) = field.file_name() { - file_name = fname.to_string(); - } - if let Some(ct) = field.content_type() { - content_type = ct.to_string(); - } - let data = field.bytes().await.map_err(|e| { - ( - StatusCode::BAD_REQUEST, - Json(serde_json::json!({ "error": format!("Failed to read file: {e}") })), - ) - })?; - file_bytes = Some(data); - } - "context" => { - let data = field.bytes().await.map_err(|e| { - ( - StatusCode::BAD_REQUEST, - Json(serde_json::json!({ "error": format!("Failed to read context: {e}") })), - ) - })?; - context = serde_json::from_slice(&data).map_err(|e| { - ( - StatusCode::BAD_REQUEST, - Json(serde_json::json!({ "error": format!("Invalid context JSON: {e}") })), - ) - })?; - } - "dictionaries" => { - let data = field.bytes().await.map_err(|e| { - ( - StatusCode::BAD_REQUEST, - Json(serde_json::json!({ "error": format!("Failed to read dictionaries: {e}") })), - ) - })?; - dictionaries = serde_json::from_slice(&data).map_err(|e| { - ( - StatusCode::BAD_REQUEST, - Json(serde_json::json!({ "error": format!("Invalid dictionaries JSON: {e}") })), - ) - })?; - } - _ => { - // Skip unknown fields - } - } - } - - let file_bytes = file_bytes.ok_or_else(|| { - ( - StatusCode::BAD_REQUEST, - Json(serde_json::json!({ "error": "Missing 'file' part in multipart upload" })), - ) - })?; - - // Detect content type from file extension if not provided - if content_type == "application/octet-stream" { - if let Some(ext) = file_name.rsplit('.').next() { - content_type = match ext.to_lowercase().as_str() { - "pdf" => "application/pdf", - "docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "html" | "htm" => "text/html", - "csv" => "text/csv", - "json" => "application/json", - "txt" => "text/plain", - "xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "xls" => "application/vnd.ms-excel", - "parquet" => "application/x-parquet", - "jpg" | "jpeg" => "image/jpeg", - "png" => "image/png", - "tiff" => "image/tiff", - "bmp" => "image/bmp", - "webp" => "image/webp", - "mp3" => "audio/mpeg", - "wav" => "audio/wav", - _ => "application/octet-stream", - } - .to_string(); - } - } - - // Execute the pipeline - let result = pipeline::execute_pipeline( - file_bytes, - &file_name, - &content_type, - &context, - &dictionaries, - ) - .await - .map_err(|e| { - ( - StatusCode::INTERNAL_SERVER_ERROR, - Json(serde_json::json!({ "error": format!("Pipeline failed: {e}") })), - ) - })?; - - // Return binary or JSON based on format query param - if query.format.as_deref() == Some("binary") { - let mut headers = HeaderMap::new(); - headers.insert( - header::CONTENT_TYPE, - result.content_type.parse().unwrap_or(header::HeaderValue::from_static("application/octet-stream")), - ); - headers.insert( - header::CONTENT_DISPOSITION, - format!("attachment; filename=\"{}\"", result.file_name) - .parse() - .unwrap_or(header::HeaderValue::from_static("attachment")), - ); - headers.insert( - "x-nvisy-run-id", - run_id.to_string().parse().unwrap(), - ); - headers.insert( - "x-nvisy-total-entities", - result.summary.total_entities.to_string().parse().unwrap(), - ); - headers.insert( - "x-nvisy-total-redactions", - result.summary.total_redactions.to_string().parse().unwrap(), - ); - - Ok((StatusCode::OK, headers, result.content).into_response()) - } else { - use base64::Engine; - let encoded = base64::engine::general_purpose::STANDARD.encode(&result.content); - - let response = RedactResponse { - run_id: run_id.to_string(), - file: encoded, - file_name: result.file_name, - content_type: result.content_type, - summary: result.summary, - audit_trail: result.audit_trail, - }; - - Ok((StatusCode::OK, Json(response)).into_response()) - } -} - -pub fn router() -> Router<AppState> { - Router::new() - .route("/api/v1/redact", post(redact)) -} diff --git a/crates/nvisy-server/src/main.rs b/crates/nvisy-server/src/main.rs deleted file mode 100644 index 273c3fe..0000000 --- a/crates/nvisy-server/src/main.rs +++ /dev/null @@ -1,30 +0,0 @@ -#![forbid(unsafe_code)] -#![cfg_attr(docsrs, feature(doc_cfg))] -#![doc = include_str!("../README.md")] - -mod app; -mod middleware; -mod handler; -mod service; - -use tracing_subscriber::EnvFilter; - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - // Initialize tracing - tracing_subscriber::fmt() - .with_env_filter(EnvFilter::from_default_env().add_directive("nvisy=info".parse()?)) - .json() - .init(); - - let config = service::ServerConfig::from_env(); - tracing::info!(host = %config.host, port = config.port, "Starting nvisy-server"); - - let app = app::build_app(&config).await?; - - let listener = tokio::net::TcpListener::bind(format!("{}:{}", config.host, config.port)).await?; - tracing::info!("Listening on {}:{}", config.host, config.port); - - axum::serve(listener, app).await?; - Ok(()) -} diff --git a/crates/nvisy-server/src/middleware/mod.rs b/crates/nvisy-server/src/middleware/mod.rs deleted file mode 100644 index 5713713..0000000 --- a/crates/nvisy-server/src/middleware/mod.rs +++ /dev/null @@ -1,2 +0,0 @@ -// Middleware is applied in app.rs via tower layers. -// Custom middleware can be added here as needed. diff --git a/crates/nvisy-server/src/service/audit_store.rs b/crates/nvisy-server/src/service/audit_store.rs deleted file mode 100644 index 9945612..0000000 --- a/crates/nvisy-server/src/service/audit_store.rs +++ /dev/null @@ -1,72 +0,0 @@ -use std::sync::RwLock; -use uuid::Uuid; - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, schemars::JsonSchema)] -pub struct StoredAudit { - pub id: Uuid, - pub action: String, - pub timestamp: String, - #[serde(skip_serializing_if = "Option::is_none")] - pub entity_id: Option<Uuid>, - #[serde(skip_serializing_if = "Option::is_none")] - pub redaction_id: Option<Uuid>, - #[serde(skip_serializing_if = "Option::is_none")] - pub policy_id: Option<Uuid>, - #[serde(skip_serializing_if = "Option::is_none")] - pub source_id: Option<Uuid>, - #[serde(skip_serializing_if = "Option::is_none")] - pub run_id: Option<Uuid>, - #[serde(skip_serializing_if = "Option::is_none")] - pub actor: Option<String>, - #[serde(skip_serializing_if = "Option::is_none")] - pub details: Option<serde_json::Value>, -} - -pub struct AuditStore { - records: RwLock<Vec<StoredAudit>>, -} - -impl AuditStore { - pub fn new() -> Self { - Self { - records: RwLock::new(Vec::new()), - } - } - - pub fn add(&self, record: StoredAudit) { - self.records.write().unwrap().push(record); - } - - pub fn query( - &self, - run_id: Option<&str>, - action: Option<&str>, - source_id: Option<&str>, - limit: usize, - offset: usize, - ) -> Vec<StoredAudit> { - let records = self.records.read().unwrap(); - let mut results: Vec<&StoredAudit> = records.iter().collect(); - - if let Some(rid) = run_id { - if let Ok(uid) = rid.parse::<Uuid>() { - results.retain(|r| r.run_id == Some(uid)); - } - } - if let Some(act) = action { - results.retain(|r| r.action == act); - } - if let Some(sid) = source_id { - if let Ok(uid) = sid.parse::<Uuid>() { - results.retain(|r| r.source_id == Some(uid)); - } - } - - results.into_iter().skip(offset).take(limit).cloned().collect() - } - - pub fn get_by_run_id(&self, run_id: Uuid) -> Vec<StoredAudit> { - let records = self.records.read().unwrap(); - records.iter().filter(|r| r.run_id == Some(run_id)).cloned().collect() - } -} diff --git a/crates/nvisy-server/src/service/config.rs b/crates/nvisy-server/src/service/config.rs deleted file mode 100644 index 2169fe4..0000000 --- a/crates/nvisy-server/src/service/config.rs +++ /dev/null @@ -1,19 +0,0 @@ -/// Server configuration loaded from environment variables. -pub struct ServerConfig { - pub host: String, - pub port: u16, - pub cors_origin: String, -} - -impl ServerConfig { - pub fn from_env() -> Self { - Self { - host: std::env::var("NVISY_HOST").unwrap_or_else(|_| "0.0.0.0".to_string()), - port: std::env::var("NVISY_PORT") - .ok() - .and_then(|p| p.parse().ok()) - .unwrap_or(8080), - cors_origin: std::env::var("NVISY_CORS_ORIGIN").unwrap_or_else(|_| "*".to_string()), - } - } -} diff --git a/crates/nvisy-server/src/service/mod.rs b/crates/nvisy-server/src/service/mod.rs deleted file mode 100644 index c026656..0000000 --- a/crates/nvisy-server/src/service/mod.rs +++ /dev/null @@ -1,35 +0,0 @@ -//! Shared application services, configuration, and state. -//! -//! This module re-exports the primary service types and implements Axum's -//! [`FromRef`](axum::extract::FromRef) for each sub-state field so that -//! handlers can extract individual services directly. - -pub mod audit_store; -pub mod config; -pub mod pipeline; -pub mod policy_store; -pub mod state; - -use std::sync::Arc; - -// Re-exports for convenience -pub use audit_store::AuditStore; -pub use config::ServerConfig; -pub use policy_store::PolicyStore; -pub use state::AppState; - -macro_rules! impl_di { - ($($f:ident: $t:ty),+) => {$( - impl axum::extract::FromRef<AppState> for $t { - fn from_ref(state: &AppState) -> Self { - state.$f.clone() - } - } - )+}; -} - -impl_di! { - run_manager: Arc<nvisy_engine::runs::RunManager>, - policy_store: Arc<PolicyStore>, - audit_store: Arc<AuditStore> -} diff --git a/crates/nvisy-server/src/service/pipeline.rs b/crates/nvisy-server/src/service/pipeline.rs deleted file mode 100644 index 7f8c811..0000000 --- a/crates/nvisy-server/src/service/pipeline.rs +++ /dev/null @@ -1,332 +0,0 @@ -//! Pipeline builder and executor for the `/redact` endpoint. -//! -//! Auto-detects file type and constructs the correct action sequence, -//! then executes actions sequentially via mpsc channels. - -use bytes::Bytes; -use std::collections::HashMap; -use tokio::sync::mpsc; - -use nvisy_core::datatypes::blob::Blob; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_ontology::ontology::entity::Entity; -use nvisy_ontology::ontology::redaction::Redaction; -use nvisy_ontology::redaction::RedactionContext; -use nvisy_core::registry::action::Action; -use nvisy_ingest::loaders::{Loader, LoaderOutput}; - -use nvisy_detect::actions::detect_dictionary::{DetectDictionaryAction, DetectDictionaryParams, DictionaryDef}; -use nvisy_detect::actions::detect_manual::{DetectManualAction, DetectManualParams}; -use nvisy_detect::actions::detect_regex::{DetectRegexAction, DetectRegexParams}; -use nvisy_detect::actions::detect_tabular::{DetectTabularAction, DetectTabularParams}; -use nvisy_detect::actions::detect_checksum::DetectChecksumParams; -use nvisy_detect::actions::evaluate_policy::{EvaluatePolicyAction, EvaluatePolicyParams}; -use nvisy_detect::actions::emit_audit::EmitAuditParams; - -use nvisy_media::actions::apply_image_redaction::{ApplyImageRedactionAction, ApplyImageRedactionParams}; -use nvisy_media::actions::apply_tabular_redaction::{ApplyTabularRedactionAction, ApplyTabularRedactionParams}; -use nvisy_media::actions::apply_pdf_redaction::{ApplyPdfRedactionAction, ApplyPdfRedactionParams}; - -/// Result of a pipeline execution. -#[derive(Debug, serde::Serialize)] -pub struct PipelineResult { - /// Redacted file content. - #[serde(skip)] - pub content: Bytes, - /// Output file name. - pub file_name: String, - /// Content type of the output. - pub content_type: String, - /// Execution summary. - pub summary: PipelineSummary, - /// Audit trail entries. - pub audit_trail: Vec<serde_json::Value>, -} - -/// Summary statistics for a pipeline run. -#[derive(Debug, serde::Serialize, schemars::JsonSchema, utoipa::ToSchema)] -pub struct PipelineSummary { - pub total_entities: usize, - pub total_redactions: usize, - pub entities_by_category: HashMap<String, usize>, - pub processing_time_ms: u64, -} - -/// Execute the full redaction pipeline for a file. -pub async fn execute_pipeline( - file_bytes: Bytes, - file_name: &str, - content_type: &str, - context: &RedactionContext, - dictionaries: &[DictionaryDef], -) -> Result<PipelineResult, Error> { - let start = std::time::Instant::now(); - - // Create blob - let mut blob = Blob::new(file_name, file_bytes); - blob = blob.with_content_type(content_type); - - // Step 1: Load file - blob = run_loader(&blob, content_type, file_name).await?; - - // Step 2: Inject manual entities if present - if !context.manual_entities.is_empty() { - for ann in &context.manual_entities { - blob.add_artifact("manual_entities", ann).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add manual entity: {e}")) - })?; - } - } - - // Step 3: Run detection actions - blob = run_action(&DetectRegexAction, blob, DetectRegexParams { - confidence_threshold: context.min_confidence, - patterns: None, - }).await?; - - // Dictionary detection - if !dictionaries.is_empty() { - blob = run_action(&DetectDictionaryAction, blob, DetectDictionaryParams { - dictionaries: dictionaries.to_vec(), - confidence: 0.85, - }).await?; - } - - // Tabular detection if we have tabular data - let has_tabular = blob.has_artifacts("tabular"); - if has_tabular { - blob = run_action(&DetectTabularAction, blob, DetectTabularParams { - column_rules: vec![], - }).await?; - } - - // Manual entity detection - if !context.manual_entities.is_empty() { - blob = run_action(&DetectManualAction, blob, DetectManualParams {}).await?; - } - - // Checksum validation - blob = run_action(&nvisy_detect::actions::detect_checksum::DetectChecksumAction, blob, DetectChecksumParams { - drop_invalid: true, - confidence_boost: 0.05, - }).await?; - - // Classification - blob = run_action(&nvisy_detect::actions::classify::ClassifyAction, blob, ()).await?; - - // Step 4: Policy evaluation - blob = run_action(&EvaluatePolicyAction, blob, EvaluatePolicyParams { - rules: context.rules.iter().map(|r| { - nvisy_ontology::redaction::PolicyRule { - id: r.entity_type.clone(), - name: r.entity_type.clone(), - categories: vec![], - entity_types: vec![r.entity_type.clone()], - confidence_threshold: context.min_confidence, - method: r.method, - replacement_template: r.replacement.clone().unwrap_or_default(), - enabled: true, - priority: 0, - } - }).collect(), - default_method: context.default_method, - default_confidence_threshold: context.min_confidence, - }).await?; - - // Step 5: Apply redactions - blob = run_action(&nvisy_detect::actions::apply_redaction::ApplyRedactionAction, blob, ()).await?; - - // Apply image redaction if we have images - let has_images = blob.has_artifacts("images"); - if has_images { - blob = run_action(&ApplyImageRedactionAction, blob, ApplyImageRedactionParams { - blur_sigma: 15.0, - block_color: [0, 0, 0, 255], - }).await?; - } - - // Apply tabular redaction - if has_tabular { - blob = run_action(&ApplyTabularRedactionAction, blob, ApplyTabularRedactionParams {}).await?; - } - - // Apply PDF reassembly if this is a PDF - if content_type == "application/pdf" { - blob = run_action(&ApplyPdfRedactionAction, blob, ApplyPdfRedactionParams {}).await?; - } - - // Step 6: Audit - blob = run_action(&nvisy_detect::actions::emit_audit::EmitAuditAction, blob, EmitAuditParams { - run_id: None, - actor: None, - }).await?; - - // Collect results - let entities: Vec<Entity> = blob.get_artifacts("entities").unwrap_or_default(); - let redactions: Vec<Redaction> = blob.get_artifacts("redactions").unwrap_or_default(); - let audit_trail: Vec<serde_json::Value> = blob.get_artifacts("audit").unwrap_or_default(); - - let mut entities_by_category: HashMap<String, usize> = HashMap::new(); - for entity in &entities { - *entities_by_category - .entry(format!("{:?}", entity.category).to_lowercase()) - .or_insert(0) += 1; - } - - let elapsed = start.elapsed(); - - let output_file_name = format!("redacted_{}", file_name); - - Ok(PipelineResult { - content: blob.content, - file_name: output_file_name, - content_type: content_type.to_string(), - summary: PipelineSummary { - total_entities: entities.len(), - total_redactions: redactions.len(), - entities_by_category, - processing_time_ms: elapsed.as_millis() as u64, - }, - audit_trail, - }) -} - -/// Run a file loader based on content type and extension. -async fn run_loader(blob: &Blob, content_type: &str, file_name: &str) -> Result<Blob, Error> { - let mut result_blob = blob.clone(); - let ext = file_name - .rsplit('.') - .next() - .unwrap_or("") - .to_lowercase(); - - let outputs: Vec<LoaderOutput> = match (content_type, ext.as_str()) { - ("application/pdf", _) | (_, "pdf") => { - let loader = nvisy_ingest::loaders::pdf_loader::PdfLoader; - let params = nvisy_ingest::loaders::pdf_loader::PdfLoaderParams { - extract_images: true, - max_pages: None, - }; - loader.load(blob, ¶ms).await? - } - (ct, _) if ct.contains("wordprocessingml") => { - let loader = nvisy_ingest::loaders::docx_loader::DocxLoader; - let params = nvisy_ingest::loaders::docx_loader::DocxLoaderParams { - extract_images: true, - }; - loader.load(blob, ¶ms).await? - } - (_, "docx") => { - let loader = nvisy_ingest::loaders::docx_loader::DocxLoader; - let params = nvisy_ingest::loaders::docx_loader::DocxLoaderParams { - extract_images: true, - }; - loader.load(blob, ¶ms).await? - } - ("text/html", _) | (_, "html") | (_, "htm") => { - let loader = nvisy_ingest::loaders::html_loader::HtmlLoader; - let params = nvisy_ingest::loaders::html_loader::HtmlLoaderParams {}; - loader.load(blob, ¶ms).await? - } - (ct, _) if ct.starts_with("image/") => { - let loader = nvisy_ingest::loaders::image_loader::ImageLoader; - let params = nvisy_ingest::loaders::image_loader::ImageLoaderParams {}; - loader.load(blob, ¶ms).await? - } - (_, "jpg") | (_, "jpeg") | (_, "png") | (_, "tiff") | (_, "bmp") | (_, "webp") => { - let loader = nvisy_ingest::loaders::image_loader::ImageLoader; - let params = nvisy_ingest::loaders::image_loader::ImageLoaderParams {}; - loader.load(blob, ¶ms).await? - } - (_, "parquet") => { - let loader = nvisy_ingest::loaders::parquet_loader::ParquetLoader; - let params = nvisy_ingest::loaders::parquet_loader::ParquetLoaderParams { - max_rows: None, - }; - loader.load(blob, ¶ms).await? - } - (ct, _) if ct.contains("spreadsheetml") || ct.contains("ms-excel") => { - let loader = nvisy_ingest::loaders::xlsx_loader::XlsxLoader; - let params = nvisy_ingest::loaders::xlsx_loader::XlsxLoaderParams { - max_rows: None, - sheets: vec![], - }; - loader.load(blob, ¶ms).await? - } - (_, "xlsx") | (_, "xls") => { - let loader = nvisy_ingest::loaders::xlsx_loader::XlsxLoader; - let params = nvisy_ingest::loaders::xlsx_loader::XlsxLoaderParams { - max_rows: None, - sheets: vec![], - }; - loader.load(blob, ¶ms).await? - } - ("text/csv", _) | (_, "csv") => { - let loader = nvisy_ingest::loaders::csv_loader::CsvLoader; - loader.load(blob, &()).await? - } - ("application/json", _) | (_, "json") => { - let loader = nvisy_ingest::loaders::json_loader::JsonLoader; - loader.load(blob, &()).await? - } - (ct, _) if ct.starts_with("audio/") => { - let loader = nvisy_ingest::loaders::audio_loader::AudioLoader; - let params = nvisy_ingest::loaders::audio_loader::AudioLoaderParams {}; - loader.load(blob, ¶ms).await? - } - (_, "mp3") | (_, "wav") | (_, "flac") | (_, "ogg") | (_, "m4a") => { - let loader = nvisy_ingest::loaders::audio_loader::AudioLoader; - let params = nvisy_ingest::loaders::audio_loader::AudioLoaderParams {}; - loader.load(blob, ¶ms).await? - } - // Default: treat as plain text - _ => { - let loader = nvisy_ingest::loaders::plaintext::PlaintextLoader; - loader.load(blob, &()).await? - } - }; - - // Add loader outputs as artifacts - for output in outputs { - match output { - LoaderOutput::Document(doc) => { - result_blob.add_artifact("documents", &doc).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add document: {e}")) - })?; - } - LoaderOutput::Image(img) => { - result_blob.add_artifact("images", &img).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add image: {e}")) - })?; - } - LoaderOutput::Tabular(tab) => { - result_blob.add_artifact("tabular", &tab).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("failed to add tabular: {e}")) - })?; - } - } - } - - Ok(result_blob) -} - -/// Run a single action on a blob, returning the processed blob. -async fn run_action<A: Action>( - action: &A, - blob: Blob, - params: A::Params, -) -> Result<Blob, Error> { - let (tx_in, rx_in) = mpsc::channel(1); - let (tx_out, mut rx_out) = mpsc::channel(1); - - tx_in.send(blob).await.map_err(|_| { - Error::new(ErrorKind::Runtime, "failed to send blob to action") - })?; - drop(tx_in); - - action.execute(rx_in, tx_out, params).await?; - - rx_out.recv().await.ok_or_else(|| { - Error::new(ErrorKind::Runtime, "action produced no output") - }) -} diff --git a/crates/nvisy-server/src/service/policy_store.rs b/crates/nvisy-server/src/service/policy_store.rs deleted file mode 100644 index cf8468e..0000000 --- a/crates/nvisy-server/src/service/policy_store.rs +++ /dev/null @@ -1,78 +0,0 @@ -use std::collections::HashMap; -use std::sync::RwLock; -use uuid::Uuid; - -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, schemars::JsonSchema)] -pub struct StoredPolicy { - pub id: Uuid, - pub name: String, - pub rules: Vec<serde_json::Value>, - pub default_method: String, - pub default_confidence_threshold: f64, - pub created_at: String, - pub updated_at: String, -} - -pub struct PolicyStore { - policies: RwLock<HashMap<Uuid, StoredPolicy>>, -} - -impl PolicyStore { - pub fn new() -> Self { - Self { - policies: RwLock::new(HashMap::new()), - } - } - - pub fn create( - &self, - name: String, - rules: Vec<serde_json::Value>, - default_method: String, - default_confidence_threshold: f64, - ) -> StoredPolicy { - let id = Uuid::new_v4(); - let now = chrono::Utc::now().to_rfc3339(); - let policy = StoredPolicy { - id, - name, - rules, - default_method, - default_confidence_threshold, - created_at: now.clone(), - updated_at: now, - }; - self.policies.write().unwrap().insert(id, policy.clone()); - policy - } - - pub fn get(&self, id: Uuid) -> Option<StoredPolicy> { - self.policies.read().unwrap().get(&id).cloned() - } - - pub fn list(&self) -> Vec<StoredPolicy> { - self.policies.read().unwrap().values().cloned().collect() - } - - pub fn update( - &self, - id: Uuid, - name: Option<String>, - rules: Option<Vec<serde_json::Value>>, - default_method: Option<String>, - default_confidence_threshold: Option<f64>, - ) -> Option<StoredPolicy> { - let mut policies = self.policies.write().unwrap(); - let existing = policies.get_mut(&id)?; - if let Some(n) = name { existing.name = n; } - if let Some(r) = rules { existing.rules = r; } - if let Some(m) = default_method { existing.default_method = m; } - if let Some(t) = default_confidence_threshold { existing.default_confidence_threshold = t; } - existing.updated_at = chrono::Utc::now().to_rfc3339(); - Some(existing.clone()) - } - - pub fn delete(&self, id: Uuid) -> bool { - self.policies.write().unwrap().remove(&id).is_some() - } -} diff --git a/crates/nvisy-server/src/service/state.rs b/crates/nvisy-server/src/service/state.rs deleted file mode 100644 index 5446ad5..0000000 --- a/crates/nvisy-server/src/service/state.rs +++ /dev/null @@ -1,19 +0,0 @@ -//! Central application state shared across all HTTP handlers. - -use std::sync::Arc; -use nvisy_engine::runs::RunManager; -use super::audit_store::AuditStore; -use super::policy_store::PolicyStore; - -/// Shared application state passed to every Axum handler via [`axum::extract::State`]. -/// -/// Each field is wrapped in an [`Arc`] so cloning the state is cheap. -#[derive(Clone)] -pub struct AppState { - /// Manages in-flight and completed pipeline runs. - pub run_manager: Arc<RunManager>, - /// In-memory store of policy definitions. - pub policy_store: Arc<PolicyStore>, - /// In-memory store of audit log entries. - pub audit_store: Arc<AuditStore>, -} From 08759cabd2ec42aaafcfc19ff2df431a3ed1191a Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Fri, 13 Feb 2026 15:43:05 +0100 Subject: [PATCH 13/17] feat: add Engine trait, EngineInput/EngineOutput, and supporting ontology types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce the top-level Engine trait in nvisy-engine defining the redaction pipeline contract (ContentHandler + policies + DAG graph → redacted output + audit trail + full breakdown). Add supporting ontology types: ClassificationResult, PolicyEvaluation, RedactionSummary. Extract ModelInfo/ModelKind into entity/model.rs. Migrate pipeline classify action to use ontology SensitivityLevel enum. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- Cargo.lock | 6 + Cargo.toml | 5 +- crates/nvisy-engine/Cargo.toml | 1 + crates/nvisy-engine/src/engine.rs | 70 +++++++ crates/nvisy-engine/src/lib.rs | 1 + crates/nvisy-engine/src/prelude.rs | 3 +- crates/nvisy-ontology/Cargo.toml | 13 +- .../nvisy-ontology/src/audit/explanation.rs | 48 +++++ .../src/{ontology/audit.rs => audit/mod.rs} | 39 +++- crates/nvisy-ontology/src/audit/retention.rs | 32 +++ .../src/detection/annotation.rs | 64 ++++++ .../src/detection/classification.rs | 18 ++ crates/nvisy-ontology/src/detection/mod.rs | 77 +++++++ crates/nvisy-ontology/src/entity/location.rs | 180 +++++++++++++++++ .../src/{ontology/entity.rs => entity/mod.rs} | 111 +++++------ crates/nvisy-ontology/src/entity/model.rs | 30 +++ crates/nvisy-ontology/src/entity/selector.rs | 65 ++++++ crates/nvisy-ontology/src/lib.rs | 5 +- crates/nvisy-ontology/src/ontology/mod.rs | 15 -- .../nvisy-ontology/src/ontology/redaction.rs | 85 -------- .../nvisy-ontology/src/policy/evaluation.rs | 27 +++ crates/nvisy-ontology/src/policy/mod.rs | 113 +++++++++++ .../nvisy-ontology/src/policy/regulation.rs | 26 +++ crates/nvisy-ontology/src/policy/rule.rs | 85 ++++++++ crates/nvisy-ontology/src/prelude.rs | 22 +- .../nvisy-ontology/src/redaction/context.rs | 188 ------------------ crates/nvisy-ontology/src/redaction/method.rs | 109 ++++++++++ crates/nvisy-ontology/src/redaction/mod.rs | 114 ++++++++++- crates/nvisy-ontology/src/redaction/output.rs | 152 ++++++++++++++ crates/nvisy-ontology/src/redaction/policy.rs | 117 ----------- crates/nvisy-ontology/src/redaction/review.rs | 35 ++++ crates/nvisy-ontology/src/redaction/spec.rs | 168 ++++++++++++++++ .../nvisy-ontology/src/redaction/summary.rs | 18 ++ crates/nvisy-pattern/src/patterns/mod.rs | 4 +- .../src/actions/apply_image_redaction.rs | 18 +- .../src/actions/apply_redaction.rs | 25 ++- .../src/actions/apply_tabular_redaction.rs | 29 ++- crates/nvisy-pipeline/src/actions/classify.rs | 44 ++-- .../src/actions/detect_checksum.rs | 4 +- .../src/actions/detect_dictionary.rs | 34 ++-- .../src/actions/detect_manual.rs | 44 ++-- .../src/actions/detect_regex.rs | 14 +- .../src/actions/detect_tabular.rs | 22 +- .../nvisy-pipeline/src/actions/emit_audit.rs | 16 +- .../src/actions/evaluate_policy.rs | 170 +++++++++++----- crates/nvisy-pipeline/src/render/block.rs | 2 +- crates/nvisy-pipeline/src/render/blur.rs | 2 +- crates/nvisy-python/src/actions/mod.rs | 2 +- crates/nvisy-python/src/actions/ocr.rs | 2 +- crates/nvisy-python/src/ner/mod.rs | 17 +- crates/nvisy-python/src/ocr/mod.rs | 18 +- docs/DETECTION.md | 35 +++- docs/INGESTION.md | 49 +++-- 53 files changed, 1890 insertions(+), 703 deletions(-) create mode 100644 crates/nvisy-engine/src/engine.rs create mode 100644 crates/nvisy-ontology/src/audit/explanation.rs rename crates/nvisy-ontology/src/{ontology/audit.rs => audit/mod.rs} (76%) create mode 100644 crates/nvisy-ontology/src/audit/retention.rs create mode 100644 crates/nvisy-ontology/src/detection/annotation.rs create mode 100644 crates/nvisy-ontology/src/detection/classification.rs create mode 100644 crates/nvisy-ontology/src/detection/mod.rs create mode 100644 crates/nvisy-ontology/src/entity/location.rs rename crates/nvisy-ontology/src/{ontology/entity.rs => entity/mod.rs} (61%) create mode 100644 crates/nvisy-ontology/src/entity/model.rs create mode 100644 crates/nvisy-ontology/src/entity/selector.rs delete mode 100644 crates/nvisy-ontology/src/ontology/mod.rs delete mode 100644 crates/nvisy-ontology/src/ontology/redaction.rs create mode 100644 crates/nvisy-ontology/src/policy/evaluation.rs create mode 100644 crates/nvisy-ontology/src/policy/mod.rs create mode 100644 crates/nvisy-ontology/src/policy/regulation.rs create mode 100644 crates/nvisy-ontology/src/policy/rule.rs delete mode 100644 crates/nvisy-ontology/src/redaction/context.rs create mode 100644 crates/nvisy-ontology/src/redaction/method.rs create mode 100644 crates/nvisy-ontology/src/redaction/output.rs delete mode 100644 crates/nvisy-ontology/src/redaction/policy.rs create mode 100644 crates/nvisy-ontology/src/redaction/review.rs create mode 100644 crates/nvisy-ontology/src/redaction/spec.rs create mode 100644 crates/nvisy-ontology/src/redaction/summary.rs diff --git a/Cargo.lock b/Cargo.lock index c780d56..951d3b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2372,6 +2372,7 @@ dependencies = [ "anyhow", "jiff", "nvisy-core", + "nvisy-ontology", "petgraph", "rand 0.9.2", "schemars", @@ -2435,6 +2436,7 @@ dependencies = [ "jiff", "nvisy-core", "schemars", + "semver", "serde", "serde_json", "uuid", @@ -3368,6 +3370,10 @@ name = "semver" version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +dependencies = [ + "serde", + "serde_core", +] [[package]] name = "seq-macro" diff --git a/Cargo.toml b/Cargo.toml index f6f854f..07ca39e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,7 @@ serde_json = { version = "1.0", features = [] } # Error handling thiserror = { version = "2.0", features = [] } anyhow = { version = "1.0", features = [] } -derive_more = { version = "1", features = ["display"] } +derive_more = { version = "1", features = ["display", "from"] } # Primitive datatypes uuid = { version = "1", features = ["serde", "v4", "v7"] } @@ -106,6 +106,9 @@ hipstr = "0.6" sha2 = "0.10" hex = "0.4" +# Semantic versioning +semver = { version = "1", features = ["serde"] } + # Enum derives strum = { version = "0.26", features = ["derive"] } diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml index 2206600..e30a2d4 100644 --- a/crates/nvisy-engine/Cargo.toml +++ b/crates/nvisy-engine/Cargo.toml @@ -20,6 +20,7 @@ documentation = { workspace = true } [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } +nvisy-ontology = { workspace = true, features = [] } # JSON Schema generation schemars = { workspace = true } diff --git a/crates/nvisy-engine/src/engine.rs b/crates/nvisy-engine/src/engine.rs new file mode 100644 index 0000000..b7b1a54 --- /dev/null +++ b/crates/nvisy-engine/src/engine.rs @@ -0,0 +1,70 @@ +//! Top-level engine contract and I/O types. +//! +//! The [`Engine`] trait defines the high-level redaction pipeline contract: +//! given a content handler, policies, and an execution graph, produce redacted +//! output together with a full audit trail and per-phase breakdown. + +use std::future::Future; + +use uuid::Uuid; + +use nvisy_core::error::Error; +use nvisy_core::fs::ContentHandler; +use nvisy_ontology::audit::Audit; +use nvisy_ontology::detection::{ClassificationResult, DetectionResult}; +use nvisy_ontology::policy::{Policy, PolicyEvaluation}; +use nvisy_ontology::redaction::RedactionSummary; + +use crate::compiler::graph::Graph; +use crate::connections::Connections; +use crate::executor::runner::RunResult; + +/// Everything the caller must provide to run a redaction pipeline. +pub struct EngineInput { + /// Handle to the managed directory containing the files to process. + pub source: ContentHandler, + /// Policies to apply (at least one). + pub policies: Vec<Policy>, + /// Execution graph defining the pipeline DAG. + pub graph: Graph, + /// External service connections for source/target nodes. + pub connections: Connections, + /// Human or service account identity. + pub actor: Option<String>, +} + +/// Full result of a pipeline run. +/// +/// Contains a content handler for the redacted output, per-phase breakdown +/// (detection, classification, policy evaluation), per-source summaries, +/// audit records, and the raw DAG execution result. +pub struct EngineOutput { + /// Unique run identifier. + pub run_id: Uuid, + /// Handle to the managed directory containing redacted output files. + pub output: ContentHandler, + /// Full detection result (entities, sensitivity, risk). + pub detection: DetectionResult, + /// Sensitivity classification. + pub classification: ClassificationResult, + /// Policy evaluation breakdown (redactions, reviews, suppressions, blocks, alerts). + pub evaluation: PolicyEvaluation, + /// Per-source redaction summaries. + pub summaries: Vec<RedactionSummary>, + /// Immutable audit trail. + pub audits: Vec<Audit>, + /// Per-node execution results from the DAG runner. + pub run_result: RunResult, +} + +/// The top-level redaction engine contract. +/// +/// Takes a content handler, policies, and an execution graph; returns redacted +/// output, audit records, and a full breakdown of every pipeline phase. +pub trait Engine: Send + Sync { + /// Execute a full redaction pipeline. + fn run( + &self, + input: EngineInput, + ) -> impl Future<Output = Result<EngineOutput, Error>> + Send; +} diff --git a/crates/nvisy-engine/src/lib.rs b/crates/nvisy-engine/src/lib.rs index c096c86..b3e6edc 100644 --- a/crates/nvisy-engine/src/lib.rs +++ b/crates/nvisy-engine/src/lib.rs @@ -10,6 +10,7 @@ pub mod compiler; pub mod connections; +pub mod engine; pub mod executor; pub mod policies; pub mod runs; diff --git a/crates/nvisy-engine/src/prelude.rs b/crates/nvisy-engine/src/prelude.rs index f0da8ad..eddd8f5 100644 --- a/crates/nvisy-engine/src/prelude.rs +++ b/crates/nvisy-engine/src/prelude.rs @@ -1,5 +1,6 @@ //! Convenience re-exports. +pub use crate::compiler::graph::{Graph, GraphEdge, GraphNode}; pub use crate::compiler::plan::{build_plan, ExecutionPlan, ResolvedNode}; +pub use crate::engine::{Engine, EngineInput, EngineOutput}; pub use crate::executor::runner::{run_graph, RunResult}; pub use crate::runs::{RunManager, RunState, RunStatus, RunSummary}; -pub use crate::compiler::graph::{Graph, GraphEdge, GraphNode}; diff --git a/crates/nvisy-ontology/Cargo.toml b/crates/nvisy-ontology/Cargo.toml index 0dd29c1..ffae204 100644 --- a/crates/nvisy-ontology/Cargo.toml +++ b/crates/nvisy-ontology/Cargo.toml @@ -17,12 +17,16 @@ repository = { workspace = true } homepage = { workspace = true } documentation = { workspace = true } +[features] +default = [] +jsonschema = ["dep:schemars"] + [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } -# JSON Schema generation -schemars = { workspace = true } +# JSON Schema generation (optional) +schemars = { workspace = true, optional = true } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -32,5 +36,8 @@ serde_json = { workspace = true, features = [] } uuid = { workspace = true, features = ["serde", "v4"] } jiff = { workspace = true } +# Semantic versioning +semver = { workspace = true } + # Error handling -derive_more = { workspace = true, features = ["display"] } +derive_more = { workspace = true, features = ["display", "from"] } diff --git a/crates/nvisy-ontology/src/audit/explanation.rs b/crates/nvisy-ontology/src/audit/explanation.rs new file mode 100644 index 0000000..c4b70fd --- /dev/null +++ b/crates/nvisy-ontology/src/audit/explanation.rs @@ -0,0 +1,48 @@ +//! Explainability metadata for data protection decisions. +//! +//! An [`Explanation`] records why an action was taken — which model, rule, +//! and confidence level were involved. Types that carry this metadata +//! implement the [`Explainable`] trait. + +use semver::Version; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::entity::{DetectionMethod, ModelInfo}; + +/// Types that carry explainability metadata. +pub trait Explainable { + /// Why this action was taken. + fn explanation(&self) -> Option<&Explanation>; +} + +/// Structured explainability metadata for a data protection decision. +/// +/// Records why an action was taken, which model and rule were involved, +/// and who reviewed it. Complements the freeform `details` field on [`Audit`](super::Audit). +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct Explanation { + /// Detection model that produced the decision. + #[serde(skip_serializing_if = "Option::is_none")] + pub model: Option<ModelInfo>, + /// Identifier of the policy rule that triggered the action. + #[serde(skip_serializing_if = "Option::is_none")] + pub rule_id: Option<Uuid>, + /// Detection confidence score. + #[serde(skip_serializing_if = "Option::is_none")] + pub confidence: Option<f64>, + /// Detection method used. + #[serde(skip_serializing_if = "Option::is_none")] + pub detection_method: Option<DetectionMethod>, + /// Human-readable reason for the action. + #[serde(skip_serializing_if = "Option::is_none")] + pub reason: Option<String>, + /// Version of the policy that was evaluated. + #[serde(skip_serializing_if = "Option::is_none")] + #[cfg_attr(feature = "jsonschema", schemars(with = "Option<String>"))] + pub policy_version: Option<Version>, + /// Identifier of the reviewer who approved/rejected. + #[serde(skip_serializing_if = "Option::is_none")] + pub reviewer_id: Option<String>, +} diff --git a/crates/nvisy-ontology/src/ontology/audit.rs b/crates/nvisy-ontology/src/audit/mod.rs similarity index 76% rename from crates/nvisy-ontology/src/ontology/audit.rs rename to crates/nvisy-ontology/src/audit/mod.rs index 9e2e247..9fe2a83 100644 --- a/crates/nvisy-ontology/src/ontology/audit.rs +++ b/crates/nvisy-ontology/src/audit/mod.rs @@ -1,25 +1,38 @@ //! Audit trail records for data protection events. +//! +//! An [`Audit`] entry records an immutable event in the data protection +//! pipeline, carrying structured [`Explanation`] metadata for compliance. + +pub mod explanation; +pub mod retention; + +pub use explanation::{Explainable, Explanation}; +pub use retention::{RetentionPolicy, RetentionScope}; use jiff::Timestamp; use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; use uuid::Uuid; + use nvisy_core::path::ContentSource; +/// Types that emit audit records. +pub trait Auditable { + /// Produce an audit record for this event. + fn to_audit(&self) -> Audit; +} + /// Kind of auditable action recorded in an [`Audit`] entry. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] pub enum AuditAction { /// A sensitive entity was detected. Detection, /// A redaction was applied to an entity. Redaction, - /// A policy was evaluated against detected entities. - PolicyEval, - /// A blob or document was accessed. - Access, - /// Processed content was exported to an external system. - Export, + /// A human review was performed on a redaction. + Review, } /// An immutable audit record tracking a data protection event. @@ -27,7 +40,7 @@ pub enum AuditAction { /// Audit entries are emitted by pipeline actions and form a tamper-evident /// log of all detection, redaction, and policy decisions. #[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct Audit { /// Content source identity and lineage. #[serde(flatten)] @@ -35,7 +48,7 @@ pub struct Audit { /// The kind of event this audit entry records. pub action: AuditAction, /// UTC timestamp when the event occurred. - #[schemars(with = "String")] + #[cfg_attr(feature = "jsonschema", schemars(with = "String"))] pub timestamp: Timestamp, /// Identifier of the related entity, if applicable. #[serde(skip_serializing_if = "Option::is_none")] @@ -55,9 +68,12 @@ pub struct Audit { /// Human or service account that triggered the event. #[serde(skip_serializing_if = "Option::is_none")] pub actor: Option<String>, + /// Structured explainability metadata. + #[serde(skip_serializing_if = "Option::is_none")] + pub explanation: Option<Explanation>, /// Additional unstructured details about the event. #[serde(skip_serializing_if = "Option::is_none")] - pub details: Option<serde_json::Map<String, serde_json::Value>>, + pub details: Option<Map<String, Value>>, } impl Audit { @@ -73,6 +89,7 @@ impl Audit { source_id: None, run_id: None, actor: None, + explanation: None, details: None, } } @@ -102,7 +119,7 @@ impl Audit { } /// Attach additional unstructured details to this audit entry. - pub fn with_details(mut self, details: serde_json::Map<String, serde_json::Value>) -> Self { + pub fn with_details(mut self, details: Map<String, Value>) -> Self { self.details = Some(details); self } diff --git a/crates/nvisy-ontology/src/audit/retention.rs b/crates/nvisy-ontology/src/audit/retention.rs new file mode 100644 index 0000000..8987eb6 --- /dev/null +++ b/crates/nvisy-ontology/src/audit/retention.rs @@ -0,0 +1,32 @@ +//! Data retention policy types. + +use serde::{Deserialize, Serialize}; + +/// What class of data a retention policy applies to. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum RetentionScope { + /// Original ingested content before redaction. + OriginalContent, + /// Redacted output artifacts. + RedactedOutput, + /// Audit log entries. + AuditLogs, +} + +/// A retention policy governing how long data is kept. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct RetentionPolicy { + /// What class of data this policy applies to. + pub scope: RetentionScope, + /// Maximum number of days to retain data. `None` means indefinite. + #[serde(skip_serializing_if = "Option::is_none")] + pub max_duration_days: Option<u64>, + /// If true, delete data immediately after processing (zero-retention mode). + pub zero_retention: bool, + /// Description of the retention policy. + #[serde(skip_serializing_if = "Option::is_none")] + pub description: Option<String>, +} diff --git a/crates/nvisy-ontology/src/detection/annotation.rs b/crates/nvisy-ontology/src/detection/annotation.rs new file mode 100644 index 0000000..a241cc6 --- /dev/null +++ b/crates/nvisy-ontology/src/detection/annotation.rs @@ -0,0 +1,64 @@ +//! Annotation types for pre-identified regions and classification labels. +//! +//! Annotations allow users and upstream systems to mark regions of content +//! before detection runs. They replace the previous `ManualAnnotation` type +//! with a unified model supporting three kinds: inclusions (pre-identified +//! sensitive regions), exclusions (known-safe regions to skip), and +//! classification labels. + +use serde::{Deserialize, Serialize}; + +use crate::entity::{EntityCategory, EntityLocation}; + +/// The kind of annotation applied to a content region. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum AnnotationKind { + /// Pre-identified sensitive region that should be treated as a detection. + Inclusion, + /// Known-safe region that detection should skip. + Exclusion, + /// Classification label attached to a document or region. + Label, +} + +/// A classification label attached to a document or region. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct AnnotationLabel { + /// Label name (e.g. `"contains-phi"`, `"gdpr-request"`). + pub name: String, + /// Scope of the label: `"document"` or a region identifier. + #[serde(skip_serializing_if = "Option::is_none")] + pub scope: Option<String>, + /// Confidence of the label assignment. + #[serde(skip_serializing_if = "Option::is_none")] + pub confidence: Option<f64>, +} + +/// A user-provided or upstream annotation on a content region. +/// +/// Replaces the previous `ManualAnnotation` with a unified type that +/// supports inclusions, exclusions, and classification labels. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct Annotation { + /// What kind of annotation this is. + pub kind: AnnotationKind, + /// Entity category, if applicable. + #[serde(skip_serializing_if = "Option::is_none")] + pub category: Option<EntityCategory>, + /// Entity type label, if applicable. + #[serde(skip_serializing_if = "Option::is_none")] + pub entity_type: Option<String>, + /// The annotated text or value. + #[serde(skip_serializing_if = "Option::is_none")] + pub value: Option<String>, + /// Location of the annotated region. + #[serde(skip_serializing_if = "Option::is_none")] + pub location: Option<EntityLocation>, + /// Classification labels attached to this annotation. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub labels: Vec<AnnotationLabel>, +} diff --git a/crates/nvisy-ontology/src/detection/classification.rs b/crates/nvisy-ontology/src/detection/classification.rs new file mode 100644 index 0000000..779a121 --- /dev/null +++ b/crates/nvisy-ontology/src/detection/classification.rs @@ -0,0 +1,18 @@ +//! Sensitivity classification result. + +use serde::{Deserialize, Serialize}; + +use super::SensitivityLevel; + +/// Result of sensitivity classification over a set of detected entities. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct ClassificationResult { + /// The computed sensitivity level. + pub sensitivity_level: SensitivityLevel, + /// Total number of entities considered. + pub total_entities: usize, + /// Re-identification risk score in the range `[0.0, 1.0]`, if computed. + #[serde(skip_serializing_if = "Option::is_none")] + pub risk_score: Option<f64>, +} diff --git a/crates/nvisy-ontology/src/detection/mod.rs b/crates/nvisy-ontology/src/detection/mod.rs new file mode 100644 index 0000000..01cc06a --- /dev/null +++ b/crates/nvisy-ontology/src/detection/mod.rs @@ -0,0 +1,77 @@ +//! Detection result types. +//! +//! A [`DetectionResult`] groups the output of a detection pass as a +//! first-class type, carrying the detected entities alongside pipeline +//! and policy metadata. + +pub mod annotation; +pub mod classification; + +pub use annotation::{Annotation, AnnotationKind, AnnotationLabel}; +pub use classification::ClassificationResult; + +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use nvisy_core::path::ContentSource; + +use crate::entity::Entity; + +/// Sensitivity classification assigned to a document or content region. +/// +/// Drives downstream policy: rules can be scoped to specific sensitivity +/// levels via [`RuleCondition`](crate::policy::RuleCondition). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum SensitivityLevel { + /// No sensitive data detected or all data is publicly available. + Public, + /// Internal use only — not intended for external distribution. + Internal, + /// Contains sensitive data requiring access controls. + Confidential, + /// Highly sensitive — regulated data requiring strict controls. + Restricted, +} + +/// Types that can be submitted for sensitive data detection. +pub trait Detectable: Send + Sync { + /// Content as text for text-based detection. + fn text_content(&self) -> Option<&str>; + /// Binary content for image/audio/video detection. + fn binary_content(&self) -> Option<&[u8]>; + /// MIME type of the content. + fn mime_type(&self) -> Option<&str>; + /// Source identity for lineage. + fn source(&self) -> &ContentSource; +} + +/// The output of a detection pass over a single content source. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct DetectionResult { + /// Content source identity and lineage. + #[serde(flatten)] + pub source: ContentSource, + /// Entities detected in the content. + pub entities: Vec<Entity>, + /// Identifier of the policy that governed detection. + #[serde(skip_serializing_if = "Option::is_none")] + pub policy_id: Option<Uuid>, + /// Identifier of the pipeline run that produced this result. + #[serde(skip_serializing_if = "Option::is_none")] + pub run_id: Option<Uuid>, + /// Processing time in milliseconds. + #[serde(skip_serializing_if = "Option::is_none")] + pub duration_ms: Option<u64>, + /// Overall sensitivity classification derived from the detected entities. + #[serde(skip_serializing_if = "Option::is_none")] + pub sensitivity_level: Option<SensitivityLevel>, + /// Re-identification risk score in the range `[0.0, 1.0]`. + /// + /// Estimates the likelihood that a data subject could be re-identified + /// from the entities remaining after redaction. Computed post-transform. + #[serde(skip_serializing_if = "Option::is_none")] + pub risk_score: Option<f64>, +} diff --git a/crates/nvisy-ontology/src/entity/location.rs b/crates/nvisy-ontology/src/entity/location.rs new file mode 100644 index 0000000..592b92f --- /dev/null +++ b/crates/nvisy-ontology/src/entity/location.rs @@ -0,0 +1,180 @@ +//! Spatial and temporal location types for entity positions. + +use derive_more::From; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +/// A time interval within an audio or video stream. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct TimeSpan { + /// Start time in seconds from the beginning of the stream. + pub start_secs: f64, + /// End time in seconds from the beginning of the stream. + pub end_secs: f64, +} + +/// Axis-aligned bounding box for image-based entity locations. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct BoundingBox { + /// Horizontal offset of the top-left corner (pixels or normalized). + pub x: f64, + /// Vertical offset of the top-left corner (pixels or normalized). + pub y: f64, + /// Width of the bounding box. + pub width: f64, + /// Height of the bounding box. + pub height: f64, +} + +/// Location of an entity within text content. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct TextLocation { + /// Byte or character offset where the entity starts. + pub start_offset: usize, + /// Byte or character offset where the entity ends. + pub end_offset: usize, + /// Start offset of the surrounding context window for redaction. + /// + /// When set, the redaction may expand to cover surrounding text + /// (e.g. +/- N characters around an SSN) to prevent contextual + /// re-identification. + #[serde(skip_serializing_if = "Option::is_none")] + pub context_start_offset: Option<usize>, + /// End offset of the surrounding context window for redaction. + #[serde(skip_serializing_if = "Option::is_none")] + pub context_end_offset: Option<usize>, + /// Identifier of the document element containing this entity. + #[serde(skip_serializing_if = "Option::is_none")] + pub element_id: Option<String>, + /// 1-based page number where the entity was found. + #[serde(skip_serializing_if = "Option::is_none")] + pub page_number: Option<u32>, +} + +/// Location of an entity within an image. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct ImageLocation { + /// Bounding box of the entity in the image. + pub bounding_box: BoundingBox, + /// Links this entity to a specific image document. + #[serde(skip_serializing_if = "Option::is_none")] + pub image_id: Option<Uuid>, + /// 1-based page number (for multi-page documents like PDFs). + #[serde(skip_serializing_if = "Option::is_none")] + pub page_number: Option<u32>, +} + +/// Location of an entity within tabular data. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct TabularLocation { + /// Row index (0-based). + pub row_index: usize, + /// Column index (0-based). + pub column_index: usize, + /// Byte offset within the cell where the entity starts, if applicable. + #[serde(skip_serializing_if = "Option::is_none")] + pub start_offset: Option<usize>, + /// Byte offset within the cell where the entity ends, if applicable. + #[serde(skip_serializing_if = "Option::is_none")] + pub end_offset: Option<usize>, +} + +/// Location of an entity within an audio stream. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct AudioLocation { + /// Time interval of the entity. + pub time_span: TimeSpan, + /// Speaker identifier from diarization. + #[serde(skip_serializing_if = "Option::is_none")] + pub speaker_id: Option<String>, +} + +/// Location of an entity within a video stream. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct VideoLocation { + /// Bounding box of the entity in the frame. + pub bounding_box: BoundingBox, + /// 0-based frame number where the entity was detected. + pub frame_number: u64, + /// Time interval of the entity in the video. + #[serde(skip_serializing_if = "Option::is_none")] + pub time_span: Option<TimeSpan>, + /// Tracking identifier for an entity across multiple frames. + #[serde(skip_serializing_if = "Option::is_none")] + pub track_id: Option<String>, + /// Speaker identifier from diarization (for audio track). + #[serde(skip_serializing_if = "Option::is_none")] + pub speaker_id: Option<String>, +} + +/// Location of an entity within its source content. +/// +/// Each variant is specific to a content modality, carrying only the +/// fields that make sense for that modality. +#[derive(Debug, Clone, From, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum EntityLocation { + /// Entity found in text content (plain text, HTML, PDF text layer, etc.). + Text(TextLocation), + /// Entity found in an image. + Image(ImageLocation), + /// Entity found in a tabular data cell. + Tabular(TabularLocation), + /// Entity found in an audio stream. + Audio(AudioLocation), + /// Entity found in a video stream. + Video(VideoLocation), +} + +impl EntityLocation { + /// Text start offset, if this is a text or tabular location. + pub fn start_offset(&self) -> Option<usize> { + match self { + Self::Text(t) => Some(t.start_offset), + Self::Tabular(t) => t.start_offset, + _ => None, + } + } + + /// Text end offset, if this is a text or tabular location. + pub fn end_offset(&self) -> Option<usize> { + match self { + Self::Text(t) => Some(t.end_offset), + Self::Tabular(t) => t.end_offset, + _ => None, + } + } + + /// Bounding box, if this is an image or video location. + pub fn bounding_box(&self) -> Option<&BoundingBox> { + match self { + Self::Image(i) => Some(&i.bounding_box), + Self::Video(v) => Some(&v.bounding_box), + _ => None, + } + } + + /// Row index, if this is a tabular location. + pub fn row_index(&self) -> Option<usize> { + match self { + Self::Tabular(t) => Some(t.row_index), + _ => None, + } + } + + /// Column index, if this is a tabular location. + pub fn column_index(&self) -> Option<usize> { + match self { + Self::Tabular(t) => Some(t.column_index), + _ => None, + } + } +} diff --git a/crates/nvisy-ontology/src/ontology/entity.rs b/crates/nvisy-ontology/src/entity/mod.rs similarity index 61% rename from crates/nvisy-ontology/src/ontology/entity.rs rename to crates/nvisy-ontology/src/entity/mod.rs index 9186a8a..f98d45b 100644 --- a/crates/nvisy-ontology/src/ontology/entity.rs +++ b/crates/nvisy-ontology/src/entity/mod.rs @@ -1,12 +1,28 @@ //! Sensitive-data entity types and detection metadata. +//! +//! An [`Entity`] represents a single occurrence of sensitive data detected +//! within a document. Entities are produced by detection actions and consumed +//! by redaction and audit stages of the pipeline. + +pub mod location; +pub mod model; +pub mod selector; + +pub use location::{ + AudioLocation, BoundingBox, EntityLocation, ImageLocation, TabularLocation, + TextLocation, TimeSpan, VideoLocation, +}; +pub use model::{ModelInfo, ModelKind}; +pub use selector::EntitySelector; use serde::{Deserialize, Serialize}; -use uuid::Uuid; +use serde_json::{Map, Value}; + use nvisy_core::path::ContentSource; /// Category of sensitive data an entity belongs to. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] pub enum EntityCategory { /// Personally Identifiable Information (names, SSNs, addresses, etc.). @@ -17,82 +33,49 @@ pub enum EntityCategory { Financial, /// Secrets and credentials (API keys, passwords, tokens). Credentials, + /// Legal documents and privileged communications. + Legal, + /// Biometric data (fingerprints, iris scans, voiceprints). + Biometric, /// User-defined or plugin-specific category. - Custom, + Custom(String), } /// Method used to detect a sensitive entity. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] #[serde(rename_all = "snake_case")] pub enum DetectionMethod { /// Regular expression pattern matching. Regex, - /// Named-entity recognition via AI model. - AiNer, - /// Lookup in a known-value dictionary. - Dictionary, /// Checksum or Luhn-algorithm validation. Checksum, - /// Multiple methods combined to produce a single detection. - Composite, + /// Lookup in a known-value dictionary. + Dictionary, + /// Named-entity recognition via AI model. + Ner, + /// Contextual NLP analysis (discourse-level understanding). + ContextualNlp, /// OCR text extraction with bounding boxes. Ocr, + /// Face detection in images or video frames. + FaceDetection, + /// Object detection in images or video frames. + ObjectDetection, + /// Entity detection from speech transcription. + SpeechTranscript, + /// Multiple methods combined to produce a single detection. + Composite, /// User-provided annotations. Manual, } -/// Axis-aligned bounding box for image-based entity locations. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct BoundingBox { - /// Horizontal offset of the top-left corner (pixels or normalized). - pub x: f64, - /// Vertical offset of the top-left corner (pixels or normalized). - pub y: f64, - /// Width of the bounding box. - pub width: f64, - /// Height of the bounding box. - pub height: f64, -} - -/// Location of an entity within its source document or image. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct EntityLocation { - /// Byte or character offset where the entity starts in the text. - pub start_offset: usize, - /// Byte or character offset where the entity ends in the text. - pub end_offset: usize, - /// Identifier of the document element containing this entity. - #[serde(skip_serializing_if = "Option::is_none")] - pub element_id: Option<String>, - /// 1-based page number where the entity was found. - #[serde(skip_serializing_if = "Option::is_none")] - pub page_number: Option<u32>, - /// Bounding box for image-based detections. - #[serde(skip_serializing_if = "Option::is_none")] - pub bounding_box: Option<BoundingBox>, - /// Tabular row index (0-based). - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub row_index: Option<usize>, - /// Tabular column index (0-based). - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub column_index: Option<usize>, - /// Links this entity to a specific image document. - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub image_id: Option<Uuid>, -} - /// A detected sensitive data occurrence within a document. /// /// Entities are produced by detection actions (regex, NER, checksum, etc.) /// and later consumed by redaction and audit actions. #[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct Entity { /// Content source identity and lineage. #[serde(flatten)] @@ -107,8 +90,17 @@ pub struct Entity { pub detection_method: DetectionMethod, /// Detection confidence score in the range `[0.0, 1.0]`. pub confidence: f64, - /// Where this entity was found in the source document. + /// Where this entity was found in the source content. pub location: EntityLocation, + /// BCP-47 language tag of the detected content. + #[serde(skip_serializing_if = "Option::is_none")] + pub language: Option<String>, + /// Detection model that produced this entity. + #[serde(skip_serializing_if = "Option::is_none")] + pub model: Option<ModelInfo>, + /// Additional unstructured metadata. + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option<Map<String, Value>>, } impl Entity { @@ -129,6 +121,9 @@ impl Entity { detection_method, confidence, location, + language: None, + model: None, + metadata: None, } } diff --git a/crates/nvisy-ontology/src/entity/model.rs b/crates/nvisy-ontology/src/entity/model.rs new file mode 100644 index 0000000..a372704 --- /dev/null +++ b/crates/nvisy-ontology/src/entity/model.rs @@ -0,0 +1,30 @@ +//! Detection model identity and provenance. + +use serde::{Deserialize, Serialize}; + +/// Provenance or licensing classification of a detection model. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum ModelKind { + /// Open-source model (e.g. spaCy, Hugging Face community models). + OpenSource, + /// Proprietary model (e.g. vendor-specific NER). + Proprietary, + /// Model accessed through a third-party API gateway. + Gateway, + /// Self-hosted model served behind an internal endpoint. + SelfHosted, +} + +/// Identity and version of the model used for detection. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct ModelInfo { + /// Model name (e.g. `"spacy-en-core-web-lg"`, `"gpt-4"`). + pub name: String, + /// Provenance / licensing classification. + pub kind: ModelKind, + /// Model version string. + pub version: String, +} diff --git a/crates/nvisy-ontology/src/entity/selector.rs b/crates/nvisy-ontology/src/entity/selector.rs new file mode 100644 index 0000000..19bfb51 --- /dev/null +++ b/crates/nvisy-ontology/src/entity/selector.rs @@ -0,0 +1,65 @@ +//! Entity selection criteria for policy rules. +//! +//! An [`EntitySelector`] describes which entities a policy rule or redaction +//! applies to, based on category, type, and confidence constraints. + +use serde::{Deserialize, Serialize}; + +use super::EntityCategory; + +/// Criteria for selecting which entities a policy rule applies to. +/// +/// All fields use "empty means all" semantics: an empty `categories` list +/// matches every category, an empty `entity_types` list matches every type, +/// and so on. When multiple fields are set, they are combined with AND logic. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct EntitySelector { + /// Entity categories this selector matches. Empty means all categories. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub categories: Vec<EntityCategory>, + /// Specific entity type names this selector matches. Empty means all types. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub entity_types: Vec<String>, + /// Minimum detection confidence required. Entities below this threshold + /// are not matched. + #[serde(default = "default_confidence_threshold")] + pub confidence_threshold: f64, +} + +fn default_confidence_threshold() -> f64 { + 0.0 +} + +impl Default for EntitySelector { + fn default() -> Self { + Self { + categories: Vec::new(), + entity_types: Vec::new(), + confidence_threshold: default_confidence_threshold(), + } + } +} + +impl EntitySelector { + /// Create a selector that matches all entities. + pub fn all() -> Self { + Self::default() + } + + /// Returns `true` if the given entity properties match this selector. + pub fn matches(&self, category: &EntityCategory, entity_type: &str, confidence: f64) -> bool { + if confidence < self.confidence_threshold { + return false; + } + if !self.categories.is_empty() && !self.categories.contains(category) { + return false; + } + if !self.entity_types.is_empty() + && !self.entity_types.iter().any(|t| t == entity_type) + { + return false; + } + true + } +} diff --git a/crates/nvisy-ontology/src/lib.rs b/crates/nvisy-ontology/src/lib.rs index 0fafae8..48686f2 100644 --- a/crates/nvisy-ontology/src/lib.rs +++ b/crates/nvisy-ontology/src/lib.rs @@ -2,7 +2,10 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -pub mod ontology; +pub mod audit; +pub mod detection; +pub mod entity; +pub mod policy; pub mod redaction; #[doc(hidden)] diff --git a/crates/nvisy-ontology/src/ontology/mod.rs b/crates/nvisy-ontology/src/ontology/mod.rs deleted file mode 100644 index 060a638..0000000 --- a/crates/nvisy-ontology/src/ontology/mod.rs +++ /dev/null @@ -1,15 +0,0 @@ -//! Detection and redaction domain types. -//! -//! Types in this module represent the core ontology of the nvisy pipeline: -//! entities (detected sensitive data), redactions (how entities are masked), -//! and audit records (immutable event log). - -pub mod audit; -pub mod entity; -pub mod redaction; - -pub use audit::{Audit, AuditAction}; -pub use entity::{ - BoundingBox, DetectionMethod, Entity, EntityCategory, EntityLocation, -}; -pub use redaction::{Redaction, RedactionMethod}; diff --git a/crates/nvisy-ontology/src/ontology/redaction.rs b/crates/nvisy-ontology/src/ontology/redaction.rs deleted file mode 100644 index ea75cc1..0000000 --- a/crates/nvisy-ontology/src/ontology/redaction.rs +++ /dev/null @@ -1,85 +0,0 @@ -//! Redaction methods and records. - -use serde::{Deserialize, Serialize}; -use uuid::Uuid; -use nvisy_core::path::ContentSource; - -/// Strategy used to redact or obfuscate a detected entity. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -#[serde(rename_all = "snake_case")] -pub enum RedactionMethod { - /// Replace characters with a mask character (e.g. `***-**-1234`). - Mask, - /// Substitute with a fixed placeholder string. - Replace, - /// Replace with a one-way hash of the original value. - Hash, - /// Encrypt the value so it can be recovered later with a key. - Encrypt, - /// Remove the value entirely from the output. - Remove, - /// Blur a region in an image. - Blur, - /// Overlay an opaque block over a region in an image. - Block, - /// Replace with a synthetically generated realistic value. - Synthesize, -} - -/// A redaction decision recording how a specific entity was (or will be) redacted. -/// -/// Each `Redaction` is linked to exactly one [`Entity`](super::entity::Entity) -/// via `entity_id`. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct Redaction { - /// Content source identity and lineage. - #[serde(flatten)] - pub source: ContentSource, - /// Identifier of the entity being redacted. - pub entity_id: Uuid, - /// Redaction strategy applied to the entity. - pub method: RedactionMethod, - /// The string that replaces the original value in the output. - pub replacement_value: String, - /// The original sensitive value, retained for audit purposes. - #[serde(skip_serializing_if = "Option::is_none")] - pub original_value: Option<String>, - /// Identifier of the policy rule that triggered this redaction. - #[serde(skip_serializing_if = "Option::is_none")] - pub policy_rule_id: Option<String>, - /// Whether the redaction has been applied to the output content. - pub applied: bool, -} - -impl Redaction { - /// Create a new pending redaction for the given entity. - pub fn new( - entity_id: Uuid, - method: RedactionMethod, - replacement_value: impl Into<String>, - ) -> Self { - Self { - source: ContentSource::new(), - entity_id, - method, - replacement_value: replacement_value.into(), - original_value: None, - policy_rule_id: None, - applied: false, - } - } - - /// Record the original sensitive value for audit trail purposes. - pub fn with_original_value(mut self, value: impl Into<String>) -> Self { - self.original_value = Some(value.into()); - self - } - - /// Associate this redaction with the policy rule that triggered it. - pub fn with_policy_rule_id(mut self, id: impl Into<String>) -> Self { - self.policy_rule_id = Some(id.into()); - self - } -} diff --git a/crates/nvisy-ontology/src/policy/evaluation.rs b/crates/nvisy-ontology/src/policy/evaluation.rs new file mode 100644 index 0000000..d0f540b --- /dev/null +++ b/crates/nvisy-ontology/src/policy/evaluation.rs @@ -0,0 +1,27 @@ +//! Policy evaluation outcome. + +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::redaction::Redaction; + +/// Full outcome of evaluating a [`Policy`](crate::policy::Policy) against a set of entities. +/// +/// Captures every rule kind's effect: redactions to apply, entities pending +/// human review, entities suppressed from output, blocked entities, and alerts. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct PolicyEvaluation { + /// Identifier of the policy that was evaluated. + pub policy_id: Uuid, + /// Redactions produced by `Redact` rules. + pub redactions: Vec<Redaction>, + /// Entity IDs routed to human review by `Review` rules. + pub pending_review: Vec<Uuid>, + /// Entity IDs suppressed from output by `Suppress` rules. + pub suppressed: Vec<Uuid>, + /// Entity IDs blocked from processing by `Block` rules. + pub blocked: Vec<Uuid>, + /// Entity IDs that triggered alert notifications via `Alert` rules. + pub alerted: Vec<Uuid>, +} diff --git a/crates/nvisy-ontology/src/policy/mod.rs b/crates/nvisy-ontology/src/policy/mod.rs new file mode 100644 index 0000000..834de59 --- /dev/null +++ b/crates/nvisy-ontology/src/policy/mod.rs @@ -0,0 +1,113 @@ +//! Redaction policies and rules. +//! +//! A [`Policy`] is a named, versioned set of [`PolicyRule`]s that govern +//! how detected entities are redacted. Policies may be associated with a +//! [`RegulationKind`] and support inheritance via the `extends` field. + +pub mod evaluation; +pub mod regulation; +pub mod rule; + +pub use evaluation::PolicyEvaluation; +pub use regulation::RegulationKind; +pub use rule::{PolicyRule, RuleCondition, RuleKind}; + +use semver::Version; +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::entity::EntityCategory; +use crate::redaction::{RedactionSpec, TextRedactionSpec}; + +/// A named redaction policy containing an ordered set of rules. +/// +/// Policies are pure configuration — they describe *what* to detect and +/// *how* to handle it, independent of any specific content source. +/// +/// Evaluated by [`find_matching_rule`](Policy::find_matching_rule) +/// which returns the first matching enabled rule sorted by priority. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct Policy { + /// Unique identifier for this policy. + pub id: Uuid, + /// Human-readable policy name. + pub name: String, + /// Policy version. + #[cfg_attr(feature = "jsonschema", schemars(with = "String"))] + pub version: Version, + /// Description of the policy's purpose. + #[serde(skip_serializing_if = "Option::is_none")] + pub description: Option<String>, + /// Parent policy identifier for inheritance. + #[serde(skip_serializing_if = "Option::is_none")] + pub extends: Option<Uuid>, + /// Compliance regulation this policy targets. + #[serde(skip_serializing_if = "Option::is_none")] + pub regulation: Option<RegulationKind>, + /// Ordered list of rules. + pub rules: Vec<PolicyRule>, + /// Fallback redaction specification when no rule matches. + pub default_spec: RedactionSpec, + /// Fallback confidence threshold when no rule matches. + pub default_confidence_threshold: f64, +} + +impl Policy { + /// Create a new policy with the given name, version, and rules, using default + /// fallback spec ([`TextRedactionSpec::Mask`]) and threshold (0.5). + pub fn new( + name: impl Into<String>, + version: Version, + rules: Vec<PolicyRule>, + ) -> Self { + Self { + id: Uuid::new_v4(), + name: name.into(), + version, + description: None, + extends: None, + regulation: None, + rules, + default_spec: RedactionSpec::Text(TextRedactionSpec::Mask { mask_char: '*' }), + default_confidence_threshold: 0.5, + } + } + + /// Override the fallback redaction specification. + pub fn with_default_spec(mut self, spec: RedactionSpec) -> Self { + self.default_spec = spec; + self + } + + /// Override the fallback confidence threshold. + pub fn with_default_confidence_threshold(mut self, threshold: f64) -> Self { + self.default_confidence_threshold = threshold; + self + } + + /// Find the first matching enabled rule for a given entity. + /// + /// Rules are sorted by priority (ascending). A rule matches when it is + /// enabled and its [`EntitySelector`] matches the given entity properties. + pub fn find_matching_rule( + &self, + category: &EntityCategory, + entity_type: &str, + confidence: f64, + ) -> Option<&PolicyRule> { + let mut sorted: Vec<&PolicyRule> = self.rules.iter().collect(); + sorted.sort_by_key(|r| r.priority); + + for rule in sorted { + if !rule.enabled { + continue; + } + if rule.selector.matches(category, entity_type, confidence) { + return Some(rule); + } + } + + None + } +} diff --git a/crates/nvisy-ontology/src/policy/regulation.rs b/crates/nvisy-ontology/src/policy/regulation.rs new file mode 100644 index 0000000..8007b1f --- /dev/null +++ b/crates/nvisy-ontology/src/policy/regulation.rs @@ -0,0 +1,26 @@ +//! Regulatory framework identifiers. + +use serde::{Deserialize, Serialize}; + +/// A compliance regulation or framework that a policy targets. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum RegulationKind { + /// Health Insurance Portability and Accountability Act. + Hipaa, + /// General Data Protection Regulation (EU). + Gdpr, + /// California Consumer Privacy Act. + Ccpa, + /// Payment Card Industry Data Security Standard. + PciDss, + /// Criminal Justice Information Services Security Policy. + Cjis, + /// Family Educational Rights and Privacy Act. + Ferpa, + /// Sarbanes-Oxley Act. + Sox, + /// User-defined regulation or framework. + Custom(String), +} diff --git a/crates/nvisy-ontology/src/policy/rule.rs b/crates/nvisy-ontology/src/policy/rule.rs new file mode 100644 index 0000000..b68bfcf --- /dev/null +++ b/crates/nvisy-ontology/src/policy/rule.rs @@ -0,0 +1,85 @@ +//! Policy rule types. +//! +//! A [`PolicyRule`] defines when and how a specific redaction is applied, +//! based on entity categories, types, and confidence thresholds. + +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; +use uuid::Uuid; + +use crate::detection::SensitivityLevel; +use crate::entity::EntitySelector; +use crate::redaction::RedactionSpec; + +/// Conditions that must be met for a [`PolicyRule`] to apply. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct RuleCondition { + /// MIME types of documents this rule applies to. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub document_types: Vec<String>, + /// User roles this rule applies to. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub roles: Vec<String>, + /// Labels that must be present on the document. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub required_labels: Vec<String>, + /// Sensitivity levels this rule applies to. Empty means all levels. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub sensitivity_levels: Vec<SensitivityLevel>, +} + +/// Classifies what a policy rule does when it matches. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum RuleKind { + /// Apply a redaction to the matched entity. + Redaction, + /// Require human review before any action is taken. + Review, + /// Flag the entity without redacting (for reporting / alerting). + Alert, + /// Block processing of the entire document. + Block, + /// Suppress a detection (treat as false positive). + Suppress, +} + +/// A single rule within a redaction [`Policy`](super::Policy). +/// +/// Rules specify which entity categories and types they match, the minimum +/// confidence threshold, and the action to take. Rules are evaluated in +/// ascending priority order. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct PolicyRule { + /// Unique identifier for this rule. + pub id: Uuid, + /// Human-readable name for display purposes. + pub name: String, + /// Description of the rule's purpose. + #[serde(skip_serializing_if = "Option::is_none")] + pub description: Option<String>, + /// What this rule does when it matches. + pub kind: RuleKind, + /// Which entities this rule applies to. + pub selector: EntitySelector, + /// Redaction specification to apply when this rule matches (relevant when `kind` is `Redaction`). + pub spec: RedactionSpec, + /// Template string for the replacement value (e.g. `"[REDACTED]"`). + pub replacement_template: String, + /// Whether this rule is active. Disabled rules are skipped during evaluation. + pub enabled: bool, + /// Evaluation priority (lower numbers are evaluated first). + pub priority: i32, + /// Additional conditions for this rule to apply. + #[serde(skip_serializing_if = "Option::is_none")] + pub conditions: Option<RuleCondition>, + /// Regulatory citation or notes explaining the rule. + #[serde(skip_serializing_if = "Option::is_none")] + pub context: Option<String>, + /// Additional unstructured metadata. + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option<Map<String, Value>>, +} diff --git a/crates/nvisy-ontology/src/prelude.rs b/crates/nvisy-ontology/src/prelude.rs index 1b91419..5968662 100644 --- a/crates/nvisy-ontology/src/prelude.rs +++ b/crates/nvisy-ontology/src/prelude.rs @@ -1,9 +1,23 @@ //! Convenience re-exports for common nvisy-ontology types. -pub use crate::ontology::{ - Audit, AuditAction, BoundingBox, DetectionMethod, Entity, EntityCategory, - EntityLocation, Redaction, RedactionMethod, +pub use crate::audit::{ + Audit, AuditAction, Auditable, Explainable, Explanation, RetentionPolicy, RetentionScope, +}; +pub use crate::detection::{ + Annotation, AnnotationKind, AnnotationLabel, ClassificationResult, Detectable, + DetectionResult, SensitivityLevel, +}; +pub use crate::entity::{ + AudioLocation, BoundingBox, DetectionMethod, Entity, EntityCategory, EntityLocation, + EntitySelector, ImageLocation, ModelInfo, ModelKind, TabularLocation, TextLocation, TimeSpan, + VideoLocation, +}; +pub use crate::policy::{ + Policy, PolicyEvaluation, PolicyRule, RegulationKind, RuleCondition, RuleKind, }; pub use crate::redaction::{ - EntityRedactionRule, ManualAnnotation, Policy, PolicyRule, RedactionContext, + AudioRedactionMethod, AudioRedactionOutput, AudioRedactionSpec, ImageRedactionMethod, + ImageRedactionOutput, ImageRedactionSpec, Redactable, Redaction, RedactionMethod, + RedactionOutput, RedactionSpec, RedactionSummary, ReviewDecision, ReviewStatus, + TextRedactionMethod, TextRedactionOutput, TextRedactionSpec, }; diff --git a/crates/nvisy-ontology/src/redaction/context.rs b/crates/nvisy-ontology/src/redaction/context.rs deleted file mode 100644 index 8a2b089..0000000 --- a/crates/nvisy-ontology/src/redaction/context.rs +++ /dev/null @@ -1,188 +0,0 @@ -//! Request-scoped redaction context for per-invocation control. - -use serde::{Deserialize, Serialize}; -use crate::ontology::entity::{BoundingBox, EntityCategory}; -use crate::ontology::redaction::RedactionMethod; - -/// Per-entity-type override for the redaction method. -/// -/// When included in a [`RedactionContext`], this rule overrides the -/// default redaction method for a specific entity type. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct EntityRedactionRule { - /// The entity type this override applies to (e.g. `"ssn"`, `"email"`). - pub entity_type: String, - /// Redaction strategy to use for this entity type. - pub method: RedactionMethod, - /// Optional custom replacement string for this entity type. - #[serde(skip_serializing_if = "Option::is_none")] - pub replacement: Option<String>, -} - -/// A user-provided annotation identifying a sensitive region. -/// -/// Manual annotations bypass automated detection — each is converted -/// directly into an [`Entity`](crate::ontology::entity::Entity) with -/// `DetectionMethod::Manual` and confidence 1.0. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct ManualAnnotation { - /// Broad classification of the annotated data. - pub category: EntityCategory, - /// Specific type label (e.g. `"ssn"`, `"name"`). - pub entity_type: String, - /// The matched or annotated text value. - #[serde(default)] - pub value: String, - /// 1-based page number, if applicable. - #[serde(skip_serializing_if = "Option::is_none")] - pub page_number: Option<u32>, - /// Bounding box for image-based annotations. - #[serde(skip_serializing_if = "Option::is_none")] - pub bounding_box: Option<BoundingBox>, - /// Start byte offset in text. - #[serde(skip_serializing_if = "Option::is_none")] - pub start_offset: Option<usize>, - /// End byte offset in text. - #[serde(skip_serializing_if = "Option::is_none")] - pub end_offset: Option<usize>, - /// Tabular row index (0-based). - #[serde(skip_serializing_if = "Option::is_none")] - pub row_index: Option<usize>, - /// Tabular column index (0-based). - #[serde(skip_serializing_if = "Option::is_none")] - pub column_index: Option<usize>, -} - -/// Request-scoped description of what to redact. -/// -/// Acts as the per-request equivalent of a stored [`Policy`](super::policy::Policy), -/// specifying categories, entity types, confidence thresholds, and -/// redaction methods for a single redaction invocation. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct RedactionContext { - /// Entity categories to scan for. Empty = all. - #[serde(default)] - pub categories: Vec<EntityCategory>, - /// Specific entity type names (e.g. "ssn", "face", "address"). Empty = all within categories. - #[serde(default)] - pub entity_types: Vec<String>, - /// Per-entity-type overrides for redaction method. - #[serde(default)] - pub rules: Vec<EntityRedactionRule>, - /// Default method when no per-type rule matches. - #[serde(default = "default_method")] - pub default_method: RedactionMethod, - /// Minimum confidence (0.0-1.0). Below = ignored. - #[serde(default = "default_min_confidence")] - pub min_confidence: f64, - /// Enable image-based detection (faces, license plates). - #[serde(default)] - pub detect_images: bool, - /// Free-form labels (e.g. "gdpr-request"). - #[serde(default)] - pub labels: Vec<String>, - /// User-provided manual annotations to treat as detected entities. - #[serde(default)] - pub manual_entities: Vec<ManualAnnotation>, -} - -fn default_method() -> RedactionMethod { - RedactionMethod::Mask -} - -fn default_min_confidence() -> f64 { - 0.5 -} - -impl Default for RedactionContext { - fn default() -> Self { - Self { - categories: Vec::new(), - entity_types: Vec::new(), - rules: Vec::new(), - default_method: RedactionMethod::Mask, - min_confidence: 0.5, - detect_images: false, - labels: Vec::new(), - manual_entities: Vec::new(), - } - } -} - -impl RedactionContext { - /// Create a new context with default settings (mask method, 0.5 min confidence). - pub fn new() -> Self { - Self::default() - } - - /// Restrict processing to the given entity categories. - pub fn with_categories(mut self, categories: Vec<EntityCategory>) -> Self { - self.categories = categories; - self - } - - /// Restrict processing to the given entity type names. - pub fn with_entity_types(mut self, entity_types: Vec<String>) -> Self { - self.entity_types = entity_types; - self - } - - /// Add a per-entity-type redaction method override. - pub fn with_rule(mut self, rule: EntityRedactionRule) -> Self { - self.rules.push(rule); - self - } - - /// Set the fallback redaction method when no per-type rule matches. - pub fn with_default_method(mut self, method: RedactionMethod) -> Self { - self.default_method = method; - self - } - - /// Set the minimum confidence threshold. Entities below this are ignored. - pub fn with_min_confidence(mut self, confidence: f64) -> Self { - self.min_confidence = confidence; - self - } - - /// Enable or disable image-based detection (faces, license plates, etc.). - pub fn with_detect_images(mut self, detect: bool) -> Self { - self.detect_images = detect; - self - } - - /// Return the redaction method for a given entity type. - /// - /// Checks per-type rules first, falls back to `default_method`. - pub fn method_for(&self, entity_type: &str) -> RedactionMethod { - self.rules - .iter() - .find(|r| r.entity_type == entity_type) - .map(|r| r.method) - .unwrap_or(self.default_method) - } - - /// Whether a detected entity should be processed given the context filters. - pub fn should_process( - &self, - category: EntityCategory, - entity_type: &str, - confidence: f64, - ) -> bool { - if confidence < self.min_confidence { - return false; - } - if !self.categories.is_empty() && !self.categories.contains(&category) { - return false; - } - if !self.entity_types.is_empty() - && !self.entity_types.iter().any(|t| t == entity_type) - { - return false; - } - true - } -} diff --git a/crates/nvisy-ontology/src/redaction/method.rs b/crates/nvisy-ontology/src/redaction/method.rs new file mode 100644 index 0000000..13290a2 --- /dev/null +++ b/crates/nvisy-ontology/src/redaction/method.rs @@ -0,0 +1,109 @@ +//! Plain-tag redaction method enums. +//! +//! These are lightweight identifiers that name a redaction algorithm without +//! carrying any configuration data. For a data-carrying request see +//! [`RedactionSpec`](super::RedactionSpec); for a data-carrying result see +//! [`RedactionOutput`](super::RedactionOutput). + +use derive_more::From; +use serde::{Deserialize, Serialize}; + +/// Redaction strategies for text and tabular content. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum TextRedactionMethod { + /// Replace characters with a mask character (e.g. `***-**-1234`). + Mask, + /// Substitute with a fixed placeholder string. + Replace, + /// Replace with a one-way hash of the original value. + Hash, + /// Encrypt the value so it can be recovered later with a key. + Encrypt, + /// Remove the value entirely from the output. + Remove, + /// Replace with a synthetically generated realistic value. + Synthesize, + /// Replace with a consistent pseudonym across the document. + Pseudonymize, + /// Replace with a vault-backed reversible token (e.g. `USER_001`). + Tokenize, + /// Aggregate value into a range or bucket (e.g. age 34 → 30-39). + Aggregate, + /// Generalize to a less precise value (e.g. street → city → country). + Generalize, + /// Shift dates by a random but consistent offset, preserving intervals. + DateShift, +} + +/// Redaction strategies for image and video regions. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum ImageRedactionMethod { + /// Apply a gaussian blur to the region. + Blur, + /// Overlay an opaque block over the region. + Block, + /// Apply pixelation to the region (mosaic effect). + Pixelate, + /// Replace with a synthetically generated region. + Synthesize, +} + +/// Redaction strategies for audio segments. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum AudioRedactionMethod { + /// Replace the audio segment with silence. + Silence, + /// Remove the audio segment entirely. + Remove, + /// Replace with synthetically generated audio. + Synthesize, +} + +/// Unified redaction strategy tag that wraps modality-specific methods. +/// +/// This is a lightweight identifier — it names the algorithm but carries no +/// configuration data. For a data-carrying request use [`RedactionSpec`](super::RedactionSpec); +/// for a data-carrying result use [`RedactionOutput`](super::RedactionOutput). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, From, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum RedactionMethod { + /// Text/tabular redaction strategy. + Text(TextRedactionMethod), + /// Image/video redaction strategy. + Image(ImageRedactionMethod), + /// Audio redaction strategy. + Audio(AudioRedactionMethod), +} + +impl RedactionMethod { + /// Returns the text redaction method if this is a text variant. + pub fn as_text(&self) -> Option<TextRedactionMethod> { + match self { + Self::Text(m) => Some(*m), + _ => None, + } + } + + /// Returns the image redaction method if this is an image variant. + pub fn as_image(&self) -> Option<ImageRedactionMethod> { + match self { + Self::Image(m) => Some(*m), + _ => None, + } + } + + /// Returns the audio redaction method if this is an audio variant. + pub fn as_audio(&self) -> Option<AudioRedactionMethod> { + match self { + Self::Audio(m) => Some(*m), + _ => None, + } + } +} diff --git a/crates/nvisy-ontology/src/redaction/mod.rs b/crates/nvisy-ontology/src/redaction/mod.rs index a75b33c..5b735d1 100644 --- a/crates/nvisy-ontology/src/redaction/mod.rs +++ b/crates/nvisy-ontology/src/redaction/mod.rs @@ -1,7 +1,111 @@ -//! Redaction context and policy types. +//! Redaction methods, specifications, outputs, and records. +//! +//! This module contains three layers of redaction types: +//! +//! 1. **Method** ([`RedactionMethod`]) — a plain tag enum naming a redaction +//! strategy. Used as a lightweight identifier (e.g. in logs, serialized +//! references, or when the caller only needs to know *which* algorithm). +//! +//! 2. **Spec** ([`RedactionSpec`]) — a data-carrying enum that describes a +//! redaction request submitted to the engine: which method to apply and +//! the configuration parameters it needs (mask char, blur sigma, key id, +//! etc.). Used on [`PolicyRule`](crate::policy::PolicyRule) and +//! [`Policy`](crate::policy::Policy). +//! +//! 3. **Output** ([`RedactionOutput`]) — a data-carrying enum that records +//! what was actually done and the result data (replacement string, +//! ciphertext, shifted date, etc.). Stored on [`Redaction`]. +//! +//! All three are organized by modality: +//! - Text / tabular: [`TextRedactionMethod`], [`TextRedactionSpec`], [`TextRedactionOutput`] +//! - Image / video: [`ImageRedactionMethod`], [`ImageRedactionSpec`], [`ImageRedactionOutput`] +//! - Audio: [`AudioRedactionMethod`], [`AudioRedactionSpec`], [`AudioRedactionOutput`] -pub mod context; -pub mod policy; +pub mod method; +pub mod output; +pub mod review; +pub mod spec; +pub mod summary; -pub use context::{EntityRedactionRule, ManualAnnotation, RedactionContext}; -pub use policy::{Policy, PolicyRule}; +pub use method::{ + AudioRedactionMethod, ImageRedactionMethod, RedactionMethod, TextRedactionMethod, +}; +pub use output::{ + AudioRedactionOutput, ImageRedactionOutput, RedactionOutput, TextRedactionOutput, +}; +pub use review::{ReviewDecision, ReviewStatus}; +pub use spec::{AudioRedactionSpec, ImageRedactionSpec, RedactionSpec, TextRedactionSpec}; +pub use summary::RedactionSummary; + +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use nvisy_core::path::ContentSource; + +use crate::entity::Entity; +use crate::policy::Policy; + +/// Types that produce redaction decisions. +pub trait Redactable { + /// The entities detected in this content. + fn entities(&self) -> &[Entity]; + /// The policy governing redaction. + fn policy(&self) -> Option<&Policy>; +} + +/// A redaction decision recording how a specific entity was (or will be) redacted. +/// +/// Each `Redaction` is linked to exactly one [`Entity`](crate::entity::Entity) +/// via `entity_id`. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct Redaction { + /// Content source identity and lineage. + #[serde(flatten)] + pub source: ContentSource, + /// Identifier of the entity being redacted. + pub entity_id: Uuid, + /// Redaction output recording the method used and its result data. + pub output: RedactionOutput, + /// The original sensitive value, retained for audit purposes. + #[serde(skip_serializing_if = "Option::is_none")] + pub original_value: Option<String>, + /// Identifier of the policy rule that triggered this redaction. + #[serde(skip_serializing_if = "Option::is_none")] + pub policy_rule_id: Option<Uuid>, + /// Whether the redaction has been applied to the output content. + pub applied: bool, + /// Version of this redaction record (starts at 1, incremented on modification). + pub version: u32, + /// Human review decision, if any. + #[serde(skip_serializing_if = "Option::is_none")] + pub review: Option<ReviewDecision>, +} + +impl Redaction { + /// Create a new pending redaction for the given entity. + pub fn new(entity_id: Uuid, output: impl Into<RedactionOutput>) -> Self { + Self { + source: ContentSource::new(), + entity_id, + output: output.into(), + original_value: None, + policy_rule_id: None, + applied: false, + version: 1, + review: None, + } + } + + /// Record the original sensitive value for audit trail purposes. + pub fn with_original_value(mut self, value: impl Into<String>) -> Self { + self.original_value = Some(value.into()); + self + } + + /// Associate this redaction with the policy rule that triggered it. + pub fn with_policy_rule_id(mut self, id: Uuid) -> Self { + self.policy_rule_id = Some(id); + self + } +} diff --git a/crates/nvisy-ontology/src/redaction/output.rs b/crates/nvisy-ontology/src/redaction/output.rs new file mode 100644 index 0000000..5392b9c --- /dev/null +++ b/crates/nvisy-ontology/src/redaction/output.rs @@ -0,0 +1,152 @@ +//! Data-carrying redaction output enums recording what was done. +//! +//! A [`RedactionOutput`] records the method that was applied and its result +//! data (replacement string, ciphertext, blur sigma, etc.). Stored on +//! [`Redaction`](super::Redaction). + +use derive_more::From; +use serde::{Deserialize, Serialize}; + +use super::method::{ + AudioRedactionMethod, ImageRedactionMethod, RedactionMethod, TextRedactionMethod, +}; + +/// Text redaction output — records the method used and its replacement data. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(tag = "method", rename_all = "snake_case")] +pub enum TextRedactionOutput { + /// Characters replaced with a mask character. + Mask { + replacement: String, + mask_char: char, + }, + /// Substituted with a fixed placeholder string. + Replace { replacement: String }, + /// Replaced with a one-way hash. + Hash { hash_value: String }, + /// Encrypted; recoverable with the referenced key. + Encrypt { ciphertext: String, key_id: String }, + /// Removed entirely from the output. + Remove, + /// Replaced with a synthetically generated value. + Synthesize { replacement: String }, + /// Replaced with a consistent pseudonym. + Pseudonymize { pseudonym: String }, + /// Replaced with a vault-backed reversible token. + Tokenize { + token: String, + vault_id: Option<String>, + }, + /// Aggregated into a range or bucket. + Aggregate { replacement: String }, + /// Generalized to a less precise value. + Generalize { + replacement: String, + level: Option<u32>, + }, + /// Date shifted by a consistent offset. + DateShift { + replacement: String, + offset_days: i64, + }, +} + +/// Image redaction output — records the method used and its parameters. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(tag = "method", rename_all = "snake_case")] +pub enum ImageRedactionOutput { + /// Gaussian blur applied to the region. + Blur { sigma: f32 }, + /// Opaque block overlay on the region. + Block { color: [u8; 4] }, + /// Pixelation (mosaic) applied to the region. + Pixelate { block_size: u32 }, + /// Region replaced with a synthetic image. + Synthesize, +} + +/// Audio redaction output — records the method used. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(tag = "method", rename_all = "snake_case")] +pub enum AudioRedactionOutput { + /// Segment replaced with silence. + Silence, + /// Segment removed entirely. + Remove, + /// Segment replaced with synthetic audio. + Synthesize, +} + +/// Unified redaction output that wraps modality-specific output variants. +/// +/// Carries method-specific result data (replacement strings, ciphertext, +/// blur sigma, etc.). Stored on [`Redaction`](super::Redaction). +#[derive(Debug, Clone, PartialEq, From, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum RedactionOutput { + /// Text/tabular redaction output. + Text(TextRedactionOutput), + /// Image/video redaction output. + Image(ImageRedactionOutput), + /// Audio redaction output. + Audio(AudioRedactionOutput), +} + +impl RedactionOutput { + /// Returns the text replacement string, regardless of specific method. + /// + /// Used by apply actions that just need to know "what string goes here". + /// Returns `None` for image and audio outputs, or text `Remove`. + pub fn replacement_value(&self) -> Option<&str> { + match self { + Self::Text(t) => match t { + TextRedactionOutput::Mask { replacement, .. } => Some(replacement), + TextRedactionOutput::Replace { replacement } => Some(replacement), + TextRedactionOutput::Hash { hash_value } => Some(hash_value), + TextRedactionOutput::Encrypt { ciphertext, .. } => Some(ciphertext), + TextRedactionOutput::Remove => None, + TextRedactionOutput::Synthesize { replacement } => Some(replacement), + TextRedactionOutput::Pseudonymize { pseudonym } => Some(pseudonym), + TextRedactionOutput::Tokenize { token, .. } => Some(token), + TextRedactionOutput::Aggregate { replacement } => Some(replacement), + TextRedactionOutput::Generalize { replacement, .. } => Some(replacement), + TextRedactionOutput::DateShift { replacement, .. } => Some(replacement), + }, + Self::Image(_) | Self::Audio(_) => None, + } + } + + /// Returns the [`RedactionMethod`] tag this output corresponds to. + pub fn method(&self) -> RedactionMethod { + match self { + Self::Text(t) => RedactionMethod::Text(match t { + TextRedactionOutput::Mask { .. } => TextRedactionMethod::Mask, + TextRedactionOutput::Replace { .. } => TextRedactionMethod::Replace, + TextRedactionOutput::Hash { .. } => TextRedactionMethod::Hash, + TextRedactionOutput::Encrypt { .. } => TextRedactionMethod::Encrypt, + TextRedactionOutput::Remove => TextRedactionMethod::Remove, + TextRedactionOutput::Synthesize { .. } => TextRedactionMethod::Synthesize, + TextRedactionOutput::Pseudonymize { .. } => TextRedactionMethod::Pseudonymize, + TextRedactionOutput::Tokenize { .. } => TextRedactionMethod::Tokenize, + TextRedactionOutput::Aggregate { .. } => TextRedactionMethod::Aggregate, + TextRedactionOutput::Generalize { .. } => TextRedactionMethod::Generalize, + TextRedactionOutput::DateShift { .. } => TextRedactionMethod::DateShift, + }), + Self::Image(i) => RedactionMethod::Image(match i { + ImageRedactionOutput::Blur { .. } => ImageRedactionMethod::Blur, + ImageRedactionOutput::Block { .. } => ImageRedactionMethod::Block, + ImageRedactionOutput::Pixelate { .. } => ImageRedactionMethod::Pixelate, + ImageRedactionOutput::Synthesize => ImageRedactionMethod::Synthesize, + }), + Self::Audio(a) => RedactionMethod::Audio(match a { + AudioRedactionOutput::Silence => AudioRedactionMethod::Silence, + AudioRedactionOutput::Remove => AudioRedactionMethod::Remove, + AudioRedactionOutput::Synthesize => AudioRedactionMethod::Synthesize, + }), + } + } +} diff --git a/crates/nvisy-ontology/src/redaction/policy.rs b/crates/nvisy-ontology/src/redaction/policy.rs deleted file mode 100644 index 5e37676..0000000 --- a/crates/nvisy-ontology/src/redaction/policy.rs +++ /dev/null @@ -1,117 +0,0 @@ -//! Redaction policies and rules. - -use serde::{Deserialize, Serialize}; -use nvisy_core::path::ContentSource; -use crate::ontology::entity::EntityCategory; -use crate::ontology::redaction::RedactionMethod; - -/// A single rule within a redaction [`Policy`]. -/// -/// Rules specify which entity categories and types they match, the minimum -/// confidence threshold, and the redaction method to apply. Rules are -/// evaluated in ascending priority order. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct PolicyRule { - /// Unique identifier for this rule within its policy. - pub id: String, - /// Human-readable name for display purposes. - pub name: String, - /// Entity categories this rule applies to. Empty means all categories. - pub categories: Vec<EntityCategory>, - /// Specific entity type names this rule applies to. Empty means all types. - pub entity_types: Vec<String>, - /// Minimum detection confidence required for this rule to trigger. - pub confidence_threshold: f64, - /// Redaction strategy to apply when this rule matches. - pub method: RedactionMethod, - /// Template string for the replacement value (e.g. `"[REDACTED]"`). - pub replacement_template: String, - /// Whether this rule is active. Disabled rules are skipped during evaluation. - pub enabled: bool, - /// Evaluation priority (lower numbers are evaluated first). - pub priority: i32, -} - -/// A named redaction policy containing an ordered set of rules. -/// -/// Policies are evaluated by [`find_matching_rule`](Policy::find_matching_rule) -/// which returns the first matching enabled rule sorted by priority. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct Policy { - /// Content source identity and lineage. - #[serde(flatten)] - pub source: ContentSource, - /// Human-readable policy name. - pub name: String, - /// Ordered list of redaction rules. - pub rules: Vec<PolicyRule>, - /// Fallback redaction method when no rule matches. - pub default_method: RedactionMethod, - /// Fallback confidence threshold when no rule matches. - pub default_confidence_threshold: f64, -} - -impl Policy { - /// Create a new policy with the given name and rules, using default - /// fallback method ([`Mask`](RedactionMethod::Mask)) and threshold (0.5). - pub fn new(name: impl Into<String>, rules: Vec<PolicyRule>) -> Self { - Self { - source: ContentSource::new(), - name: name.into(), - rules, - default_method: RedactionMethod::Mask, - default_confidence_threshold: 0.5, - } - } - - /// Override the fallback redaction method. - pub fn with_default_method(mut self, method: RedactionMethod) -> Self { - self.default_method = method; - self - } - - /// Override the fallback confidence threshold. - pub fn with_default_confidence_threshold(mut self, threshold: f64) -> Self { - self.default_confidence_threshold = threshold; - self - } - - /// Find the first matching enabled rule for a given entity. - /// - /// Rules are sorted by priority (ascending). A rule matches when: - /// - It is enabled - /// - The entity's confidence meets the rule's threshold - /// - The entity's category is in the rule's categories (or categories is empty) - /// - The entity's type is in the rule's entityTypes (or entityTypes is empty) - pub fn find_matching_rule( - &self, - category: EntityCategory, - entity_type: &str, - confidence: f64, - ) -> Option<&PolicyRule> { - let mut sorted: Vec<&PolicyRule> = self.rules.iter().collect(); - sorted.sort_by_key(|r| r.priority); - - for rule in sorted { - if !rule.enabled { - continue; - } - if confidence < rule.confidence_threshold { - continue; - } - if !rule.categories.is_empty() && !rule.categories.contains(&category) { - continue; - } - if !rule.entity_types.is_empty() - && !rule.entity_types.iter().any(|t| t == entity_type) - { - continue; - } - return Some(rule); - } - - None - } -} diff --git a/crates/nvisy-ontology/src/redaction/review.rs b/crates/nvisy-ontology/src/redaction/review.rs new file mode 100644 index 0000000..d2e25fd --- /dev/null +++ b/crates/nvisy-ontology/src/redaction/review.rs @@ -0,0 +1,35 @@ +//! Human-in-the-loop review types. + +use jiff::Timestamp; +use serde::{Deserialize, Serialize}; + +/// Status of a human review on a redaction decision. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum ReviewStatus { + /// Awaiting human review. + Pending, + /// A human reviewer approved the redaction. + Approved, + /// A human reviewer rejected the redaction. + Rejected, + /// Automatically approved by policy (no human review required). + AutoApproved, +} + +/// A review decision recorded against a redaction. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct ReviewDecision { + /// Outcome of the review. + pub status: ReviewStatus, + /// Identifier of the reviewer (human or service account). + pub reviewer_id: String, + /// When the review decision was made. + #[cfg_attr(feature = "jsonschema", schemars(with = "String"))] + pub timestamp: Timestamp, + /// Optional reason for the decision. + #[serde(skip_serializing_if = "Option::is_none")] + pub reason: Option<String>, +} diff --git a/crates/nvisy-ontology/src/redaction/spec.rs b/crates/nvisy-ontology/src/redaction/spec.rs new file mode 100644 index 0000000..5c7a039 --- /dev/null +++ b/crates/nvisy-ontology/src/redaction/spec.rs @@ -0,0 +1,168 @@ +//! Data-carrying redaction specifications submitted to the engine. +//! +//! A [`RedactionSpec`] describes *how* to redact — which method to apply and +//! the configuration parameters it needs (mask char, blur sigma, encryption +//! key id, etc.). Used on [`PolicyRule`](crate::policy::PolicyRule) and +//! [`Policy`](crate::policy::Policy). + +use derive_more::From; +use serde::{Deserialize, Serialize}; + +use super::method::{ + AudioRedactionMethod, ImageRedactionMethod, RedactionMethod, TextRedactionMethod, +}; + +/// Text redaction specification with method-specific configuration. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(tag = "method", rename_all = "snake_case")] +pub enum TextRedactionSpec { + /// Replace characters with a mask character. + Mask { + /// Character used for masking (default `'*'`). + #[serde(default = "default_mask_char")] + mask_char: char, + }, + /// Substitute with a fixed placeholder string. + Replace { + /// Template for the replacement (supports `{entityType}`, `{category}`, `{value}`). + #[serde(default)] + placeholder: String, + }, + /// Replace with a one-way hash. + Hash, + /// Encrypt the value; recoverable with the referenced key. + Encrypt { + /// Identifier of the encryption key to use. + key_id: String, + }, + /// Remove the value entirely. + Remove, + /// Replace with a synthetically generated value. + Synthesize, + /// Replace with a consistent pseudonym. + Pseudonymize, + /// Replace with a vault-backed reversible token. + Tokenize { + /// Identifier of the token vault. + #[serde(default)] + vault_id: Option<String>, + }, + /// Aggregate into a range or bucket. + Aggregate, + /// Generalize to a less precise value. + Generalize { + /// Generalization level (1 = city, 2 = state, etc.). + #[serde(default)] + level: Option<u32>, + }, + /// Shift dates by a consistent offset. + DateShift { + /// Fixed offset in days (0 = engine picks a random offset). + #[serde(default)] + offset_days: i64, + }, +} + +fn default_mask_char() -> char { + '*' +} + +/// Image redaction specification with method-specific configuration. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(tag = "method", rename_all = "snake_case")] +pub enum ImageRedactionSpec { + /// Apply a gaussian blur. + Blur { + /// Blur sigma value. + #[serde(default = "default_sigma")] + sigma: f32, + }, + /// Overlay an opaque block. + Block { + /// RGBA color for the block. + #[serde(default = "default_block_color")] + color: [u8; 4], + }, + /// Apply pixelation (mosaic). + Pixelate { + /// Pixel block size. + #[serde(default = "default_block_size")] + block_size: u32, + }, + /// Replace with a synthetic region. + Synthesize, +} + +fn default_sigma() -> f32 { + 15.0 +} +fn default_block_color() -> [u8; 4] { + [0, 0, 0, 255] +} +fn default_block_size() -> u32 { + 10 +} + +/// Audio redaction specification. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(tag = "method", rename_all = "snake_case")] +pub enum AudioRedactionSpec { + /// Replace with silence. + Silence, + /// Remove the segment entirely. + Remove, + /// Replace with synthetic audio. + Synthesize, +} + +/// Unified redaction specification submitted to the engine. +/// +/// Carries the method to apply and its configuration parameters. +/// Used on [`PolicyRule`](crate::policy::PolicyRule) and +/// [`Policy`](crate::policy::Policy). +#[derive(Debug, Clone, PartialEq, From, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum RedactionSpec { + /// Text/tabular redaction specification. + Text(TextRedactionSpec), + /// Image/video redaction specification. + Image(ImageRedactionSpec), + /// Audio redaction specification. + Audio(AudioRedactionSpec), +} + +impl RedactionSpec { + /// Returns the [`RedactionMethod`] tag this spec corresponds to. + pub fn method(&self) -> RedactionMethod { + match self { + Self::Text(t) => RedactionMethod::Text(match t { + TextRedactionSpec::Mask { .. } => TextRedactionMethod::Mask, + TextRedactionSpec::Replace { .. } => TextRedactionMethod::Replace, + TextRedactionSpec::Hash => TextRedactionMethod::Hash, + TextRedactionSpec::Encrypt { .. } => TextRedactionMethod::Encrypt, + TextRedactionSpec::Remove => TextRedactionMethod::Remove, + TextRedactionSpec::Synthesize => TextRedactionMethod::Synthesize, + TextRedactionSpec::Pseudonymize => TextRedactionMethod::Pseudonymize, + TextRedactionSpec::Tokenize { .. } => TextRedactionMethod::Tokenize, + TextRedactionSpec::Aggregate => TextRedactionMethod::Aggregate, + TextRedactionSpec::Generalize { .. } => TextRedactionMethod::Generalize, + TextRedactionSpec::DateShift { .. } => TextRedactionMethod::DateShift, + }), + Self::Image(i) => RedactionMethod::Image(match i { + ImageRedactionSpec::Blur { .. } => ImageRedactionMethod::Blur, + ImageRedactionSpec::Block { .. } => ImageRedactionMethod::Block, + ImageRedactionSpec::Pixelate { .. } => ImageRedactionMethod::Pixelate, + ImageRedactionSpec::Synthesize => ImageRedactionMethod::Synthesize, + }), + Self::Audio(a) => RedactionMethod::Audio(match a { + AudioRedactionSpec::Silence => AudioRedactionMethod::Silence, + AudioRedactionSpec::Remove => AudioRedactionMethod::Remove, + AudioRedactionSpec::Synthesize => AudioRedactionMethod::Synthesize, + }), + } + } +} diff --git a/crates/nvisy-ontology/src/redaction/summary.rs b/crates/nvisy-ontology/src/redaction/summary.rs new file mode 100644 index 0000000..5246387 --- /dev/null +++ b/crates/nvisy-ontology/src/redaction/summary.rs @@ -0,0 +1,18 @@ +//! Per-source redaction summary. + +use serde::{Deserialize, Serialize}; + +use nvisy_core::path::ContentSource; + +/// Summary of redactions applied to a single content source. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct RedactionSummary { + /// The content source these counts apply to. + #[serde(flatten)] + pub source: ContentSource, + /// Number of redactions successfully applied. + pub redactions_applied: usize, + /// Number of redactions skipped (e.g. due to review holds or errors). + pub redactions_skipped: usize, +} diff --git a/crates/nvisy-pattern/src/patterns/mod.rs b/crates/nvisy-pattern/src/patterns/mod.rs index 2a3cc90..f4b79f2 100644 --- a/crates/nvisy-pattern/src/patterns/mod.rs +++ b/crates/nvisy-pattern/src/patterns/mod.rs @@ -9,7 +9,7 @@ pub mod validators; use std::collections::HashMap; use std::sync::LazyLock; -use nvisy_ontology::ontology::entity::EntityCategory; +use nvisy_ontology::entity::EntityCategory; /// JSON representation of a pattern loaded from disk. #[derive(Debug, Clone, serde::Deserialize)] @@ -52,7 +52,7 @@ fn parse_category(s: &str) -> EntityCategory { "phi" => EntityCategory::Phi, "financial" => EntityCategory::Financial, "credentials" => EntityCategory::Credentials, - _ => EntityCategory::Custom, + other => EntityCategory::Custom(other.to_string()), } } diff --git a/crates/nvisy-pipeline/src/actions/apply_image_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_image_redaction.rs index 3b48527..0079853 100644 --- a/crates/nvisy-pipeline/src/actions/apply_image_redaction.rs +++ b/crates/nvisy-pipeline/src/actions/apply_image_redaction.rs @@ -5,8 +5,8 @@ use serde::Deserialize; use nvisy_ingest::handler::{FormatHandler, ImageHandler}; use nvisy_ingest::document::Document; -use nvisy_ontology::ontology::entity::{BoundingBox, Entity}; -use nvisy_ontology::ontology::redaction::{Redaction, RedactionMethod}; +use nvisy_ontology::entity::{BoundingBox, Entity}; +use nvisy_ontology::redaction::{ImageRedactionOutput, Redaction, RedactionOutput}; use nvisy_core::error::{Error, ErrorKind}; use crate::action::Action; @@ -69,12 +69,16 @@ impl Action for ApplyImageRedactionAction { let mut block_regions: Vec<BoundingBox> = Vec::new(); for entity in &entities { - if let Some(bbox) = &entity.location.bounding_box { + if let Some(bbox) = entity.location.bounding_box() { if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { - match redaction.method { - RedactionMethod::Blur => blur_regions.push(bbox.clone()), - RedactionMethod::Block => block_regions.push(bbox.clone()), - // Default non-image methods to block for images + match &redaction.output { + RedactionOutput::Image(ImageRedactionOutput::Blur { .. }) => { + blur_regions.push(bbox.clone()) + } + RedactionOutput::Image(ImageRedactionOutput::Block { .. }) => { + block_regions.push(bbox.clone()) + } + // Default non-image methods, pixelate, and synthesize to block _ => block_regions.push(bbox.clone()), } } diff --git a/crates/nvisy-pipeline/src/actions/apply_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_redaction.rs index cdce89a..f19480e 100644 --- a/crates/nvisy-pipeline/src/actions/apply_redaction.rs +++ b/crates/nvisy-pipeline/src/actions/apply_redaction.rs @@ -5,8 +5,8 @@ use uuid::Uuid; use nvisy_ingest::handler::{FormatHandler, PlaintextHandler}; use nvisy_ingest::document::Document; -use nvisy_ontology::ontology::entity::Entity; -use nvisy_ontology::ontology::redaction::Redaction; +use nvisy_ontology::entity::Entity; +use nvisy_ontology::redaction::Redaction; use nvisy_core::error::Error; use crate::action::Action; @@ -79,10 +79,25 @@ impl Action for ApplyRedactionAction { continue; } + let start_offset = match entity.location.start_offset() { + Some(s) => s, + None => continue, + }; + let end_offset = match entity.location.end_offset() { + Some(e) => e, + None => continue, + }; + + let replacement_value = redaction + .output + .replacement_value() + .unwrap_or("") + .to_string(); + pending.push(PendingRedaction { - start_offset: entity.location.start_offset, - end_offset: entity.location.end_offset, - replacement_value: redaction.replacement_value.clone(), + start_offset, + end_offset, + replacement_value, }); } diff --git a/crates/nvisy-pipeline/src/actions/apply_tabular_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_tabular_redaction.rs index 686c5cc..aff358f 100644 --- a/crates/nvisy-pipeline/src/actions/apply_tabular_redaction.rs +++ b/crates/nvisy-pipeline/src/actions/apply_tabular_redaction.rs @@ -4,8 +4,8 @@ use serde::Deserialize; use nvisy_ingest::handler::FormatHandler; use nvisy_ingest::document::Document; -use nvisy_ontology::ontology::entity::Entity; -use nvisy_ontology::ontology::redaction::{Redaction, RedactionMethod}; +use nvisy_ontology::entity::Entity; +use nvisy_ontology::redaction::{Redaction, RedactionOutput, TextRedactionOutput}; use nvisy_core::error::Error; use crate::action::Action; @@ -53,18 +53,14 @@ impl Action for ApplyTabularRedactionAction { for entity in &entities { if let (Some(row_idx), Some(col_idx)) = - (entity.location.row_index, entity.location.column_index) + (entity.location.row_index(), entity.location.column_index()) { if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { for doc in &mut documents { if let Some(rows) = &mut doc.rows { if let Some(row) = rows.get_mut(row_idx) { if let Some(cell) = row.get_mut(col_idx) { - *cell = apply_cell_redaction( - cell, - redaction.method, - &redaction.replacement_value, - ); + *cell = apply_cell_redaction(cell, &redaction.output); } } } @@ -77,26 +73,25 @@ impl Action for ApplyTabularRedactionAction { } } -fn apply_cell_redaction(cell: &str, method: RedactionMethod, replacement: &str) -> String { - match method { - RedactionMethod::Mask => { +fn apply_cell_redaction(cell: &str, output: &RedactionOutput) -> String { + match output { + RedactionOutput::Text(TextRedactionOutput::Mask { mask_char, .. }) => { // Mask all but last 4 characters if cell.len() > 4 { format!( "{}{}", - "*".repeat(cell.len() - 4), + mask_char.to_string().repeat(cell.len() - 4), &cell[cell.len() - 4..] ) } else { - "*".repeat(cell.len()) + mask_char.to_string().repeat(cell.len()) } } - RedactionMethod::Replace => replacement.to_string(), - RedactionMethod::Remove => String::new(), - RedactionMethod::Hash => { + RedactionOutput::Text(TextRedactionOutput::Remove) => String::new(), + RedactionOutput::Text(TextRedactionOutput::Hash { .. }) => { format!("[HASH:{:x}]", hash_string(cell)) } - _ => replacement.to_string(), + _ => output.replacement_value().unwrap_or("").to_string(), } } diff --git a/crates/nvisy-pipeline/src/actions/classify.rs b/crates/nvisy-pipeline/src/actions/classify.rs index a6387c0..4b6cc3b 100644 --- a/crates/nvisy-pipeline/src/actions/classify.rs +++ b/crates/nvisy-pipeline/src/actions/classify.rs @@ -1,23 +1,16 @@ //! Sensitivity classification action. -use nvisy_ontology::ontology::entity::Entity; +pub use nvisy_ontology::detection::ClassificationResult; +use nvisy_ontology::detection::SensitivityLevel; +use nvisy_ontology::entity::Entity; use nvisy_core::error::Error; use crate::action::Action; -/// Result of sensitivity classification. -pub struct ClassificationResult { - /// The computed sensitivity level (`"none"`, `"low"`, `"medium"`, `"high"`, or `"critical"`). - pub sensitivity_level: String, - /// Total number of entities considered. - pub total_entities: usize, -} - /// Assigns a sensitivity level based on detected entities. /// -/// The action inspects the entities, computes a sensitivity level -/// (`"none"`, `"low"`, `"medium"`, `"high"`, or `"critical"`), and returns -/// a [`ClassificationResult`]. +/// The action inspects the entities, computes a [`SensitivityLevel`], and +/// returns a [`ClassificationResult`]. pub struct ClassifyAction; #[async_trait::async_trait] @@ -44,40 +37,37 @@ impl Action for ClassifyAction { Ok(ClassificationResult { sensitivity_level, total_entities, + risk_score: None, }) } } -/// Computes a sensitivity level string from a set of detected entities. +/// Computes a sensitivity level from a set of detected entities. /// /// The heuristic is: -/// - `"none"` -- no entities. -/// - `"critical"` -- at least one high-confidence (>= 0.9) credential, SSN, or credit card. -/// - `"high"` -- any critical type present, or more than 10 entities total. -/// - `"medium"` -- more than 3 entities. -/// - `"low"` -- 1-3 non-critical entities. -fn compute_sensitivity_level(entities: &[Entity]) -> String { +/// - [`Public`](SensitivityLevel::Public) — no entities. +/// - [`Restricted`](SensitivityLevel::Restricted) — at least one high-confidence (>= 0.9) credential, SSN, or credit card. +/// - [`Confidential`](SensitivityLevel::Confidential) — any critical type present, or more than 10 entities total. +/// - [`Internal`](SensitivityLevel::Internal) — 1–10 non-critical entities. +fn compute_sensitivity_level(entities: &[Entity]) -> SensitivityLevel { if entities.is_empty() { - return "none".to_string(); + return SensitivityLevel::Public; } let has_high_confidence = entities.iter().any(|e| e.confidence >= 0.9); let has_critical_types = entities.iter().any(|e| { matches!( e.category, - nvisy_ontology::ontology::entity::EntityCategory::Credentials + nvisy_ontology::entity::EntityCategory::Credentials ) || e.entity_type == "ssn" || e.entity_type == "credit_card" }); if has_critical_types && has_high_confidence { - return "critical".to_string(); + return SensitivityLevel::Restricted; } if has_critical_types || entities.len() > 10 { - return "high".to_string(); - } - if entities.len() > 3 { - return "medium".to_string(); + return SensitivityLevel::Confidential; } - "low".to_string() + SensitivityLevel::Internal } diff --git a/crates/nvisy-pipeline/src/actions/detect_checksum.rs b/crates/nvisy-pipeline/src/actions/detect_checksum.rs index aafb39d..e28a302 100644 --- a/crates/nvisy-pipeline/src/actions/detect_checksum.rs +++ b/crates/nvisy-pipeline/src/actions/detect_checksum.rs @@ -2,7 +2,7 @@ use serde::Deserialize; -use nvisy_ontology::ontology::entity::{DetectionMethod, Entity}; +use nvisy_ontology::entity::{DetectionMethod, Entity}; use nvisy_core::error::Error; use nvisy_pattern::patterns::validators::luhn_check; @@ -72,7 +72,7 @@ impl Action for DetectChecksumAction { if is_valid { let mut boosted = Entity::new( - entity.category, + entity.category.clone(), &entity.entity_type, &entity.value, DetectionMethod::Checksum, diff --git a/crates/nvisy-pipeline/src/actions/detect_dictionary.rs b/crates/nvisy-pipeline/src/actions/detect_dictionary.rs index c5a75b9..c74bf75 100644 --- a/crates/nvisy-pipeline/src/actions/detect_dictionary.rs +++ b/crates/nvisy-pipeline/src/actions/detect_dictionary.rs @@ -5,7 +5,9 @@ use serde::Deserialize; use nvisy_ingest::handler::FormatHandler; use nvisy_ingest::document::Document; -use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityLocation}; +use nvisy_ontology::entity::{ + DetectionMethod, Entity, EntityCategory, EntityLocation, TabularLocation, TextLocation, +}; use nvisy_core::error::{Error, ErrorKind}; use nvisy_pattern::dictionaries; @@ -86,21 +88,19 @@ impl Action for DetectDictionaryAction { for mat in ac.find_iter(content) { let value = &values[mat.pattern().as_usize()]; let entity = Entity::new( - def.category, + def.category.clone(), &def.entity_type, value.as_str(), DetectionMethod::Dictionary, confidence, - EntityLocation { + EntityLocation::Text(TextLocation { start_offset: mat.start(), end_offset: mat.end(), + context_start_offset: None, + context_end_offset: None, element_id: None, page_number: None, - bounding_box: None, - row_index: None, - column_index: None, - image_id: None, - }, + }), ) .with_parent(&doc.source); entities.push(entity); @@ -119,21 +119,17 @@ impl Action for DetectDictionaryAction { for mat in ac.find_iter(cell) { let value = &values[mat.pattern().as_usize()]; let entity = Entity::new( - def.category, + def.category.clone(), &def.entity_type, value.as_str(), DetectionMethod::Dictionary, confidence, - EntityLocation { - start_offset: mat.start(), - end_offset: mat.end(), - element_id: None, - page_number: None, - bounding_box: None, - row_index: Some(row_idx), - column_index: Some(col_idx), - image_id: None, - }, + EntityLocation::Tabular(TabularLocation { + row_index: row_idx, + column_index: col_idx, + start_offset: Some(mat.start()), + end_offset: Some(mat.end()), + }), ) .with_parent(&doc.source); entities.push(entity); diff --git a/crates/nvisy-pipeline/src/actions/detect_manual.rs b/crates/nvisy-pipeline/src/actions/detect_manual.rs index 121b9ec..1dafc3a 100644 --- a/crates/nvisy-pipeline/src/actions/detect_manual.rs +++ b/crates/nvisy-pipeline/src/actions/detect_manual.rs @@ -1,11 +1,11 @@ //! Manual annotation detection action. //! -//! Converts user-provided [`ManualAnnotation`]s into full [`Entity`] objects. +//! Converts user-provided inclusion [`Annotation`]s into full [`Entity`] objects. use serde::Deserialize; -use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityLocation}; -use nvisy_ontology::redaction::ManualAnnotation; +use nvisy_ontology::entity::{DetectionMethod, Entity}; +use nvisy_ontology::detection::{Annotation, AnnotationKind}; use nvisy_core::error::Error; use crate::action::Action; @@ -15,7 +15,7 @@ use crate::action::Action; #[serde(rename_all = "camelCase")] pub struct DetectManualParams {} -/// Converts each [`ManualAnnotation`] into a full [`Entity`] with +/// Converts each inclusion [`Annotation`] into a full [`Entity`] with /// `DetectionMethod::Manual` and confidence 1.0. pub struct DetectManualAction { params: DetectManualParams, @@ -24,7 +24,7 @@ pub struct DetectManualAction { #[async_trait::async_trait] impl Action for DetectManualAction { type Params = DetectManualParams; - type Input = Vec<ManualAnnotation>; + type Input = Vec<Annotation>; type Output = Vec<Entity>; fn id(&self) -> &str { @@ -42,22 +42,30 @@ impl Action for DetectManualAction { let mut entities = Vec::new(); for ann in &annotations { + if ann.kind != AnnotationKind::Inclusion { + continue; + } + let category = match &ann.category { + Some(c) => c.clone(), + None => continue, + }; + let entity_type = match &ann.entity_type { + Some(t) => t.clone(), + None => continue, + }; + let value = ann.value.clone().unwrap_or_default(); + let location = match &ann.location { + Some(l) => l.clone(), + None => continue, + }; + let entity = Entity::new( - ann.category, - &ann.entity_type, - &ann.value, + category, + entity_type, + value, DetectionMethod::Manual, 1.0, - EntityLocation { - start_offset: ann.start_offset.unwrap_or(0), - end_offset: ann.end_offset.unwrap_or(0), - element_id: None, - page_number: ann.page_number, - bounding_box: ann.bounding_box.clone(), - row_index: ann.row_index, - column_index: ann.column_index, - image_id: None, - }, + location, ); entities.push(entity); diff --git a/crates/nvisy-pipeline/src/actions/detect_regex.rs b/crates/nvisy-pipeline/src/actions/detect_regex.rs index e3c59f7..bd0ee3d 100644 --- a/crates/nvisy-pipeline/src/actions/detect_regex.rs +++ b/crates/nvisy-pipeline/src/actions/detect_regex.rs @@ -5,7 +5,7 @@ use serde::Deserialize; use nvisy_ingest::handler::FormatHandler; use nvisy_ingest::document::Document; -use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityLocation}; +use nvisy_ontology::entity::{DetectionMethod, Entity, EntityLocation, TextLocation}; use nvisy_core::error::Error; use nvisy_pattern::patterns::{self, PatternDefinition}; @@ -75,21 +75,19 @@ impl Action for DetectRegexAction { } let entity = Entity::new( - pattern.category, + pattern.category.clone(), &pattern.entity_type, value, DetectionMethod::Regex, pattern.confidence, - EntityLocation { + EntityLocation::Text(TextLocation { start_offset: mat.start(), end_offset: mat.end(), + context_start_offset: None, + context_end_offset: None, element_id: None, page_number: None, - bounding_box: None, - row_index: None, - column_index: None, - image_id: None, - }, + }), ) .with_parent(&doc.source); diff --git a/crates/nvisy-pipeline/src/actions/detect_tabular.rs b/crates/nvisy-pipeline/src/actions/detect_tabular.rs index c6f1e91..b68bff0 100644 --- a/crates/nvisy-pipeline/src/actions/detect_tabular.rs +++ b/crates/nvisy-pipeline/src/actions/detect_tabular.rs @@ -5,7 +5,9 @@ use serde::Deserialize; use nvisy_ingest::handler::FormatHandler; use nvisy_ingest::document::Document; -use nvisy_ontology::ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityLocation}; +use nvisy_ontology::entity::{ + DetectionMethod, Entity, EntityCategory, EntityLocation, TabularLocation, +}; use nvisy_core::error::{Error, ErrorKind}; use crate::action::Action; @@ -99,21 +101,17 @@ impl Action for DetectTabularAction { } let entity = Entity::new( - rule.category, + rule.category.clone(), &rule.entity_type, cell.as_str(), DetectionMethod::Composite, 0.9, - EntityLocation { - start_offset: 0, - end_offset: cell.len(), - element_id: None, - page_number: None, - bounding_box: None, - row_index: Some(row_idx), - column_index: Some(col_idx), - image_id: None, - }, + EntityLocation::Tabular(TabularLocation { + row_index: row_idx, + column_index: col_idx, + start_offset: Some(0), + end_offset: Some(cell.len()), + }), ) .with_parent(&doc.source); diff --git a/crates/nvisy-pipeline/src/actions/emit_audit.rs b/crates/nvisy-pipeline/src/actions/emit_audit.rs index 924ad72..c04e91c 100644 --- a/crates/nvisy-pipeline/src/actions/emit_audit.rs +++ b/crates/nvisy-pipeline/src/actions/emit_audit.rs @@ -3,8 +3,8 @@ use serde::Deserialize; use uuid::Uuid; -use nvisy_ontology::ontology::audit::{Audit, AuditAction}; -use nvisy_ontology::ontology::redaction::Redaction; +use nvisy_ontology::audit::{Audit, AuditAction}; +use nvisy_ontology::redaction::Redaction; use nvisy_core::error::Error; use crate::action::Action; @@ -66,17 +66,13 @@ impl Action for EmitAuditAction { let mut details = serde_json::Map::new(); details.insert( - "method".to_string(), - serde_json::to_value(redaction.method).unwrap_or_default(), + "output".to_string(), + serde_json::to_value(&redaction.output).unwrap_or_default(), ); - details.insert( - "replacementValue".to_string(), - serde_json::Value::String(redaction.replacement_value.clone()), - ); - if let Some(ref rule_id) = redaction.policy_rule_id { + if let Some(rule_id) = redaction.policy_rule_id { details.insert( "policyRuleId".to_string(), - serde_json::Value::String(rule_id.clone()), + serde_json::Value::String(rule_id.to_string()), ); } audit = audit.with_details(details); diff --git a/crates/nvisy-pipeline/src/actions/evaluate_policy.rs b/crates/nvisy-pipeline/src/actions/evaluate_policy.rs index bbba900..7a68383 100644 --- a/crates/nvisy-pipeline/src/actions/evaluate_policy.rs +++ b/crates/nvisy-pipeline/src/actions/evaluate_policy.rs @@ -2,9 +2,12 @@ use serde::Deserialize; -use nvisy_ontology::ontology::entity::Entity; -use nvisy_ontology::redaction::policy::PolicyRule; -use nvisy_ontology::ontology::redaction::{Redaction, RedactionMethod}; +use nvisy_ontology::entity::Entity; +use nvisy_ontology::policy::PolicyRule; +use nvisy_ontology::redaction::{ + AudioRedactionOutput, AudioRedactionSpec, ImageRedactionOutput, ImageRedactionSpec, Redaction, + RedactionOutput, RedactionSpec, TextRedactionOutput, TextRedactionSpec, +}; use nvisy_core::error::Error; use crate::action::Action; @@ -16,16 +19,16 @@ pub struct EvaluatePolicyParams { /// Ordered policy rules to evaluate. #[serde(default)] pub rules: Vec<PolicyRule>, - /// Fallback redaction method when no rule matches. - #[serde(default = "default_method")] - pub default_method: RedactionMethod, + /// Fallback redaction specification when no rule matches. + #[serde(default = "default_spec")] + pub default_spec: RedactionSpec, /// Fallback confidence threshold. #[serde(default = "default_threshold")] pub default_confidence_threshold: f64, } -fn default_method() -> RedactionMethod { - RedactionMethod::Mask +fn default_spec() -> RedactionSpec { + RedactionSpec::Text(TextRedactionSpec::Mask { mask_char: '*' }) } fn default_threshold() -> f64 { 0.5 @@ -34,7 +37,7 @@ fn default_threshold() -> f64 { /// Evaluates policy rules against detected entities and produces [`Redaction`] instructions. /// /// For each entity the action finds the first matching rule (sorted by priority), -/// applies its redaction method and replacement template, and creates a +/// applies its redaction spec and replacement template, and creates a /// [`Redaction`]. Entities that fall below the confidence threshold are skipped. pub struct EvaluatePolicyAction { params: EvaluatePolicyParams, @@ -58,7 +61,7 @@ impl Action for EvaluatePolicyAction { &self, entities: Self::Input, ) -> Result<Vec<Redaction>, Error> { - let default_method = self.params.default_method; + let default_spec = &self.params.default_spec; let default_threshold = self.params.default_confidence_threshold; let mut sorted_rules = self.params.rules.clone(); @@ -68,26 +71,22 @@ impl Action for EvaluatePolicyAction { for entity in &entities { let rule = find_matching_rule(entity, &sorted_rules); - let method = rule.map(|r| r.method).unwrap_or(default_method); - let threshold = rule - .map(|r| r.confidence_threshold) - .unwrap_or(default_threshold); + let spec = rule.map(|r| &r.spec).unwrap_or(default_spec); - if entity.confidence < threshold { + if rule.is_none() && entity.confidence < default_threshold { continue; } - let replacement_value = if let Some(r) = rule { - apply_template(&r.replacement_template, entity) + let output = if let Some(r) = rule { + build_output_from_template(spec, &r.replacement_template, entity) } else { - apply_default_mask(entity, default_method) + build_default_output(entity, spec) }; - let mut redaction = - Redaction::new(entity.source.as_uuid(), method, replacement_value); + let mut redaction = Redaction::new(entity.source.as_uuid(), output); redaction = redaction.with_original_value(&entity.value); if let Some(r) = rule { - redaction = redaction.with_policy_rule_id(&r.id); + redaction = redaction.with_policy_rule_id(r.id); } redaction.source.set_parent_id(Some(entity.source.as_uuid())); @@ -98,25 +97,16 @@ impl Action for EvaluatePolicyAction { } } -/// Returns the first enabled rule whose category/entity-type filters and confidence -/// threshold match the given entity, or `None` if no rule applies. +/// Returns the first enabled rule whose [`EntitySelector`] matches the given entity, +/// or `None` if no rule applies. fn find_matching_rule<'a>(entity: &Entity, rules: &'a [PolicyRule]) -> Option<&'a PolicyRule> { for rule in rules { if !rule.enabled { continue; } - if entity.confidence < rule.confidence_threshold { - continue; - } - if !rule.categories.is_empty() && !rule.categories.contains(&entity.category) { - continue; + if rule.selector.matches(&entity.category, &entity.entity_type, entity.confidence) { + return Some(rule); } - if !rule.entity_types.is_empty() - && !rule.entity_types.iter().any(|t| t == &entity.entity_type) - { - continue; - } - return Some(rule); } None } @@ -134,16 +124,106 @@ fn apply_template(template: &str, entity: &Entity) -> String { .replace("{value}", &entity.value) } -/// Generates a replacement string for an entity using the given default redaction method. -fn apply_default_mask(entity: &Entity, method: RedactionMethod) -> String { - match method { - RedactionMethod::Mask => "*".repeat(entity.value.len()), - RedactionMethod::Replace => format!("[{}]", entity.entity_type.to_uppercase()), - RedactionMethod::Remove => String::new(), - RedactionMethod::Hash => format!("[HASH:{}]", entity.entity_type), - RedactionMethod::Encrypt => format!("[ENC:{}]", entity.entity_type), - RedactionMethod::Blur => format!("[BLURRED:{}]", entity.entity_type), - RedactionMethod::Block => "\u{2588}".repeat(entity.value.len()), - RedactionMethod::Synthesize => format!("[SYNTH:{}]", entity.entity_type), +/// Builds a [`RedactionOutput`] from a spec and a policy rule's replacement template. +fn build_output_from_template( + spec: &RedactionSpec, + template: &str, + entity: &Entity, +) -> RedactionOutput { + let replacement = apply_template(template, entity); + build_output_with_replacement(spec, replacement) +} + +/// Generates a [`RedactionOutput`] for an entity using the given default redaction spec. +fn build_default_output(entity: &Entity, spec: &RedactionSpec) -> RedactionOutput { + match spec { + RedactionSpec::Text(text) => { + let replacement = match text { + TextRedactionSpec::Mask { mask_char } => { + mask_char.to_string().repeat(entity.value.len()) + } + TextRedactionSpec::Replace { placeholder } => { + if placeholder.is_empty() { + format!("[{}]", entity.entity_type.to_uppercase()) + } else { + apply_template(placeholder, entity) + } + } + TextRedactionSpec::Remove => String::new(), + TextRedactionSpec::Hash => format!("[HASH:{}]", entity.entity_type), + TextRedactionSpec::Encrypt { .. } => format!("[ENC:{}]", entity.entity_type), + TextRedactionSpec::Synthesize => format!("[SYNTH:{}]", entity.entity_type), + TextRedactionSpec::Pseudonymize => format!("[PSEUDO:{}]", entity.entity_type), + TextRedactionSpec::Tokenize { .. } => format!("[TOKEN:{}]", entity.entity_type), + TextRedactionSpec::Aggregate => format!("[AGG:{}]", entity.entity_type), + TextRedactionSpec::Generalize { .. } => format!("[GEN:{}]", entity.entity_type), + TextRedactionSpec::DateShift { .. } => format!("[SHIFTED:{}]", entity.entity_type), + }; + build_output_with_replacement(spec, replacement) + } + RedactionSpec::Image(img) => RedactionOutput::Image(match img { + ImageRedactionSpec::Blur { sigma } => ImageRedactionOutput::Blur { sigma: *sigma }, + ImageRedactionSpec::Block { color } => ImageRedactionOutput::Block { color: *color }, + ImageRedactionSpec::Pixelate { block_size } => { + ImageRedactionOutput::Pixelate { block_size: *block_size } + } + ImageRedactionSpec::Synthesize => ImageRedactionOutput::Synthesize, + }), + RedactionSpec::Audio(audio) => RedactionOutput::Audio(match audio { + AudioRedactionSpec::Silence => AudioRedactionOutput::Silence, + AudioRedactionSpec::Remove => AudioRedactionOutput::Remove, + AudioRedactionSpec::Synthesize => AudioRedactionOutput::Synthesize, + }), + } +} + +/// Builds a [`RedactionOutput`] from a spec and a replacement string. +fn build_output_with_replacement(spec: &RedactionSpec, replacement: String) -> RedactionOutput { + match spec { + RedactionSpec::Text(text) => RedactionOutput::Text(match text { + TextRedactionSpec::Mask { mask_char } => TextRedactionOutput::Mask { + replacement, + mask_char: *mask_char, + }, + TextRedactionSpec::Replace { .. } => TextRedactionOutput::Replace { replacement }, + TextRedactionSpec::Hash => TextRedactionOutput::Hash { + hash_value: replacement, + }, + TextRedactionSpec::Encrypt { key_id } => TextRedactionOutput::Encrypt { + ciphertext: replacement, + key_id: key_id.clone(), + }, + TextRedactionSpec::Remove => TextRedactionOutput::Remove, + TextRedactionSpec::Synthesize => TextRedactionOutput::Synthesize { replacement }, + TextRedactionSpec::Pseudonymize => TextRedactionOutput::Pseudonymize { + pseudonym: replacement, + }, + TextRedactionSpec::Tokenize { vault_id } => TextRedactionOutput::Tokenize { + token: replacement, + vault_id: vault_id.clone(), + }, + TextRedactionSpec::Aggregate => TextRedactionOutput::Aggregate { replacement }, + TextRedactionSpec::Generalize { level } => TextRedactionOutput::Generalize { + replacement, + level: *level, + }, + TextRedactionSpec::DateShift { offset_days } => TextRedactionOutput::DateShift { + replacement, + offset_days: *offset_days, + }, + }), + RedactionSpec::Image(img) => RedactionOutput::Image(match img { + ImageRedactionSpec::Blur { sigma } => ImageRedactionOutput::Blur { sigma: *sigma }, + ImageRedactionSpec::Block { color } => ImageRedactionOutput::Block { color: *color }, + ImageRedactionSpec::Pixelate { block_size } => { + ImageRedactionOutput::Pixelate { block_size: *block_size } + } + ImageRedactionSpec::Synthesize => ImageRedactionOutput::Synthesize, + }), + RedactionSpec::Audio(audio) => RedactionOutput::Audio(match audio { + AudioRedactionSpec::Silence => AudioRedactionOutput::Silence, + AudioRedactionSpec::Remove => AudioRedactionOutput::Remove, + AudioRedactionSpec::Synthesize => AudioRedactionOutput::Synthesize, + }), } } diff --git a/crates/nvisy-pipeline/src/render/block.rs b/crates/nvisy-pipeline/src/render/block.rs index a707914..05d69a3 100644 --- a/crates/nvisy-pipeline/src/render/block.rs +++ b/crates/nvisy-pipeline/src/render/block.rs @@ -1,7 +1,7 @@ //! Solid color block overlay for image regions. use image::{DynamicImage, Rgba, RgbaImage}; -use nvisy_ontology::ontology::entity::BoundingBox; +use nvisy_ontology::entity::BoundingBox; /// Apply a solid color block overlay to the specified regions of an image. /// diff --git a/crates/nvisy-pipeline/src/render/blur.rs b/crates/nvisy-pipeline/src/render/blur.rs index 4c7e56d..468c49e 100644 --- a/crates/nvisy-pipeline/src/render/blur.rs +++ b/crates/nvisy-pipeline/src/render/blur.rs @@ -2,7 +2,7 @@ use image::DynamicImage; use imageproc::filter::gaussian_blur_f32; -use nvisy_ontology::ontology::entity::BoundingBox; +use nvisy_ontology::entity::BoundingBox; /// Apply gaussian blur to the specified regions of an image. /// diff --git a/crates/nvisy-python/src/actions/mod.rs b/crates/nvisy-python/src/actions/mod.rs index 089c47e..4986b46 100644 --- a/crates/nvisy-python/src/actions/mod.rs +++ b/crates/nvisy-python/src/actions/mod.rs @@ -12,7 +12,7 @@ use serde::Deserialize; use nvisy_ingest::handler::{FormatHandler, PlaintextHandler}; use nvisy_ingest::document::Document; -use nvisy_ontology::ontology::entity::Entity; +use nvisy_ontology::entity::Entity; use nvisy_core::error::Error; use nvisy_core::io::ContentData; use nvisy_pipeline::action::Action; diff --git a/crates/nvisy-python/src/actions/ocr.rs b/crates/nvisy-python/src/actions/ocr.rs index 763d700..34d84ef 100644 --- a/crates/nvisy-python/src/actions/ocr.rs +++ b/crates/nvisy-python/src/actions/ocr.rs @@ -4,7 +4,7 @@ use serde::Deserialize; use nvisy_ingest::handler::{FormatHandler, PlaintextHandler}; use nvisy_ingest::document::Document; -use nvisy_ontology::ontology::entity::Entity; +use nvisy_ontology::entity::Entity; use nvisy_core::error::Error; use nvisy_core::io::ContentData; use nvisy_pipeline::action::Action; diff --git a/crates/nvisy-python/src/ner/mod.rs b/crates/nvisy-python/src/ner/mod.rs index 8259e04..a3639e3 100644 --- a/crates/nvisy-python/src/ner/mod.rs +++ b/crates/nvisy-python/src/ner/mod.rs @@ -6,9 +6,8 @@ use pyo3::prelude::*; use pyo3::types::{PyDict, PyList}; -use nvisy_ontology::ontology::entity::{Entity, EntityLocation}; +use nvisy_ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityLocation, TextLocation}; use nvisy_core::error::Error; -use nvisy_ontology::ontology::entity::{DetectionMethod, EntityCategory}; use crate::bridge::PythonBridge; use crate::error::from_pyerr; @@ -124,7 +123,7 @@ fn parse_python_entities(_py: Python<'_>, result: Bound<'_, PyAny>) -> Result<Ve "phi" => EntityCategory::Phi, "financial" => EntityCategory::Financial, "credentials" => EntityCategory::Credentials, - _ => EntityCategory::Custom, + other => EntityCategory::Custom(other.to_string()), }; let entity_type: String = dict @@ -164,18 +163,16 @@ fn parse_python_entities(_py: Python<'_>, result: Bound<'_, PyAny>) -> Result<Ve category, entity_type, value, - DetectionMethod::AiNer, + DetectionMethod::Ner, confidence, - EntityLocation { + EntityLocation::Text(TextLocation { start_offset, end_offset, + context_start_offset: None, + context_end_offset: None, element_id: None, page_number: None, - bounding_box: None, - row_index: None, - column_index: None, - image_id: None, - }, + }), ); entities.push(entity); diff --git a/crates/nvisy-python/src/ocr/mod.rs b/crates/nvisy-python/src/ocr/mod.rs index 643c831..7ad9b54 100644 --- a/crates/nvisy-python/src/ocr/mod.rs +++ b/crates/nvisy-python/src/ocr/mod.rs @@ -7,9 +7,10 @@ use pyo3::prelude::*; use pyo3::types::{PyDict, PyList}; -use nvisy_ontology::ontology::entity::{BoundingBox, Entity, EntityLocation}; +use nvisy_ontology::entity::{ + BoundingBox, DetectionMethod, Entity, EntityCategory, EntityLocation, ImageLocation, +}; use nvisy_core::error::Error; -use nvisy_ontology::ontology::entity::{DetectionMethod, EntityCategory}; use crate::bridge::PythonBridge; use crate::error::from_pyerr; @@ -132,16 +133,11 @@ fn parse_ocr_results(result: Bound<'_, PyAny>) -> Result<Vec<Entity>, Error> { &text, DetectionMethod::Ocr, confidence, - EntityLocation { - start_offset: 0, - end_offset: text.len(), - element_id: None, - page_number: None, - bounding_box: Some(BoundingBox { x, y, width, height }), - row_index: None, - column_index: None, + EntityLocation::Image(ImageLocation { + bounding_box: BoundingBox { x, y, width, height }, image_id: None, - }, + page_number: None, + }), ); entities.push(entity); diff --git a/docs/DETECTION.md b/docs/DETECTION.md index 59d6d51..9a297d8 100644 --- a/docs/DETECTION.md +++ b/docs/DETECTION.md @@ -43,18 +43,45 @@ Audio content introduces temporal and speaker-level dimensions to detection: - **Direct waveform redaction**: Replacement of sensitive audio segments with silence, tones, or noise at the waveform level. - **Speaker-specific redaction**: Selective redaction of content from identified speakers while preserving contributions from others, enabled by speaker diarization. -## 7. Detection Orchestration +## 7. Policies + +Detection is governed by policies — declarative rule sets that define what to detect and how to handle it. A policy is the primary configuration surface through which administrators, compliance officers, and integrators express redaction intent without modifying detection code. + +### 7.1 Policy Structure + +A policy is a named, versioned collection of rules. Each rule specifies: + +- **What to detect**: An entity type, pattern, or semantic category (e.g., "SSN", "face", "medical diagnosis"). +- **Detection parameters**: Confidence thresholds, locale constraints, and any modality-specific settings. +- **Redaction action**: The redaction method to apply when the rule matches (mask, blur, replace, or suppress). +- **Additional context**: Free-form or structured metadata attached to a rule that provides guidance to downstream stages — for example, a justification string, a regulatory citation, or instructions for human reviewers. + +### 7.2 Policy Composition + +Policies may extend other policies. A child policy inherits all rules from its parent and may add new rules, override inherited rules, or narrow inherited thresholds. This composition model enables organizations to maintain a base compliance policy (e.g., "HIPAA") and extend it with organization-specific additions without duplicating the base rule set. + +### 7.3 Annotations + +Files submitted for processing may carry annotations — either provided by the user at submission time or attached as part of a broader context (e.g., a case management system that tags documents with classification labels). Annotations can include: + +- **Pre-identified regions**: Bounding boxes, text spans, or time ranges that the submitter has already marked as sensitive, bypassing or supplementing automated detection. +- **Classification labels**: Document-level or region-level labels (e.g., "contains PHI", "attorney-client privileged") that influence which policy rules apply. +- **Exclusion markers**: Regions or entities explicitly marked as non-sensitive, instructing the detection engine to skip them. + +The detection engine must consume annotations as first-class inputs alongside its own detection results, merging user-provided and machine-generated findings into a unified annotation set before redaction. + +## 8. Detection Orchestration Individual detection strategies — deterministic, ML-based, vision, and audio — must be composed into a coherent pipeline rather than operating in isolation. -### 7.1 Tiered Execution +### 8.1 Tiered Execution Detection should proceed in tiers ordered by cost and specificity. Deterministic patterns (regex, checksums) execute first, providing high-precision results at minimal computational cost. ML and vision models execute subsequently, targeting content that deterministic methods cannot address. This tiered architecture avoids unnecessary GPU inference for content that can be resolved through pattern matching alone. -### 7.2 Result Merging +### 8.2 Result Merging When multiple detection strategies identify overlapping or adjacent sensitive regions within the same content, the platform must merge results into a unified set of detection annotations. Overlapping detections should be consolidated rather than duplicated. Each merged annotation must retain provenance — which strategies contributed to the detection and at what confidence level. -### 7.3 Conflict Resolution +### 8.3 Conflict Resolution When detection strategies disagree — for example, a regex match identifies a number as a credit card while an NER model classifies the surrounding context as non-sensitive — the platform must apply configurable conflict resolution rules. Default behavior should favor the higher-confidence or higher-sensitivity classification, but administrators must be able to override this through policy. diff --git a/docs/INGESTION.md b/docs/INGESTION.md index f7fb4f9..3d2a208 100644 --- a/docs/INGESTION.md +++ b/docs/INGESTION.md @@ -8,14 +8,34 @@ The quality of the ingestion layer is a critical success factor. Redaction platf ## 2. Supported Input Formats -The platform must support ingestion across the following modalities: +The platform must support ingestion across multiple modalities. Formats are organized into tiers reflecting implementation priority and expected coverage at each stage of the product lifecycle. -- **Documents**: PDF (native and scanned), DOCX, HTML, plain text -- **Images**: JPG, PNG, TIFF, and other common raster formats -- **Video**: Standard container formats with frame-level extraction -- **Audio**: WAV, MP3, and other common audio formats -- **Structured data**: CSV, JSON, and database connectors -- **Communications**: Email (with attachments), chat logs (Slack, Teams, WhatsApp exports) +### Tier 1 — Core (launch requirement) + +These formats represent the most common inputs in regulated enterprise environments and must be supported at general availability: + +- **PDF**: Native (digitally authored) and scanned, including multi-page documents with mixed content (text, images, tables, forms). +- **Images**: JPG, PNG, TIFF — the dominant formats for scanned documents, photographs, and screenshots. +- **Plain text and markup**: TXT, HTML, and Markdown. +- **Structured data**: CSV and JSON. + +### Tier 2 — Extended (near-term) + +These formats are frequently encountered in enterprise workflows and should be supported shortly after launch: + +- **Office documents**: DOCX, XLSX, PPTX. +- **Audio**: WAV, MP3, and other common audio formats. +- **Video**: Standard container formats (MP4, MOV, AVI) with frame-level extraction. +- **Email**: EML and MSG formats, including inline content and attachments (recursively ingested). + +### Tier 3 — Specialized (roadmap) + +These formats address long-tail use cases in specific verticals or operational contexts: + +- **Communications**: Chat log exports from Slack, Teams, and WhatsApp. +- **Database connectors**: Direct ingestion from relational databases and message queues. +- **Archival and compound formats**: ZIP, TAR, and other container formats with recursive extraction of enclosed files. +- **Domain-specific**: DICOM (medical imaging), GeoTIFF (geospatial), and other vertical-specific formats as demand dictates. ## 3. Extraction Capabilities @@ -24,6 +44,8 @@ Each modality requires specialized extraction techniques: - **Optical character recognition (OCR)**: Layout-aware OCR that preserves spatial relationships between text regions, table cells, headers, and form fields. - **Speech-to-text**: Transcription with speaker diarization, enabling attribution of spoken content to individual speakers. - **Video frame extraction**: Decomposition of video streams into individual frames for visual analysis, with temporal alignment to audio tracks. +- **Entity identification in images**: Detection and localization of entities within images — faces, persons, objects, text regions, documents, and other identifiable elements — producing bounding boxes or segmentation masks that downstream detection and redaction stages can operate on. +- **Entity tracking in video**: Persistent tracking of identified entities across video frames. When a face, person, or object is detected in one frame, the platform must maintain identity continuity across subsequent frames to enable consistent redaction without requiring independent detection on every frame. - **Document structure parsing**: Identification of semantic document elements — headings, paragraphs, tables, lists, and form fields — beyond raw text extraction. - **Metadata extraction**: Capture of authorship, timestamps, geolocation, and other embedded metadata that may itself constitute sensitive information. @@ -35,14 +57,15 @@ Following redaction, the transformation layer must produce output that meets dow Redacted output should preserve the structural characteristics of the source document. Tables must remain aligned, page layouts must be maintained, and non-redacted content must remain unaltered. -### 4.2 Export Formats +### 4.2 Output Formats + +The primary output of the transformation layer is a redacted file in the same format as the input — a PDF produces a redacted PDF, an image produces a redacted image, and so on. The platform must not alter the source format unless explicitly requested. -The platform should support export as: +In addition to the format-preserving primary output, the platform should produce supplementary outputs that serve downstream workflows: -- Redacted PDF with visual redaction markers -- Structured JSON with redaction metadata -- Masked CSV for tabular data -- Anonymized datasets for analytics consumption +- **Redaction metadata (JSON)**: A structured manifest describing every redaction applied — entity type, location, triggering rule, confidence score, and reviewer disposition. This metadata enables programmatic consumption of redaction results by audit systems, analytics pipelines, and downstream integrations. +- **Masked structured data (CSV/JSON)**: For tabular or structured inputs, a masked variant in which sensitive cell values are replaced according to the active masking strategy, suitable for analytics or data science consumption. +- **Anonymized datasets**: Fully de-identified exports intended for secondary use (model training, statistical analysis) where no re-identification pathway should exist. ### 4.3 Masking Strategies From 68d1f480cbbb83d1ffefab32dcdad7072bc8be2e Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Sat, 14 Feb 2026 04:24:14 +0100 Subject: [PATCH 14/17] refactor(ontology): add Sensitivity, DocumentType, privatize sub-modules; remove Parquet/WebP/BMP support Consolidate sensitivity assessment into a reusable Sensitivity struct combining SensitivityLevel and risk_score. Add DocumentType enum in entity with concrete image (Png, Jpeg, Tiff) and audio (Wav, Mp3) variants. Make all ontology sub-modules private with public re-exports. Reformat all Cargo.toml deps to consistent table syntax. Remove Parquet loader/handler/deps and WebP/BMP from ImageHandler as unsupported. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- Cargo.lock | 393 +----------------- Cargo.toml | 25 +- crates/nvisy-core/Cargo.toml | 14 +- crates/nvisy-engine/Cargo.toml | 4 +- crates/nvisy-ingest/Cargo.toml | 26 +- crates/nvisy-ingest/README.md | 4 +- crates/nvisy-ingest/src/document.rs | 2 +- crates/nvisy-ingest/src/handler.rs | 35 +- crates/nvisy-ingest/src/prelude.rs | 5 - crates/nvisy-ingest/src/tabular/mod.rs | 5 +- crates/nvisy-ingest/src/tabular/parquet.rs | 128 ------ crates/nvisy-object/Cargo.toml | 2 +- crates/nvisy-ontology/Cargo.toml | 6 +- crates/nvisy-ontology/src/audit/mod.rs | 4 +- .../src/detection/classification.rs | 9 +- crates/nvisy-ontology/src/detection/mod.rs | 49 +-- .../src/detection/sensitivity.rs | 38 ++ crates/nvisy-ontology/src/entity/document.rs | 34 ++ crates/nvisy-ontology/src/entity/mod.rs | 8 +- crates/nvisy-ontology/src/policy/mod.rs | 6 +- crates/nvisy-ontology/src/policy/rule.rs | 6 +- crates/nvisy-ontology/src/prelude.rs | 10 +- crates/nvisy-ontology/src/redaction/mod.rs | 10 +- crates/nvisy-pattern/Cargo.toml | 2 +- crates/nvisy-pipeline/Cargo.toml | 16 +- crates/nvisy-pipeline/src/actions/classify.rs | 14 +- crates/nvisy-python/Cargo.toml | 6 +- docs/TODO.md | 11 + packages/nvisy-exif/src/nvisy_exif/exif.py | 6 +- 29 files changed, 192 insertions(+), 686 deletions(-) delete mode 100644 crates/nvisy-ingest/src/tabular/parquet.rs create mode 100644 crates/nvisy-ontology/src/detection/sensitivity.rs create mode 100644 crates/nvisy-ontology/src/entity/document.rs create mode 100644 docs/TODO.md diff --git a/Cargo.lock b/Cargo.lock index 951d3b7..4eedef9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -44,20 +44,6 @@ dependencies = [ "cpufeatures", ] -[[package]] -name = "ahash" -version = "0.8.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" -dependencies = [ - "cfg-if", - "const-random", - "getrandom 0.3.4", - "once_cell", - "version_check", - "zerocopy", -] - [[package]] name = "aho-corasick" version = "1.1.4" @@ -185,173 +171,6 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" -[[package]] -name = "arrow" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5ec52ba94edeed950e4a41f75d35376df196e8cb04437f7280a5aa49f20f796" -dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", -] - -[[package]] -name = "arrow-arith" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc766fdacaf804cb10c7c70580254fcdb5d55cdfda2bc57b02baf5223a3af9e" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "chrono", - "num", -] - -[[package]] -name = "arrow-array" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a12fcdb3f1d03f69d3ec26ac67645a8fe3f878d77b5ebb0b15d64a116c212985" -dependencies = [ - "ahash", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "chrono", - "half", - "hashbrown 0.15.5", - "num", -] - -[[package]] -name = "arrow-buffer" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "263f4801ff1839ef53ebd06f99a56cecd1dbaf314ec893d93168e2e860e0291c" -dependencies = [ - "bytes", - "half", - "num", -] - -[[package]] -name = "arrow-cast" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede6175fbc039dfc946a61c1b6d42fd682fcecf5ab5d148fbe7667705798cac9" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "atoi", - "base64", - "chrono", - "half", - "lexical-core", - "num", - "ryu", -] - -[[package]] -name = "arrow-data" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61cfdd7d99b4ff618f167e548b2411e5dd2c98c0ddebedd7df433d34c20a4429" -dependencies = [ - "arrow-buffer", - "arrow-schema", - "half", - "num", -] - -[[package]] -name = "arrow-ipc" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62ff528658b521e33905334723b795ee56b393dbe9cf76c8b1f64b648c65a60c" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "flatbuffers", -] - -[[package]] -name = "arrow-ord" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0a3334a743bd2a1479dbc635540617a3923b4b2f6870f37357339e6b5363c21" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", -] - -[[package]] -name = "arrow-row" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d1d7a7291d2c5107e92140f75257a99343956871f3d3ab33a7b41532f79cb68" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "half", -] - -[[package]] -name = "arrow-schema" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39cfaf5e440be44db5413b75b72c2a87c1f8f0627117d110264048f2969b99e9" - -[[package]] -name = "arrow-select" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69efcd706420e52cd44f5c4358d279801993846d1c2a8e52111853d61d55a619" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "num", -] - -[[package]] -name = "arrow-string" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a21546b337ab304a32cfc0770f671db7411787586b45b78b4593ae78e64e2b03" -dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", - "memchr", - "num", - "regex", - "regex-syntax", -] - [[package]] name = "as-slice" version = "0.2.1" @@ -383,15 +202,6 @@ dependencies = [ "syn 2.0.114", ] -[[package]] -name = "atoi" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" -dependencies = [ - "num-traits", -] - [[package]] name = "atoi_simd" version = "0.17.0" @@ -468,12 +278,6 @@ version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e4b40c7323adcfc0a41c4b88143ed58346ff65a288fc144329c5c45e05d70c6" -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.10.0" @@ -646,26 +450,6 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" -[[package]] -name = "const-random" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" -dependencies = [ - "const-random-macro", -] - -[[package]] -name = "const-random-macro" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" -dependencies = [ - "getrandom 0.2.17", - "once_cell", - "tiny-keccak", -] - [[package]] name = "constant_time_eq" version = "0.3.1" @@ -1083,16 +867,6 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" -[[package]] -name = "flatbuffers" -version = "24.12.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" -dependencies = [ - "bitflags 1.3.2", - "rustc_version", -] - [[package]] name = "flate2" version = "1.1.9" @@ -1331,7 +1105,6 @@ checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", - "num-traits", "zerocopy", ] @@ -1728,12 +1501,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "integer-encoding" -version = "3.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" - [[package]] name = "interpolate_name" version = "0.2.4" @@ -1864,63 +1631,6 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" -[[package]] -name = "lexical-core" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" -dependencies = [ - "lexical-parse-float", - "lexical-parse-integer", - "lexical-util", - "lexical-write-float", - "lexical-write-integer", -] - -[[package]] -name = "lexical-parse-float" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" -dependencies = [ - "lexical-parse-integer", - "lexical-util", -] - -[[package]] -name = "lexical-parse-integer" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" -dependencies = [ - "lexical-util", -] - -[[package]] -name = "lexical-util" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" - -[[package]] -name = "lexical-write-float" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" -dependencies = [ - "lexical-util", - "lexical-write-integer", -] - -[[package]] -name = "lexical-write-integer" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" -dependencies = [ - "lexical-util", -] - [[package]] name = "libc" version = "0.2.181" @@ -2389,7 +2099,6 @@ dependencies = [ name = "nvisy-ingest" version = "0.1.0" dependencies = [ - "arrow", "async-trait", "bytes", "calamine", @@ -2397,7 +2106,6 @@ dependencies = [ "infer", "lopdf", "nvisy-core", - "parquet", "pdf-extract", "quick-xml 0.37.5", "schemars", @@ -2509,7 +2217,7 @@ version = "0.10.75" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" dependencies = [ - "bitflags 2.10.0", + "bitflags", "cfg-if", "foreign-types", "libc", @@ -2547,15 +2255,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "ordered-float" -version = "2.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" -dependencies = [ - "num-traits", -] - [[package]] name = "owned_ttf_parser" version = "0.25.1" @@ -2588,33 +2287,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "parquet" -version = "54.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb15796ac6f56b429fd99e33ba133783ad75b27c36b4b5ce06f1f82cc97754e" -dependencies = [ - "ahash", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", - "base64", - "bytes", - "chrono", - "half", - "hashbrown 0.15.5", - "num", - "num-bigint", - "paste", - "seq-macro", - "thrift", - "twox-hash", -] - [[package]] name = "paste" version = "1.0.15" @@ -2746,7 +2418,7 @@ version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97baced388464909d42d89643fe4361939af9b7ce7a31ee32a168f832a70f2a0" dependencies = [ - "bitflags 2.10.0", + "bitflags", "crc32fast", "fdeflate", "flate2", @@ -3117,7 +2789,7 @@ version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.10.0", + "bitflags", ] [[package]] @@ -3214,22 +2886,13 @@ version = "0.8.52" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c6a884d2998352bb4daf0183589aec883f16a6da1f4dde84d8e2e9a5409a1ce" -[[package]] -name = "rustc_version" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" -dependencies = [ - "semver", -] - [[package]] name = "rustix" version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" dependencies = [ - "bitflags 2.10.0", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -3329,7 +2992,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags 2.10.0", + "bitflags", "core-foundation", "core-foundation-sys", "libc", @@ -3352,7 +3015,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8" dependencies = [ - "bitflags 2.10.0", + "bitflags", "cssparser", "derive_more 0.99.20", "fxhash", @@ -3375,12 +3038,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "seq-macro" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" - [[package]] name = "serde" version = "1.0.228" @@ -3572,12 +3229,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "string_cache" version = "0.8.9" @@ -3723,17 +3374,6 @@ dependencies = [ "syn 2.0.114", ] -[[package]] -name = "thrift" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" -dependencies = [ - "byteorder", - "integer-encoding", - "ordered-float", -] - [[package]] name = "tiff" version = "0.10.3" @@ -3779,15 +3419,6 @@ dependencies = [ "time-core", ] -[[package]] -name = "tiny-keccak" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" -dependencies = [ - "crunchy", -] - [[package]] name = "tinystr" version = "0.8.2" @@ -3896,7 +3527,7 @@ version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" dependencies = [ - "bitflags 2.10.0", + "bitflags", "bytes", "futures-util", "http", @@ -3963,16 +3594,6 @@ version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31" -[[package]] -name = "twox-hash" -version = "1.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" -dependencies = [ - "cfg-if", - "static_assertions", -] - [[package]] name = "type1-encoding-parser" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 07ca39e..8d9ff02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -84,27 +84,26 @@ minio = { version = "0.3", features = [] } # Image processing image = { version = "0.25", default-features = false, features = ["png", "jpeg", "tiff"] } -imageproc = "0.25" +imageproc = { version = "0.25", features = [] } # Document parsing -pdf-extract = "0.7" -lopdf = "0.34" -scraper = "0.22" -calamine = "0.33" -zip = "2" -quick-xml = "0.37" -arrow = { version = "54", default-features = false } -parquet = { version = "54", default-features = false, features = ["arrow"] } +pdf-extract = { version = "0.7", features = [] } +lopdf = { version = "0.34", features = [] } +scraper = { version = "0.22", features = [] } +calamine = { version = "0.33", features = [] } +zip = { version = "2", features = [] } +quick-xml = { version = "0.37", features = [] } + # Time jiff = { version = "0.2", features = ["serde"] } # Interned strings -hipstr = "0.6" +hipstr = { version = "0.6", features = [] } # Hashing -sha2 = "0.10" -hex = "0.4" +sha2 = { version = "0.10", features = [] } +hex = { version = "0.4", features = [] } # Semantic versioning semver = { version = "1", features = ["serde"] } @@ -113,7 +112,7 @@ semver = { version = "1", features = ["serde"] } strum = { version = "0.26", features = ["derive"] } # Testing -tempfile = "3" +tempfile = { version = "3", features = [] } # Randomness rand = { version = "0.9", features = [] } diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index 7cd9a1a..8172dd7 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -19,7 +19,7 @@ documentation = { workspace = true } [dependencies] # JSON Schema generation -schemars = { workspace = true } +schemars = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -41,20 +41,20 @@ anyhow = { workspace = true, features = [] } derive_more = { workspace = true, features = ["display", "deref", "as_ref"] } # Time -jiff = { workspace = true } +jiff = { workspace = true, features = [] } # Interned strings -hipstr = { workspace = true } +hipstr = { workspace = true, features = [] } # Hashing -sha2 = { workspace = true } -hex = { workspace = true } +sha2 = { workspace = true, features = [] } +hex = { workspace = true, features = [] } # Enum derives -strum = { workspace = true } +strum = { workspace = true, features = [] } # Observability tracing = { workspace = true, features = [] } [dev-dependencies] -tempfile = { workspace = true } +tempfile = { workspace = true, features = [] } diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml index e30a2d4..095deba 100644 --- a/crates/nvisy-engine/Cargo.toml +++ b/crates/nvisy-engine/Cargo.toml @@ -23,7 +23,7 @@ nvisy-core = { workspace = true, features = [] } nvisy-ontology = { workspace = true, features = [] } # JSON Schema generation -schemars = { workspace = true } +schemars = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -35,7 +35,7 @@ tokio-util = { workspace = true, features = [] } # Primitive datatypes uuid = { workspace = true, features = ["v4"] } -jiff = { workspace = true } +jiff = { workspace = true, features = [] } # Graph data structures petgraph = { workspace = true, features = [] } diff --git a/crates/nvisy-ingest/Cargo.toml b/crates/nvisy-ingest/Cargo.toml index 7b04966..be70fcb 100644 --- a/crates/nvisy-ingest/Cargo.toml +++ b/crates/nvisy-ingest/Cargo.toml @@ -22,7 +22,7 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [features] -default = ["pdf", "docx", "html", "xlsx", "parquet", "image"] +default = ["pdf", "docx", "html", "xlsx", "image"] # PDF parsing and text extraction via pdf-extract + lopdf pdf = ["dep:pdf-extract", "dep:lopdf"] # Microsoft Word (.docx) parsing via zip + quick-xml @@ -31,8 +31,6 @@ docx = ["dep:zip", "dep:quick-xml"] html = ["dep:scraper"] # Excel (.xlsx) spreadsheet parsing via calamine xlsx = ["dep:calamine"] -# Apache Parquet columnar data via arrow + parquet -parquet = ["dep:parquet", "dep:arrow"] # Image decoding (PNG, JPEG, TIFF) via the image crate image = ["dep:image"] @@ -41,7 +39,7 @@ image = ["dep:image"] nvisy-core = { workspace = true, features = [] } # JSON Schema generation -schemars = { workspace = true } +schemars = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -53,21 +51,19 @@ async-trait = { workspace = true, features = [] } # Primitive datatypes uuid = { workspace = true, features = ["v4"] } -bytes = { workspace = true } +bytes = { workspace = true, features = [] } # Observability tracing = { workspace = true, features = [] } # File type detection -infer = { workspace = true } +infer = { workspace = true, features = [] } # Document parsing (feature-gated) -pdf-extract = { workspace = true, optional = true } -lopdf = { workspace = true, optional = true } -zip = { workspace = true, optional = true } -quick-xml = { workspace = true, optional = true } -scraper = { workspace = true, optional = true } -calamine = { workspace = true, optional = true } -arrow = { workspace = true, optional = true } -parquet = { workspace = true, optional = true } -image = { workspace = true, optional = true } +pdf-extract = { workspace = true, optional = true, features = [] } +lopdf = { workspace = true, optional = true, features = [] } +zip = { workspace = true, optional = true, features = [] } +quick-xml = { workspace = true, optional = true, features = [] } +scraper = { workspace = true, optional = true, features = [] } +calamine = { workspace = true, optional = true, features = [] } +image = { workspace = true, optional = true, features = [] } diff --git a/crates/nvisy-ingest/README.md b/crates/nvisy-ingest/README.md index 0cbc41b..378ce7b 100644 --- a/crates/nvisy-ingest/README.md +++ b/crates/nvisy-ingest/README.md @@ -2,8 +2,8 @@ File-format loaders for the Nvisy multimodal redaction platform. -This crate provides loaders for PDF, DOCX, HTML, Image, Parquet, XLSX, -Audio, CSV, JSON, and plain-text files. Each loader implements the +This crate provides loaders for PDF, DOCX, HTML, Image, XLSX, Audio, +CSV, JSON, and plain-text files. Each loader implements the [`Loader`](crate::loaders::Loader) trait and converts raw blob bytes into structured `Document`, `ImageData`, or `TabularData` artifacts. diff --git a/crates/nvisy-ingest/src/document.rs b/crates/nvisy-ingest/src/document.rs index c7a0cee..6aa19a1 100644 --- a/crates/nvisy-ingest/src/document.rs +++ b/crates/nvisy-ingest/src/document.rs @@ -15,7 +15,7 @@ use crate::handler::{FormatHandler, Handler}; /// Fields are grouped by content modality: /// - **Text** (`content`, `title`, `elements`, `page_count`) — for PDF, DOCX, HTML, etc. /// - **Binary/image** (`data`, `mime_type`, `width`, `height`, etc.) — for images and raw bytes. -/// - **Tabular** (`columns`, `rows`, `sheet_name`) — for CSV, XLSX, Parquet. +/// - **Tabular** (`columns`, `rows`, `sheet_name`) — for CSV, XLSX. #[derive(Debug, Clone)] pub struct Document<H: Handler> { /// Content source identity and lineage. diff --git a/crates/nvisy-ingest/src/handler.rs b/crates/nvisy-ingest/src/handler.rs index eaf1648..80886ae 100644 --- a/crates/nvisy-ingest/src/handler.rs +++ b/crates/nvisy-ingest/src/handler.rs @@ -101,7 +101,7 @@ impl Handler for DocxHandler { fn content_types(&self) -> &[&str] { &["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] } } -/// Handles image files (PNG, JPEG, TIFF, etc.). +/// Handles image files (PNG, JPEG, TIFF). #[cfg(feature = "image")] #[derive(Debug, Clone)] pub struct ImageHandler; @@ -109,8 +109,8 @@ pub struct ImageHandler; #[cfg(feature = "image")] impl Handler for ImageHandler { fn id(&self) -> &str { "image" } - fn extensions(&self) -> &[&str] { &["jpg", "jpeg", "png", "tiff", "bmp", "webp"] } - fn content_types(&self) -> &[&str] { &["image/jpeg", "image/png", "image/tiff", "image/bmp", "image/webp"] } + fn extensions(&self) -> &[&str] { &["jpg", "jpeg", "png", "tiff"] } + fn content_types(&self) -> &[&str] { &["image/jpeg", "image/png", "image/tiff"] } } /// Handles XLSX/XLS spreadsheet files. @@ -125,18 +125,6 @@ impl Handler for XlsxHandler { fn content_types(&self) -> &[&str] { &["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel"] } } -/// Handles Apache Parquet files. -#[cfg(feature = "parquet")] -#[derive(Debug, Clone)] -pub struct ParquetHandler; - -#[cfg(feature = "parquet")] -impl Handler for ParquetHandler { - fn id(&self) -> &str { "parquet" } - fn extensions(&self) -> &[&str] { &["parquet"] } - fn content_types(&self) -> &[&str] { &["application/x-parquet"] } -} - /// Handles WAV audio files. #[derive(Debug, Clone)] pub struct WavHandler; @@ -180,8 +168,6 @@ pub enum FormatHandler { Image(ImageHandler), #[cfg(feature = "xlsx")] Xlsx(XlsxHandler), - #[cfg(feature = "parquet")] - Parquet(ParquetHandler), Wav(WavHandler), Mp3(Mp3Handler), } @@ -202,8 +188,7 @@ impl Handler for FormatHandler { Self::Image(h) => h.id(), #[cfg(feature = "xlsx")] Self::Xlsx(h) => h.id(), - #[cfg(feature = "parquet")] - Self::Parquet(h) => h.id(), + Self::Wav(h) => h.id(), Self::Mp3(h) => h.id(), } @@ -224,8 +209,7 @@ impl Handler for FormatHandler { Self::Image(h) => h.extensions(), #[cfg(feature = "xlsx")] Self::Xlsx(h) => h.extensions(), - #[cfg(feature = "parquet")] - Self::Parquet(h) => h.extensions(), + Self::Wav(h) => h.extensions(), Self::Mp3(h) => h.extensions(), } @@ -246,8 +230,7 @@ impl Handler for FormatHandler { Self::Image(h) => h.content_types(), #[cfg(feature = "xlsx")] Self::Xlsx(h) => h.content_types(), - #[cfg(feature = "parquet")] - Self::Parquet(h) => h.content_types(), + Self::Wav(h) => h.content_types(), Self::Mp3(h) => h.content_types(), } @@ -285,10 +268,6 @@ impl From<ImageHandler> for FormatHandler { impl From<XlsxHandler> for FormatHandler { fn from(h: XlsxHandler) -> Self { Self::Xlsx(h) } } -#[cfg(feature = "parquet")] -impl From<ParquetHandler> for FormatHandler { - fn from(h: ParquetHandler) -> Self { Self::Parquet(h) } -} impl From<WavHandler> for FormatHandler { fn from(h: WavHandler) -> Self { Self::Wav(h) } } @@ -343,7 +322,7 @@ pub trait ImageLoader: Handler { ) -> Result<Vec<Document<FormatHandler>>, Error>; } -/// Loader for spreadsheet/tabular formats (XLSX, Parquet). +/// Loader for spreadsheet/tabular formats (XLSX). #[async_trait::async_trait] pub trait SpreadsheetLoader: Handler { /// Strongly-typed parameters for this loader. diff --git a/crates/nvisy-ingest/src/prelude.rs b/crates/nvisy-ingest/src/prelude.rs index 0ad5b84..516a27a 100644 --- a/crates/nvisy-ingest/src/prelude.rs +++ b/crates/nvisy-ingest/src/prelude.rs @@ -17,9 +17,6 @@ pub use crate::handler::DocxHandler; pub use crate::handler::ImageHandler; #[cfg(feature = "xlsx")] pub use crate::handler::XlsxHandler; -#[cfg(feature = "parquet")] -pub use crate::handler::ParquetHandler; - pub use crate::document::Document; pub use crate::element::{Element, ElementCategory, ElementType}; @@ -37,7 +34,5 @@ pub use crate::binary::docx::DocxLoader; pub use crate::image::image::ImageFileLoader; #[cfg(feature = "xlsx")] pub use crate::tabular::xlsx::XlsxLoader; -#[cfg(feature = "parquet")] -pub use crate::tabular::parquet::ParquetLoader; pub use crate::audio::wav::WavLoader; pub use crate::audio::mp3::Mp3Loader; diff --git a/crates/nvisy-ingest/src/tabular/mod.rs b/crates/nvisy-ingest/src/tabular/mod.rs index d66c51b..2c8189b 100644 --- a/crates/nvisy-ingest/src/tabular/mod.rs +++ b/crates/nvisy-ingest/src/tabular/mod.rs @@ -1,7 +1,4 @@ -//! Tabular/spreadsheet file loaders (XLSX, Parquet). +//! Tabular/spreadsheet file loaders (XLSX). #[cfg(feature = "xlsx")] pub mod xlsx; - -#[cfg(feature = "parquet")] -pub mod parquet; diff --git a/crates/nvisy-ingest/src/tabular/parquet.rs b/crates/nvisy-ingest/src/tabular/parquet.rs deleted file mode 100644 index d977241..0000000 --- a/crates/nvisy-ingest/src/tabular/parquet.rs +++ /dev/null @@ -1,128 +0,0 @@ -//! Apache Parquet file loader. - -use serde::Deserialize; -use std::sync::Arc; - -use nvisy_core::io::ContentData; -use nvisy_core::error::{Error, ErrorKind}; - -use crate::document::Document; -use crate::handler::{ParquetHandler, PlaintextHandler, FormatHandler, SpreadsheetLoader}; - -use arrow::array::{Array, RecordBatchReader}; -use arrow::record_batch::RecordBatch; -use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; - -/// Typed parameters for [`ParquetLoader`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ParquetLoaderParams { - /// Maximum number of rows to read. `None` means all rows. - #[serde(default)] - pub max_rows: Option<usize>, -} - -/// Extracts tabular data from Parquet files plus a flattened text document -/// for regex/dictionary scanning. -pub struct ParquetLoader; - -impl Clone for ParquetLoader { - fn clone(&self) -> Self { Self } -} - -#[async_trait::async_trait] -impl SpreadsheetLoader for ParquetLoader { - type Params = ParquetLoaderParams; - - async fn load( - &self, - content: &ContentData, - params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error> { - let data = bytes::Bytes::copy_from_slice(&content.to_bytes()); - let builder = ParquetRecordBatchReaderBuilder::try_new(data) - .map_err(|e| { - Error::new(ErrorKind::Runtime, format!("Parquet open failed: {e}")) - })?; - - let reader = builder.build().map_err(|e| { - Error::new(ErrorKind::Runtime, format!("Parquet reader build failed: {e}")) - })?; - - let schema = reader.schema(); - let columns: Vec<String> = schema - .fields() - .iter() - .map(|f: &arrow::datatypes::FieldRef| f.name().clone()) - .collect(); - - let mut all_rows: Vec<Vec<String>> = Vec::new(); - - for batch_result in reader { - let batch: RecordBatch = batch_result.map_err(|e| { - Error::new(ErrorKind::Runtime, format!("Parquet batch read failed: {e}")) - })?; - - for row_idx in 0..batch.num_rows() { - if let Some(max) = params.max_rows { - if all_rows.len() >= max { - break; - } - } - - let mut row = Vec::with_capacity(batch.num_columns()); - for col_idx in 0..batch.num_columns() { - let col: &Arc<dyn Array> = batch.column(col_idx); - let val = array_value_to_string(col.as_ref(), row_idx); - row.push(val); - } - all_rows.push(row); - } - - if let Some(max) = params.max_rows { - if all_rows.len() >= max { - break; - } - } - } - - let mut tabular_doc = Document::new(ParquetHandler) - .with_tabular(columns, all_rows.clone()); - tabular_doc.source.set_parent_id(Some(content.content_source.as_uuid())); - - // Flatten to text for regex/dictionary scanning - let mut text_parts = Vec::new(); - for row in &all_rows { - text_parts.push(row.join("\t")); - } - let flat_text = text_parts.join("\n"); - let mut text_doc = Document::new(PlaintextHandler) - .with_text(flat_text); - text_doc.source.set_parent_id(Some(content.content_source.as_uuid())); - - Ok(vec![tabular_doc.into_format(), text_doc.into_format()]) - } -} - -fn array_value_to_string(array: &dyn Array, index: usize) -> String { - if array.is_null(index) { - return String::new(); - } - - use std::fmt::Write; - let mut buf = String::new(); - let formatter = arrow::util::display::ArrayFormatter::try_new(array, &Default::default()); - match formatter { - Ok(f) => { - let _ = write!(buf, "{}", f.value(index)); - buf - } - Err(_) => String::new(), - } -} - -impl crate::handler::Handler for ParquetLoader { - fn id(&self) -> &str { ParquetHandler.id() } - fn extensions(&self) -> &[&str] { ParquetHandler.extensions() } - fn content_types(&self) -> &[&str] { ParquetHandler.content_types() } -} diff --git a/crates/nvisy-object/Cargo.toml b/crates/nvisy-object/Cargo.toml index b8682c1..ab392ea 100644 --- a/crates/nvisy-object/Cargo.toml +++ b/crates/nvisy-object/Cargo.toml @@ -24,7 +24,7 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } -nvisy-pipeline = { workspace = true } +nvisy-pipeline = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } diff --git a/crates/nvisy-ontology/Cargo.toml b/crates/nvisy-ontology/Cargo.toml index ffae204..a5bb2e8 100644 --- a/crates/nvisy-ontology/Cargo.toml +++ b/crates/nvisy-ontology/Cargo.toml @@ -26,7 +26,7 @@ jsonschema = ["dep:schemars"] nvisy-core = { workspace = true, features = [] } # JSON Schema generation (optional) -schemars = { workspace = true, optional = true } +schemars = { workspace = true, optional = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -34,10 +34,10 @@ serde_json = { workspace = true, features = [] } # Primitive datatypes uuid = { workspace = true, features = ["serde", "v4"] } -jiff = { workspace = true } +jiff = { workspace = true, features = [] } # Semantic versioning -semver = { workspace = true } +semver = { workspace = true, features = [] } # Error handling derive_more = { workspace = true, features = ["display", "from"] } diff --git a/crates/nvisy-ontology/src/audit/mod.rs b/crates/nvisy-ontology/src/audit/mod.rs index 9fe2a83..4e95a0d 100644 --- a/crates/nvisy-ontology/src/audit/mod.rs +++ b/crates/nvisy-ontology/src/audit/mod.rs @@ -3,8 +3,8 @@ //! An [`Audit`] entry records an immutable event in the data protection //! pipeline, carrying structured [`Explanation`] metadata for compliance. -pub mod explanation; -pub mod retention; +mod explanation; +mod retention; pub use explanation::{Explainable, Explanation}; pub use retention::{RetentionPolicy, RetentionScope}; diff --git a/crates/nvisy-ontology/src/detection/classification.rs b/crates/nvisy-ontology/src/detection/classification.rs index 779a121..f107961 100644 --- a/crates/nvisy-ontology/src/detection/classification.rs +++ b/crates/nvisy-ontology/src/detection/classification.rs @@ -2,17 +2,14 @@ use serde::{Deserialize, Serialize}; -use super::SensitivityLevel; +use super::Sensitivity; /// Result of sensitivity classification over a set of detected entities. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct ClassificationResult { - /// The computed sensitivity level. - pub sensitivity_level: SensitivityLevel, + /// Sensitivity assessment (level + risk score). + pub sensitivity: Sensitivity, /// Total number of entities considered. pub total_entities: usize, - /// Re-identification risk score in the range `[0.0, 1.0]`, if computed. - #[serde(skip_serializing_if = "Option::is_none")] - pub risk_score: Option<f64>, } diff --git a/crates/nvisy-ontology/src/detection/mod.rs b/crates/nvisy-ontology/src/detection/mod.rs index 01cc06a..9af8a56 100644 --- a/crates/nvisy-ontology/src/detection/mod.rs +++ b/crates/nvisy-ontology/src/detection/mod.rs @@ -4,11 +4,13 @@ //! first-class type, carrying the detected entities alongside pipeline //! and policy metadata. -pub mod annotation; -pub mod classification; +mod annotation; +mod classification; +mod sensitivity; pub use annotation::{Annotation, AnnotationKind, AnnotationLabel}; pub use classification::ClassificationResult; +pub use sensitivity::{Sensitivity, SensitivityLevel}; use serde::{Deserialize, Serialize}; use uuid::Uuid; @@ -17,36 +19,6 @@ use nvisy_core::path::ContentSource; use crate::entity::Entity; -/// Sensitivity classification assigned to a document or content region. -/// -/// Drives downstream policy: rules can be scoped to specific sensitivity -/// levels via [`RuleCondition`](crate::policy::RuleCondition). -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] -#[serde(rename_all = "snake_case")] -pub enum SensitivityLevel { - /// No sensitive data detected or all data is publicly available. - Public, - /// Internal use only — not intended for external distribution. - Internal, - /// Contains sensitive data requiring access controls. - Confidential, - /// Highly sensitive — regulated data requiring strict controls. - Restricted, -} - -/// Types that can be submitted for sensitive data detection. -pub trait Detectable: Send + Sync { - /// Content as text for text-based detection. - fn text_content(&self) -> Option<&str>; - /// Binary content for image/audio/video detection. - fn binary_content(&self) -> Option<&[u8]>; - /// MIME type of the content. - fn mime_type(&self) -> Option<&str>; - /// Source identity for lineage. - fn source(&self) -> &ContentSource; -} - /// The output of a detection pass over a single content source. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] @@ -59,19 +31,10 @@ pub struct DetectionResult { /// Identifier of the policy that governed detection. #[serde(skip_serializing_if = "Option::is_none")] pub policy_id: Option<Uuid>, - /// Identifier of the pipeline run that produced this result. - #[serde(skip_serializing_if = "Option::is_none")] - pub run_id: Option<Uuid>, /// Processing time in milliseconds. #[serde(skip_serializing_if = "Option::is_none")] pub duration_ms: Option<u64>, - /// Overall sensitivity classification derived from the detected entities. - #[serde(skip_serializing_if = "Option::is_none")] - pub sensitivity_level: Option<SensitivityLevel>, - /// Re-identification risk score in the range `[0.0, 1.0]`. - /// - /// Estimates the likelihood that a data subject could be re-identified - /// from the entities remaining after redaction. Computed post-transform. + /// Overall sensitivity assessment derived from the detected entities. #[serde(skip_serializing_if = "Option::is_none")] - pub risk_score: Option<f64>, + pub sensitivity: Option<Sensitivity>, } diff --git a/crates/nvisy-ontology/src/detection/sensitivity.rs b/crates/nvisy-ontology/src/detection/sensitivity.rs new file mode 100644 index 0000000..783f2a7 --- /dev/null +++ b/crates/nvisy-ontology/src/detection/sensitivity.rs @@ -0,0 +1,38 @@ +//! Sensitivity level and assessment types. + +use serde::{Deserialize, Serialize}; + +/// Sensitivity classification assigned to a document or content region. +/// +/// Drives downstream policy: rules can be scoped to specific sensitivity +/// levels via [`RuleCondition`](crate::policy::RuleCondition). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum SensitivityLevel { + /// No sensitive data detected or all data is publicly available. + Public, + /// Internal use only — not intended for external distribution. + Internal, + /// Contains sensitive data requiring access controls. + Confidential, + /// Highly sensitive — regulated data requiring strict controls. + Restricted, +} + +/// Combined sensitivity assessment for a content source. +/// +/// Pairs a discrete [`SensitivityLevel`] with an optional continuous +/// re-identification risk score in `[0.0, 1.0]`. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct Sensitivity { + /// Discrete sensitivity classification. + pub level: SensitivityLevel, + /// Re-identification risk score in the range `[0.0, 1.0]`. + /// + /// Estimates the likelihood that a data subject could be re-identified + /// from the entities remaining after redaction. + #[serde(skip_serializing_if = "Option::is_none")] + pub risk_score: Option<f64>, +} diff --git a/crates/nvisy-ontology/src/entity/document.rs b/crates/nvisy-ontology/src/entity/document.rs new file mode 100644 index 0000000..7499808 --- /dev/null +++ b/crates/nvisy-ontology/src/entity/document.rs @@ -0,0 +1,34 @@ +//! Document format classification. + +use serde::{Deserialize, Serialize}; + +/// Document format that content can be classified as. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +#[serde(rename_all = "snake_case")] +pub enum DocumentType { + /// Plain text (`.txt`, `.log`, etc.). + Plaintext, + /// Comma-separated values. + Csv, + /// JSON data. + Json, + /// HTML pages. + Html, + /// PDF documents. + Pdf, + /// Microsoft Word (`.docx`). + Docx, + /// Microsoft Excel (`.xlsx`). + Xlsx, + /// PNG image. + Png, + /// JPEG image. + Jpeg, + /// TIFF image. + Tiff, + /// WAV audio. + Wav, + /// MP3 audio. + Mp3, +} diff --git a/crates/nvisy-ontology/src/entity/mod.rs b/crates/nvisy-ontology/src/entity/mod.rs index f98d45b..3748bfc 100644 --- a/crates/nvisy-ontology/src/entity/mod.rs +++ b/crates/nvisy-ontology/src/entity/mod.rs @@ -4,10 +4,12 @@ //! within a document. Entities are produced by detection actions and consumed //! by redaction and audit stages of the pipeline. -pub mod location; -pub mod model; -pub mod selector; +mod document; +mod location; +mod model; +mod selector; +pub use document::DocumentType; pub use location::{ AudioLocation, BoundingBox, EntityLocation, ImageLocation, TabularLocation, TextLocation, TimeSpan, VideoLocation, diff --git a/crates/nvisy-ontology/src/policy/mod.rs b/crates/nvisy-ontology/src/policy/mod.rs index 834de59..08820cd 100644 --- a/crates/nvisy-ontology/src/policy/mod.rs +++ b/crates/nvisy-ontology/src/policy/mod.rs @@ -4,9 +4,9 @@ //! how detected entities are redacted. Policies may be associated with a //! [`RegulationKind`] and support inheritance via the `extends` field. -pub mod evaluation; -pub mod regulation; -pub mod rule; +mod evaluation; +mod regulation; +mod rule; pub use evaluation::PolicyEvaluation; pub use regulation::RegulationKind; diff --git a/crates/nvisy-ontology/src/policy/rule.rs b/crates/nvisy-ontology/src/policy/rule.rs index b68bfcf..12b65e2 100644 --- a/crates/nvisy-ontology/src/policy/rule.rs +++ b/crates/nvisy-ontology/src/policy/rule.rs @@ -8,16 +8,16 @@ use serde_json::{Map, Value}; use uuid::Uuid; use crate::detection::SensitivityLevel; -use crate::entity::EntitySelector; +use crate::entity::{DocumentType, EntitySelector}; use crate::redaction::RedactionSpec; /// Conditions that must be met for a [`PolicyRule`] to apply. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct RuleCondition { - /// MIME types of documents this rule applies to. + /// Document formats this rule applies to. Empty means all formats. #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub document_types: Vec<String>, + pub document_types: Vec<DocumentType>, /// User roles this rule applies to. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub roles: Vec<String>, diff --git a/crates/nvisy-ontology/src/prelude.rs b/crates/nvisy-ontology/src/prelude.rs index 5968662..159ff5b 100644 --- a/crates/nvisy-ontology/src/prelude.rs +++ b/crates/nvisy-ontology/src/prelude.rs @@ -4,13 +4,13 @@ pub use crate::audit::{ Audit, AuditAction, Auditable, Explainable, Explanation, RetentionPolicy, RetentionScope, }; pub use crate::detection::{ - Annotation, AnnotationKind, AnnotationLabel, ClassificationResult, Detectable, - DetectionResult, SensitivityLevel, + Annotation, AnnotationKind, AnnotationLabel, ClassificationResult, DetectionResult, + Sensitivity, SensitivityLevel, }; pub use crate::entity::{ - AudioLocation, BoundingBox, DetectionMethod, Entity, EntityCategory, EntityLocation, - EntitySelector, ImageLocation, ModelInfo, ModelKind, TabularLocation, TextLocation, TimeSpan, - VideoLocation, + AudioLocation, BoundingBox, DetectionMethod, DocumentType, Entity, EntityCategory, + EntityLocation, EntitySelector, ImageLocation, ModelInfo, ModelKind, TabularLocation, + TextLocation, TimeSpan, VideoLocation, }; pub use crate::policy::{ Policy, PolicyEvaluation, PolicyRule, RegulationKind, RuleCondition, RuleKind, diff --git a/crates/nvisy-ontology/src/redaction/mod.rs b/crates/nvisy-ontology/src/redaction/mod.rs index 5b735d1..568a419 100644 --- a/crates/nvisy-ontology/src/redaction/mod.rs +++ b/crates/nvisy-ontology/src/redaction/mod.rs @@ -21,11 +21,11 @@ //! - Image / video: [`ImageRedactionMethod`], [`ImageRedactionSpec`], [`ImageRedactionOutput`] //! - Audio: [`AudioRedactionMethod`], [`AudioRedactionSpec`], [`AudioRedactionOutput`] -pub mod method; -pub mod output; -pub mod review; -pub mod spec; -pub mod summary; +mod method; +mod output; +mod review; +mod spec; +mod summary; pub use method::{ AudioRedactionMethod, ImageRedactionMethod, RedactionMethod, TextRedactionMethod, diff --git a/crates/nvisy-pattern/Cargo.toml b/crates/nvisy-pattern/Cargo.toml index 6e08fa8..f51b7c9 100644 --- a/crates/nvisy-pattern/Cargo.toml +++ b/crates/nvisy-pattern/Cargo.toml @@ -23,7 +23,7 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates -nvisy-ontology = { workspace = true } +nvisy-ontology = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } diff --git a/crates/nvisy-pipeline/Cargo.toml b/crates/nvisy-pipeline/Cargo.toml index 11fb7d8..c7c1905 100644 --- a/crates/nvisy-pipeline/Cargo.toml +++ b/crates/nvisy-pipeline/Cargo.toml @@ -31,9 +31,9 @@ pdf-redaction = ["dep:lopdf"] [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } -nvisy-ontology = { workspace = true } -nvisy-ingest = { workspace = true } -nvisy-pattern = { workspace = true } +nvisy-ontology = { workspace = true, features = [] } +nvisy-ingest = { workspace = true, features = [] } +nvisy-pattern = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } @@ -45,18 +45,18 @@ async-trait = { workspace = true, features = [] } # Primitive datatypes uuid = { workspace = true, features = ["v4"] } -bytes = { workspace = true } +bytes = { workspace = true, features = [] } # Text processing regex = { workspace = true, features = [] } -aho-corasick = { workspace = true } +aho-corasick = { workspace = true, features = [] } # Observability tracing = { workspace = true, features = [] } # Image processing (feature-gated) -image = { workspace = true, optional = true } -imageproc = { workspace = true, optional = true } +image = { workspace = true, optional = true, features = [] } +imageproc = { workspace = true, optional = true, features = [] } # PDF manipulation (feature-gated) -lopdf = { workspace = true, optional = true } +lopdf = { workspace = true, optional = true, features = [] } diff --git a/crates/nvisy-pipeline/src/actions/classify.rs b/crates/nvisy-pipeline/src/actions/classify.rs index 4b6cc3b..0e4b0d4 100644 --- a/crates/nvisy-pipeline/src/actions/classify.rs +++ b/crates/nvisy-pipeline/src/actions/classify.rs @@ -1,7 +1,7 @@ //! Sensitivity classification action. pub use nvisy_ontology::detection::ClassificationResult; -use nvisy_ontology::detection::SensitivityLevel; +use nvisy_ontology::detection::{Sensitivity, SensitivityLevel}; use nvisy_ontology::entity::Entity; use nvisy_core::error::Error; @@ -9,8 +9,8 @@ use crate::action::Action; /// Assigns a sensitivity level based on detected entities. /// -/// The action inspects the entities, computes a [`SensitivityLevel`], and -/// returns a [`ClassificationResult`]. +/// The action inspects the entities, computes a [`Sensitivity`] assessment, +/// and returns a [`ClassificationResult`]. pub struct ClassifyAction; #[async_trait::async_trait] @@ -32,12 +32,14 @@ impl Action for ClassifyAction { entities: Self::Input, ) -> Result<ClassificationResult, Error> { let total_entities = entities.len(); - let sensitivity_level = compute_sensitivity_level(&entities); + let level = compute_sensitivity_level(&entities); Ok(ClassificationResult { - sensitivity_level, + sensitivity: Sensitivity { + level, + risk_score: None, + }, total_entities, - risk_score: None, }) } } diff --git a/crates/nvisy-python/Cargo.toml b/crates/nvisy-python/Cargo.toml index 7e999e1..089355f 100644 --- a/crates/nvisy-python/Cargo.toml +++ b/crates/nvisy-python/Cargo.toml @@ -24,9 +24,9 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } -nvisy-ontology = { workspace = true } -nvisy-pipeline = { workspace = true } -nvisy-ingest = { workspace = true } +nvisy-ontology = { workspace = true, features = [] } +nvisy-pipeline = { workspace = true, features = [] } +nvisy-ingest = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } diff --git a/docs/TODO.md b/docs/TODO.md new file mode 100644 index 0000000..83f408d --- /dev/null +++ b/docs/TODO.md @@ -0,0 +1,11 @@ +# TODO + +## Engine + +- [ ] Implement `Engine` trait for the DAG runner +- [ ] Wire `EngineInput`/`EngineOutput` through the pipeline + +## Ontology + +- [ ] Add video document types (MP4, WebM, AVI) +- [ ] Add archive document types (ZIP, TAR, GZIP) diff --git a/packages/nvisy-exif/src/nvisy_exif/exif.py b/packages/nvisy-exif/src/nvisy_exif/exif.py index b2bd16c..952d804 100644 --- a/packages/nvisy-exif/src/nvisy_exif/exif.py +++ b/packages/nvisy-exif/src/nvisy_exif/exif.py @@ -1,6 +1,6 @@ """EXIF metadata reading and stripping for images. -Uses Pillow for EXIF handling. Supports JPEG, PNG, TIFF, and WebP formats. +Uses Pillow for EXIF handling. Supports JPEG, PNG, and TIFF formats. These functions are designed to be callable from Rust via PyO3. """ @@ -16,7 +16,7 @@ def read_exif(image_bytes: bytes) -> dict: """Read EXIF metadata from image bytes. Args: - image_bytes: Raw image bytes (JPEG, PNG, TIFF, or WebP). + image_bytes: Raw image bytes (JPEG, PNG, or TIFF). Returns: Dictionary mapping human-readable tag names to their values. @@ -40,7 +40,7 @@ def strip_exif(image_bytes: bytes) -> bytes: """Remove all EXIF metadata from image bytes. Args: - image_bytes: Raw image bytes (JPEG, PNG, TIFF, or WebP). + image_bytes: Raw image bytes (JPEG, PNG, or TIFF). Returns: Image bytes with all EXIF metadata removed, preserving the From 3d2cbe9e00b9aefb590493c7230d922d53a3cb70 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Sat, 14 Feb 2026 20:05:27 +0100 Subject: [PATCH 15/17] refactor(ingest): replace Content GAT with async SpanStream/SpanEditStream, merge into Handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Delete the Content trait (with its GAT Iter<'a>) and merge SpanId, SpanData, view_spans, and edit_spans directly into the Handler trait as async methods via #[async_trait]. Span iteration is now exposed through SpanStream (view_stream.rs) and SpanEditStream (edit_stream.rs), thin wrappers around Pin<Box<dyn Stream + Send>>. - Add futures dependency to nvisy-ingest - Create document/view_stream.rs (SpanStream) and document/edit_stream.rs (SpanEditStream) - Rename parse_spans → view_spans; make view_spans/edit_spans async - Privatize TxtSpanIter, CsvSpanIter, JsonSpanIter - Add stub SpanId=()/SpanData=() impls to all 8 format-stub handlers - Convert all 44 handler/loader tests to async Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- Cargo.lock | 25 +- Cargo.toml | 26 +- crates/nvisy-ingest/Cargo.toml | 31 +- crates/nvisy-ingest/src/audio/mod.rs | 4 - crates/nvisy-ingest/src/audio/mp3.rs | 51 -- crates/nvisy-ingest/src/audio/wav.rs | 51 -- crates/nvisy-ingest/src/binary/docx.rs | 168 ----- crates/nvisy-ingest/src/binary/pdf.rs | 166 ----- crates/nvisy-ingest/src/document.rs | 299 --------- .../nvisy-ingest/src/document/edit_stream.rs | 43 ++ crates/nvisy-ingest/src/document/mod.rs | 73 +++ .../nvisy-ingest/src/document/view_stream.rs | 43 ++ crates/nvisy-ingest/src/element.rs | 375 ------------ crates/nvisy-ingest/src/handler.rs | 351 ----------- crates/nvisy-ingest/src/handler/audio/mod.rs | 6 + crates/nvisy-ingest/src/handler/audio/mp3.rs | 32 + crates/nvisy-ingest/src/handler/audio/wav.rs | 32 + .../nvisy-ingest/src/handler/document/docx.rs | 32 + .../src/{binary => handler/document}/mod.rs | 3 +- .../nvisy-ingest/src/handler/document/pdf.rs | 32 + crates/nvisy-ingest/src/handler/encoding.rs | 25 + crates/nvisy-ingest/src/handler/image/jpeg.rs | 32 + crates/nvisy-ingest/src/handler/image/mod.rs | 6 + crates/nvisy-ingest/src/handler/image/png.rs | 32 + crates/nvisy-ingest/src/handler/mod.rs | 87 +++ crates/nvisy-ingest/src/handler/span.rs | 19 + .../nvisy-ingest/src/handler/tabular/mod.rs | 4 + .../nvisy-ingest/src/handler/tabular/xlsx.rs | 32 + .../src/handler/text/csv_handler.rs | 403 ++++++++++++ .../src/handler/text/csv_loader.rs | 234 +++++++ crates/nvisy-ingest/src/handler/text/html.rs | 32 + .../src/handler/text/json_handler.rs | 578 ++++++++++++++++++ .../src/handler/text/json_loader.rs | 186 ++++++ crates/nvisy-ingest/src/handler/text/mod.rs | 10 + .../src/handler/text/txt_handler.rs | 214 +++++++ .../src/handler/text/txt_loader.rs | 136 +++++ crates/nvisy-ingest/src/image/image.rs | 58 -- crates/nvisy-ingest/src/image/mod.rs | 4 - crates/nvisy-ingest/src/lib.rs | 6 - crates/nvisy-ingest/src/prelude.rs | 43 +- crates/nvisy-ingest/src/tabular/mod.rs | 4 - crates/nvisy-ingest/src/tabular/xlsx.rs | 113 ---- crates/nvisy-ingest/src/text/csv.rs | 38 -- crates/nvisy-ingest/src/text/html.rs | 106 ---- crates/nvisy-ingest/src/text/json.rs | 42 -- crates/nvisy-ingest/src/text/mod.rs | 8 - crates/nvisy-ingest/src/text/plaintext.rs | 41 -- crates/nvisy-ontology/src/entity/document.rs | 4 +- crates/nvisy-pipeline/Cargo.toml | 4 +- .../src/actions/apply_audio_redaction.rs | 47 -- .../src/actions/apply_image_redaction.rs | 133 ---- .../src/actions/apply_pdf_redaction.rs | 6 +- .../src/actions/apply_redaction.rs | 341 +++++++++-- .../src/actions/apply_tabular_redaction.rs | 103 ---- .../src/actions/detect_dictionary.rs | 6 +- .../src/actions/detect_regex.rs | 2 +- .../src/actions/detect_tabular.rs | 12 +- crates/nvisy-pipeline/src/actions/mod.rs | 9 +- crates/nvisy-pipeline/src/prelude.rs | 6 +- crates/nvisy-python/src/actions/mod.rs | 16 +- crates/nvisy-python/src/actions/ocr.rs | 12 +- 61 files changed, 2696 insertions(+), 2341 deletions(-) delete mode 100644 crates/nvisy-ingest/src/audio/mod.rs delete mode 100644 crates/nvisy-ingest/src/audio/mp3.rs delete mode 100644 crates/nvisy-ingest/src/audio/wav.rs delete mode 100644 crates/nvisy-ingest/src/binary/docx.rs delete mode 100644 crates/nvisy-ingest/src/binary/pdf.rs delete mode 100644 crates/nvisy-ingest/src/document.rs create mode 100644 crates/nvisy-ingest/src/document/edit_stream.rs create mode 100644 crates/nvisy-ingest/src/document/mod.rs create mode 100644 crates/nvisy-ingest/src/document/view_stream.rs delete mode 100644 crates/nvisy-ingest/src/element.rs delete mode 100644 crates/nvisy-ingest/src/handler.rs create mode 100644 crates/nvisy-ingest/src/handler/audio/mod.rs create mode 100644 crates/nvisy-ingest/src/handler/audio/mp3.rs create mode 100644 crates/nvisy-ingest/src/handler/audio/wav.rs create mode 100644 crates/nvisy-ingest/src/handler/document/docx.rs rename crates/nvisy-ingest/src/{binary => handler/document}/mod.rs (64%) create mode 100644 crates/nvisy-ingest/src/handler/document/pdf.rs create mode 100644 crates/nvisy-ingest/src/handler/encoding.rs create mode 100644 crates/nvisy-ingest/src/handler/image/jpeg.rs create mode 100644 crates/nvisy-ingest/src/handler/image/mod.rs create mode 100644 crates/nvisy-ingest/src/handler/image/png.rs create mode 100644 crates/nvisy-ingest/src/handler/mod.rs create mode 100644 crates/nvisy-ingest/src/handler/span.rs create mode 100644 crates/nvisy-ingest/src/handler/tabular/mod.rs create mode 100644 crates/nvisy-ingest/src/handler/tabular/xlsx.rs create mode 100644 crates/nvisy-ingest/src/handler/text/csv_handler.rs create mode 100644 crates/nvisy-ingest/src/handler/text/csv_loader.rs create mode 100644 crates/nvisy-ingest/src/handler/text/html.rs create mode 100644 crates/nvisy-ingest/src/handler/text/json_handler.rs create mode 100644 crates/nvisy-ingest/src/handler/text/json_loader.rs create mode 100644 crates/nvisy-ingest/src/handler/text/mod.rs create mode 100644 crates/nvisy-ingest/src/handler/text/txt_handler.rs create mode 100644 crates/nvisy-ingest/src/handler/text/txt_loader.rs delete mode 100644 crates/nvisy-ingest/src/image/image.rs delete mode 100644 crates/nvisy-ingest/src/image/mod.rs delete mode 100644 crates/nvisy-ingest/src/tabular/mod.rs delete mode 100644 crates/nvisy-ingest/src/tabular/xlsx.rs delete mode 100644 crates/nvisy-ingest/src/text/csv.rs delete mode 100644 crates/nvisy-ingest/src/text/html.rs delete mode 100644 crates/nvisy-ingest/src/text/json.rs delete mode 100644 crates/nvisy-ingest/src/text/mod.rs delete mode 100644 crates/nvisy-ingest/src/text/plaintext.rs delete mode 100644 crates/nvisy-pipeline/src/actions/apply_audio_redaction.rs delete mode 100644 crates/nvisy-pipeline/src/actions/apply_image_redaction.rs delete mode 100644 crates/nvisy-pipeline/src/actions/apply_tabular_redaction.rs diff --git a/Cargo.lock b/Cargo.lock index 4eedef9..b67959a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -578,6 +578,27 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + [[package]] name = "dashmap" version = "6.1.0" @@ -2102,13 +2123,15 @@ dependencies = [ "async-trait", "bytes", "calamine", + "csv", + "futures", "image", "infer", "lopdf", "nvisy-core", + "nvisy-ontology", "pdf-extract", "quick-xml 0.37.5", - "schemars", "scraper", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index 8d9ff02..f62acce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,19 +53,26 @@ tracing = { version = "0.1", features = [] } # (De)serialization serde = { version = "1.0", features = ["derive"] } serde_json = { version = "1.0", features = [] } +schemars = { version = "1", features = ["uuid1", "bytes1"] } -# Error handling +# Derive macros and error handling thiserror = { version = "2.0", features = [] } anyhow = { version = "1.0", features = [] } derive_more = { version = "1", features = ["display", "from"] } +strum = { version = "0.26", features = ["derive"] } # Primitive datatypes uuid = { version = "1", features = ["serde", "v4", "v7"] } bytes = { version = "1", features = ["serde"] } +jiff = { version = "0.2", features = ["serde"] } +sha2 = { version = "0.10", features = [] } +hex = { version = "0.4", features = [] } # Text processing +hipstr = { version = "0.6", features = [] } regex = { version = "1.0", features = [] } aho-corasick = { version = "1", features = [] } +csv = { version = "1", features = [] } # Graph data structures petgraph = { version = "0.8", features = [] } @@ -73,9 +80,6 @@ petgraph = { version = "0.8", features = [] } # File type detection infer = { version = "0.19", features = [] } -# JSON Schema generation -schemars = { version = "1", features = ["uuid1", "bytes1"] } - # Python interop pyo3 = { version = "0.23", features = [] } @@ -94,23 +98,9 @@ calamine = { version = "0.33", features = [] } zip = { version = "2", features = [] } quick-xml = { version = "0.37", features = [] } - -# Time -jiff = { version = "0.2", features = ["serde"] } - -# Interned strings -hipstr = { version = "0.6", features = [] } - -# Hashing -sha2 = { version = "0.10", features = [] } -hex = { version = "0.4", features = [] } - # Semantic versioning semver = { version = "1", features = ["serde"] } -# Enum derives -strum = { version = "0.26", features = ["derive"] } - # Testing tempfile = { version = "3", features = [] } diff --git a/crates/nvisy-ingest/Cargo.toml b/crates/nvisy-ingest/Cargo.toml index be70fcb..d367962 100644 --- a/crates/nvisy-ingest/Cargo.toml +++ b/crates/nvisy-ingest/Cargo.toml @@ -22,32 +22,40 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [features] -default = ["pdf", "docx", "html", "xlsx", "image"] -# PDF parsing and text extraction via pdf-extract + lopdf -pdf = ["dep:pdf-extract", "dep:lopdf"] -# Microsoft Word (.docx) parsing via zip + quick-xml -docx = ["dep:zip", "dep:quick-xml"] +default = ["pdf", "docx", "html", "xlsx", "image", "wav", "mp3"] +# PDF parsing and text extraction via pdf-extract + lopdf; enables png for extracted images +pdf = ["dep:pdf-extract", "dep:lopdf", "png"] +# Microsoft Word (.docx) parsing via zip + quick-xml; enables image formats for extracted images +docx = ["dep:zip", "dep:quick-xml", "jpeg", "png"] # HTML parsing and text extraction via scraper html = ["dep:scraper"] # Excel (.xlsx) spreadsheet parsing via calamine xlsx = ["dep:calamine"] -# Image decoding (PNG, JPEG, TIFF) via the image crate -image = ["dep:image"] +# Convenience alias: all image formats +image = ["jpeg", "png"] +# Individual image format handlers (each requires dep:image) +jpeg = ["dep:image"] +png = ["dep:image"] +# Audio format handlers (no additional dependencies) +wav = [] +mp3 = [] [dependencies] # Internal crates nvisy-core = { workspace = true, features = [] } - -# JSON Schema generation -schemars = { workspace = true, features = [] } +nvisy-ontology = { workspace = true, features = [] } # (De)serialization serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = [] } +# Text parsing +csv = { workspace = true, features = [] } + # Async runtime tokio = { workspace = true, features = ["sync"] } async-trait = { workspace = true, features = [] } +futures = { workspace = true, features = [] } # Primitive datatypes uuid = { workspace = true, features = ["v4"] } @@ -67,3 +75,6 @@ quick-xml = { workspace = true, optional = true, features = [] } scraper = { workspace = true, optional = true, features = [] } calamine = { workspace = true, optional = true, features = [] } image = { workspace = true, optional = true, features = [] } + +[dev-dependencies] +tokio = { workspace = true, features = ["macros", "rt"] } diff --git a/crates/nvisy-ingest/src/audio/mod.rs b/crates/nvisy-ingest/src/audio/mod.rs deleted file mode 100644 index 1849018..0000000 --- a/crates/nvisy-ingest/src/audio/mod.rs +++ /dev/null @@ -1,4 +0,0 @@ -//! Audio file loaders. - -pub mod wav; -pub mod mp3; diff --git a/crates/nvisy-ingest/src/audio/mp3.rs b/crates/nvisy-ingest/src/audio/mp3.rs deleted file mode 100644 index 445003c..0000000 --- a/crates/nvisy-ingest/src/audio/mp3.rs +++ /dev/null @@ -1,51 +0,0 @@ -//! MP3 audio file loader. -//! -//! Returns a document with metadata only -- audio redaction is not yet implemented. - -use serde::Deserialize; - -use nvisy_core::io::ContentData; -use nvisy_core::error::Error; - -use crate::document::Document; -use crate::handler::{Mp3Handler, FormatHandler, AudioLoader}; - -/// Typed parameters for [`Mp3Loader`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct Mp3LoaderParams {} - -/// Placeholder loader for MP3 audio files. Returns a metadata-only document. -pub struct Mp3Loader; - -impl Clone for Mp3Loader { - fn clone(&self) -> Self { Self } -} - -#[async_trait::async_trait] -impl AudioLoader for Mp3Loader { - type Params = Mp3LoaderParams; - - async fn load( - &self, - content: &ContentData, - _params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error> { - let content_type = content.content_type().unwrap_or("audio/mpeg").to_string(); - let size = content.to_bytes().len(); - - let mut doc = Document::new(Mp3Handler) - .with_text(format!( - "[Audio file: type={}, size={} bytes. Audio redaction not yet implemented.]", - content_type, size - )); - doc.source.set_parent_id(Some(content.content_source.as_uuid())); - Ok(vec![doc.into_format()]) - } -} - -impl crate::handler::Handler for Mp3Loader { - fn id(&self) -> &str { Mp3Handler.id() } - fn extensions(&self) -> &[&str] { Mp3Handler.extensions() } - fn content_types(&self) -> &[&str] { Mp3Handler.content_types() } -} diff --git a/crates/nvisy-ingest/src/audio/wav.rs b/crates/nvisy-ingest/src/audio/wav.rs deleted file mode 100644 index fa9feab..0000000 --- a/crates/nvisy-ingest/src/audio/wav.rs +++ /dev/null @@ -1,51 +0,0 @@ -//! WAV audio file loader. -//! -//! Returns a document with metadata only -- audio redaction is not yet implemented. - -use serde::Deserialize; - -use nvisy_core::io::ContentData; -use nvisy_core::error::Error; - -use crate::document::Document; -use crate::handler::{WavHandler, FormatHandler, AudioLoader}; - -/// Typed parameters for [`WavLoader`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct WavLoaderParams {} - -/// Placeholder loader for WAV audio files. Returns a metadata-only document. -pub struct WavLoader; - -impl Clone for WavLoader { - fn clone(&self) -> Self { Self } -} - -#[async_trait::async_trait] -impl AudioLoader for WavLoader { - type Params = WavLoaderParams; - - async fn load( - &self, - content: &ContentData, - _params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error> { - let content_type = content.content_type().unwrap_or("audio/wav").to_string(); - let size = content.to_bytes().len(); - - let mut doc = Document::new(WavHandler) - .with_text(format!( - "[Audio file: type={}, size={} bytes. Audio redaction not yet implemented.]", - content_type, size - )); - doc.source.set_parent_id(Some(content.content_source.as_uuid())); - Ok(vec![doc.into_format()]) - } -} - -impl crate::handler::Handler for WavLoader { - fn id(&self) -> &str { WavHandler.id() } - fn extensions(&self) -> &[&str] { WavHandler.extensions() } - fn content_types(&self) -> &[&str] { WavHandler.content_types() } -} diff --git a/crates/nvisy-ingest/src/binary/docx.rs b/crates/nvisy-ingest/src/binary/docx.rs deleted file mode 100644 index 98d61b6..0000000 --- a/crates/nvisy-ingest/src/binary/docx.rs +++ /dev/null @@ -1,168 +0,0 @@ -//! DOCX (Office Open XML) file loader. - -use bytes::Bytes; -use serde::Deserialize; -use std::io::Cursor; - -use nvisy_core::io::ContentData; -use nvisy_core::error::{Error, ErrorKind}; - -use crate::document::Document; -use crate::element::{Element, ElementType}; -use crate::handler::{DocxHandler, ImageHandler, FormatHandler, BinaryLoader}; - -/// Typed parameters for [`DocxLoader`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct DocxLoaderParams { - /// Whether to extract embedded images. - #[serde(default = "default_true")] - pub extract_images: bool, -} - -fn default_true() -> bool { - true -} - -/// Extracts text and optionally images from DOCX files. -pub struct DocxLoader; - -impl Clone for DocxLoader { - fn clone(&self) -> Self { Self } -} - -#[async_trait::async_trait] -impl BinaryLoader for DocxLoader { - type Params = DocxLoaderParams; - - async fn load( - &self, - content: &ContentData, - params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error> { - let cursor = Cursor::new(content.to_bytes().to_vec()); - let mut archive = zip::ZipArchive::new(cursor).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("Failed to open DOCX ZIP: {e}")) - })?; - - let mut documents = Vec::new(); - let mut elements = Vec::new(); - let mut full_text = String::new(); - - // Parse word/document.xml - if let Ok(mut entry) = archive.by_name("word/document.xml") { - let mut xml_content = String::new(); - std::io::Read::read_to_string(&mut entry, &mut xml_content).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("Failed to read document.xml: {e}")) - })?; - - let mut reader = quick_xml::Reader::from_str(&xml_content); - let mut in_text = false; - let mut in_heading = false; - let mut current_text = String::new(); - let mut buf = Vec::new(); - - loop { - match reader.read_event_into(&mut buf) { - Ok(quick_xml::events::Event::Start(ref e)) => { - match e.name().as_ref() { - b"w:t" => in_text = true, - b"w:pStyle" => { - for attr in e.attributes().flatten() { - if attr.key.as_ref() == b"w:val" { - let val = String::from_utf8_lossy(&attr.value); - if val.starts_with("Heading") { - in_heading = true; - } - } - } - } - _ => {} - } - } - Ok(quick_xml::events::Event::End(ref e)) => { - match e.name().as_ref() { - b"w:t" => in_text = false, - b"w:p" => { - if !current_text.is_empty() { - let element_type = if in_heading { - ElementType::Title - } else { - ElementType::NarrativeText - }; - elements.push(Element::new(element_type, ¤t_text)); - if !full_text.is_empty() { - full_text.push('\n'); - } - full_text.push_str(¤t_text); - current_text.clear(); - in_heading = false; - } - } - _ => {} - } - } - Ok(quick_xml::events::Event::Text(ref e)) => { - if in_text { - let text = e.unescape().unwrap_or_default(); - current_text.push_str(&text); - } - } - Ok(quick_xml::events::Event::Eof) => break, - Err(e) => { - tracing::warn!("DOCX XML parse error: {e}"); - break; - } - _ => {} - } - buf.clear(); - } - } - - let mut doc = Document::new(DocxHandler) - .with_text(full_text) - .with_elements(elements); - doc.source.set_parent_id(Some(content.content_source.as_uuid())); - documents.push(doc.into_format()); - - // Extract images from word/media/ - if params.extract_images { - let media_names: Vec<String> = (0..archive.len()) - .filter_map(|i| { - let entry = archive.by_index(i).ok()?; - let name = entry.name().to_string(); - if name.starts_with("word/media/") { - Some(name) - } else { - None - } - }) - .collect(); - - for name in media_names { - if let Ok(mut entry) = archive.by_name(&name) { - let mut buf = Vec::new(); - std::io::Read::read_to_end(&mut entry, &mut buf).ok(); - if !buf.is_empty() { - let mime = infer::get(&buf) - .map(|t| t.mime_type().to_string()) - .unwrap_or_else(|| "image/png".to_string()); - let mut img = Document::new(ImageHandler) - .with_data(Bytes::from(buf), mime) - .with_source_path(&name); - img.source.set_parent_id(Some(content.content_source.as_uuid())); - documents.push(img.into_format()); - } - } - } - } - - Ok(documents) - } -} - -impl crate::handler::Handler for DocxLoader { - fn id(&self) -> &str { DocxHandler.id() } - fn extensions(&self) -> &[&str] { DocxHandler.extensions() } - fn content_types(&self) -> &[&str] { DocxHandler.content_types() } -} diff --git a/crates/nvisy-ingest/src/binary/pdf.rs b/crates/nvisy-ingest/src/binary/pdf.rs deleted file mode 100644 index 982f70a..0000000 --- a/crates/nvisy-ingest/src/binary/pdf.rs +++ /dev/null @@ -1,166 +0,0 @@ -//! PDF file loader using `pdf-extract` and `lopdf`. - -use bytes::Bytes; -use serde::Deserialize; - -use nvisy_core::io::ContentData; -use nvisy_core::error::{Error, ErrorKind}; - -use crate::document::Document; -use crate::handler::{PdfHandler, ImageHandler, FormatHandler, BinaryLoader}; - -/// Typed parameters for [`PdfLoader`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct PdfLoaderParams { - /// Whether to extract embedded images from the PDF. - #[serde(default = "default_true")] - pub extract_images: bool, - /// Maximum number of pages to process. `None` means all pages. - #[serde(default)] - pub max_pages: Option<u32>, -} - -fn default_true() -> bool { - true -} - -/// Extracts text and optionally images from PDF files. -pub struct PdfLoader; - -impl Clone for PdfLoader { - fn clone(&self) -> Self { Self } -} - -#[async_trait::async_trait] -impl BinaryLoader for PdfLoader { - type Params = PdfLoaderParams; - - async fn load( - &self, - content: &ContentData, - params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error> { - let bytes = content.to_bytes().to_vec(); - let mut documents = Vec::new(); - - // Extract text - let text = pdf_extract::extract_text_from_mem(&bytes).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("PDF text extraction failed: {e}")) - })?; - - let lop_doc = lopdf::Document::load_mem(&bytes).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("PDF parsing failed: {e}")) - })?; - - let page_count = lop_doc.get_pages().len() as u32; - - let mut doc = Document::new(PdfHandler) - .with_text(text) - .with_page_count(page_count); - doc.source.set_parent_id(Some(content.content_source.as_uuid())); - documents.push(doc.into_format()); - - // Extract embedded images - if params.extract_images { - let max_pages = params.max_pages.unwrap_or(page_count); - for (page_num, page_id) in lop_doc.get_pages() { - if page_num > max_pages { - break; - } - - let (resources_opt, _) = match lop_doc.get_page_resources(page_id) { - Ok(r) => r, - Err(_) => continue, - }; - - let resources = match resources_opt { - Some(res) => res, - None => continue, - }; - - let xobject_obj = match resources.get(b"XObject") { - Ok(obj) => obj, - Err(_) => continue, - }; - - let xobjects = match lop_doc.dereference(xobject_obj) { - Ok((_, lopdf::Object::Dictionary(dict))) => dict.clone(), - _ => continue, - }; - - for (_name, obj_ref) in xobjects.iter() { - let stream = match lop_doc.dereference(obj_ref) { - Ok((_, lopdf::Object::Stream(s))) => s.clone(), - _ => continue, - }; - - let is_image = stream - .dict - .get(b"Subtype") - .ok() - .and_then(|s| { - if let lopdf::Object::Name(n) = s { - Some(n.as_slice() == b"Image") - } else { - None - } - }) - .unwrap_or(false); - - if !is_image { - continue; - } - - let image_bytes = stream.content.clone(); - if image_bytes.is_empty() { - continue; - } - - let width = stream - .dict - .get(b"Width") - .ok() - .and_then(|w| { - if let lopdf::Object::Integer(i) = w { - Some(*i as u32) - } else { - None - } - }); - - let height = stream - .dict - .get(b"Height") - .ok() - .and_then(|h| { - if let lopdf::Object::Integer(i) = h { - Some(*i as u32) - } else { - None - } - }); - - let mut img = Document::new(ImageHandler) - .with_data(Bytes::from(image_bytes), "image/png") - .with_page_number(page_num); - - if let (Some(w), Some(h)) = (width, height) { - img = img.with_dimensions(w, h); - } - - img.source.set_parent_id(Some(content.content_source.as_uuid())); - documents.push(img.into_format()); - } - } - } - - Ok(documents) - } -} - -impl crate::handler::Handler for PdfLoader { - fn id(&self) -> &str { PdfHandler.id() } - fn extensions(&self) -> &[&str] { PdfHandler.extensions() } - fn content_types(&self) -> &[&str] { PdfHandler.content_types() } -} diff --git a/crates/nvisy-ingest/src/document.rs b/crates/nvisy-ingest/src/document.rs deleted file mode 100644 index 6aa19a1..0000000 --- a/crates/nvisy-ingest/src/document.rs +++ /dev/null @@ -1,299 +0,0 @@ -//! Unified document representation for any handleable content. - -use bytes::Bytes; -use nvisy_core::path::ContentSource; -use serde::Serialize; - -use crate::element::Element; -use crate::handler::{FormatHandler, Handler}; - -/// A unified representation of any content that can be handled by the pipeline. -/// -/// `Document` is generic over `H`, a [`Handler`] that describes the source -/// format. For heterogeneous collections, use `Document<FormatHandler>`. -/// -/// Fields are grouped by content modality: -/// - **Text** (`content`, `title`, `elements`, `page_count`) — for PDF, DOCX, HTML, etc. -/// - **Binary/image** (`data`, `mime_type`, `width`, `height`, etc.) — for images and raw bytes. -/// - **Tabular** (`columns`, `rows`, `sheet_name`) — for CSV, XLSX. -#[derive(Debug, Clone)] -pub struct Document<H: Handler> { - /// Content source identity and lineage. - pub source: ContentSource, - - // -- Text content (from text, PDF, DOCX, HTML, etc.) -- - - /// Full text content, if applicable. - pub content: Option<String>, - /// Document title, if extracted. - pub title: Option<String>, - /// Structural elements parsed from the document. - pub elements: Option<Vec<Element>>, - /// Total page count for paginated formats. - pub page_count: Option<u32>, - - // -- Binary/image content -- - - /// Raw binary data (image bytes, audio bytes, etc.). - pub data: Option<Bytes>, - /// MIME type of the data (e.g. `"image/png"`, `"audio/wav"`). - pub mime_type: Option<String>, - /// Width in pixels (images). - pub width: Option<u32>, - /// Height in pixels (images). - pub height: Option<u32>, - /// File path or URL the content was loaded from. - pub source_path: Option<String>, - /// 1-based page number this was extracted from. - pub page_number: Option<u32>, - - // -- Tabular content -- - - /// Column header names. - pub columns: Option<Vec<String>>, - /// Row data (each inner Vec same length as columns). - pub rows: Option<Vec<Vec<String>>>, - /// Sheet or tab name within a multi-sheet workbook. - pub sheet_name: Option<String>, - - /// Format handler (not serialized). - handler: H, -} - -impl<H: Handler> Document<H> { - /// Create a new empty document with the given handler. - pub fn new(handler: H) -> Self { - Self { - source: ContentSource::new(), - content: None, - title: None, - elements: None, - page_count: None, - data: None, - mime_type: None, - width: None, - height: None, - source_path: None, - page_number: None, - columns: None, - rows: None, - sheet_name: None, - handler, - } - } - - /// Get a reference to the format handler. - pub fn handler(&self) -> &H { - &self.handler - } - - /// Original file format identifier (delegates to `handler.id()`). - pub fn source_format(&self) -> &str { - self.handler.id() - } - - // -- Builder methods -- - - /// Set text content. - pub fn with_text(mut self, content: impl Into<String>) -> Self { - self.content = Some(content.into()); - self - } - - /// Set binary data and MIME type. - pub fn with_data(mut self, data: impl Into<Bytes>, mime: impl Into<String>) -> Self { - self.data = Some(data.into()); - self.mime_type = Some(mime.into()); - self - } - - /// Set tabular content (columns + rows). - pub fn with_tabular(mut self, columns: Vec<String>, rows: Vec<Vec<String>>) -> Self { - self.columns = Some(columns); - self.rows = Some(rows); - self - } - - /// Set the document title. - pub fn with_title(mut self, title: impl Into<String>) -> Self { - self.title = Some(title.into()); - self - } - - /// Attach parsed structural elements. - pub fn with_elements(mut self, elements: Vec<Element>) -> Self { - self.elements = Some(elements); - self - } - - /// Set the total page count. - pub fn with_page_count(mut self, count: u32) -> Self { - self.page_count = Some(count); - self - } - - /// Set pixel dimensions (images). - pub fn with_dimensions(mut self, width: u32, height: u32) -> Self { - self.width = Some(width); - self.height = Some(height); - self - } - - /// Set the source file path or URL. - pub fn with_source_path(mut self, path: impl Into<String>) -> Self { - self.source_path = Some(path.into()); - self - } - - /// Set the 1-based page number this was extracted from. - pub fn with_page_number(mut self, page: u32) -> Self { - self.page_number = Some(page); - self - } - - /// Set the sheet/tab name for tabular data. - pub fn with_sheet_name(mut self, name: impl Into<String>) -> Self { - self.sheet_name = Some(name.into()); - self - } - - /// Convert into a `Document<FormatHandler>` by wrapping the handler. - pub fn into_format(self) -> Document<FormatHandler> - where - H: Into<FormatHandler>, - { - Document { - source: self.source, - content: self.content, - title: self.title, - elements: self.elements, - page_count: self.page_count, - data: self.data, - mime_type: self.mime_type, - width: self.width, - height: self.height, - source_path: self.source_path, - page_number: self.page_number, - columns: self.columns, - rows: self.rows, - sheet_name: self.sheet_name, - handler: self.handler.into(), - } - } - - /// Unique BCP-47 language tags collected from all elements. - pub fn languages(&self) -> Vec<String> { - let mut langs = Vec::new(); - if let Some(elements) = &self.elements { - for el in elements { - if let Some(ref element_langs) = el.languages { - for lang in element_langs { - if !langs.contains(lang) { - langs.push(lang.clone()); - } - } - } - } - } - langs - } - - /// Group elements by their 1-based page number. - /// Elements without a page_number are collected under key 0. - pub fn get_elements_by_page(&self) -> std::collections::HashMap<u32, Vec<&Element>> { - let mut map = std::collections::HashMap::new(); - if let Some(elements) = &self.elements { - for el in elements { - let page = el.page_number.unwrap_or(0); - map.entry(page).or_insert_with(Vec::new).push(el); - } - } - map - } - - /// Create a Document by deriving content from element texts joined with "\n\n". - pub fn from_elements(elements: Vec<Element>, handler: H) -> Self { - let content = elements - .iter() - .map(|e| e.text.as_str()) - .collect::<Vec<_>>() - .join("\n\n"); - let mut doc = Self::new(handler); - doc.content = Some(content); - doc.elements = Some(elements); - doc - } -} - -// --------------------------------------------------------------------------- -// Serialization -// --------------------------------------------------------------------------- - -impl<H: Handler> Serialize for Document<H> { - fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> { - use serde::ser::SerializeStruct; - - // Count always-present fields - let mut count = 3; // id + parent_id + source_format - if self.content.is_some() { count += 1; } - if self.title.is_some() { count += 1; } - if self.elements.is_some() { count += 1; } - if self.page_count.is_some() { count += 1; } - if self.data.is_some() { count += 1; } - if self.mime_type.is_some() { count += 1; } - if self.width.is_some() { count += 1; } - if self.height.is_some() { count += 1; } - if self.source_path.is_some() { count += 1; } - if self.page_number.is_some() { count += 1; } - if self.columns.is_some() { count += 1; } - if self.rows.is_some() { count += 1; } - if self.sheet_name.is_some() { count += 1; } - - let mut state = serializer.serialize_struct("Document", count)?; - state.serialize_field("id", &self.source.as_uuid())?; - state.serialize_field("parent_id", &self.source.parent_id())?; - state.serialize_field("source_format", self.handler.id())?; - - if let Some(ref content) = self.content { - state.serialize_field("content", content)?; - } - if let Some(ref title) = self.title { - state.serialize_field("title", title)?; - } - if let Some(ref elements) = self.elements { - state.serialize_field("elements", elements)?; - } - if let Some(page_count) = self.page_count { - state.serialize_field("page_count", &page_count)?; - } - if let Some(ref data) = self.data { - state.serialize_field("data", data.as_ref())?; - } - if let Some(ref mime_type) = self.mime_type { - state.serialize_field("mime_type", mime_type)?; - } - if let Some(width) = self.width { - state.serialize_field("width", &width)?; - } - if let Some(height) = self.height { - state.serialize_field("height", &height)?; - } - if let Some(ref source_path) = self.source_path { - state.serialize_field("source_path", source_path)?; - } - if let Some(page_number) = self.page_number { - state.serialize_field("page_number", &page_number)?; - } - if let Some(ref columns) = self.columns { - state.serialize_field("columns", columns)?; - } - if let Some(ref rows) = self.rows { - state.serialize_field("rows", rows)?; - } - if let Some(ref sheet_name) = self.sheet_name { - state.serialize_field("sheet_name", sheet_name)?; - } - - state.end() - } -} diff --git a/crates/nvisy-ingest/src/document/edit_stream.rs b/crates/nvisy-ingest/src/document/edit_stream.rs new file mode 100644 index 0000000..c520da1 --- /dev/null +++ b/crates/nvisy-ingest/src/document/edit_stream.rs @@ -0,0 +1,43 @@ +//! Async span edit stream for [`Handler::edit_spans`]. +//! +//! [`Handler::edit_spans`]: crate::handler::Handler::edit_spans + +use std::pin::Pin; +use std::task::{Context, Poll}; + +use futures::Stream; + +use crate::handler::span::SpanEdit; + +/// Async stream of edits consumed by [`Handler::edit_spans`]. +/// +/// Wraps a `Pin<Box<dyn Stream>>` so that callers can pass any +/// iterator/stream of edits without exposing a concrete type. +/// +/// [`Handler::edit_spans`]: crate::handler::Handler::edit_spans +pub struct SpanEditStream<'a, Id, Data> { + inner: Pin<Box<dyn Stream<Item = SpanEdit<Id, Data>> + Send + 'a>>, +} + +impl<'a, Id, Data> SpanEditStream<'a, Id, Data> { + /// Wrap any `Send` stream of span edits. + pub fn new(stream: impl Stream<Item = SpanEdit<Id, Data>> + Send + 'a) -> Self { + Self { + inner: Box::pin(stream), + } + } +} + +impl<Id, Data> Unpin for SpanEditStream<'_, Id, Data> {} + +impl<Id, Data> Stream for SpanEditStream<'_, Id, Data> { + type Item = SpanEdit<Id, Data>; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> { + self.inner.as_mut().poll_next(cx) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.inner.size_hint() + } +} diff --git a/crates/nvisy-ingest/src/document/mod.rs b/crates/nvisy-ingest/src/document/mod.rs new file mode 100644 index 0000000..c05c411 --- /dev/null +++ b/crates/nvisy-ingest/src/document/mod.rs @@ -0,0 +1,73 @@ +//! Unified document representation. + +pub mod view_stream; +pub mod edit_stream; + +use nvisy_core::io::ContentData; +use nvisy_core::path::ContentSource; +use nvisy_ontology::entity::DocumentType; + +use crate::handler::Handler; + +/// A unified representation of any content that can be handled by the pipeline. +/// +/// `Document` is generic over `H`, a [`Handler`] that holds the loaded data +/// and provides methods to read and manipulate it. +#[derive(Debug)] +pub struct Document<H: Handler> { + /// Content source identity and lineage. + pub source: ContentSource, + /// 1-based page number this was extracted from. + pub page_number: Option<u32>, + + /// Format handler (holds the loaded data). + handler: H, +} + +impl<H: Handler + Clone> Clone for Document<H> { + fn clone(&self) -> Self { + Self { + source: self.source, + page_number: self.page_number, + handler: self.handler.clone(), + } + } +} + +impl<H: Handler> Document<H> { + /// Create a new document with the given handler. + pub fn new(handler: H) -> Self { + Self { + source: ContentSource::new(), + page_number: None, + handler, + } + } + + /// Get a reference to the format handler. + pub fn handler(&self) -> &H { + &self.handler + } + + /// Get a mutable reference to the format handler. + pub fn handler_mut(&mut self) -> &mut H { + &mut self.handler + } + + /// The document type of the loaded content. + pub fn document_type(&self) -> DocumentType { + self.handler.document_type() + } + + /// Set the 1-based page number this was extracted from. + pub fn with_page_number(mut self, page: u32) -> Self { + self.page_number = Some(page); + self + } + + /// Set this document's parent to the given content source. + pub fn with_parent(mut self, content: &ContentData) -> Self { + self.source.set_parent_id(Some(content.content_source.as_uuid())); + self + } +} diff --git a/crates/nvisy-ingest/src/document/view_stream.rs b/crates/nvisy-ingest/src/document/view_stream.rs new file mode 100644 index 0000000..c17e52d --- /dev/null +++ b/crates/nvisy-ingest/src/document/view_stream.rs @@ -0,0 +1,43 @@ +//! Async span stream for [`Handler::view_spans`]. +//! +//! [`Handler::view_spans`]: crate::handler::Handler::view_spans + +use std::pin::Pin; +use std::task::{Context, Poll}; + +use futures::Stream; + +use crate::handler::span::Span; + +/// Async stream of spans returned by [`Handler::view_spans`]. +/// +/// Wraps a `Pin<Box<dyn Stream>>` so that handler implementations +/// can return any iterator/stream without exposing a concrete type. +/// +/// [`Handler::view_spans`]: crate::handler::Handler::view_spans +pub struct SpanStream<'a, Id, Data> { + inner: Pin<Box<dyn Stream<Item = Span<Id, Data>> + Send + 'a>>, +} + +impl<'a, Id, Data> SpanStream<'a, Id, Data> { + /// Wrap any `Send` stream of spans. + pub fn new(stream: impl Stream<Item = Span<Id, Data>> + Send + 'a) -> Self { + Self { + inner: Box::pin(stream), + } + } +} + +impl<Id, Data> Unpin for SpanStream<'_, Id, Data> {} + +impl<Id, Data> Stream for SpanStream<'_, Id, Data> { + type Item = Span<Id, Data>; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> { + self.inner.as_mut().poll_next(cx) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.inner.size_hint() + } +} diff --git a/crates/nvisy-ingest/src/element.rs b/crates/nvisy-ingest/src/element.rs deleted file mode 100644 index b942597..0000000 --- a/crates/nvisy-ingest/src/element.rs +++ /dev/null @@ -1,375 +0,0 @@ -//! Structural elements extracted from documents and their ontology. - -use serde::{Deserialize, Serialize}; -use uuid::Uuid; - -/// General-purpose metadata map. -pub type Metadata = serde_json::Map<String, serde_json::Value>; - -// --------------------------------------------------------------------------- -// Element ontology -// --------------------------------------------------------------------------- - -/// Broad grouping of element types. -/// -/// Every [`ElementType`] belongs to exactly one category, providing -/// a coarse filter for pipeline actions that only operate on certain -/// kinds of content. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -#[serde(rename_all = "snake_case")] -pub enum ElementCategory { - /// Narrative text, headings, list items, captions, and addresses. - Text, - /// Tabular data. - Table, - /// Images and other media content. - Media, - /// Source code fragments. - Code, - /// Mathematical formulae. - Math, - /// Form elements such as checkboxes and key-value fields. - Form, - /// Layout markers like page breaks and page numbers. - Layout, - /// Email message content. - Email, -} - -/// Specific structural element type extracted from a document. -/// -/// Each variant maps to a single [`ElementCategory`] via -/// [`ElementType::category`]. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -#[serde(rename_all = "kebab-case")] -pub enum ElementType { - // -- Text -- - - /// A document title or section heading. - Title, - /// A block of narrative prose. - NarrativeText, - /// An item within a bulleted or numbered list. - ListItem, - /// A page or section header. - Header, - /// A page or section footer. - Footer, - /// Caption text associated with a figure. - FigureCaption, - /// A physical or mailing address. - Address, - /// Text that does not fit any other text category. - UncategorizedText, - - // -- Table -- - - /// A data table with rows and columns. - Table, - - // -- Media -- - - /// An embedded image. - Image, - - // -- Code -- - - /// A source code snippet or block. - CodeSnippet, - - // -- Math -- - - /// A mathematical formula or equation. - Formula, - - // -- Form -- - - /// A checkbox form control. - Checkbox, - /// A set of key-value pairs extracted from a form. - FormKeysValues, - - // -- Layout -- - - /// A page break marker. - PageBreak, - /// A page number indicator. - PageNumber, - - // -- Email -- - - /// An email message body and headers. - EmailMessage, -} - -impl ElementType { - /// Return the category this element type belongs to. - pub fn category(&self) -> ElementCategory { - match self { - Self::Title - | Self::NarrativeText - | Self::ListItem - | Self::Header - | Self::Footer - | Self::FigureCaption - | Self::Address - | Self::UncategorizedText => ElementCategory::Text, - Self::Table => ElementCategory::Table, - Self::Image => ElementCategory::Media, - Self::CodeSnippet => ElementCategory::Code, - Self::Formula => ElementCategory::Math, - Self::Checkbox | Self::FormKeysValues => ElementCategory::Form, - Self::PageBreak | Self::PageNumber => ElementCategory::Layout, - Self::EmailMessage => ElementCategory::Email, - } - } -} - -/// Parse an element type string and return its category. -/// -/// Returns `None` if the string does not match any known [`ElementType`]. -pub fn category_of(type_str: &str) -> Option<ElementCategory> { - let et: ElementType = - serde_json::from_value(serde_json::Value::String(type_str.to_string())).ok()?; - Some(et.category()) -} - -// --------------------------------------------------------------------------- -// Structural elements -// --------------------------------------------------------------------------- - -/// An inline hyperlink within element text. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct Link { - /// Display text of the hyperlink. - pub text: String, - /// Target URL of the hyperlink. - pub url: String, - /// Character offset where the link text begins in the parent element. - pub start_index: usize, -} - -/// An inline formatting span within element text. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct EmphasizedText { - /// The emphasized text content. - pub text: String, - /// HTML tag name describing the emphasis (e.g. `"b"`, `"i"`, `"em"`). - pub tag: String, -} - -/// A single cell within a table structure. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct TableCellData { - /// Zero-based row index. - pub row: usize, - /// Zero-based column index. - pub column: usize, - /// Text content of the cell. - pub text: String, - /// Whether this cell is a header cell. - #[serde(skip_serializing_if = "Option::is_none")] - pub is_header: Option<bool>, -} - -/// Extraction or OCR provenance data for an element. -/// -/// Records how an element was detected and any extraction -/// confidence metadata. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct ElementProvenance { - /// Confidence score of the extraction (0.0 to 1.0). - #[serde(skip_serializing_if = "Option::is_none")] - pub confidence: Option<f64>, - /// Name of the extraction engine or model that produced this element. - #[serde(skip_serializing_if = "Option::is_none")] - pub detection_origin: Option<String>, - /// Whether this element continues from a previous element split across pages. - #[serde(skip_serializing_if = "Option::is_none")] - pub is_continuation: Option<bool>, - /// Type of header or footer (e.g. `"primary"`, `"footnote"`), if applicable. - #[serde(skip_serializing_if = "Option::is_none")] - pub header_footer_type: Option<String>, -} - -/// Structured key-value pair extracted from a form. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct FormKeyValuePair { - /// Form field label or key. - pub key: String, - /// Form field value, if one was extracted. - #[serde(skip_serializing_if = "Option::is_none")] - pub value: Option<String>, - /// Extraction confidence for this key-value pair. - #[serde(skip_serializing_if = "Option::is_none")] - pub confidence: Option<f64>, -} - -/// A single structural element extracted from a document. -/// -/// Combines base element fields with optional type-specific fields -/// (image, table, form, email) in a flat struct rather than inheritance. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[derive(schemars::JsonSchema)] -pub struct Element { - /// Unique identifier for this element. - pub id: Uuid, - /// The structural type of this element. - #[serde(rename = "type")] - pub element_type: ElementType, - /// Plain-text content of the element. - pub text: String, - - /// Identifier of the parent element (for nested structures). - #[serde(skip_serializing_if = "Option::is_none")] - pub parent_id: Option<Uuid>, - /// 1-based page number where this element appears. - #[serde(skip_serializing_if = "Option::is_none")] - pub page_number: Option<u32>, - /// Named page or sheet label (e.g. worksheet name in a spreadsheet). - #[serde(skip_serializing_if = "Option::is_none")] - pub page_name: Option<String>, - /// Heading level (1-6) for title or header elements. - #[serde(skip_serializing_if = "Option::is_none")] - pub level: Option<u32>, - /// BCP-47 language tags detected in this element. - #[serde(skip_serializing_if = "Option::is_none")] - pub languages: Option<Vec<String>>, - /// Arbitrary metadata associated with this element. - #[serde(skip_serializing_if = "Option::is_none")] - pub metadata: Option<Metadata>, - /// Tag identifying the extraction source or pipeline stage. - #[serde(skip_serializing_if = "Option::is_none")] - pub source_tag: Option<String>, - /// HTML representation of the element's text with inline formatting. - #[serde(skip_serializing_if = "Option::is_none")] - pub text_as_html: Option<String>, - /// Inline hyperlinks found within this element's text. - #[serde(skip_serializing_if = "Option::is_none")] - pub links: Option<Vec<Link>>, - /// Inline formatting spans (bold, italic, etc.) within this element. - #[serde(skip_serializing_if = "Option::is_none")] - pub emphasized_texts: Option<Vec<EmphasizedText>>, - /// Extraction or OCR provenance information. - #[serde(skip_serializing_if = "Option::is_none")] - pub provenance: Option<ElementProvenance>, - - // -- Image-specific fields (when element_type is Image) -- - - /// Base64-encoded image data. - #[serde(skip_serializing_if = "Option::is_none")] - pub image_base64: Option<String>, - /// MIME type of the embedded image. - #[serde(skip_serializing_if = "Option::is_none")] - pub image_mime_type: Option<String>, - /// Remote URL of the image. - #[serde(skip_serializing_if = "Option::is_none")] - pub image_url: Option<String>, - /// Local file path of the image. - #[serde(skip_serializing_if = "Option::is_none")] - pub image_path: Option<String>, - - // -- Table-specific fields (when element_type is Table) -- - - /// Individual table cells with row/column coordinates. - #[serde(skip_serializing_if = "Option::is_none")] - pub cells: Option<Vec<TableCellData>>, - - // -- Form-specific fields (when element_type is Checkbox/FormKeysValues) -- - - /// Whether a checkbox is checked. - #[serde(skip_serializing_if = "Option::is_none")] - pub checked: Option<bool>, - /// Value of a form field. - #[serde(skip_serializing_if = "Option::is_none")] - pub value: Option<String>, - /// Structured key-value pairs extracted from a form. - #[serde(skip_serializing_if = "Option::is_none")] - pub key_value_pairs: Option<Vec<FormKeyValuePair>>, - - // -- Email-specific fields (when element_type is EmailMessage) -- - - /// Sender addresses. - #[serde(skip_serializing_if = "Option::is_none")] - pub sent_from: Option<Vec<String>>, - /// Primary recipient addresses. - #[serde(skip_serializing_if = "Option::is_none")] - pub sent_to: Option<Vec<String>>, - /// CC recipient addresses. - #[serde(skip_serializing_if = "Option::is_none")] - pub cc_recipient: Option<Vec<String>>, - /// BCC recipient addresses. - #[serde(skip_serializing_if = "Option::is_none")] - pub bcc_recipient: Option<Vec<String>>, - /// Email subject line. - #[serde(skip_serializing_if = "Option::is_none")] - pub subject: Option<String>, - /// Email signature block. - #[serde(skip_serializing_if = "Option::is_none")] - pub signature: Option<String>, - /// RFC 2822 Message-ID of the email. - #[serde(skip_serializing_if = "Option::is_none")] - pub email_message_id: Option<String>, -} - -impl Element { - /// Create a new element with the given type and text content. - pub fn new(element_type: ElementType, text: impl Into<String>) -> Self { - Self { - id: Uuid::new_v4(), - element_type, - text: text.into(), - parent_id: None, - page_number: None, - page_name: None, - level: None, - languages: None, - metadata: None, - source_tag: None, - text_as_html: None, - links: None, - emphasized_texts: None, - provenance: None, - image_base64: None, - image_mime_type: None, - image_url: None, - image_path: None, - cells: None, - checked: None, - value: None, - key_value_pairs: None, - sent_from: None, - sent_to: None, - cc_recipient: None, - bcc_recipient: None, - subject: None, - signature: None, - email_message_id: None, - } - } - - /// Set the 1-based page number for this element. - pub fn with_page_number(mut self, page: u32) -> Self { - self.page_number = Some(page); - self - } - - /// Set the heading level (1-6) for title or header elements. - pub fn with_level(mut self, level: u32) -> Self { - self.level = Some(level); - self - } - - /// Set BCP-47 language tags detected in this element. - pub fn with_languages(mut self, langs: Vec<String>) -> Self { - self.languages = Some(langs); - self - } -} diff --git a/crates/nvisy-ingest/src/handler.rs b/crates/nvisy-ingest/src/handler.rs deleted file mode 100644 index 80886ae..0000000 --- a/crates/nvisy-ingest/src/handler.rs +++ /dev/null @@ -1,351 +0,0 @@ -//! Handler trait, format handler enum, and loader traits. -//! -//! The [`Handler`] supertrait defines metadata shared by all format handlers. -//! The closed [`FormatHandler`] enum provides type erasure so that -//! `Document<FormatHandler>` can represent any supported format in -//! heterogeneous collections. -//! -//! Loader traits ([`TextLoader`], [`BinaryLoader`], [`ImageLoader`], -//! [`SpreadsheetLoader`], [`AudioLoader`]) extend `Handler` with a typed -//! `load()` method that returns `Vec<Document<Self>>`. - -use nvisy_core::error::Error; -use nvisy_core::io::ContentData; - -use crate::document::Document; - -// --------------------------------------------------------------------------- -// Handler supertrait -// --------------------------------------------------------------------------- - -/// Base trait for all format handlers. -/// -/// Every concrete handler (e.g. `CsvHandler`, `PdfHandler`) implements this -/// trait, providing an identifier, supported file extensions, and MIME types. -pub trait Handler: Send + Sync + Clone + 'static { - /// Unique identifier (e.g. `"csv"`, `"pdf"`, `"wav"`). - fn id(&self) -> &str; - /// File extensions this handler supports (e.g. `&["csv"]`). - fn extensions(&self) -> &[&str]; - /// MIME content types this handler supports (e.g. `&["text/csv"]`). - fn content_types(&self) -> &[&str]; -} - -// --------------------------------------------------------------------------- -// Concrete handler structs -// --------------------------------------------------------------------------- - -/// Handles plain-text files (`.txt`, `.text`). -#[derive(Debug, Clone)] -pub struct PlaintextHandler; - -impl Handler for PlaintextHandler { - fn id(&self) -> &str { "plaintext" } - fn extensions(&self) -> &[&str] { &["txt", "text"] } - fn content_types(&self) -> &[&str] { &["text/plain"] } -} - -/// Handles CSV files (`.csv`). -#[derive(Debug, Clone)] -pub struct CsvHandler; - -impl Handler for CsvHandler { - fn id(&self) -> &str { "csv" } - fn extensions(&self) -> &[&str] { &["csv"] } - fn content_types(&self) -> &[&str] { &["text/csv"] } -} - -/// Handles JSON files (`.json`). -#[derive(Debug, Clone)] -pub struct JsonHandler; - -impl Handler for JsonHandler { - fn id(&self) -> &str { "json" } - fn extensions(&self) -> &[&str] { &["json"] } - fn content_types(&self) -> &[&str] { &["application/json"] } -} - -/// Handles HTML files (`.html`, `.htm`). -#[cfg(feature = "html")] -#[derive(Debug, Clone)] -pub struct HtmlHandler; - -#[cfg(feature = "html")] -impl Handler for HtmlHandler { - fn id(&self) -> &str { "html" } - fn extensions(&self) -> &[&str] { &["html", "htm"] } - fn content_types(&self) -> &[&str] { &["text/html"] } -} - -/// Handles PDF files (`.pdf`). -#[cfg(feature = "pdf")] -#[derive(Debug, Clone)] -pub struct PdfHandler; - -#[cfg(feature = "pdf")] -impl Handler for PdfHandler { - fn id(&self) -> &str { "pdf" } - fn extensions(&self) -> &[&str] { &["pdf"] } - fn content_types(&self) -> &[&str] { &["application/pdf"] } -} - -/// Handles DOCX files (`.docx`). -#[cfg(feature = "docx")] -#[derive(Debug, Clone)] -pub struct DocxHandler; - -#[cfg(feature = "docx")] -impl Handler for DocxHandler { - fn id(&self) -> &str { "docx" } - fn extensions(&self) -> &[&str] { &["docx"] } - fn content_types(&self) -> &[&str] { &["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] } -} - -/// Handles image files (PNG, JPEG, TIFF). -#[cfg(feature = "image")] -#[derive(Debug, Clone)] -pub struct ImageHandler; - -#[cfg(feature = "image")] -impl Handler for ImageHandler { - fn id(&self) -> &str { "image" } - fn extensions(&self) -> &[&str] { &["jpg", "jpeg", "png", "tiff"] } - fn content_types(&self) -> &[&str] { &["image/jpeg", "image/png", "image/tiff"] } -} - -/// Handles XLSX/XLS spreadsheet files. -#[cfg(feature = "xlsx")] -#[derive(Debug, Clone)] -pub struct XlsxHandler; - -#[cfg(feature = "xlsx")] -impl Handler for XlsxHandler { - fn id(&self) -> &str { "xlsx" } - fn extensions(&self) -> &[&str] { &["xlsx", "xls"] } - fn content_types(&self) -> &[&str] { &["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.ms-excel"] } -} - -/// Handles WAV audio files. -#[derive(Debug, Clone)] -pub struct WavHandler; - -impl Handler for WavHandler { - fn id(&self) -> &str { "wav" } - fn extensions(&self) -> &[&str] { &["wav"] } - fn content_types(&self) -> &[&str] { &["audio/wav", "audio/x-wav"] } -} - -/// Handles MP3 audio files. -#[derive(Debug, Clone)] -pub struct Mp3Handler; - -impl Handler for Mp3Handler { - fn id(&self) -> &str { "mp3" } - fn extensions(&self) -> &[&str] { &["mp3"] } - fn content_types(&self) -> &[&str] { &["audio/mpeg"] } -} - -// --------------------------------------------------------------------------- -// FormatHandler enum — closed type erasure -// --------------------------------------------------------------------------- - -/// Closed enum of all supported format handlers. -/// -/// Provides type erasure: `Document<FormatHandler>` can represent -/// content from any supported format in heterogeneous collections. -#[derive(Debug, Clone)] -pub enum FormatHandler { - Plaintext(PlaintextHandler), - Csv(CsvHandler), - Json(JsonHandler), - #[cfg(feature = "html")] - Html(HtmlHandler), - #[cfg(feature = "pdf")] - Pdf(PdfHandler), - #[cfg(feature = "docx")] - Docx(DocxHandler), - #[cfg(feature = "image")] - Image(ImageHandler), - #[cfg(feature = "xlsx")] - Xlsx(XlsxHandler), - Wav(WavHandler), - Mp3(Mp3Handler), -} - -impl Handler for FormatHandler { - fn id(&self) -> &str { - match self { - Self::Plaintext(h) => h.id(), - Self::Csv(h) => h.id(), - Self::Json(h) => h.id(), - #[cfg(feature = "html")] - Self::Html(h) => h.id(), - #[cfg(feature = "pdf")] - Self::Pdf(h) => h.id(), - #[cfg(feature = "docx")] - Self::Docx(h) => h.id(), - #[cfg(feature = "image")] - Self::Image(h) => h.id(), - #[cfg(feature = "xlsx")] - Self::Xlsx(h) => h.id(), - - Self::Wav(h) => h.id(), - Self::Mp3(h) => h.id(), - } - } - - fn extensions(&self) -> &[&str] { - match self { - Self::Plaintext(h) => h.extensions(), - Self::Csv(h) => h.extensions(), - Self::Json(h) => h.extensions(), - #[cfg(feature = "html")] - Self::Html(h) => h.extensions(), - #[cfg(feature = "pdf")] - Self::Pdf(h) => h.extensions(), - #[cfg(feature = "docx")] - Self::Docx(h) => h.extensions(), - #[cfg(feature = "image")] - Self::Image(h) => h.extensions(), - #[cfg(feature = "xlsx")] - Self::Xlsx(h) => h.extensions(), - - Self::Wav(h) => h.extensions(), - Self::Mp3(h) => h.extensions(), - } - } - - fn content_types(&self) -> &[&str] { - match self { - Self::Plaintext(h) => h.content_types(), - Self::Csv(h) => h.content_types(), - Self::Json(h) => h.content_types(), - #[cfg(feature = "html")] - Self::Html(h) => h.content_types(), - #[cfg(feature = "pdf")] - Self::Pdf(h) => h.content_types(), - #[cfg(feature = "docx")] - Self::Docx(h) => h.content_types(), - #[cfg(feature = "image")] - Self::Image(h) => h.content_types(), - #[cfg(feature = "xlsx")] - Self::Xlsx(h) => h.content_types(), - - Self::Wav(h) => h.content_types(), - Self::Mp3(h) => h.content_types(), - } - } -} - -// -- From impls for each concrete handler -> FormatHandler -- - -impl From<PlaintextHandler> for FormatHandler { - fn from(h: PlaintextHandler) -> Self { Self::Plaintext(h) } -} -impl From<CsvHandler> for FormatHandler { - fn from(h: CsvHandler) -> Self { Self::Csv(h) } -} -impl From<JsonHandler> for FormatHandler { - fn from(h: JsonHandler) -> Self { Self::Json(h) } -} -#[cfg(feature = "html")] -impl From<HtmlHandler> for FormatHandler { - fn from(h: HtmlHandler) -> Self { Self::Html(h) } -} -#[cfg(feature = "pdf")] -impl From<PdfHandler> for FormatHandler { - fn from(h: PdfHandler) -> Self { Self::Pdf(h) } -} -#[cfg(feature = "docx")] -impl From<DocxHandler> for FormatHandler { - fn from(h: DocxHandler) -> Self { Self::Docx(h) } -} -#[cfg(feature = "image")] -impl From<ImageHandler> for FormatHandler { - fn from(h: ImageHandler) -> Self { Self::Image(h) } -} -#[cfg(feature = "xlsx")] -impl From<XlsxHandler> for FormatHandler { - fn from(h: XlsxHandler) -> Self { Self::Xlsx(h) } -} -impl From<WavHandler> for FormatHandler { - fn from(h: WavHandler) -> Self { Self::Wav(h) } -} -impl From<Mp3Handler> for FormatHandler { - fn from(h: Mp3Handler) -> Self { Self::Mp3(h) } -} - -// --------------------------------------------------------------------------- -// Loader traits -// --------------------------------------------------------------------------- - -/// Loader for text-based formats (plain text, CSV, JSON, HTML). -#[async_trait::async_trait] -pub trait TextLoader: Handler { - /// Strongly-typed parameters for this loader. - type Params: Send; - - /// Parse the content into documents. - async fn load( - &self, - content: &ContentData, - params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error>; -} - -/// Loader for binary document formats (PDF, DOCX) that produce both -/// text documents and extracted images. -#[async_trait::async_trait] -pub trait BinaryLoader: Handler { - /// Strongly-typed parameters for this loader. - type Params: Send; - - /// Parse the content into documents (text pages and extracted images). - async fn load( - &self, - content: &ContentData, - params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error>; -} - -/// Loader for image formats (PNG, JPEG, TIFF, etc.). -#[async_trait::async_trait] -pub trait ImageLoader: Handler { - /// Strongly-typed parameters for this loader. - type Params: Send; - - /// Decode the content into image documents. - async fn load( - &self, - content: &ContentData, - params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error>; -} - -/// Loader for spreadsheet/tabular formats (XLSX). -#[async_trait::async_trait] -pub trait SpreadsheetLoader: Handler { - /// Strongly-typed parameters for this loader. - type Params: Send; - - /// Parse the content into tabular documents. - async fn load( - &self, - content: &ContentData, - params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error>; -} - -/// Loader for audio formats (WAV, MP3). -#[async_trait::async_trait] -pub trait AudioLoader: Handler { - /// Strongly-typed parameters for this loader. - type Params: Send; - - /// Process the audio content. - async fn load( - &self, - content: &ContentData, - params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error>; -} diff --git a/crates/nvisy-ingest/src/handler/audio/mod.rs b/crates/nvisy-ingest/src/handler/audio/mod.rs new file mode 100644 index 0000000..5a1fb22 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/audio/mod.rs @@ -0,0 +1,6 @@ +//! Audio format handlers. + +#[cfg(feature = "wav")] +pub mod wav; +#[cfg(feature = "mp3")] +pub mod mp3; diff --git a/crates/nvisy-ingest/src/handler/audio/mp3.rs b/crates/nvisy-ingest/src/handler/audio/mp3.rs new file mode 100644 index 0000000..fea96de --- /dev/null +++ b/crates/nvisy-ingest/src/handler/audio/mp3.rs @@ -0,0 +1,32 @@ +//! MP3 handler (stub — awaiting migration to Loader/Handler pattern). + +use nvisy_core::error::Error; +use nvisy_ontology::entity::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::Handler; + +#[derive(Debug)] +pub struct Mp3Handler; + +#[async_trait::async_trait] +impl Handler for Mp3Handler { + fn document_type(&self) -> DocumentType { + DocumentType::Mp3 + } + + type SpanId = (); + type SpanData = (); + + async fn view_spans(&self) -> SpanStream<'_, (), ()> { + SpanStream::new(futures::stream::empty()) + } + + async fn edit_spans( + &mut self, + _edits: SpanEditStream<'_, (), ()>, + ) -> Result<(), Error> { + Ok(()) + } +} diff --git a/crates/nvisy-ingest/src/handler/audio/wav.rs b/crates/nvisy-ingest/src/handler/audio/wav.rs new file mode 100644 index 0000000..cedaf05 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/audio/wav.rs @@ -0,0 +1,32 @@ +//! WAV handler (stub — awaiting migration to Loader/Handler pattern). + +use nvisy_core::error::Error; +use nvisy_ontology::entity::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::Handler; + +#[derive(Debug)] +pub struct WavHandler; + +#[async_trait::async_trait] +impl Handler for WavHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Wav + } + + type SpanId = (); + type SpanData = (); + + async fn view_spans(&self) -> SpanStream<'_, (), ()> { + SpanStream::new(futures::stream::empty()) + } + + async fn edit_spans( + &mut self, + _edits: SpanEditStream<'_, (), ()>, + ) -> Result<(), Error> { + Ok(()) + } +} diff --git a/crates/nvisy-ingest/src/handler/document/docx.rs b/crates/nvisy-ingest/src/handler/document/docx.rs new file mode 100644 index 0000000..90b7d9e --- /dev/null +++ b/crates/nvisy-ingest/src/handler/document/docx.rs @@ -0,0 +1,32 @@ +//! DOCX handler (stub — awaiting migration to Loader/Handler pattern). + +use nvisy_core::error::Error; +use nvisy_ontology::entity::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::Handler; + +#[derive(Debug)] +pub struct DocxHandler; + +#[async_trait::async_trait] +impl Handler for DocxHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Docx + } + + type SpanId = (); + type SpanData = (); + + async fn view_spans(&self) -> SpanStream<'_, (), ()> { + SpanStream::new(futures::stream::empty()) + } + + async fn edit_spans( + &mut self, + _edits: SpanEditStream<'_, (), ()>, + ) -> Result<(), Error> { + Ok(()) + } +} diff --git a/crates/nvisy-ingest/src/binary/mod.rs b/crates/nvisy-ingest/src/handler/document/mod.rs similarity index 64% rename from crates/nvisy-ingest/src/binary/mod.rs rename to crates/nvisy-ingest/src/handler/document/mod.rs index 02642a4..9c62d32 100644 --- a/crates/nvisy-ingest/src/binary/mod.rs +++ b/crates/nvisy-ingest/src/handler/document/mod.rs @@ -1,7 +1,6 @@ -//! Binary document loaders (PDF, DOCX). +//! Rich document format handlers. #[cfg(feature = "pdf")] pub mod pdf; - #[cfg(feature = "docx")] pub mod docx; diff --git a/crates/nvisy-ingest/src/handler/document/pdf.rs b/crates/nvisy-ingest/src/handler/document/pdf.rs new file mode 100644 index 0000000..4c8ac68 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/document/pdf.rs @@ -0,0 +1,32 @@ +//! PDF handler (stub — awaiting migration to Loader/Handler pattern). + +use nvisy_core::error::Error; +use nvisy_ontology::entity::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::Handler; + +#[derive(Debug)] +pub struct PdfHandler; + +#[async_trait::async_trait] +impl Handler for PdfHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Pdf + } + + type SpanId = (); + type SpanData = (); + + async fn view_spans(&self) -> SpanStream<'_, (), ()> { + SpanStream::new(futures::stream::empty()) + } + + async fn edit_spans( + &mut self, + _edits: SpanEditStream<'_, (), ()>, + ) -> Result<(), Error> { + Ok(()) + } +} diff --git a/crates/nvisy-ingest/src/handler/encoding.rs b/crates/nvisy-ingest/src/handler/encoding.rs new file mode 100644 index 0000000..c90d92e --- /dev/null +++ b/crates/nvisy-ingest/src/handler/encoding.rs @@ -0,0 +1,25 @@ +//! Character encoding for text-based loaders. + +use nvisy_core::error::Error; + +/// Character encoding used to decode raw bytes before parsing. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum TextEncoding { + /// UTF-8 (the default and by far the most common encoding). + #[default] + Utf8, +} + +impl TextEncoding { + /// Decode raw bytes to a UTF-8 string. + /// + /// `origin` identifies the caller for error messages + /// (e.g. `"json-loader"`). + pub fn decode_bytes(self, bytes: &[u8], origin: &str) -> Result<String, Error> { + match self { + Self::Utf8 => String::from_utf8(bytes.to_vec()).map_err(|e| { + Error::validation(format!("Invalid UTF-8: {e}"), origin) + }), + } + } +} diff --git a/crates/nvisy-ingest/src/handler/image/jpeg.rs b/crates/nvisy-ingest/src/handler/image/jpeg.rs new file mode 100644 index 0000000..9b56f21 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/image/jpeg.rs @@ -0,0 +1,32 @@ +//! JPEG handler (stub — awaiting migration to Loader/Handler pattern). + +use nvisy_core::error::Error; +use nvisy_ontology::entity::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::Handler; + +#[derive(Debug)] +pub struct JpegHandler; + +#[async_trait::async_trait] +impl Handler for JpegHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Jpeg + } + + type SpanId = (); + type SpanData = (); + + async fn view_spans(&self) -> SpanStream<'_, (), ()> { + SpanStream::new(futures::stream::empty()) + } + + async fn edit_spans( + &mut self, + _edits: SpanEditStream<'_, (), ()>, + ) -> Result<(), Error> { + Ok(()) + } +} diff --git a/crates/nvisy-ingest/src/handler/image/mod.rs b/crates/nvisy-ingest/src/handler/image/mod.rs new file mode 100644 index 0000000..46bcfdd --- /dev/null +++ b/crates/nvisy-ingest/src/handler/image/mod.rs @@ -0,0 +1,6 @@ +//! Image format handlers and shared decode helper. + +#[cfg(feature = "jpeg")] +pub mod jpeg; +#[cfg(feature = "png")] +pub mod png; diff --git a/crates/nvisy-ingest/src/handler/image/png.rs b/crates/nvisy-ingest/src/handler/image/png.rs new file mode 100644 index 0000000..a62e210 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/image/png.rs @@ -0,0 +1,32 @@ +//! PNG handler (stub — awaiting migration to Loader/Handler pattern). + +use nvisy_core::error::Error; +use nvisy_ontology::entity::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::Handler; + +#[derive(Debug)] +pub struct PngHandler; + +#[async_trait::async_trait] +impl Handler for PngHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Png + } + + type SpanId = (); + type SpanData = (); + + async fn view_spans(&self) -> SpanStream<'_, (), ()> { + SpanStream::new(futures::stream::empty()) + } + + async fn edit_spans( + &mut self, + _edits: SpanEditStream<'_, (), ()>, + ) -> Result<(), Error> { + Ok(()) + } +} diff --git a/crates/nvisy-ingest/src/handler/mod.rs b/crates/nvisy-ingest/src/handler/mod.rs new file mode 100644 index 0000000..47a44d0 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/mod.rs @@ -0,0 +1,87 @@ +//! Loader and handler traits. +//! +//! A [`Loader`] validates and parses raw content, producing a +//! [`Document`] containing the corresponding [`Handler`]. The handler +//! holds the loaded data and provides methods to read and manipulate it. +//! +//! Each handler defines its own span types and exposes them as async +//! streams via [`Handler::view_spans`] and [`Handler::edit_spans`]. + +use nvisy_core::error::Error; +use nvisy_core::io::ContentData; +use nvisy_ontology::entity::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::document::Document; + +pub mod encoding; +pub mod span; + +pub mod text; +pub mod document; +pub mod image; +pub mod tabular; +pub mod audio; + +pub use encoding::TextEncoding; +pub use span::{Span, SpanEdit}; + +pub use text::txt_handler::{TxtData, TxtHandler, TxtSpan}; +pub use text::txt_loader::{TxtLoader, TxtParams}; +pub use text::csv_handler::{CsvData, CsvHandler, CsvSpan}; +pub use text::csv_loader::{CsvLoader, CsvParams}; +pub use text::json_handler::{ + JsonData, JsonHandler, JsonIndent, JsonPath, +}; +pub use text::json_loader::{JsonParams, JsonLoader}; + +/// Trait implemented by all format handlers. +/// +/// A handler holds loaded, validated content and provides methods to +/// read and manipulate it. Handlers are produced by their corresponding +/// [`Loader`]. +/// +/// Each handler defines its own span addressing scheme ([`SpanId`](Self::SpanId)) +/// and data type ([`SpanData`](Self::SpanData)). Pipeline actions +/// constrain `SpanData` to express what they need (e.g. `AsRef<str>` +/// for text scanning). +#[async_trait::async_trait] +pub trait Handler: Send + Sync + 'static { + /// The document type this handler represents. + fn document_type(&self) -> DocumentType; + + /// Strongly-typed identifier for a span within this handler. + type SpanId: Send + Sync + Clone + 'static; + /// The data type carried by each span. + type SpanData: Send + 'static; + + /// Return the loaded content as an async stream of spans. + async fn view_spans(&self) -> SpanStream<'_, Self::SpanId, Self::SpanData>; + + /// Apply edits from an async stream back to the source structure. + async fn edit_spans( + &mut self, + edits: SpanEditStream<'_, Self::SpanId, Self::SpanData>, + ) -> Result<(), Error>; +} + +/// Trait implemented by format loaders. +/// +/// A loader validates and parses raw content, producing a +/// [`Document`] with the corresponding handler. +#[async_trait::async_trait] +pub trait Loader: Send + Sync + 'static { + /// The handler type this loader produces. + type Handler: Handler; + /// Strongly-typed parameters for loading. + type Params: Send; + + /// Validate and parse the content, returning a document with + /// the loaded handler. + async fn load( + &self, + content: &ContentData, + params: &Self::Params, + ) -> Result<Vec<Document<Self::Handler>>, Error>; +} diff --git a/crates/nvisy-ingest/src/handler/span.rs b/crates/nvisy-ingest/src/handler/span.rs new file mode 100644 index 0000000..4bbd276 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/span.rs @@ -0,0 +1,19 @@ +//! Span types for content traversal and editing. + +/// A span of content tagged with its origin in the source structure. +#[derive(Debug, Clone)] +pub struct Span<Id, Data> { + /// Identifier locating this span within the handler's data model. + pub id: Id, + /// The content of this span. + pub data: Data, +} + +/// An edit to apply to a specific span. +#[derive(Debug, Clone)] +pub struct SpanEdit<Id, Data> { + /// Which span to edit (must match a `Span::id`). + pub id: Id, + /// Replacement data for this span. + pub data: Data, +} diff --git a/crates/nvisy-ingest/src/handler/tabular/mod.rs b/crates/nvisy-ingest/src/handler/tabular/mod.rs new file mode 100644 index 0000000..bb7cea7 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/tabular/mod.rs @@ -0,0 +1,4 @@ +//! Tabular/spreadsheet format handlers. + +#[cfg(feature = "xlsx")] +pub mod xlsx; diff --git a/crates/nvisy-ingest/src/handler/tabular/xlsx.rs b/crates/nvisy-ingest/src/handler/tabular/xlsx.rs new file mode 100644 index 0000000..c4c3bad --- /dev/null +++ b/crates/nvisy-ingest/src/handler/tabular/xlsx.rs @@ -0,0 +1,32 @@ +//! XLSX handler (stub — awaiting migration to Loader/Handler pattern). + +use nvisy_core::error::Error; +use nvisy_ontology::entity::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::Handler; + +#[derive(Debug)] +pub struct XlsxHandler; + +#[async_trait::async_trait] +impl Handler for XlsxHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Xlsx + } + + type SpanId = (); + type SpanData = (); + + async fn view_spans(&self) -> SpanStream<'_, (), ()> { + SpanStream::new(futures::stream::empty()) + } + + async fn edit_spans( + &mut self, + _edits: SpanEditStream<'_, (), ()>, + ) -> Result<(), Error> { + Ok(()) + } +} diff --git a/crates/nvisy-ingest/src/handler/text/csv_handler.rs b/crates/nvisy-ingest/src/handler/text/csv_handler.rs new file mode 100644 index 0000000..c270ea2 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/text/csv_handler.rs @@ -0,0 +1,403 @@ +//! CSV handler — holds parsed CSV content and provides span-based +//! access via [`Handler`]. +//! +//! The handler stores the parsed rows (and optional headers) together +//! with the detected delimiter so the file can be reconstructed after +//! edits. +//! +//! # Span model +//! +//! [`Handler::view_spans`] yields one [`Span`] per cell. If headers +//! are present, header cells are emitted first (with +//! [`CsvSpan::header`] set to `true`), followed by data cells in +//! row-major order. +//! +//! [`Handler::edit_spans`] replaces cell content at the given +//! (row, col) position. Header cells can also be edited. + +use futures::StreamExt; + +use nvisy_core::error::Error; +use nvisy_ontology::entity::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::{Handler, Span}; + +/// Cell address within a CSV document. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct CsvSpan { + /// 0-based row index (within data rows, not counting the header). + pub row: usize, + /// 0-based column index. + pub col: usize, + /// `true` when this span addresses a header cell rather than a + /// data cell. + pub header: bool, + /// Column name (from the header row) or column index as a string + /// when no headers are present. + pub key: String, +} + +impl CsvSpan { + /// Address a data cell with a column key. + pub fn cell(row: usize, col: usize, key: impl Into<String>) -> Self { + Self { + row, + col, + header: false, + key: key.into(), + } + } + + /// Address a header cell. + pub fn header_cell(col: usize, key: impl Into<String>) -> Self { + Self { + row: 0, + col, + header: true, + key: key.into(), + } + } +} + +/// Parsed CSV content. +#[derive(Debug, Clone)] +pub struct CsvData { + /// Column headers, if present. + pub headers: Option<Vec<String>>, + /// Data rows (excluding the header row). + pub rows: Vec<Vec<String>>, + /// Field delimiter byte (e.g. `b','`, `b'\t'`, `b';'`). + pub delimiter: u8, + /// Whether the original source had a trailing newline. + pub trailing_newline: bool, +} + +/// Handler for loaded CSV content. +#[derive(Debug)] +pub struct CsvHandler { + pub(crate) data: CsvData, +} + +#[async_trait::async_trait] +impl Handler for CsvHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Csv + } + + type SpanId = CsvSpan; + type SpanData = String; + + async fn view_spans(&self) -> SpanStream<'_, CsvSpan, String> { + SpanStream::new(futures::stream::iter(CsvSpanIter::new(&self.data))) + } + + async fn edit_spans( + &mut self, + edits: SpanEditStream<'_, CsvSpan, String>, + ) -> Result<(), Error> { + let edits: Vec<_> = edits.collect().await; + for edit in edits { + if edit.id.header { + let headers = self.data.headers.as_mut().ok_or_else(|| { + Error::validation("no headers to edit", "csv-handler") + })?; + let cell = headers.get_mut(edit.id.col).ok_or_else(|| { + Error::validation( + format!("header column {} out of bounds", edit.id.col), + "csv-handler", + ) + })?; + *cell = edit.data; + } else { + let row = self.data.rows.get_mut(edit.id.row).ok_or_else(|| { + Error::validation( + format!("row {} out of bounds", edit.id.row), + "csv-handler", + ) + })?; + let cell = row.get_mut(edit.id.col).ok_or_else(|| { + Error::validation( + format!( + "column {} out of bounds in row {}", + edit.id.col, edit.id.row, + ), + "csv-handler", + ) + })?; + *cell = edit.data; + } + } + Ok(()) + } +} + +impl CsvHandler { + /// Column headers, if present. + pub fn headers(&self) -> Option<&[String]> { + self.data.headers.as_deref() + } + + /// All data rows. + pub fn rows(&self) -> &[Vec<String>] { + &self.data.rows + } + + /// A specific cell by (row, col). + pub fn cell(&self, row: usize, col: usize) -> Option<&str> { + self.data + .rows + .get(row) + .and_then(|r| r.get(col)) + .map(|s| s.as_str()) + } + + /// Number of data rows (excluding the header). + pub fn row_count(&self) -> usize { + self.data.rows.len() + } + + /// Detected field delimiter. + pub fn delimiter(&self) -> u8 { + self.data.delimiter + } + + /// Whether the original source had a trailing newline. + pub fn trailing_newline(&self) -> bool { + self.data.trailing_newline + } + + /// Consume the handler and return the inner [`CsvData`]. + pub fn into_data(self) -> CsvData { + self.data + } +} + +/// Iterator over cells of a CSV document. +/// +/// Yields header cells first (if present), then data cells in +/// row-major order. +struct CsvSpanIter<'a> { + headers: Option<&'a [String]>, + rows: &'a [Vec<String>], + /// Current position: `None` = in headers, `Some(row)` = in data. + phase: CsvIterPhase, + col: usize, +} + +enum CsvIterPhase { + Headers, + Data(usize), +} + +impl<'a> CsvSpanIter<'a> { + fn new(data: &'a CsvData) -> Self { + let phase = if data.headers.is_some() { + CsvIterPhase::Headers + } else { + CsvIterPhase::Data(0) + }; + Self { + headers: data.headers.as_deref(), + rows: &data.rows, + phase, + col: 0, + } + } +} + +impl<'a> Iterator for CsvSpanIter<'a> { + type Item = Span<CsvSpan, String>; + + fn next(&mut self) -> Option<Self::Item> { + loop { + match &self.phase { + CsvIterPhase::Headers => { + let headers = self.headers?; + if let Some(value) = headers.get(self.col) { + let col = self.col; + self.col += 1; + return Some(Span { + id: CsvSpan::header_cell(col, value.clone()), + data: value.clone(), + }); + } + self.phase = CsvIterPhase::Data(0); + self.col = 0; + } + CsvIterPhase::Data(row) => { + let row_idx = *row; + let row_data = self.rows.get(row_idx)?; + if let Some(value) = row_data.get(self.col) { + let col = self.col; + self.col += 1; + let key = self + .headers + .and_then(|h| h.get(col)) + .cloned() + .unwrap_or_else(|| col.to_string()); + return Some(Span { + id: CsvSpan::cell(row_idx, col, key), + data: value.clone(), + }); + } + self.phase = CsvIterPhase::Data(row_idx + 1); + self.col = 0; + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::handler::SpanEdit; + use futures::StreamExt; + + fn handler_with_headers( + headers: Vec<&str>, + rows: Vec<Vec<&str>>, + ) -> CsvHandler { + CsvHandler { + data: CsvData { + headers: Some(headers.into_iter().map(String::from).collect()), + rows: rows + .into_iter() + .map(|r| r.into_iter().map(String::from).collect()) + .collect(), + delimiter: b',', + trailing_newline: true, + }, + } + } + + fn handler_no_headers(rows: Vec<Vec<&str>>) -> CsvHandler { + CsvHandler { + data: CsvData { + headers: None, + rows: rows + .into_iter() + .map(|r| r.into_iter().map(String::from).collect()) + .collect(), + delimiter: b',', + trailing_newline: true, + }, + } + } + + #[tokio::test] + async fn view_spans_with_headers() { + let h = handler_with_headers( + vec!["name", "age"], + vec![vec!["Alice", "30"], vec!["Bob", "25"]], + ); + let spans: Vec<_> = h.view_spans().await.collect().await; + + // 2 header cells + 4 data cells + assert_eq!(spans.len(), 6); + + // Headers + assert_eq!(spans[0].id, CsvSpan::header_cell(0, "name")); + assert_eq!(spans[0].data, "name"); + assert_eq!(spans[1].id, CsvSpan::header_cell(1, "age")); + assert_eq!(spans[1].data, "age"); + + // Row 0 + assert_eq!(spans[2].id, CsvSpan::cell(0, 0, "name")); + assert_eq!(spans[2].id.key, "name"); + assert_eq!(spans[2].data, "Alice"); + assert_eq!(spans[3].id, CsvSpan::cell(0, 1, "age")); + assert_eq!(spans[3].id.key, "age"); + assert_eq!(spans[3].data, "30"); + + // Row 1 + assert_eq!(spans[4].id, CsvSpan::cell(1, 0, "name")); + assert_eq!(spans[4].data, "Bob"); + assert_eq!(spans[5].id, CsvSpan::cell(1, 1, "age")); + assert_eq!(spans[5].data, "25"); + } + + #[tokio::test] + async fn view_spans_no_headers() { + let h = handler_no_headers(vec![vec!["x", "y"], vec!["1", "2"]]); + let spans: Vec<_> = h.view_spans().await.collect().await; + + assert_eq!(spans.len(), 4); + assert_eq!(spans[0].id, CsvSpan::cell(0, 0, "0")); + assert_eq!(spans[0].id.key, "0"); + assert_eq!(spans[0].data, "x"); + } + + #[tokio::test] + async fn view_spans_empty() { + let h = handler_no_headers(vec![]); + let spans: Vec<_> = h.view_spans().await.collect().await; + assert!(spans.is_empty()); + } + + #[tokio::test] + async fn edit_spans_data_cell() { + let mut h = handler_with_headers( + vec!["ssn"], + vec![vec!["123-45-6789"]], + ); + h.edit_spans(SpanEditStream::new(futures::stream::iter(vec![ + SpanEdit { + id: CsvSpan::cell(0, 0, "ssn"), + data: "[REDACTED]".into(), + }, + ]))) + .await + .unwrap(); + assert_eq!(h.cell(0, 0), Some("[REDACTED]")); + } + + #[tokio::test] + async fn edit_spans_header_cell() { + let mut h = handler_with_headers( + vec!["secret_field"], + vec![vec!["value"]], + ); + h.edit_spans(SpanEditStream::new(futures::stream::iter(vec![ + SpanEdit { + id: CsvSpan::header_cell(0, "secret_field"), + data: "redacted".into(), + }, + ]))) + .await + .unwrap(); + assert_eq!(h.headers(), Some(["redacted".to_string()].as_slice())); + } + + #[tokio::test] + async fn edit_spans_row_out_of_bounds() { + let mut h = handler_no_headers(vec![vec!["a"]]); + let err = h + .edit_spans(SpanEditStream::new(futures::stream::iter(vec![ + SpanEdit { + id: CsvSpan::cell(5, 0, "0"), + data: "x".into(), + }, + ]))) + .await + .unwrap_err(); + assert!(err.to_string().contains("out of bounds")); + } + + #[tokio::test] + async fn edit_spans_col_out_of_bounds() { + let mut h = handler_no_headers(vec![vec!["a"]]); + let err = h + .edit_spans(SpanEditStream::new(futures::stream::iter(vec![ + SpanEdit { + id: CsvSpan::cell(0, 5, "5"), + data: "x".into(), + }, + ]))) + .await + .unwrap_err(); + assert!(err.to_string().contains("out of bounds")); + } +} diff --git a/crates/nvisy-ingest/src/handler/text/csv_loader.rs b/crates/nvisy-ingest/src/handler/text/csv_loader.rs new file mode 100644 index 0000000..40c7d45 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/text/csv_loader.rs @@ -0,0 +1,234 @@ +//! CSV loader — validates and parses raw CSV content into a +//! [`Document<CsvHandler>`]. +//! +//! The loader auto-detects the field delimiter (comma, tab, semicolon, +//! pipe) by inspecting the first line. + +use nvisy_core::error::Error; +use nvisy_core::io::ContentData; + +use crate::document::Document; +use crate::handler::{CsvData, CsvHandler, Loader, TextEncoding}; + +/// Parameters for [`CsvLoader`]. +#[derive(Debug)] +pub struct CsvParams { + /// Character encoding of the input bytes. + pub encoding: TextEncoding, + /// Whether the first row contains column headers. + /// Defaults to `true`. + pub has_headers: bool, + /// Override the field delimiter. If `None`, the loader will + /// auto-detect from the first line. + pub delimiter: Option<u8>, +} + +impl Default for CsvParams { + fn default() -> Self { + Self { + encoding: TextEncoding::Utf8, + has_headers: true, + delimiter: None, + } + } +} + +/// Loader that validates and parses CSV files. +/// +/// Produces a single [`Document<CsvHandler>`] per input. +#[derive(Debug)] +pub struct CsvLoader; + +#[async_trait::async_trait] +impl Loader for CsvLoader { + type Handler = CsvHandler; + type Params = CsvParams; + + async fn load( + &self, + content: &ContentData, + params: &Self::Params, + ) -> Result<Vec<Document<CsvHandler>>, Error> { + let raw = content.to_bytes(); + let text = params.encoding.decode_bytes(&raw, "csv-loader")?; + let trailing_newline = text.ends_with('\n'); + let delimiter = params + .delimiter + .unwrap_or_else(|| detect_delimiter(&text)); + + let mut reader = csv::ReaderBuilder::new() + .has_headers(params.has_headers) + .delimiter(delimiter) + .flexible(true) + .from_reader(text.as_bytes()); + + let headers = if params.has_headers { + let hdr = reader.headers().map_err(|e| { + Error::validation(format!("CSV header error: {e}"), "csv-loader") + })?; + Some(hdr.iter().map(String::from).collect()) + } else { + None + }; + + let mut rows = Vec::new(); + for result in reader.records() { + let record = result.map_err(|e| { + Error::validation(format!("CSV parse error: {e}"), "csv-loader") + })?; + rows.push(record.iter().map(String::from).collect()); + } + + let handler = CsvHandler { + data: CsvData { + headers, + rows, + delimiter, + trailing_newline, + }, + }; + let doc = Document::new(handler).with_parent(content); + Ok(vec![doc]) + } +} + +/// Auto-detect the CSV delimiter by counting candidate characters +/// in the first line. +fn detect_delimiter(text: &str) -> u8 { + let first_line = text.lines().next().unwrap_or(""); + let candidates: &[(u8, char)] = &[ + (b',', ','), + (b'\t', '\t'), + (b';', ';'), + (b'|', '|'), + ]; + candidates + .iter() + .max_by_key(|(_, ch)| first_line.matches(*ch).count()) + .map(|(b, _)| *b) + .unwrap_or(b',') +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::handler::Handler; + use bytes::Bytes; + use futures::StreamExt; + use nvisy_core::path::ContentSource; + use nvisy_ontology::entity::DocumentType; + + fn content_from_str(s: &str) -> ContentData { + ContentData::new(ContentSource::new(), Bytes::from(s.to_owned())) + } + + #[tokio::test] + async fn load_with_headers() { + let content = content_from_str("name,age\nAlice,30\nBob,25\n"); + let docs = CsvLoader + .load(&content, &CsvParams::default()) + .await + .unwrap(); + + assert_eq!(docs.len(), 1); + assert_eq!(docs[0].document_type(), DocumentType::Csv); + + let h = docs[0].handler(); + assert_eq!(h.headers(), Some(["name", "age"].map(String::from).as_slice())); + assert_eq!(h.row_count(), 2); + assert_eq!(h.cell(0, 0), Some("Alice")); + assert_eq!(h.cell(1, 1), Some("25")); + assert!(h.trailing_newline()); + } + + #[tokio::test] + async fn load_without_headers() { + let params = CsvParams { + has_headers: false, + ..CsvParams::default() + }; + let content = content_from_str("x,y\n1,2\n"); + let docs = CsvLoader.load(&content, ¶ms).await.unwrap(); + + let h = docs[0].handler(); + assert!(h.headers().is_none()); + assert_eq!(h.row_count(), 2); + assert_eq!(h.cell(0, 0), Some("x")); + } + + #[tokio::test] + async fn load_tab_delimited() { + let content = content_from_str("a\tb\n1\t2\n"); + let docs = CsvLoader + .load(&content, &CsvParams::default()) + .await + .unwrap(); + let h = docs[0].handler(); + assert_eq!(h.delimiter(), b'\t'); + assert_eq!(h.headers(), Some(["a", "b"].map(String::from).as_slice())); + } + + #[tokio::test] + async fn load_semicolon_delimited() { + let content = content_from_str("a;b\n1;2\n"); + let docs = CsvLoader + .load(&content, &CsvParams::default()) + .await + .unwrap(); + assert_eq!(docs[0].handler().delimiter(), b';'); + } + + #[tokio::test] + async fn load_quoted_fields() { + let content = content_from_str("name,bio\n\"Alice\",\"Has a, comma\"\n"); + let docs = CsvLoader + .load(&content, &CsvParams::default()) + .await + .unwrap(); + let h = docs[0].handler(); + assert_eq!(h.cell(0, 1), Some("Has a, comma")); + } + + #[tokio::test] + async fn load_empty() { + let content = content_from_str(""); + let docs = CsvLoader + .load(&content, &CsvParams::default()) + .await + .unwrap(); + let h = docs[0].handler(); + assert_eq!(h.row_count(), 0); + } + + #[tokio::test] + async fn load_spans_round_trip() { + let content = content_from_str("name,age\nAlice,30\n"); + let docs = CsvLoader + .load(&content, &CsvParams::default()) + .await + .unwrap(); + let spans: Vec<_> = docs[0].handler().view_spans().await.collect().await; + + // 2 header + 2 data + assert_eq!(spans.len(), 4); + assert_eq!(spans[0].data, "name"); + assert_eq!(spans[1].data, "age"); + assert_eq!(spans[2].data, "Alice"); + assert_eq!(spans[2].id.key, "name"); + assert_eq!(spans[3].data, "30"); + assert_eq!(spans[3].id.key, "age"); + } + + #[tokio::test] + async fn load_invalid_utf8() { + let content = ContentData::new( + ContentSource::new(), + Bytes::from_static(&[0xFF, 0xFE, 0x00]), + ); + let err = CsvLoader + .load(&content, &CsvParams::default()) + .await + .unwrap_err(); + assert!(err.to_string().contains("UTF-8")); + } +} diff --git a/crates/nvisy-ingest/src/handler/text/html.rs b/crates/nvisy-ingest/src/handler/text/html.rs new file mode 100644 index 0000000..318bd46 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/text/html.rs @@ -0,0 +1,32 @@ +//! HTML handler (stub — awaiting migration to Loader/Handler pattern). + +use nvisy_core::error::Error; +use nvisy_ontology::entity::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::Handler; + +#[derive(Debug)] +pub struct HtmlHandler; + +#[async_trait::async_trait] +impl Handler for HtmlHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Html + } + + type SpanId = (); + type SpanData = (); + + async fn view_spans(&self) -> SpanStream<'_, (), ()> { + SpanStream::new(futures::stream::empty()) + } + + async fn edit_spans( + &mut self, + _edits: SpanEditStream<'_, (), ()>, + ) -> Result<(), Error> { + Ok(()) + } +} diff --git a/crates/nvisy-ingest/src/handler/text/json_handler.rs b/crates/nvisy-ingest/src/handler/text/json_handler.rs new file mode 100644 index 0000000..74598cb --- /dev/null +++ b/crates/nvisy-ingest/src/handler/text/json_handler.rs @@ -0,0 +1,578 @@ +//! JSON handler — holds parsed JSON content and provides span-based +//! access via [`Handler`]. +//! +//! The handler stores the parsed [`serde_json::Value`] tree together +//! with formatting metadata captured during loading, so the original +//! file can be reconstructed with identical whitespace after edits. +//! +//! # Span model +//! +//! [`Handler::view_spans`] yields one [`Span`] per node in the JSON +//! tree. **Every** value is emitted — leaf scalars, and object keys +//! (as string-valued spans). Each span is addressed by a [`JsonPath`]: +//! an [RFC 6901] JSON Pointer such as `/address/city` plus a flag +//! indicating whether the span targets the key name or the value. +//! +//! [`Handler::edit_spans`] accepts [`SpanEdit`]s. For value spans the +//! value at the pointer is replaced; for key spans the object key is +//! renamed. +//! +//! [RFC 6901]: https://www.rfc-editor.org/rfc/rfc6901 + +use std::num::NonZeroU32; + +use futures::StreamExt; +use serde::{Deserialize, Serialize}; + +use nvisy_core::error::Error; +use nvisy_ontology::entity::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::{Handler, Span}; + +const DEFAULT_INDENT: NonZeroU32 = NonZeroU32::new(2).unwrap(); + +/// [RFC 6901] JSON Pointer identifying a span within a JSON document. +/// +/// `pointer` follows JSON Pointer syntax: `""` for the root, +/// `"/foo/0/bar"` for nested paths. Object keys containing `~` or `/` +/// are escaped as `~0` and `~1` respectively. +/// +/// When `key_of` is `true` the span addresses the **key name** of the +/// object entry at `pointer`, rather than its value. Editing a key +/// span renames the key; editing a value span replaces the value. +/// +/// [RFC 6901]: https://www.rfc-editor.org/rfc/rfc6901 +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct JsonPath { + pub pointer: String, + pub key_of: bool, +} + +impl JsonPath { + /// Create a value-addressing path. + pub fn value(pointer: impl Into<String>) -> Self { + Self { + pointer: pointer.into(), + key_of: false, + } + } + + /// Create a key-addressing path. + pub fn key(pointer: impl Into<String>) -> Self { + Self { + pointer: pointer.into(), + key_of: true, + } + } +} + +/// Indentation style detected in the original JSON source. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum JsonIndent { + /// No whitespace between tokens (`{"a":1}`). + Compact, + /// N-space indentation. + Spaces(NonZeroU32), + /// Tab indentation. + Tab, +} + +impl JsonIndent { + /// Two-space indentation. + pub fn two_spaces() -> Self { + Self::Spaces(NonZeroU32::new(2).unwrap()) + } + + /// Four-space indentation. + pub fn four_spaces() -> Self { + Self::Spaces(NonZeroU32::new(4).unwrap()) + } +} + +/// Parsed JSON content together with its original formatting. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct JsonData { + pub value: serde_json::Value, + pub indent: JsonIndent, + pub trailing_newline: bool, +} + +impl Default for JsonData { + fn default() -> Self { + Self { + value: serde_json::Value::Null, + indent: JsonIndent::Spaces(DEFAULT_INDENT), + trailing_newline: true, + } + } +} + +/// Handler for loaded JSON content. +/// +/// Provides direct access to the parsed [`serde_json::Value`] tree +/// for reading and mutation, plus [`Handler`] implementation for +/// pipeline-driven span-based editing. +#[derive(Debug)] +pub struct JsonHandler { + pub(crate) data: JsonData, +} + +#[async_trait::async_trait] +impl Handler for JsonHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Json + } + + type SpanId = JsonPath; + type SpanData = serde_json::Value; + + async fn view_spans(&self) -> SpanStream<'_, JsonPath, serde_json::Value> { + SpanStream::new(futures::stream::iter(JsonSpanIter::new(&self.data.value))) + } + + async fn edit_spans( + &mut self, + edits: SpanEditStream<'_, JsonPath, serde_json::Value>, + ) -> Result<(), Error> { + let edits: Vec<_> = edits.collect().await; + // Apply value edits first so that pointers remain valid when + // key renames change the path structure. + for edit in edits.iter().filter(|e| !e.id.key_of) { + let target = + self.data.value.pointer_mut(&edit.id.pointer).ok_or_else(|| { + Error::validation( + format!("JSON pointer not found: {}", edit.id.pointer), + "json-handler", + ) + })?; + *target = edit.data.clone(); + } + for edit in edits.iter().filter(|e| e.id.key_of) { + rename_key(&mut self.data.value, &edit.id.pointer, &edit.data)?; + } + Ok(()) + } +} + +impl JsonHandler { + /// Reference to the root JSON value. + pub fn value(&self) -> &serde_json::Value { + &self.data.value + } + + /// Mutable reference to the root JSON value. + pub fn value_mut(&mut self) -> &mut serde_json::Value { + &mut self.data.value + } + + /// Look up a value by [RFC 6901] JSON Pointer (e.g. `"/a/0/b"`). + /// + /// [RFC 6901]: https://www.rfc-editor.org/rfc/rfc6901 + pub fn pointer(&self, pointer: &str) -> Option<&serde_json::Value> { + self.data.value.pointer(pointer) + } + + /// Mutably look up a value by [RFC 6901] JSON Pointer. + /// + /// [RFC 6901]: https://www.rfc-editor.org/rfc/rfc6901 + pub fn pointer_mut(&mut self, pointer: &str) -> Option<&mut serde_json::Value> { + self.data.value.pointer_mut(pointer) + } + + /// Replace the entire root value. + pub fn set_value(&mut self, value: serde_json::Value) { + self.data.value = value; + } + + /// Indentation style detected in the original source. + pub fn indent(&self) -> JsonIndent { + self.data.indent + } + + /// Whether the original source had a trailing newline. + pub fn trailing_newline(&self) -> bool { + self.data.trailing_newline + } + + /// Consume the handler and return the inner [`JsonData`]. + pub fn into_data(self) -> JsonData { + self.data + } +} + +/// Rename an object key addressed by `pointer`. +/// +/// `new_name` must be a `Value::String`; the pointer must resolve to +/// an entry inside an object. +fn rename_key( + root: &mut serde_json::Value, + pointer: &str, + new_name: &serde_json::Value, +) -> Result<(), Error> { + let new_key = new_name.as_str().ok_or_else(|| { + Error::validation("key rename requires a string value", "json-handler") + })?; + + let (parent_ptr, old_key) = split_pointer(pointer)?; + + let parent = if parent_ptr.is_empty() { + root as &mut serde_json::Value + } else { + root.pointer_mut(parent_ptr).ok_or_else(|| { + Error::validation( + format!("JSON pointer not found: {parent_ptr}"), + "json-handler", + ) + })? + }; + + let obj = parent.as_object_mut().ok_or_else(|| { + Error::validation( + format!("parent at {parent_ptr} is not an object"), + "json-handler", + ) + })?; + + let value = obj.remove(&old_key).ok_or_else(|| { + Error::validation( + format!("key {old_key:?} not found in object at {parent_ptr}"), + "json-handler", + ) + })?; + + obj.insert(new_key.to_owned(), value); + Ok(()) +} + +/// Split a JSON Pointer into parent pointer and last segment (unescaped). +fn split_pointer(pointer: &str) -> Result<(&str, String), Error> { + let last_slash = pointer.rfind('/').ok_or_else(|| { + Error::validation( + format!("invalid JSON pointer for key rename: {pointer}"), + "json-handler", + ) + })?; + let parent = &pointer[..last_slash]; + let segment = unescape_json_pointer(&pointer[last_slash + 1..]); + Ok((parent, segment)) +} + +/// Unescape a JSON Pointer segment ([RFC 6901]): `~1` → `/`, `~0` → `~`. +/// +/// [RFC 6901]: https://www.rfc-editor.org/rfc/rfc6901 +fn unescape_json_pointer(segment: &str) -> String { + if segment.contains('~') { + segment.replace("~1", "/").replace("~0", "~") + } else { + segment.to_owned() + } +} + +/// Stack frame for iterative JSON tree traversal. +enum IterFrame<'a> { + /// A leaf or unexpanded node to process. + Pending { + value: &'a serde_json::Value, + pointer: String, + }, + /// A key span to yield before descending into its value. + KeySpan { + value: &'a serde_json::Value, + pointer: String, + key: String, + }, + /// An object whose entries are being yielded. + Object(String, serde_json::map::Iter<'a>), + /// An array whose elements are being yielded. + Array(String, std::iter::Enumerate<std::slice::Iter<'a, serde_json::Value>>), +} + +/// Stack-based depth-first iterator over a JSON tree. +/// +/// Yields one [`Span`] per leaf value **and** one per object key. +/// Key spans have [`JsonPath::key_of`] set to `true` and carry the +/// key name as `Value::String`. Objects and arrays are expanded in +/// place without recursion, so arbitrarily deep documents are safe +/// to iterate. +struct JsonSpanIter<'a> { + stack: Vec<IterFrame<'a>>, +} + +impl<'a> JsonSpanIter<'a> { + fn new(root: &'a serde_json::Value) -> Self { + Self { + stack: vec![IterFrame::Pending { + value: root, + pointer: String::new(), + }], + } + } +} + +impl<'a> Iterator for JsonSpanIter<'a> { + type Item = Span<JsonPath, serde_json::Value>; + + fn next(&mut self) -> Option<Self::Item> { + loop { + let frame = self.stack.last_mut()?; + + match frame { + IterFrame::Pending { .. } => { + let IterFrame::Pending { value, pointer } = + self.stack.pop().unwrap() + else { + unreachable!() + }; + match value { + serde_json::Value::Object(map) => { + self.stack.push(IterFrame::Object(pointer, map.iter())); + } + serde_json::Value::Array(arr) => { + self.stack + .push(IterFrame::Array(pointer, arr.iter().enumerate())); + } + leaf => { + return Some(Span { + id: JsonPath::value(pointer), + data: leaf.clone(), + }); + } + } + } + IterFrame::KeySpan { .. } => { + let IterFrame::KeySpan { value, pointer, key } = + self.stack.pop().unwrap() + else { + unreachable!() + }; + // Push the value traversal so it runs after we yield the key. + self.stack.push(IterFrame::Pending { + value, + pointer: pointer.clone(), + }); + return Some(Span { + id: JsonPath::key(&pointer), + data: serde_json::Value::String(key), + }); + } + IterFrame::Object(pointer, iter) => match iter.next() { + Some((key, child)) => { + let child_pointer = + format!("{}/{}", pointer, escape_json_pointer(key)); + self.stack.push(IterFrame::KeySpan { + value: child, + pointer: child_pointer, + key: key.clone(), + }); + } + None => { + self.stack.pop(); + } + }, + IterFrame::Array(pointer, iter) => match iter.next() { + Some((i, child)) => { + let child_pointer = format!("{}/{i}", pointer); + self.stack.push(IterFrame::Pending { + value: child, + pointer: child_pointer, + }); + } + None => { + self.stack.pop(); + } + }, + } + } + } +} + +/// Escape a JSON object key for use in a JSON Pointer ([RFC 6901]). +/// +/// [RFC 6901]: https://www.rfc-editor.org/rfc/rfc6901 +fn escape_json_pointer(key: &str) -> String { + if key.contains('~') || key.contains('/') { + key.replace('~', "~0").replace('/', "~1") + } else { + key.to_owned() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::handler::SpanEdit; + use futures::StreamExt; + use serde_json::json; + + fn handler(value: serde_json::Value) -> JsonHandler { + JsonHandler { + data: JsonData { + value, + ..JsonData::default() + }, + } + } + + #[tokio::test] + async fn view_spans_flat_object() { + let h = handler(json!({"name": "Alice", "age": 30})); + let spans: Vec<_> = h.view_spans().await.collect().await; + + // BTreeMap (alphabetical): age before name. + // Each key emits a key span followed by a value span. + assert_eq!(spans.len(), 4); + assert_eq!(spans[0].id, JsonPath::key("/age")); + assert_eq!(spans[0].data, json!("age")); + assert_eq!(spans[1].id, JsonPath::value("/age")); + assert_eq!(spans[1].data, json!(30)); + assert_eq!(spans[2].id, JsonPath::key("/name")); + assert_eq!(spans[2].data, json!("name")); + assert_eq!(spans[3].id, JsonPath::value("/name")); + assert_eq!(spans[3].data, json!("Alice")); + } + + #[tokio::test] + async fn view_spans_nested() { + let h = handler(json!({"a": {"b": [1, "two", null]}})); + let spans: Vec<_> = h.view_spans().await.collect().await; + + // key "a", key "b", values 0/1/2 + assert_eq!(spans.len(), 5); + assert_eq!(spans[0].id, JsonPath::key("/a")); + assert_eq!(spans[1].id, JsonPath::key("/a/b")); + assert_eq!(spans[2].id, JsonPath::value("/a/b/0")); + assert_eq!(spans[2].data, json!(1)); + assert_eq!(spans[3].id, JsonPath::value("/a/b/1")); + assert_eq!(spans[3].data, json!("two")); + assert_eq!(spans[4].id, JsonPath::value("/a/b/2")); + assert_eq!(spans[4].data, json!(null)); + } + + #[tokio::test] + async fn view_spans_key_escaping() { + let h = handler(json!({"a/b": "x", "c~d": "y"})); + let spans: Vec<_> = h.view_spans().await.collect().await; + + // key span, value span, key span, value span + assert_eq!(spans.len(), 4); + assert_eq!(spans[0].id, JsonPath::key("/a~1b")); + assert_eq!(spans[0].data, json!("a/b")); + assert_eq!(spans[1].id, JsonPath::value("/a~1b")); + assert_eq!(spans[1].data, json!("x")); + assert_eq!(spans[2].id, JsonPath::key("/c~0d")); + assert_eq!(spans[2].data, json!("c~d")); + assert_eq!(spans[3].id, JsonPath::value("/c~0d")); + assert_eq!(spans[3].data, json!("y")); + } + + #[tokio::test] + async fn view_spans_empty_object() { + let h = handler(json!({})); + let spans: Vec<_> = h.view_spans().await.collect().await; + assert!(spans.is_empty()); + } + + #[tokio::test] + async fn view_spans_scalar_root() { + let h = handler(json!("hello")); + let spans: Vec<_> = h.view_spans().await.collect().await; + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].id, JsonPath::value("")); + assert_eq!(spans[0].data, json!("hello")); + } + + #[tokio::test] + async fn edit_spans_replace_value() { + let mut h = handler(json!({"ssn": "123-45-6789"})); + h.edit_spans(SpanEditStream::new(futures::stream::iter(vec![ + SpanEdit { + id: JsonPath::value("/ssn"), + data: json!(null), + }, + ]))) + .await + .unwrap(); + assert_eq!(h.value(), &json!({"ssn": null})); + } + + #[tokio::test] + async fn edit_spans_rename_key() { + let mut h = handler(json!({"John Smith": {"age": 30}})); + h.edit_spans(SpanEditStream::new(futures::stream::iter(vec![ + SpanEdit { + id: JsonPath::key("/John Smith"), + data: json!("[REDACTED]"), + }, + ]))) + .await + .unwrap(); + assert_eq!(h.value(), &json!({"[REDACTED]": {"age": 30}})); + } + + #[tokio::test] + async fn edit_spans_rename_nested_key() { + let mut h = handler(json!({"a": {"secret_field": 42}})); + h.edit_spans(SpanEditStream::new(futures::stream::iter(vec![ + SpanEdit { + id: JsonPath::key("/a/secret_field"), + data: json!("redacted"), + }, + ]))) + .await + .unwrap(); + assert_eq!(h.value(), &json!({"a": {"redacted": 42}})); + } + + #[tokio::test] + async fn edit_spans_rename_key_requires_string() { + let mut h = handler(json!({"a": 1})); + let err = h + .edit_spans(SpanEditStream::new(futures::stream::iter(vec![ + SpanEdit { + id: JsonPath::key("/a"), + data: json!(42), + }, + ]))) + .await + .unwrap_err(); + assert!(err.to_string().contains("string")); + } + + #[tokio::test] + async fn edit_spans_bad_pointer() { + let mut h = handler(json!({"a": 1})); + let err = h + .edit_spans(SpanEditStream::new(futures::stream::iter(vec![ + SpanEdit { + id: JsonPath::value("/nonexistent"), + data: json!(null), + }, + ]))) + .await + .unwrap_err(); + assert!(err.to_string().contains("not found")); + } + + #[tokio::test] + async fn edit_spans_value_before_key_rename() { + let mut h = handler(json!({"name": "Alice"})); + // Key rename listed first, but value edit must apply first + // (while /name still exists) before the key is renamed. + h.edit_spans(SpanEditStream::new(futures::stream::iter(vec![ + SpanEdit { + id: JsonPath::key("/name"), + data: json!("[REDACTED]"), + }, + SpanEdit { + id: JsonPath::value("/name"), + data: json!("***"), + }, + ]))) + .await + .unwrap(); + assert_eq!(h.value(), &json!({"[REDACTED]": "***"})); + } +} diff --git a/crates/nvisy-ingest/src/handler/text/json_loader.rs b/crates/nvisy-ingest/src/handler/text/json_loader.rs new file mode 100644 index 0000000..5baf657 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/text/json_loader.rs @@ -0,0 +1,186 @@ +//! JSON loader — validates and parses raw JSON content into a +//! [`Document<JsonHandler>`]. +//! +//! The loader detects the indentation style and trailing-newline +//! convention of the source file so that [`JsonData`] preserves +//! whitespace for round-trip fidelity. + +use std::num::NonZeroU32; + +use nvisy_core::error::Error; +use nvisy_core::io::ContentData; + +use crate::document::Document; +use crate::handler::{ + JsonData, JsonHandler, JsonIndent, Loader, TextEncoding, +}; + +/// Parameters for [`JsonLoader`]. +#[derive(Debug, Default)] +pub struct JsonParams { + /// Character encoding of the input bytes. + pub encoding: TextEncoding, +} + +/// Loader that validates and parses JSON files. +/// +/// Produces a single [`Document<JsonHandler>`] per input. The +/// loaded handler stores the parsed [`serde_json::Value`] tree +/// together with formatting metadata for round-trip fidelity. +#[derive(Debug)] +pub struct JsonLoader; + +#[async_trait::async_trait] +impl Loader for JsonLoader { + type Handler = JsonHandler; + type Params = JsonParams; + + async fn load( + &self, + content: &ContentData, + params: &Self::Params, + ) -> Result<Vec<Document<JsonHandler>>, Error> { + let raw = content.to_bytes(); + let text = params.encoding.decode_bytes(&raw, "json-loader")?; + let (indent, trailing_newline) = detect_formatting(&text); + + let value: serde_json::Value = serde_json::from_str(&text).map_err(|e| { + Error::validation(format!("Invalid JSON: {e}"), "json-loader") + })?; + + let handler = JsonHandler { + data: JsonData { + value, + indent, + trailing_newline, + }, + }; + let doc = Document::new(handler).with_parent(content); + Ok(vec![doc]) + } +} + +/// Detect indentation style and trailing newline from raw JSON source. +/// +/// Inspects the first indented line to determine the whitespace +/// convention. Falls back to [`JsonIndent::Compact`] when no +/// indentation is present (single-line JSON). +fn detect_formatting(source: &str) -> (JsonIndent, bool) { + let trailing_newline = source.ends_with('\n'); + + let indent = source + .lines() + .find_map(|line| { + let stripped = line.trim_start(); + if stripped.len() == line.len() { + return None; + } + let ws = &line[..line.len() - stripped.len()]; + if ws.starts_with('\t') { + Some(JsonIndent::Tab) + } else { + let n = u32::try_from(ws.len()).unwrap_or(u32::MAX); + Some(JsonIndent::Spaces( + NonZeroU32::new(n).unwrap_or(NonZeroU32::new(2).unwrap()), + )) + } + }) + .unwrap_or(JsonIndent::Compact); + + (indent, trailing_newline) +} + +#[cfg(test)] +mod tests { + use super::*; + use bytes::Bytes; + use nvisy_core::path::ContentSource; + use nvisy_ontology::entity::DocumentType; + use serde_json::json; + + fn content_from_str(s: &str) -> ContentData { + ContentData::new(ContentSource::new(), Bytes::from(s.to_owned())) + } + + #[tokio::test] + async fn load_simple_object() { + let content = content_from_str(r#"{"name": "Alice", "age": 30}"#); + let docs = JsonLoader + .load(&content, &JsonParams::default()) + .await + .unwrap(); + + assert_eq!(docs.len(), 1); + assert_eq!(docs[0].document_type(), DocumentType::Json); + + let handler = docs[0].handler(); + assert_eq!(handler.value(), &json!({"name": "Alice", "age": 30})); + } + + #[tokio::test] + async fn load_detects_compact_formatting() { + let content = content_from_str(r#"{"a":1}"#); + let docs = JsonLoader + .load(&content, &JsonParams::default()) + .await + .unwrap(); + let h = docs[0].handler(); + assert_eq!(h.indent(), JsonIndent::Compact); + assert!(!h.trailing_newline()); + } + + #[tokio::test] + async fn load_detects_two_space_indent() { + let content = content_from_str("{\n \"a\": 1\n}\n"); + let docs = JsonLoader + .load(&content, &JsonParams::default()) + .await + .unwrap(); + let h = docs[0].handler(); + assert_eq!(h.indent(), JsonIndent::two_spaces()); + assert!(h.trailing_newline()); + } + + #[tokio::test] + async fn load_detects_four_space_indent() { + let content = content_from_str("{\n \"a\": 1\n}\n"); + let docs = JsonLoader + .load(&content, &JsonParams::default()) + .await + .unwrap(); + assert_eq!(docs[0].handler().indent(), JsonIndent::four_spaces()); + } + + #[tokio::test] + async fn load_detects_tab_indent() { + let content = content_from_str("{\n\t\"a\": 1\n}\n"); + let docs = JsonLoader + .load(&content, &JsonParams::default()) + .await + .unwrap(); + assert_eq!(docs[0].handler().indent(), JsonIndent::Tab); + } + + #[tokio::test] + async fn load_invalid_utf8() { + let content = ContentData::new( + ContentSource::new(), + Bytes::from_static(&[0xFF, 0xFE, 0x00]), + ); + let err = JsonLoader + .load(&content, &JsonParams::default()) + .await + .unwrap_err(); + assert!(err.to_string().contains("UTF-8")); + } + + #[tokio::test] + async fn load_invalid_json() { + let content = content_from_str("{not json}"); + let err = JsonLoader + .load(&content, &JsonParams::default()) + .await + .unwrap_err(); + assert!(err.to_string().contains("JSON")); + } +} diff --git a/crates/nvisy-ingest/src/handler/text/mod.rs b/crates/nvisy-ingest/src/handler/text/mod.rs new file mode 100644 index 0000000..22b6542 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/text/mod.rs @@ -0,0 +1,10 @@ +//! Text-based format handlers. + +pub mod txt_handler; +pub mod txt_loader; +pub mod csv_handler; +pub mod csv_loader; +pub mod json_handler; +pub mod json_loader; +#[cfg(feature = "html")] +pub mod html; diff --git a/crates/nvisy-ingest/src/handler/text/txt_handler.rs b/crates/nvisy-ingest/src/handler/text/txt_handler.rs new file mode 100644 index 0000000..3e67c8b --- /dev/null +++ b/crates/nvisy-ingest/src/handler/text/txt_handler.rs @@ -0,0 +1,214 @@ +//! Plain-text handler — holds loaded text content and provides +//! span-based access via [`Handler`]. +//! +//! The handler stores the text as a vector of lines together with a +//! trailing-newline flag so the original file can be reconstructed +//! byte-for-byte after edits. +//! +//! # Span model +//! +//! [`Handler::view_spans`] yields one [`Span`] per line. Each span +//! is addressed by a [`TxtSpan`] (0-based line index) and carries the +//! line content as a `String`. +//! +//! [`Handler::edit_spans`] replaces the content of lines at the given +//! indices. + +use futures::StreamExt; + +use nvisy_core::error::Error; +use nvisy_ontology::entity::DocumentType; + +use crate::document::edit_stream::SpanEditStream; +use crate::document::view_stream::SpanStream; +use crate::handler::{Handler, Span}; + +/// 0-based line index identifying a span within a plain-text document. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct TxtSpan(pub usize); + +/// Parsed plain-text content. +#[derive(Debug, Clone)] +pub struct TxtData { + pub lines: Vec<String>, + pub trailing_newline: bool, +} + +/// Handler for loaded plain-text content. +/// +/// Each line is independently addressable via [`TxtSpan`]. +#[derive(Debug)] +pub struct TxtHandler { + pub(crate) data: TxtData, +} + +#[async_trait::async_trait] +impl Handler for TxtHandler { + fn document_type(&self) -> DocumentType { + DocumentType::Txt + } + + type SpanId = TxtSpan; + type SpanData = String; + + async fn view_spans(&self) -> SpanStream<'_, TxtSpan, String> { + SpanStream::new(futures::stream::iter(TxtSpanIter { + lines: &self.data.lines, + index: 0, + })) + } + + async fn edit_spans( + &mut self, + edits: SpanEditStream<'_, TxtSpan, String>, + ) -> Result<(), Error> { + let edits: Vec<_> = edits.collect().await; + for edit in edits { + let line = self.data.lines.get_mut(edit.id.0).ok_or_else(|| { + Error::validation( + format!("line index out of bounds: {}", edit.id.0), + "txt-handler", + ) + })?; + *line = edit.data; + } + Ok(()) + } +} + +impl TxtHandler { + /// All lines in the document. + pub fn lines(&self) -> &[String] { + &self.data.lines + } + + /// A specific line by 0-based index. + pub fn line(&self, index: usize) -> Option<&str> { + self.data.lines.get(index).map(|s| s.as_str()) + } + + /// Whether the original source had a trailing newline. + pub fn trailing_newline(&self) -> bool { + self.data.trailing_newline + } + + /// Total number of lines. + pub fn line_count(&self) -> usize { + self.data.lines.len() + } + + /// Consume the handler and return the inner [`TxtData`]. + pub fn into_data(self) -> TxtData { + self.data + } +} + +/// Iterator over lines of a plain-text document. +struct TxtSpanIter<'a> { + lines: &'a [String], + index: usize, +} + +impl<'a> Iterator for TxtSpanIter<'a> { + type Item = Span<TxtSpan, String>; + + fn next(&mut self) -> Option<Self::Item> { + let line = self.lines.get(self.index)?; + let span = Span { + id: TxtSpan(self.index), + data: line.clone(), + }; + self.index += 1; + Some(span) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + let remaining = self.lines.len() - self.index; + (remaining, Some(remaining)) + } +} + +impl<'a> ExactSizeIterator for TxtSpanIter<'a> {} + +#[cfg(test)] +mod tests { + use super::*; + use crate::handler::SpanEdit; + use futures::{Stream, StreamExt}; + + fn handler(text: &str) -> TxtHandler { + let trailing_newline = text.ends_with('\n'); + let lines = text.lines().map(String::from).collect(); + TxtHandler { + data: TxtData { + lines, + trailing_newline, + }, + } + } + + #[tokio::test] + async fn view_spans_multiline() { + let h = handler("hello\nworld\n"); + let spans: Vec<_> = h.view_spans().await.collect().await; + + assert_eq!(spans.len(), 2); + assert_eq!(spans[0].id, TxtSpan(0)); + assert_eq!(spans[0].data, "hello"); + assert_eq!(spans[1].id, TxtSpan(1)); + assert_eq!(spans[1].data, "world"); + } + + #[tokio::test] + async fn view_spans_single_line_no_newline() { + let h = handler("no newline"); + let spans: Vec<_> = h.view_spans().await.collect().await; + + assert_eq!(spans.len(), 1); + assert_eq!(spans[0].data, "no newline"); + assert!(!h.trailing_newline()); + } + + #[tokio::test] + async fn view_spans_empty() { + let h = handler(""); + let spans: Vec<_> = h.view_spans().await.collect().await; + assert!(spans.is_empty()); + } + + #[tokio::test] + async fn edit_spans_replace_line() { + let mut h = handler("hello\nworld\n"); + h.edit_spans(SpanEditStream::new(futures::stream::iter(vec![ + SpanEdit { + id: TxtSpan(1), + data: "[REDACTED]".into(), + }, + ]))) + .await + .unwrap(); + assert_eq!(h.lines(), &["hello", "[REDACTED]"]); + } + + #[tokio::test] + async fn edit_spans_out_of_bounds() { + let mut h = handler("one line"); + let err = h + .edit_spans(SpanEditStream::new(futures::stream::iter(vec![ + SpanEdit { + id: TxtSpan(5), + data: "nope".into(), + }, + ]))) + .await + .unwrap_err(); + assert!(err.to_string().contains("out of bounds")); + } + + #[tokio::test] + async fn view_spans_size_hint() { + let h = handler("a\nb\nc\n"); + let stream = h.view_spans().await; + assert_eq!(stream.size_hint(), (3, Some(3))); + } +} diff --git a/crates/nvisy-ingest/src/handler/text/txt_loader.rs b/crates/nvisy-ingest/src/handler/text/txt_loader.rs new file mode 100644 index 0000000..5347415 --- /dev/null +++ b/crates/nvisy-ingest/src/handler/text/txt_loader.rs @@ -0,0 +1,136 @@ +//! Plain-text loader — validates and parses raw text content into a +//! [`Document<TxtHandler>`]. +//! +//! The loader splits the input into lines and records whether the +//! source ended with a trailing newline so the file can be +//! reconstructed after edits. + +use nvisy_core::error::Error; +use nvisy_core::io::ContentData; + +use crate::document::Document; +use crate::handler::{Loader, TxtData, TxtHandler}; + +/// Parameters for [`TxtLoader`]. +#[derive(Debug, Default)] +pub struct TxtParams { + /// Character encoding of the input bytes. + pub encoding: crate::handler::TextEncoding, +} + +/// Loader that validates and parses plain-text files. +/// +/// Produces a single [`Document<TxtHandler>`] per input. +#[derive(Debug)] +pub struct TxtLoader; + +#[async_trait::async_trait] +impl Loader for TxtLoader { + type Handler = TxtHandler; + type Params = TxtParams; + + async fn load( + &self, + content: &ContentData, + params: &Self::Params, + ) -> Result<Vec<Document<TxtHandler>>, Error> { + let raw = content.to_bytes(); + let text = params.encoding.decode_bytes(&raw, "txt-loader")?; + let trailing_newline = text.ends_with('\n'); + let lines = text.lines().map(String::from).collect(); + + let handler = TxtHandler { + data: TxtData { + lines, + trailing_newline, + }, + }; + let doc = Document::new(handler).with_parent(content); + Ok(vec![doc]) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::handler::Handler; + use bytes::Bytes; + use futures::StreamExt; + use nvisy_core::path::ContentSource; + use nvisy_ontology::entity::DocumentType; + + fn content_from_str(s: &str) -> ContentData { + ContentData::new(ContentSource::new(), Bytes::from(s.to_owned())) + } + + #[tokio::test] + async fn load_multiline() { + let content = content_from_str("hello\nworld\n"); + let docs = TxtLoader + .load(&content, &TxtParams::default()) + .await + .unwrap(); + + assert_eq!(docs.len(), 1); + assert_eq!(docs[0].document_type(), DocumentType::Txt); + + let h = docs[0].handler(); + assert_eq!(h.lines(), &["hello", "world"]); + assert!(h.trailing_newline()); + } + + #[tokio::test] + async fn load_no_trailing_newline() { + let content = content_from_str("single line"); + let docs = TxtLoader + .load(&content, &TxtParams::default()) + .await + .unwrap(); + + let h = docs[0].handler(); + assert_eq!(h.line_count(), 1); + assert_eq!(h.line(0), Some("single line")); + assert!(!h.trailing_newline()); + } + + #[tokio::test] + async fn load_empty() { + let content = content_from_str(""); + let docs = TxtLoader + .load(&content, &TxtParams::default()) + .await + .unwrap(); + + let h = docs[0].handler(); + assert_eq!(h.line_count(), 0); + assert!(!h.trailing_newline()); + } + + #[tokio::test] + async fn load_preserves_spans_through_round_trip() { + let content = content_from_str("Alice\nBob\nCharlie\n"); + let docs = TxtLoader + .load(&content, &TxtParams::default()) + .await + .unwrap(); + + let spans: Vec<_> = docs[0].handler().view_spans().await.collect().await; + assert_eq!(spans.len(), 3); + assert_eq!(spans[0].data, "Alice"); + assert_eq!(spans[1].data, "Bob"); + assert_eq!(spans[2].data, "Charlie"); + } + + #[tokio::test] + async fn load_invalid_utf8() { + let content = ContentData::new( + ContentSource::new(), + Bytes::from_static(&[0xFF, 0xFE, 0x00]), + ); + let err = TxtLoader + .load(&content, &TxtParams::default()) + .await + .unwrap_err(); + assert!(err.to_string().contains("UTF-8")); + } +} diff --git a/crates/nvisy-ingest/src/image/image.rs b/crates/nvisy-ingest/src/image/image.rs deleted file mode 100644 index 0fac9fc..0000000 --- a/crates/nvisy-ingest/src/image/image.rs +++ /dev/null @@ -1,58 +0,0 @@ -//! Image file loader using the `image` crate. - -use bytes::Bytes; -use serde::Deserialize; - -use nvisy_core::io::ContentData; -use nvisy_core::error::{Error, ErrorKind}; - -use crate::document::Document; -use crate::handler::{ImageHandler as ImageHandlerType, FormatHandler, ImageLoader}; - -/// Typed parameters for [`ImageFileLoader`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ImageLoaderParams {} - -/// Decodes image files and returns a [`Document`] with binary data and dimensions. -pub struct ImageFileLoader; - -impl Clone for ImageFileLoader { - fn clone(&self) -> Self { Self } -} - -#[async_trait::async_trait] -impl ImageLoader for ImageFileLoader { - type Params = ImageLoaderParams; - - async fn load( - &self, - content: &ContentData, - _params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error> { - let raw = content.to_bytes(); - let img = image::load_from_memory(&raw).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("Image decode failed: {e}")) - })?; - - let width = img.width(); - let height = img.height(); - - let mime_type = content - .content_type() - .unwrap_or("image/png") - .to_string(); - - let mut doc = Document::new(ImageHandlerType) - .with_data(Bytes::copy_from_slice(&raw), mime_type) - .with_dimensions(width, height); - doc.source.set_parent_id(Some(content.content_source.as_uuid())); - Ok(vec![doc.into_format()]) - } -} - -impl crate::handler::Handler for ImageFileLoader { - fn id(&self) -> &str { ImageHandlerType.id() } - fn extensions(&self) -> &[&str] { ImageHandlerType.extensions() } - fn content_types(&self) -> &[&str] { ImageHandlerType.content_types() } -} diff --git a/crates/nvisy-ingest/src/image/mod.rs b/crates/nvisy-ingest/src/image/mod.rs deleted file mode 100644 index ebcc729..0000000 --- a/crates/nvisy-ingest/src/image/mod.rs +++ /dev/null @@ -1,4 +0,0 @@ -//! Image file loaders. - -#[cfg(feature = "image")] -pub mod image; diff --git a/crates/nvisy-ingest/src/lib.rs b/crates/nvisy-ingest/src/lib.rs index cde5e1a..ed421f6 100644 --- a/crates/nvisy-ingest/src/lib.rs +++ b/crates/nvisy-ingest/src/lib.rs @@ -4,12 +4,6 @@ pub mod handler; pub mod document; -pub mod element; -pub mod text; -pub mod binary; -pub mod image; -pub mod tabular; -pub mod audio; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-ingest/src/prelude.rs b/crates/nvisy-ingest/src/prelude.rs index 516a27a..245b2be 100644 --- a/crates/nvisy-ingest/src/prelude.rs +++ b/crates/nvisy-ingest/src/prelude.rs @@ -1,38 +1,15 @@ //! Convenience re-exports. pub use crate::handler::{ - Handler, FormatHandler, - PlaintextHandler, CsvHandler, JsonHandler, - WavHandler, Mp3Handler, - TextLoader, BinaryLoader, ImageLoader, SpreadsheetLoader, AudioLoader, + Handler, Loader, TextEncoding, + Span, SpanEdit, + TxtData, TxtHandler, TxtSpan, + TxtLoader, TxtParams, + CsvData, CsvHandler, CsvSpan, + CsvLoader, CsvParams, + JsonData, JsonHandler, JsonIndent, + JsonParams, JsonLoader, JsonPath, }; - -#[cfg(feature = "html")] -pub use crate::handler::HtmlHandler; -#[cfg(feature = "pdf")] -pub use crate::handler::PdfHandler; -#[cfg(feature = "docx")] -pub use crate::handler::DocxHandler; -#[cfg(feature = "image")] -pub use crate::handler::ImageHandler; -#[cfg(feature = "xlsx")] -pub use crate::handler::XlsxHandler; +pub use crate::document::view_stream::SpanStream; +pub use crate::document::edit_stream::SpanEditStream; pub use crate::document::Document; -pub use crate::element::{Element, ElementCategory, ElementType}; - -pub use crate::text::csv::CsvLoader; -pub use crate::text::json::JsonLoader; -pub use crate::text::plaintext::PlaintextLoader; - -#[cfg(feature = "html")] -pub use crate::text::html::HtmlLoader; -#[cfg(feature = "pdf")] -pub use crate::binary::pdf::PdfLoader; -#[cfg(feature = "docx")] -pub use crate::binary::docx::DocxLoader; -#[cfg(feature = "image")] -pub use crate::image::image::ImageFileLoader; -#[cfg(feature = "xlsx")] -pub use crate::tabular::xlsx::XlsxLoader; -pub use crate::audio::wav::WavLoader; -pub use crate::audio::mp3::Mp3Loader; diff --git a/crates/nvisy-ingest/src/tabular/mod.rs b/crates/nvisy-ingest/src/tabular/mod.rs deleted file mode 100644 index 2c8189b..0000000 --- a/crates/nvisy-ingest/src/tabular/mod.rs +++ /dev/null @@ -1,4 +0,0 @@ -//! Tabular/spreadsheet file loaders (XLSX). - -#[cfg(feature = "xlsx")] -pub mod xlsx; diff --git a/crates/nvisy-ingest/src/tabular/xlsx.rs b/crates/nvisy-ingest/src/tabular/xlsx.rs deleted file mode 100644 index f446d4d..0000000 --- a/crates/nvisy-ingest/src/tabular/xlsx.rs +++ /dev/null @@ -1,113 +0,0 @@ -//! Excel XLSX/XLS file loader using `calamine`. - -use serde::Deserialize; -use std::io::Cursor; - -use nvisy_core::io::ContentData; -use nvisy_core::error::{Error, ErrorKind}; - -use crate::document::Document; -use crate::handler::{XlsxHandler, PlaintextHandler, FormatHandler, SpreadsheetLoader}; - -use calamine::{Reader, open_workbook_auto_from_rs}; - -/// Typed parameters for [`XlsxLoader`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct XlsxLoaderParams { - /// Maximum number of rows per sheet. `None` means all rows. - #[serde(default)] - pub max_rows: Option<usize>, - /// Specific sheet names to load. Empty means all sheets. - #[serde(default)] - pub sheets: Vec<String>, -} - -/// Extracts tabular data per sheet from XLSX/XLS files, plus a flattened -/// text document for regex/dictionary scanning. -pub struct XlsxLoader; - -impl Clone for XlsxLoader { - fn clone(&self) -> Self { Self } -} - -#[async_trait::async_trait] -impl SpreadsheetLoader for XlsxLoader { - type Params = XlsxLoaderParams; - - async fn load( - &self, - content: &ContentData, - params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error> { - let cursor = Cursor::new(content.to_bytes().to_vec()); - let mut workbook = open_workbook_auto_from_rs(cursor).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("XLSX open failed: {e}")) - })?; - - let sheet_names: Vec<String> = workbook.sheet_names().to_vec(); - let mut documents = Vec::new(); - let mut all_text_parts = Vec::new(); - - for sheet_name in &sheet_names { - if !params.sheets.is_empty() - && !params.sheets.iter().any(|s| s == sheet_name) - { - continue; - } - - let range = match workbook.worksheet_range(sheet_name) { - Ok(r) => r, - Err(e) => { - tracing::warn!("Skipping sheet '{}': {}", sheet_name, e); - continue; - } - }; - - let mut rows_iter = range.rows(); - - // First row as headers - let columns: Vec<String> = match rows_iter.next() { - Some(header_row) => header_row - .iter() - .map(|c| c.to_string()) - .collect(), - None => continue, - }; - - let mut rows = Vec::new(); - for row in rows_iter { - if let Some(max) = params.max_rows { - if rows.len() >= max { - break; - } - } - let row_data: Vec<String> = row.iter().map(|c| c.to_string()).collect(); - all_text_parts.push(row_data.join("\t")); - rows.push(row_data); - } - - let mut tabular_doc = Document::new(XlsxHandler) - .with_tabular(columns, rows) - .with_sheet_name(sheet_name); - tabular_doc.source.set_parent_id(Some(content.content_source.as_uuid())); - documents.push(tabular_doc.into_format()); - } - - // Create a flattened document for regex/dictionary scanning - if !all_text_parts.is_empty() { - let mut doc = Document::new(PlaintextHandler) - .with_text(all_text_parts.join("\n")); - doc.source.set_parent_id(Some(content.content_source.as_uuid())); - documents.push(doc.into_format()); - } - - Ok(documents) - } -} - -impl crate::handler::Handler for XlsxLoader { - fn id(&self) -> &str { XlsxHandler.id() } - fn extensions(&self) -> &[&str] { XlsxHandler.extensions() } - fn content_types(&self) -> &[&str] { XlsxHandler.content_types() } -} diff --git a/crates/nvisy-ingest/src/text/csv.rs b/crates/nvisy-ingest/src/text/csv.rs deleted file mode 100644 index 17d9b01..0000000 --- a/crates/nvisy-ingest/src/text/csv.rs +++ /dev/null @@ -1,38 +0,0 @@ -//! CSV file loader. - -use nvisy_core::io::ContentData; -use nvisy_core::error::Error; - -use crate::document::Document; -use crate::handler::{CsvHandler, FormatHandler, TextLoader}; - -/// Loads CSV content into a single [`Document`] containing the raw CSV text. -pub struct CsvLoader; - -impl Clone for CsvLoader { - fn clone(&self) -> Self { Self } -} - -#[async_trait::async_trait] -impl TextLoader for CsvLoader { - type Params = (); - - async fn load( - &self, - content: &ContentData, - _params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error> { - let text = String::from_utf8(content.to_bytes().to_vec()).map_err(|e| { - Error::validation(format!("Invalid UTF-8 in CSV: {}", e), "csv-loader") - })?; - let mut doc = Document::new(CsvHandler).with_text(text); - doc.source.set_parent_id(Some(content.content_source.as_uuid())); - Ok(vec![doc.into_format()]) - } -} - -impl crate::handler::Handler for CsvLoader { - fn id(&self) -> &str { CsvHandler.id() } - fn extensions(&self) -> &[&str] { CsvHandler.extensions() } - fn content_types(&self) -> &[&str] { CsvHandler.content_types() } -} diff --git a/crates/nvisy-ingest/src/text/html.rs b/crates/nvisy-ingest/src/text/html.rs deleted file mode 100644 index 574203b..0000000 --- a/crates/nvisy-ingest/src/text/html.rs +++ /dev/null @@ -1,106 +0,0 @@ -//! HTML file loader using the `scraper` crate. - -use serde::Deserialize; - -use nvisy_core::io::ContentData; -use nvisy_core::error::{Error, ErrorKind}; - -use crate::document::Document; -use crate::element::{Element, ElementType}; -use crate::handler::{HtmlHandler, FormatHandler, TextLoader}; - -/// Typed parameters for [`HtmlLoader`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct HtmlLoaderParams {} - -/// Extracts text and structural elements from HTML documents. -pub struct HtmlLoader; - -impl Clone for HtmlLoader { - fn clone(&self) -> Self { Self } -} - -#[async_trait::async_trait] -impl TextLoader for HtmlLoader { - type Params = HtmlLoaderParams; - - async fn load( - &self, - content: &ContentData, - _params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error> { - let html_str = String::from_utf8(content.to_bytes().to_vec()).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("HTML is not valid UTF-8: {e}")) - })?; - - let document = scraper::Html::parse_document(&html_str); - let mut elements = Vec::new(); - let mut full_text = String::new(); - - // Map HTML tags to element types - let tag_mappings: &[(&str, ElementType)] = &[ - ("h1", ElementType::Title), - ("h2", ElementType::Title), - ("h3", ElementType::Title), - ("h4", ElementType::Title), - ("h5", ElementType::Title), - ("h6", ElementType::Title), - ("p", ElementType::NarrativeText), - ("li", ElementType::ListItem), - ("table", ElementType::Table), - ("pre", ElementType::CodeSnippet), - ("code", ElementType::CodeSnippet), - ("address", ElementType::Address), - ("header", ElementType::Header), - ("footer", ElementType::Footer), - ("figcaption", ElementType::FigureCaption), - ]; - - for (tag, element_type) in tag_mappings { - let selector = scraper::Selector::parse(tag).unwrap(); - for element in document.select(&selector) { - let text: String = element.text().collect::<Vec<_>>().join(" "); - let trimmed = text.trim(); - if trimmed.is_empty() { - continue; - } - let mut el = Element::new(*element_type, trimmed); - // Set heading level for h1-h6 - if tag.starts_with('h') && tag.len() == 2 { - if let Some(level) = tag[1..].parse::<u32>().ok() { - el = el.with_level(level); - } - } - if !full_text.is_empty() { - full_text.push('\n'); - } - full_text.push_str(trimmed); - elements.push(el); - } - } - - // If no structured elements found, extract all body text - if elements.is_empty() { - let body_selector = scraper::Selector::parse("body").unwrap(); - if let Some(body) = document.select(&body_selector).next() { - full_text = body.text().collect::<Vec<_>>().join(" "); - let trimmed = full_text.trim().to_string(); - full_text = trimmed; - } - } - - let doc = Document::new(HtmlHandler) - .with_text(full_text) - .with_elements(elements) - .into_format(); - - Ok(vec![doc]) - } -} - -impl crate::handler::Handler for HtmlLoader { - fn id(&self) -> &str { HtmlHandler.id() } - fn extensions(&self) -> &[&str] { HtmlHandler.extensions() } - fn content_types(&self) -> &[&str] { HtmlHandler.content_types() } -} diff --git a/crates/nvisy-ingest/src/text/json.rs b/crates/nvisy-ingest/src/text/json.rs deleted file mode 100644 index 2576347..0000000 --- a/crates/nvisy-ingest/src/text/json.rs +++ /dev/null @@ -1,42 +0,0 @@ -//! JSON file loader. - -use nvisy_core::io::ContentData; -use nvisy_core::error::Error; - -use crate::document::Document; -use crate::handler::{JsonHandler, FormatHandler, TextLoader}; - -/// Loads JSON content into a single [`Document`] containing the raw JSON text. -pub struct JsonLoader; - -impl Clone for JsonLoader { - fn clone(&self) -> Self { Self } -} - -#[async_trait::async_trait] -impl TextLoader for JsonLoader { - type Params = (); - - async fn load( - &self, - content: &ContentData, - _params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error> { - let text = String::from_utf8(content.to_bytes().to_vec()).map_err(|e| { - Error::validation(format!("Invalid UTF-8 in JSON: {}", e), "json-loader") - })?; - // Validate it's valid JSON - let _: serde_json::Value = serde_json::from_str(&text).map_err(|e| { - Error::validation(format!("Invalid JSON: {}", e), "json-loader") - })?; - let mut doc = Document::new(JsonHandler).with_text(text); - doc.source.set_parent_id(Some(content.content_source.as_uuid())); - Ok(vec![doc.into_format()]) - } -} - -impl crate::handler::Handler for JsonLoader { - fn id(&self) -> &str { JsonHandler.id() } - fn extensions(&self) -> &[&str] { JsonHandler.extensions() } - fn content_types(&self) -> &[&str] { JsonHandler.content_types() } -} diff --git a/crates/nvisy-ingest/src/text/mod.rs b/crates/nvisy-ingest/src/text/mod.rs deleted file mode 100644 index fdb9ae1..0000000 --- a/crates/nvisy-ingest/src/text/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -//! Text-based file loaders (CSV, JSON, plaintext, HTML). - -pub mod csv; -pub mod json; -pub mod plaintext; - -#[cfg(feature = "html")] -pub mod html; diff --git a/crates/nvisy-ingest/src/text/plaintext.rs b/crates/nvisy-ingest/src/text/plaintext.rs deleted file mode 100644 index c55b429..0000000 --- a/crates/nvisy-ingest/src/text/plaintext.rs +++ /dev/null @@ -1,41 +0,0 @@ -//! Plain-text file loader. - -use nvisy_core::io::ContentData; -use nvisy_core::error::Error; - -use crate::document::Document; -use crate::handler::{PlaintextHandler, FormatHandler, TextLoader}; - -/// Loads plain-text content into a single [`Document`]. -pub struct PlaintextLoader; - -impl Clone for PlaintextLoader { - fn clone(&self) -> Self { Self } -} - -#[async_trait::async_trait] -impl TextLoader for PlaintextLoader { - type Params = (); - - async fn load( - &self, - content: &ContentData, - _params: &Self::Params, - ) -> Result<Vec<Document<FormatHandler>>, Error> { - let text = String::from_utf8(content.to_bytes().to_vec()).map_err(|e| { - Error::validation( - format!("Invalid UTF-8 in plaintext: {}", e), - "plaintext-loader", - ) - })?; - let mut doc = Document::new(PlaintextHandler).with_text(text); - doc.source.set_parent_id(Some(content.content_source.as_uuid())); - Ok(vec![doc.into_format()]) - } -} - -impl crate::handler::Handler for PlaintextLoader { - fn id(&self) -> &str { PlaintextHandler.id() } - fn extensions(&self) -> &[&str] { PlaintextHandler.extensions() } - fn content_types(&self) -> &[&str] { PlaintextHandler.content_types() } -} diff --git a/crates/nvisy-ontology/src/entity/document.rs b/crates/nvisy-ontology/src/entity/document.rs index 7499808..01d6d68 100644 --- a/crates/nvisy-ontology/src/entity/document.rs +++ b/crates/nvisy-ontology/src/entity/document.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; #[serde(rename_all = "snake_case")] pub enum DocumentType { /// Plain text (`.txt`, `.log`, etc.). - Plaintext, + Txt, /// Comma-separated values. Csv, /// JSON data. @@ -25,8 +25,6 @@ pub enum DocumentType { Png, /// JPEG image. Jpeg, - /// TIFF image. - Tiff, /// WAV audio. Wav, /// MP3 audio. diff --git a/crates/nvisy-pipeline/Cargo.toml b/crates/nvisy-pipeline/Cargo.toml index c7c1905..24f99fa 100644 --- a/crates/nvisy-pipeline/Cargo.toml +++ b/crates/nvisy-pipeline/Cargo.toml @@ -23,8 +23,8 @@ rustdoc-args = ["--cfg", "docsrs"] [features] default = ["image-redaction", "pdf-redaction"] -# Image blur/block redaction via image + imageproc -image-redaction = ["dep:image", "dep:imageproc"] +# Image blur/block redaction via image + imageproc; enables nvisy-ingest/png for PngHandler +image-redaction = ["dep:image", "dep:imageproc", "nvisy-ingest/png"] # PDF reassembly with redacted content via lopdf pdf-redaction = ["dep:lopdf"] diff --git a/crates/nvisy-pipeline/src/actions/apply_audio_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_audio_redaction.rs deleted file mode 100644 index 77b7593..0000000 --- a/crates/nvisy-pipeline/src/actions/apply_audio_redaction.rs +++ /dev/null @@ -1,47 +0,0 @@ -//! Placeholder audio redaction action. - -use serde::Deserialize; - -use nvisy_core::error::Error; -use nvisy_core::io::ContentData; - -use crate::action::Action; - -/// Typed parameters for [`ApplyAudioRedactionAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ApplyAudioRedactionParams { - /// Time segments to mute, as `(start_seconds, end_seconds)` pairs. - #[serde(default)] - pub mute_segments: Vec<(f64, f64)>, -} - -/// Placeholder action for audio redaction. -/// -/// Passes through content unchanged -- audio redaction is not yet implemented. -pub struct ApplyAudioRedactionAction { - params: ApplyAudioRedactionParams, -} - -#[async_trait::async_trait] -impl Action for ApplyAudioRedactionAction { - type Params = ApplyAudioRedactionParams; - type Input = ContentData; - type Output = ContentData; - - fn id(&self) -> &str { - "apply-audio-redaction" - } - - async fn connect(params: Self::Params) -> Result<Self, Error> { - Ok(Self { params }) - } - - async fn execute( - &self, - input: Self::Input, - ) -> Result<Self::Output, Error> { - tracing::warn!("Audio redaction not yet implemented, passing through unchanged"); - Ok(input) - } -} diff --git a/crates/nvisy-pipeline/src/actions/apply_image_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_image_redaction.rs deleted file mode 100644 index 0079853..0000000 --- a/crates/nvisy-pipeline/src/actions/apply_image_redaction.rs +++ /dev/null @@ -1,133 +0,0 @@ -//! Image redaction action -- applies blur or block overlay to image regions. - -use bytes::Bytes; -use serde::Deserialize; - -use nvisy_ingest::handler::{FormatHandler, ImageHandler}; -use nvisy_ingest::document::Document; -use nvisy_ontology::entity::{BoundingBox, Entity}; -use nvisy_ontology::redaction::{ImageRedactionOutput, Redaction, RedactionOutput}; -use nvisy_core::error::{Error, ErrorKind}; - -use crate::action::Action; -use crate::render::{blur, block}; - -/// Typed parameters for [`ApplyImageRedactionAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ApplyImageRedactionParams { - /// Sigma value for gaussian blur. - #[serde(default = "default_sigma")] - pub blur_sigma: f32, - /// RGBA color for block overlays. - #[serde(default = "default_color")] - pub block_color: [u8; 4], -} - -fn default_sigma() -> f32 { - 15.0 -} -fn default_color() -> [u8; 4] { - [0, 0, 0, 255] -} - -/// Applies blur or block redaction to image regions identified by entities -/// with bounding boxes. -pub struct ApplyImageRedactionAction { - params: ApplyImageRedactionParams, -} - -#[async_trait::async_trait] -impl Action for ApplyImageRedactionAction { - type Params = ApplyImageRedactionParams; - type Input = (Vec<Document<FormatHandler>>, Vec<Entity>, Vec<Redaction>); - type Output = Vec<Document<FormatHandler>>; - - fn id(&self) -> &str { - "apply-image-redaction" - } - - async fn connect(params: Self::Params) -> Result<Self, Error> { - Ok(Self { params }) - } - - async fn execute( - &self, - input: Self::Input, - ) -> Result<Self::Output, Error> { - let (documents, entities, redactions) = input; - - // Build entity->redaction map - let redaction_map: std::collections::HashMap<uuid::Uuid, &Redaction> = redactions - .iter() - .filter(|r| !r.applied) - .map(|r| (r.entity_id, r)) - .collect(); - - // Collect entities with bounding boxes, grouped by redaction method - let mut blur_regions: Vec<BoundingBox> = Vec::new(); - let mut block_regions: Vec<BoundingBox> = Vec::new(); - - for entity in &entities { - if let Some(bbox) = entity.location.bounding_box() { - if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { - match &redaction.output { - RedactionOutput::Image(ImageRedactionOutput::Blur { .. }) => { - blur_regions.push(bbox.clone()) - } - RedactionOutput::Image(ImageRedactionOutput::Block { .. }) => { - block_regions.push(bbox.clone()) - } - // Default non-image methods, pixelate, and synthesize to block - _ => block_regions.push(bbox.clone()), - } - } - } - } - - if blur_regions.is_empty() && block_regions.is_empty() { - return Ok(documents); - } - - // Filter for image documents only - let mut new_docs = Vec::new(); - for doc in &documents { - let image_data = match &doc.data { - Some(d) => d, - None => { - new_docs.push(doc.clone()); - continue; - } - }; - - let dyn_img = image::load_from_memory(image_data).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("image decode failed: {e}")) - })?; - - let mut result = dyn_img; - if !blur_regions.is_empty() { - result = blur::apply_gaussian_blur(&result, &blur_regions, self.params.blur_sigma); - } - if !block_regions.is_empty() { - let color = image::Rgba(self.params.block_color); - result = block::apply_block_overlay(&result, &block_regions, color); - } - - // Encode back to PNG - let mut buf = std::io::Cursor::new(Vec::new()); - result - .write_to(&mut buf, image::ImageFormat::Png) - .map_err(|e| { - Error::new(ErrorKind::Runtime, format!("image encode failed: {e}")) - })?; - - let new_doc = Document::new(FormatHandler::Image(ImageHandler)) - .with_data(Bytes::from(buf.into_inner()), "image/png") - .with_dimensions(result.width(), result.height()); - - new_docs.push(new_doc); - } - - Ok(new_docs) - } -} diff --git a/crates/nvisy-pipeline/src/actions/apply_pdf_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_pdf_redaction.rs index 7d5b6c3..911553e 100644 --- a/crates/nvisy-pipeline/src/actions/apply_pdf_redaction.rs +++ b/crates/nvisy-pipeline/src/actions/apply_pdf_redaction.rs @@ -63,7 +63,7 @@ impl Action for ApplyPdfRedactionAction { // Collect image documents for XObject replacement let images: Vec<&Document<FormatHandler>> = documents .iter() - .filter(|d| d.data.is_some()) + .filter(|d| d.image().is_some()) .collect(); if !images.is_empty() { @@ -118,10 +118,10 @@ impl Action for ApplyPdfRedactionAction { if let (Some(sid), Some(redacted_doc)) = (stream_id, images.get(image_idx)) { - if let Some(ref data) = redacted_doc.data { + if let Some(image) = redacted_doc.image() { let new_stream = lopdf::Stream::new( lopdf::Dictionary::new(), - data.to_vec(), + image.bytes.to_vec(), ); pdf_doc .objects diff --git a/crates/nvisy-pipeline/src/actions/apply_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_redaction.rs index f19480e..f9c2abc 100644 --- a/crates/nvisy-pipeline/src/actions/apply_redaction.rs +++ b/crates/nvisy-pipeline/src/actions/apply_redaction.rs @@ -1,23 +1,58 @@ -//! Action that applies pending redactions to document text. +//! Unified redaction action -- applies text, image, tabular, and audio redactions. use std::collections::HashMap; use uuid::Uuid; +use serde::Deserialize; -use nvisy_ingest::handler::{FormatHandler, PlaintextHandler}; +use nvisy_ingest::handler::{FormatHandler, TxtHandler}; use nvisy_ingest::document::Document; +use nvisy_ingest::document::data::*; use nvisy_ontology::entity::Entity; -use nvisy_ontology::redaction::Redaction; +use nvisy_ontology::redaction::{Redaction, RedactionOutput, TextRedactionOutput}; use nvisy_core::error::Error; +#[cfg(feature = "image-redaction")] +use bytes::Bytes; +#[cfg(feature = "image-redaction")] +use nvisy_ingest::handler::PngHandler; +#[cfg(feature = "image-redaction")] +use nvisy_ontology::entity::BoundingBox; +#[cfg(feature = "image-redaction")] +use nvisy_ontology::redaction::ImageRedactionOutput; +#[cfg(feature = "image-redaction")] +use nvisy_core::error::ErrorKind; + use crate::action::Action; +/// Typed parameters for [`ApplyRedactionAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ApplyRedactionParams { + /// Sigma value for gaussian blur (image redaction). + #[serde(default = "default_sigma")] + pub blur_sigma: f32, + /// RGBA color for block overlays (image redaction). + #[serde(default = "default_color")] + pub block_color: [u8; 4], +} + +fn default_sigma() -> f32 { + 15.0 +} +fn default_color() -> [u8; 4] { + [0, 0, 0, 255] +} + /// Applies pending [`Redaction`] instructions to document content. /// -/// The action correlates entities with their redactions, locates the -/// corresponding text spans inside each document, and replaces them with -/// the computed replacement values. The resulting redacted documents are -/// returned. -pub struct ApplyRedactionAction; +/// Dispatches per-document based on content type: +/// - **Text documents**: byte-offset replacement +/// - **Image documents**: blur/block overlay (feature-gated) +/// - **Tabular documents**: cell-level redaction +/// - **Audio documents**: pass-through with warning +pub struct ApplyRedactionAction { + params: ApplyRedactionParams, +} /// A single text replacement that has been resolved but not yet applied. struct PendingRedaction { @@ -31,7 +66,7 @@ struct PendingRedaction { #[async_trait::async_trait] impl Action for ApplyRedactionAction { - type Params = (); + type Params = ApplyRedactionParams; type Input = (Vec<Document<FormatHandler>>, Vec<Entity>, Vec<Redaction>); type Output = Vec<Document<FormatHandler>>; @@ -39,8 +74,8 @@ impl Action for ApplyRedactionAction { "apply-redaction" } - async fn connect(_params: Self::Params) -> Result<Self, Error> { - Ok(Self) + async fn connect(params: Self::Params) -> Result<Self, Error> { + Ok(Self { params }) } async fn execute( @@ -51,81 +86,121 @@ impl Action for ApplyRedactionAction { let entity_map: HashMap<Uuid, &Entity> = entities.iter().map(|e| (e.source.as_uuid(), e)).collect(); - let redaction_map: HashMap<Uuid, &Redaction> = - redactions.iter().map(|r| (r.entity_id, r)).collect(); + let redaction_map: HashMap<Uuid, &Redaction> = redactions + .iter() + .filter(|r| !r.applied) + .map(|r| (r.entity_id, r)) + .collect(); let mut result_docs = Vec::new(); for doc in &documents { - let content = match &doc.content { - Some(c) => c, - None => { - result_docs.push(doc.clone()); - continue; - } - }; - - let mut pending: Vec<PendingRedaction> = Vec::new(); - - for (entity_id, redaction) in &redaction_map { - let entity = match entity_map.get(entity_id) { - Some(e) => e, - None => continue, - }; - - // Check entity belongs to this document - let belongs = entity.source.parent_id() == Some(doc.source.as_uuid()); - if !belongs { - continue; - } - - let start_offset = match entity.location.start_offset() { - Some(s) => s, - None => continue, - }; - let end_offset = match entity.location.end_offset() { - Some(e) => e, - None => continue, - }; - - let replacement_value = redaction - .output - .replacement_value() - .unwrap_or("") - .to_string(); - - pending.push(PendingRedaction { - start_offset, - end_offset, - replacement_value, - }); + // Tabular documents + if doc.tabular().is_some() { + let redacted = apply_tabular_doc(doc, &entities, &redaction_map); + result_docs.push(redacted); + continue; } - if pending.is_empty() { - result_docs.push(doc.clone()); + // Image documents + #[cfg(feature = "image-redaction")] + if doc.image().is_some() { + let redacted = apply_image_doc( + doc, + &entities, + &redaction_map, + self.params.blur_sigma, + self.params.block_color, + )?; + result_docs.push(redacted); continue; } - let redacted_content = apply_redactions(content, &mut pending); - let mut result = Document::new(FormatHandler::Plaintext(PlaintextHandler)) - .with_text(redacted_content); - result.title = doc.title.clone(); - result.elements = doc.elements.clone(); - result.page_count = doc.page_count; - result.source.set_parent_id(Some(doc.source.as_uuid())); + // Text documents (content present) + if let Some(content) = doc.text() { + let redacted = apply_text_doc( + doc, + content, + &entity_map, + &redaction_map, + ); + result_docs.push(redacted); + continue; + } - result_docs.push(result); + // Fallback: pass through unchanged + result_docs.push(doc.clone()); } Ok(result_docs) } } +// --------------------------------------------------------------------------- +// Text redaction +// --------------------------------------------------------------------------- + +fn apply_text_doc( + doc: &Document<FormatHandler>, + content: &str, + entity_map: &HashMap<Uuid, &Entity>, + redaction_map: &HashMap<Uuid, &Redaction>, +) -> Document<FormatHandler> { + let mut pending: Vec<PendingRedaction> = Vec::new(); + + for (entity_id, redaction) in redaction_map { + let entity = match entity_map.get(entity_id) { + Some(e) => e, + None => continue, + }; + + // Check entity belongs to this document + let belongs = entity.source.parent_id() == Some(doc.source.as_uuid()); + if !belongs { + continue; + } + + let start_offset = match entity.location.start_offset() { + Some(s) => s, + None => continue, + }; + let end_offset = match entity.location.end_offset() { + Some(e) => e, + None => continue, + }; + + let replacement_value = redaction + .output + .replacement_value() + .unwrap_or("") + .to_string(); + + pending.push(PendingRedaction { + start_offset, + end_offset, + replacement_value, + }); + } + + if pending.is_empty() { + return doc.clone(); + } + + let redacted_content = apply_text_redactions(content, &mut pending); + let mut result = Document::new( + FormatHandler::Txt(TxtHandler), + DocumentData::Text(TextData { text: redacted_content }), + ); + result.source.set_parent_id(Some(doc.source.as_uuid())); + + result +} + /// Applies a set of pending redactions to `text`, returning the redacted result. /// /// Replacements are applied right-to-left (descending start offset) so that /// earlier byte offsets remain valid after each substitution. -fn apply_redactions(text: &str, pending: &mut [PendingRedaction]) -> String { +fn apply_text_redactions(text: &str, pending: &mut [PendingRedaction]) -> String { // Sort by start offset descending (right-to-left) to preserve positions pending.sort_by(|a, b| b.start_offset.cmp(&a.start_offset)); @@ -146,3 +221,137 @@ fn apply_redactions(text: &str, pending: &mut [PendingRedaction]) -> String { } result } + +// --------------------------------------------------------------------------- +// Image redaction (feature-gated) +// --------------------------------------------------------------------------- + +#[cfg(feature = "image-redaction")] +fn apply_image_doc( + doc: &Document<FormatHandler>, + entities: &[Entity], + redaction_map: &HashMap<Uuid, &Redaction>, + blur_sigma: f32, + block_color: [u8; 4], +) -> Result<Document<FormatHandler>, Error> { + use crate::render::{blur, block}; + + let image_data = match doc.image() { + Some(d) => d, + None => return Ok(doc.clone()), + }; + + let mut blur_regions: Vec<BoundingBox> = Vec::new(); + let mut block_regions: Vec<BoundingBox> = Vec::new(); + + for entity in entities { + if let Some(bbox) = entity.location.bounding_box() { + if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { + match &redaction.output { + RedactionOutput::Image(ImageRedactionOutput::Blur { .. }) => { + blur_regions.push(bbox.clone()) + } + RedactionOutput::Image(ImageRedactionOutput::Block { .. }) => { + block_regions.push(bbox.clone()) + } + _ => block_regions.push(bbox.clone()), + } + } + } + } + + if blur_regions.is_empty() && block_regions.is_empty() { + return Ok(doc.clone()); + } + + let dyn_img = image::load_from_memory(&image_data.bytes).map_err(|e| { + Error::new(ErrorKind::Runtime, format!("image decode failed: {e}")) + })?; + + let mut result = dyn_img; + if !blur_regions.is_empty() { + result = blur::apply_gaussian_blur(&result, &blur_regions, blur_sigma); + } + if !block_regions.is_empty() { + let color = image::Rgba(block_color); + result = block::apply_block_overlay(&result, &block_regions, color); + } + + // Encode back to PNG + let mut buf = std::io::Cursor::new(Vec::new()); + result + .write_to(&mut buf, image::ImageFormat::Png) + .map_err(|e| { + Error::new(ErrorKind::Runtime, format!("image encode failed: {e}")) + })?; + + let new_doc = Document::new( + FormatHandler::Png(PngHandler), + DocumentData::Image(ImageData { + bytes: Bytes::from(buf.into_inner()), + mime_type: "image/png".to_string(), + width: result.width(), + height: result.height(), + }), + ); + + Ok(new_doc) +} + +// --------------------------------------------------------------------------- +// Tabular redaction +// --------------------------------------------------------------------------- + +fn apply_tabular_doc( + doc: &Document<FormatHandler>, + entities: &[Entity], + redaction_map: &HashMap<Uuid, &Redaction>, +) -> Document<FormatHandler> { + let mut result = doc.clone(); + + for entity in entities { + if let (Some(row_idx), Some(col_idx)) = + (entity.location.row_index(), entity.location.column_index()) + { + if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { + if let Some(tabular) = result.tabular_mut() { + if let Some(row) = tabular.rows.get_mut(row_idx) { + if let Some(cell) = row.get_mut(col_idx) { + *cell = apply_cell_redaction(cell, &redaction.output); + } + } + } + } + } + } + + result +} + +fn apply_cell_redaction(cell: &str, output: &RedactionOutput) -> String { + match output { + RedactionOutput::Text(TextRedactionOutput::Mask { mask_char, .. }) => { + if cell.len() > 4 { + format!( + "{}{}", + mask_char.to_string().repeat(cell.len() - 4), + &cell[cell.len() - 4..] + ) + } else { + mask_char.to_string().repeat(cell.len()) + } + } + RedactionOutput::Text(TextRedactionOutput::Remove) => String::new(), + RedactionOutput::Text(TextRedactionOutput::Hash { .. }) => { + format!("[HASH:{:x}]", hash_string(cell)) + } + _ => output.replacement_value().unwrap_or("").to_string(), + } +} + +fn hash_string(s: &str) -> u64 { + use std::hash::{Hash, Hasher}; + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + s.hash(&mut hasher); + hasher.finish() +} diff --git a/crates/nvisy-pipeline/src/actions/apply_tabular_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_tabular_redaction.rs deleted file mode 100644 index aff358f..0000000 --- a/crates/nvisy-pipeline/src/actions/apply_tabular_redaction.rs +++ /dev/null @@ -1,103 +0,0 @@ -//! Tabular data redaction action -- applies redaction to specific cells. - -use serde::Deserialize; - -use nvisy_ingest::handler::FormatHandler; -use nvisy_ingest::document::Document; -use nvisy_ontology::entity::Entity; -use nvisy_ontology::redaction::{Redaction, RedactionOutput, TextRedactionOutput}; -use nvisy_core::error::Error; - -use crate::action::Action; - -/// Typed parameters for [`ApplyTabularRedactionAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ApplyTabularRedactionParams {} - -/// Applies pending redactions to tabular data cells. -/// -/// For entities with `row_index` and `column_index`, the corresponding cell -/// value is redacted according to the redaction method (mask, replace, -/// remove, hash). -pub struct ApplyTabularRedactionAction { - params: ApplyTabularRedactionParams, -} - -#[async_trait::async_trait] -impl Action for ApplyTabularRedactionAction { - type Params = ApplyTabularRedactionParams; - type Input = (Vec<Document<FormatHandler>>, Vec<Entity>, Vec<Redaction>); - type Output = Vec<Document<FormatHandler>>; - - fn id(&self) -> &str { - "apply-tabular-redaction" - } - - async fn connect(params: Self::Params) -> Result<Self, Error> { - Ok(Self { params }) - } - - async fn execute( - &self, - input: Self::Input, - ) -> Result<Self::Output, Error> { - let (mut documents, entities, redactions) = input; - - // Build entity->redaction map - let redaction_map: std::collections::HashMap<uuid::Uuid, &Redaction> = redactions - .iter() - .filter(|r| !r.applied) - .map(|r| (r.entity_id, r)) - .collect(); - - for entity in &entities { - if let (Some(row_idx), Some(col_idx)) = - (entity.location.row_index(), entity.location.column_index()) - { - if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { - for doc in &mut documents { - if let Some(rows) = &mut doc.rows { - if let Some(row) = rows.get_mut(row_idx) { - if let Some(cell) = row.get_mut(col_idx) { - *cell = apply_cell_redaction(cell, &redaction.output); - } - } - } - } - } - } - } - - Ok(documents) - } -} - -fn apply_cell_redaction(cell: &str, output: &RedactionOutput) -> String { - match output { - RedactionOutput::Text(TextRedactionOutput::Mask { mask_char, .. }) => { - // Mask all but last 4 characters - if cell.len() > 4 { - format!( - "{}{}", - mask_char.to_string().repeat(cell.len() - 4), - &cell[cell.len() - 4..] - ) - } else { - mask_char.to_string().repeat(cell.len()) - } - } - RedactionOutput::Text(TextRedactionOutput::Remove) => String::new(), - RedactionOutput::Text(TextRedactionOutput::Hash { .. }) => { - format!("[HASH:{:x}]", hash_string(cell)) - } - _ => output.replacement_value().unwrap_or("").to_string(), - } -} - -fn hash_string(s: &str) -> u64 { - use std::hash::{Hash, Hasher}; - let mut hasher = std::collections::hash_map::DefaultHasher::new(); - s.hash(&mut hasher); - hasher.finish() -} diff --git a/crates/nvisy-pipeline/src/actions/detect_dictionary.rs b/crates/nvisy-pipeline/src/actions/detect_dictionary.rs index c74bf75..d1df595 100644 --- a/crates/nvisy-pipeline/src/actions/detect_dictionary.rs +++ b/crates/nvisy-pipeline/src/actions/detect_dictionary.rs @@ -83,7 +83,7 @@ impl Action for DetectDictionaryAction { for doc in &documents { // Text content matching - if let Some(content) = &doc.content { + if let Some(content) = doc.text() { for (def, ac, values) in &self.automata { for mat in ac.find_iter(content) { let value = &values[mat.pattern().as_usize()]; @@ -109,8 +109,8 @@ impl Action for DetectDictionaryAction { } // Tabular content matching - if let Some(rows) = &doc.rows { - for (row_idx, row) in rows.iter().enumerate() { + if let Some(tabular) = doc.tabular() { + for (row_idx, row) in tabular.rows.iter().enumerate() { for (col_idx, cell) in row.iter().enumerate() { if cell.is_empty() { continue; diff --git a/crates/nvisy-pipeline/src/actions/detect_regex.rs b/crates/nvisy-pipeline/src/actions/detect_regex.rs index bd0ee3d..a1a21ca 100644 --- a/crates/nvisy-pipeline/src/actions/detect_regex.rs +++ b/crates/nvisy-pipeline/src/actions/detect_regex.rs @@ -55,7 +55,7 @@ impl Action for DetectRegexAction { let mut entities = Vec::new(); for doc in &documents { - let content = match &doc.content { + let content = match doc.text() { Some(c) => c, None => continue, }; diff --git a/crates/nvisy-pipeline/src/actions/detect_tabular.rs b/crates/nvisy-pipeline/src/actions/detect_tabular.rs index b68bff0..e0a7fa2 100644 --- a/crates/nvisy-pipeline/src/actions/detect_tabular.rs +++ b/crates/nvisy-pipeline/src/actions/detect_tabular.rs @@ -79,22 +79,18 @@ impl Action for DetectTabularAction { let mut entities = Vec::new(); for doc in &documents { - let columns = match &doc.columns { - Some(c) => c, - None => continue, - }; - let rows = match &doc.rows { - Some(r) => r, + let tabular = match doc.tabular() { + Some(t) => t, None => continue, }; - for (col_idx, col_name) in columns.iter().enumerate() { + for (col_idx, col_name) in tabular.columns.iter().enumerate() { for (regex, rule) in &self.compiled_rules { if !regex.is_match(col_name) { continue; } - for (row_idx, row) in rows.iter().enumerate() { + for (row_idx, row) in tabular.rows.iter().enumerate() { if let Some(cell) = row.get(col_idx) { if cell.is_empty() { continue; diff --git a/crates/nvisy-pipeline/src/actions/mod.rs b/crates/nvisy-pipeline/src/actions/mod.rs index 444e46f..9d34d64 100644 --- a/crates/nvisy-pipeline/src/actions/mod.rs +++ b/crates/nvisy-pipeline/src/actions/mod.rs @@ -3,7 +3,7 @@ //! Each sub-module exposes a single [`Action`](crate::action::Action) //! implementation that can be wired into an nvisy execution plan. -/// Applies pending redactions to document content. +/// Applies pending redactions to document content (text, image, tabular, audio). pub mod apply_redaction; /// Computes a sensitivity classification for each blob based on detected entities. pub mod classify; @@ -21,13 +21,6 @@ pub mod detect_tabular; pub mod emit_audit; /// Evaluates policy rules against detected entities and produces redaction instructions. pub mod evaluate_policy; -/// Applies image redactions (blur, block) to image artifacts. -#[cfg(feature = "image-redaction")] -pub mod apply_image_redaction; -/// Applies redactions to tabular data cells. -pub mod apply_tabular_redaction; /// Reassembles redacted content into PDF files. #[cfg(feature = "pdf-redaction")] pub mod apply_pdf_redaction; -/// Placeholder for audio redaction. -pub mod apply_audio_redaction; diff --git a/crates/nvisy-pipeline/src/prelude.rs b/crates/nvisy-pipeline/src/prelude.rs index e3d7556..877431e 100644 --- a/crates/nvisy-pipeline/src/prelude.rs +++ b/crates/nvisy-pipeline/src/prelude.rs @@ -10,12 +10,8 @@ pub use crate::actions::detect_manual::{DetectManualAction, DetectManualParams}; pub use crate::actions::detect_checksum::{DetectChecksumAction, DetectChecksumParams}; pub use crate::actions::classify::{ClassifyAction, ClassificationResult}; pub use crate::actions::evaluate_policy::{EvaluatePolicyAction, EvaluatePolicyParams}; -pub use crate::actions::apply_redaction::ApplyRedactionAction; +pub use crate::actions::apply_redaction::{ApplyRedactionAction, ApplyRedactionParams}; pub use crate::actions::emit_audit::{EmitAuditAction, EmitAuditParams}; -pub use crate::actions::apply_tabular_redaction::{ApplyTabularRedactionAction, ApplyTabularRedactionParams}; -pub use crate::actions::apply_audio_redaction::{ApplyAudioRedactionAction, ApplyAudioRedactionParams}; -#[cfg(feature = "image-redaction")] -pub use crate::actions::apply_image_redaction::{ApplyImageRedactionAction, ApplyImageRedactionParams}; #[cfg(feature = "pdf-redaction")] pub use crate::actions::apply_pdf_redaction::{ApplyPdfRedactionAction, ApplyPdfRedactionParams}; diff --git a/crates/nvisy-python/src/actions/mod.rs b/crates/nvisy-python/src/actions/mod.rs index 4986b46..cd5e2b1 100644 --- a/crates/nvisy-python/src/actions/mod.rs +++ b/crates/nvisy-python/src/actions/mod.rs @@ -10,8 +10,9 @@ pub mod ocr; use serde::Deserialize; -use nvisy_ingest::handler::{FormatHandler, PlaintextHandler}; +use nvisy_ingest::handler::{FormatHandler, TxtHandler}; use nvisy_ingest::document::Document; +use nvisy_ingest::document::data::*; use nvisy_ontology::entity::Entity; use nvisy_core::error::Error; use nvisy_core::io::ContentData; @@ -92,14 +93,17 @@ impl Action for DetectNerAction { "python/ner", false, ))?; - vec![Document::new(FormatHandler::Plaintext(PlaintextHandler)).with_text(text)] + vec![Document::new( + FormatHandler::Txt(TxtHandler), + DocumentData::Text(TextData { text: text.to_string() }), + )] } else { documents }; let mut all_entities = Vec::new(); for doc in &docs { - if let Some(ref content) = doc.content { + if let Some(content) = doc.text() { let entities = ner::detect_ner(&self.bridge, content, &config).await?; all_entities.extend(entities); } @@ -163,11 +167,11 @@ impl Action for DetectNerImageAction { all_entities.extend(entities); } else { for doc in &images { - if let (Some(data), Some(mime)) = (&doc.data, &doc.mime_type) { + if let Some(image) = doc.image() { let entities = ner::detect_ner_image( &self.bridge, - data, - mime, + &image.bytes, + &image.mime_type, &config, ).await?; all_entities.extend(entities); diff --git a/crates/nvisy-python/src/actions/ocr.rs b/crates/nvisy-python/src/actions/ocr.rs index 34d84ef..9d54e17 100644 --- a/crates/nvisy-python/src/actions/ocr.rs +++ b/crates/nvisy-python/src/actions/ocr.rs @@ -2,8 +2,9 @@ use serde::Deserialize; -use nvisy_ingest::handler::{FormatHandler, PlaintextHandler}; +use nvisy_ingest::handler::{FormatHandler, TxtHandler}; use nvisy_ingest::document::Document; +use nvisy_ingest::document::data::*; use nvisy_ontology::entity::Entity; use nvisy_core::error::Error; use nvisy_core::io::ContentData; @@ -95,9 +96,9 @@ impl Action for OcrDetectAction { all_entities.extend(entities); } else { for doc in &images { - if let (Some(data), Some(mime)) = (&doc.data, &doc.mime_type) { + if let Some(image) = doc.image() { let entities = - ocr::detect_ocr(&self.bridge, data, mime, &config) + ocr::detect_ocr(&self.bridge, &image.bytes, &image.mime_type, &config) .await?; for entity in &entities { all_ocr_text.push(entity.value.clone()); @@ -110,7 +111,10 @@ impl Action for OcrDetectAction { // Create a Document from concatenated OCR text for downstream processing let mut documents = Vec::new(); if !all_ocr_text.is_empty() { - let ocr_doc = Document::new(FormatHandler::Plaintext(PlaintextHandler)).with_text(all_ocr_text.join("\n")); + let ocr_doc = Document::new( + FormatHandler::Txt(TxtHandler), + DocumentData::Text(TextData { text: all_ocr_text.join("\n") }), + ); documents.push(ocr_doc); } From be0685454856f621fffeae8d1af0978af53409ec Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Sun, 15 Feb 2026 17:07:17 +0100 Subject: [PATCH 16/17] feat(pipeline): add AI detection, generation, and audio redaction stubs; reorganize into detection/redaction/generation modules Add NER detection, OCR, transcription, and synthetic data generation action stubs with typed input/output structs. Add audio redaction pass-through to apply action. Enrich ApplyRedactionParams to cover text, image, and audio modalities. Add bytes field to WavHandler and Mp3Handler. Split flat actions/ directory into detection/, redaction/, and generation/ top-level modules; move render/ under redaction/. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- crates/nvisy-ingest/src/handler/audio/mp3.rs | 18 +- crates/nvisy-ingest/src/handler/audio/wav.rs | 18 +- crates/nvisy-ingest/src/handler/mod.rs | 8 + crates/nvisy-pipeline/Cargo.toml | 12 +- .../src/actions/apply_pdf_redaction.rs | 148 ---------- crates/nvisy-pipeline/src/actions/mod.rs | 26 -- .../checksum.rs} | 6 +- .../src/{actions => detection}/classify.rs | 0 .../dictionary.rs} | 113 ++++---- .../detect_manual.rs => detection/manual.rs} | 20 +- crates/nvisy-pipeline/src/detection/mod.rs | 20 ++ crates/nvisy-pipeline/src/detection/ner.rs | 64 +++++ .../detect_regex.rs => detection/regex.rs} | 33 +-- .../tabular.rs} | 32 +-- crates/nvisy-pipeline/src/generation/mod.rs | 14 + crates/nvisy-pipeline/src/generation/ocr.rs | 81 ++++++ .../src/generation/synthetic.rs | 59 ++++ .../src/generation/transcribe.rs | 77 +++++ crates/nvisy-pipeline/src/lib.rs | 21 +- crates/nvisy-pipeline/src/prelude.rs | 36 ++- .../apply_redaction.rs => redaction/apply.rs} | 268 +++++++++++------- .../src/{actions => redaction}/emit_audit.rs | 47 ++- .../{actions => redaction}/evaluate_policy.rs | 3 - crates/nvisy-pipeline/src/redaction/mod.rs | 14 + .../src/{ => redaction}/render/block.rs | 0 .../src/{ => redaction}/render/blur.rs | 0 .../src/{ => redaction}/render/mod.rs | 0 27 files changed, 702 insertions(+), 436 deletions(-) delete mode 100644 crates/nvisy-pipeline/src/actions/apply_pdf_redaction.rs delete mode 100644 crates/nvisy-pipeline/src/actions/mod.rs rename crates/nvisy-pipeline/src/{actions/detect_checksum.rs => detection/checksum.rs} (88%) rename crates/nvisy-pipeline/src/{actions => detection}/classify.rs (100%) rename crates/nvisy-pipeline/src/{actions/detect_dictionary.rs => detection/dictionary.rs} (57%) rename crates/nvisy-pipeline/src/{actions/detect_manual.rs => detection/manual.rs} (77%) create mode 100644 crates/nvisy-pipeline/src/detection/mod.rs create mode 100644 crates/nvisy-pipeline/src/detection/ner.rs rename crates/nvisy-pipeline/src/{actions/detect_regex.rs => detection/regex.rs} (77%) rename crates/nvisy-pipeline/src/{actions/detect_tabular.rs => detection/tabular.rs} (78%) create mode 100644 crates/nvisy-pipeline/src/generation/mod.rs create mode 100644 crates/nvisy-pipeline/src/generation/ocr.rs create mode 100644 crates/nvisy-pipeline/src/generation/synthetic.rs create mode 100644 crates/nvisy-pipeline/src/generation/transcribe.rs rename crates/nvisy-pipeline/src/{actions/apply_redaction.rs => redaction/apply.rs} (53%) rename crates/nvisy-pipeline/src/{actions => redaction}/emit_audit.rs (56%) rename crates/nvisy-pipeline/src/{actions => redaction}/evaluate_policy.rs (99%) create mode 100644 crates/nvisy-pipeline/src/redaction/mod.rs rename crates/nvisy-pipeline/src/{ => redaction}/render/block.rs (100%) rename crates/nvisy-pipeline/src/{ => redaction}/render/blur.rs (100%) rename crates/nvisy-pipeline/src/{ => redaction}/render/mod.rs (100%) diff --git a/crates/nvisy-ingest/src/handler/audio/mp3.rs b/crates/nvisy-ingest/src/handler/audio/mp3.rs index fea96de..5e0a962 100644 --- a/crates/nvisy-ingest/src/handler/audio/mp3.rs +++ b/crates/nvisy-ingest/src/handler/audio/mp3.rs @@ -1,5 +1,7 @@ //! MP3 handler (stub — awaiting migration to Loader/Handler pattern). +use bytes::Bytes; + use nvisy_core::error::Error; use nvisy_ontology::entity::DocumentType; @@ -7,8 +9,20 @@ use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; use crate::handler::Handler; -#[derive(Debug)] -pub struct Mp3Handler; +#[derive(Debug, Clone)] +pub struct Mp3Handler { + pub(crate) bytes: Bytes, +} + +impl Mp3Handler { + pub fn new(bytes: Bytes) -> Self { + Self { bytes } + } + + pub fn bytes(&self) -> &Bytes { + &self.bytes + } +} #[async_trait::async_trait] impl Handler for Mp3Handler { diff --git a/crates/nvisy-ingest/src/handler/audio/wav.rs b/crates/nvisy-ingest/src/handler/audio/wav.rs index cedaf05..c8cb4e4 100644 --- a/crates/nvisy-ingest/src/handler/audio/wav.rs +++ b/crates/nvisy-ingest/src/handler/audio/wav.rs @@ -1,5 +1,7 @@ //! WAV handler (stub — awaiting migration to Loader/Handler pattern). +use bytes::Bytes; + use nvisy_core::error::Error; use nvisy_ontology::entity::DocumentType; @@ -7,8 +9,20 @@ use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; use crate::handler::Handler; -#[derive(Debug)] -pub struct WavHandler; +#[derive(Debug, Clone)] +pub struct WavHandler { + pub(crate) bytes: Bytes, +} + +impl WavHandler { + pub fn new(bytes: Bytes) -> Self { + Self { bytes } + } + + pub fn bytes(&self) -> &Bytes { + &self.bytes + } +} #[async_trait::async_trait] impl Handler for WavHandler { diff --git a/crates/nvisy-ingest/src/handler/mod.rs b/crates/nvisy-ingest/src/handler/mod.rs index 47a44d0..13fea3b 100644 --- a/crates/nvisy-ingest/src/handler/mod.rs +++ b/crates/nvisy-ingest/src/handler/mod.rs @@ -36,6 +36,14 @@ pub use text::json_handler::{ }; pub use text::json_loader::{JsonParams, JsonLoader}; +#[cfg(feature = "png")] +pub use image::png::PngHandler; + +#[cfg(feature = "wav")] +pub use audio::wav::WavHandler; +#[cfg(feature = "mp3")] +pub use audio::mp3::Mp3Handler; + /// Trait implemented by all format handlers. /// /// A handler holds loaded, validated content and provides methods to diff --git a/crates/nvisy-pipeline/Cargo.toml b/crates/nvisy-pipeline/Cargo.toml index 24f99fa..368e7ba 100644 --- a/crates/nvisy-pipeline/Cargo.toml +++ b/crates/nvisy-pipeline/Cargo.toml @@ -22,11 +22,11 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [features] -default = ["image-redaction", "pdf-redaction"] +default = ["image-redaction", "audio-redaction"] # Image blur/block redaction via image + imageproc; enables nvisy-ingest/png for PngHandler image-redaction = ["dep:image", "dep:imageproc", "nvisy-ingest/png"] -# PDF reassembly with redacted content via lopdf -pdf-redaction = ["dep:lopdf"] +# Audio redaction pass-through; enables nvisy-ingest/wav for WavHandler +audio-redaction = ["nvisy-ingest/wav"] [dependencies] # Internal crates @@ -47,6 +47,9 @@ async-trait = { workspace = true, features = [] } uuid = { workspace = true, features = ["v4"] } bytes = { workspace = true, features = [] } +# Time +jiff = { workspace = true, features = [] } + # Text processing regex = { workspace = true, features = [] } aho-corasick = { workspace = true, features = [] } @@ -57,6 +60,3 @@ tracing = { workspace = true, features = [] } # Image processing (feature-gated) image = { workspace = true, optional = true, features = [] } imageproc = { workspace = true, optional = true, features = [] } - -# PDF manipulation (feature-gated) -lopdf = { workspace = true, optional = true, features = [] } diff --git a/crates/nvisy-pipeline/src/actions/apply_pdf_redaction.rs b/crates/nvisy-pipeline/src/actions/apply_pdf_redaction.rs deleted file mode 100644 index 911553e..0000000 --- a/crates/nvisy-pipeline/src/actions/apply_pdf_redaction.rs +++ /dev/null @@ -1,148 +0,0 @@ -//! PDF reassembly action -- writes redacted content back to PDF bytes. - -use bytes::Bytes; -use serde::Deserialize; - -use nvisy_ingest::handler::FormatHandler; -use nvisy_ingest::document::Document; -use nvisy_core::error::{Error, ErrorKind}; -use nvisy_core::io::ContentData; -use nvisy_core::path::ContentSource; - -use crate::action::Action; - -/// Typed parameters for [`ApplyPdfRedactionAction`]. -#[derive(Debug, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct ApplyPdfRedactionParams {} - -/// Reassembles redacted text and images back into the original PDF. -/// -/// Uses `lopdf` to: -/// 1. Replace PDF content streams with redacted text. -/// 2. Replace embedded image XObjects with redacted image data. -/// 3. Write the modified PDF back to a new `ContentData`. -pub struct ApplyPdfRedactionAction { - params: ApplyPdfRedactionParams, -} - -#[async_trait::async_trait] -impl Action for ApplyPdfRedactionAction { - type Params = ApplyPdfRedactionParams; - type Input = (ContentData, Vec<Document<FormatHandler>>); - type Output = ContentData; - - fn id(&self) -> &str { - "apply-pdf-redaction" - } - - async fn connect(params: Self::Params) -> Result<Self, Error> { - Ok(Self { params }) - } - - async fn execute( - &self, - input: Self::Input, - ) -> Result<Self::Output, Error> { - let (content, documents) = input; - - // Only process if the content is actually a PDF - let is_pdf = content - .content_type() - .map(|ct| ct == "application/pdf") - .unwrap_or(false); - - if !is_pdf { - return Ok(content); - } - - let mut pdf_doc = lopdf::Document::load_mem(content.as_bytes()).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("PDF load failed: {e}")) - })?; - - // Collect image documents for XObject replacement - let images: Vec<&Document<FormatHandler>> = documents - .iter() - .filter(|d| d.image().is_some()) - .collect(); - - if !images.is_empty() { - let pages: Vec<(u32, lopdf::ObjectId)> = - pdf_doc.get_pages().into_iter().collect(); - let mut image_idx = 0; - - for (_page_num, page_id) in &pages { - let (resources_opt, _) = match pdf_doc.get_page_resources(*page_id) { - Ok(r) => r, - Err(_) => continue, - }; - - let resources = match resources_opt { - Some(res) => res.clone(), - None => continue, - }; - - let xobject_obj = match resources.get(b"XObject") { - Ok(obj) => obj.clone(), - Err(_) => continue, - }; - - let xobjects = match pdf_doc.dereference(&xobject_obj) { - Ok((_, lopdf::Object::Dictionary(dict))) => dict.clone(), - _ => continue, - }; - - for (_name, obj_ref) in xobjects.iter() { - let stream_id = match obj_ref { - lopdf::Object::Reference(id) => Some(*id), - _ => None, - }; - - let is_image = match pdf_doc.dereference(obj_ref) { - Ok((_, lopdf::Object::Stream(s))) => s - .dict - .get(b"Subtype") - .ok() - .and_then(|st| { - if let lopdf::Object::Name(n) = st { - Some(n.as_slice() == b"Image") - } else { - None - } - }) - .unwrap_or(false), - _ => false, - }; - - if is_image { - if let (Some(sid), Some(redacted_doc)) = - (stream_id, images.get(image_idx)) - { - if let Some(image) = redacted_doc.image() { - let new_stream = lopdf::Stream::new( - lopdf::Dictionary::new(), - image.bytes.to_vec(), - ); - pdf_doc - .objects - .insert(sid, lopdf::Object::Stream(new_stream)); - } - image_idx += 1; - } - } - } - } - } - - // Write the modified PDF to a buffer - let mut output_buf = Vec::new(); - pdf_doc.save_to(&mut output_buf).map_err(|e| { - Error::new(ErrorKind::Runtime, format!("PDF save failed: {e}")) - })?; - - let result = ContentData::new(ContentSource::new(), Bytes::from(output_buf)) - .with_content_type("application/pdf"); - - Ok(result) - } -} diff --git a/crates/nvisy-pipeline/src/actions/mod.rs b/crates/nvisy-pipeline/src/actions/mod.rs deleted file mode 100644 index 9d34d64..0000000 --- a/crates/nvisy-pipeline/src/actions/mod.rs +++ /dev/null @@ -1,26 +0,0 @@ -//! Pipeline actions for the detection and redaction workflow. -//! -//! Each sub-module exposes a single [`Action`](crate::action::Action) -//! implementation that can be wired into an nvisy execution plan. - -/// Applies pending redactions to document content (text, image, tabular, audio). -pub mod apply_redaction; -/// Computes a sensitivity classification for each blob based on detected entities. -pub mod classify; -/// Validates detected entities using checksum algorithms (e.g. Luhn). -pub mod detect_checksum; -/// Aho-Corasick dictionary-based entity detection. -pub mod detect_dictionary; -/// Converts user-provided manual annotations into entities. -pub mod detect_manual; -/// Scans document text with compiled regex patterns to detect PII/PHI entities. -pub mod detect_regex; -/// Column-based rule matching for tabular data. -pub mod detect_tabular; -/// Emits audit trail records for every applied redaction. -pub mod emit_audit; -/// Evaluates policy rules against detected entities and produces redaction instructions. -pub mod evaluate_policy; -/// Reassembles redacted content into PDF files. -#[cfg(feature = "pdf-redaction")] -pub mod apply_pdf_redaction; diff --git a/crates/nvisy-pipeline/src/actions/detect_checksum.rs b/crates/nvisy-pipeline/src/detection/checksum.rs similarity index 88% rename from crates/nvisy-pipeline/src/actions/detect_checksum.rs rename to crates/nvisy-pipeline/src/detection/checksum.rs index e28a302..f301e00 100644 --- a/crates/nvisy-pipeline/src/actions/detect_checksum.rs +++ b/crates/nvisy-pipeline/src/detection/checksum.rs @@ -77,8 +77,12 @@ impl Action for DetectChecksumAction { &entity.value, DetectionMethod::Checksum, (entity.confidence + confidence_boost).min(1.0), - entity.location.clone(), ); + boosted.text_location = entity.text_location.clone(); + boosted.image_location = entity.image_location.clone(); + boosted.tabular_location = entity.tabular_location.clone(); + boosted.audio_location = entity.audio_location.clone(); + boosted.video_location = entity.video_location.clone(); boosted.source.set_parent_id(entity.source.parent_id()); result.push(boosted); diff --git a/crates/nvisy-pipeline/src/actions/classify.rs b/crates/nvisy-pipeline/src/detection/classify.rs similarity index 100% rename from crates/nvisy-pipeline/src/actions/classify.rs rename to crates/nvisy-pipeline/src/detection/classify.rs diff --git a/crates/nvisy-pipeline/src/actions/detect_dictionary.rs b/crates/nvisy-pipeline/src/detection/dictionary.rs similarity index 57% rename from crates/nvisy-pipeline/src/actions/detect_dictionary.rs rename to crates/nvisy-pipeline/src/detection/dictionary.rs index d1df595..e653724 100644 --- a/crates/nvisy-pipeline/src/actions/detect_dictionary.rs +++ b/crates/nvisy-pipeline/src/detection/dictionary.rs @@ -3,10 +3,10 @@ use aho_corasick::AhoCorasick; use serde::Deserialize; -use nvisy_ingest::handler::FormatHandler; +use nvisy_ingest::handler::{TxtHandler, CsvHandler}; use nvisy_ingest::document::Document; use nvisy_ontology::entity::{ - DetectionMethod, Entity, EntityCategory, EntityLocation, TabularLocation, TextLocation, + DetectionMethod, Entity, EntityCategory, TabularLocation, TextLocation, }; use nvisy_core::error::{Error, ErrorKind}; use nvisy_pattern::dictionaries; @@ -56,7 +56,7 @@ pub struct DetectDictionaryAction { #[async_trait::async_trait] impl Action for DetectDictionaryAction { type Params = DetectDictionaryParams; - type Input = Vec<Document<FormatHandler>>; + type Input = (Vec<Document<TxtHandler>>, Vec<Document<CsvHandler>>); type Output = Vec<Entity>; fn id(&self) -> &str { @@ -76,64 +76,69 @@ impl Action for DetectDictionaryAction { async fn execute( &self, - documents: Self::Input, + input: Self::Input, ) -> Result<Vec<Entity>, Error> { + let (text_docs, tabular_docs) = input; let confidence = self.params.confidence; let mut entities = Vec::new(); - for doc in &documents { - // Text content matching - if let Some(content) = doc.text() { - for (def, ac, values) in &self.automata { - for mat in ac.find_iter(content) { - let value = &values[mat.pattern().as_usize()]; - let entity = Entity::new( - def.category.clone(), - &def.entity_type, - value.as_str(), - DetectionMethod::Dictionary, - confidence, - EntityLocation::Text(TextLocation { - start_offset: mat.start(), - end_offset: mat.end(), - context_start_offset: None, - context_end_offset: None, - element_id: None, - page_number: None, - }), - ) - .with_parent(&doc.source); - entities.push(entity); - } + // Text content matching + for doc in &text_docs { + let lines = doc.handler().lines(); + let mut content = lines.join("\n"); + if doc.handler().trailing_newline() { + content.push('\n'); + } + + for (def, ac, values) in &self.automata { + for mat in ac.find_iter(&content) { + let value = &values[mat.pattern().as_usize()]; + let entity = Entity::new( + def.category.clone(), + &def.entity_type, + value.as_str(), + DetectionMethod::Dictionary, + confidence, + ) + .with_text_location(TextLocation { + start_offset: mat.start(), + end_offset: mat.end(), + context_start_offset: None, + context_end_offset: None, + element_id: None, + page_number: None, + }) + .with_parent(&doc.source); + entities.push(entity); } } + } - // Tabular content matching - if let Some(tabular) = doc.tabular() { - for (row_idx, row) in tabular.rows.iter().enumerate() { - for (col_idx, cell) in row.iter().enumerate() { - if cell.is_empty() { - continue; - } - for (def, ac, values) in &self.automata { - for mat in ac.find_iter(cell) { - let value = &values[mat.pattern().as_usize()]; - let entity = Entity::new( - def.category.clone(), - &def.entity_type, - value.as_str(), - DetectionMethod::Dictionary, - confidence, - EntityLocation::Tabular(TabularLocation { - row_index: row_idx, - column_index: col_idx, - start_offset: Some(mat.start()), - end_offset: Some(mat.end()), - }), - ) - .with_parent(&doc.source); - entities.push(entity); - } + // Tabular content matching + for doc in &tabular_docs { + for (row_idx, row) in doc.handler().rows().iter().enumerate() { + for (col_idx, cell) in row.iter().enumerate() { + if cell.is_empty() { + continue; + } + for (def, ac, values) in &self.automata { + for mat in ac.find_iter(cell) { + let value = &values[mat.pattern().as_usize()]; + let entity = Entity::new( + def.category.clone(), + &def.entity_type, + value.as_str(), + DetectionMethod::Dictionary, + confidence, + ) + .with_tabular_location(TabularLocation { + row_index: row_idx, + column_index: col_idx, + start_offset: Some(mat.start()), + end_offset: Some(mat.end()), + }) + .with_parent(&doc.source); + entities.push(entity); } } } diff --git a/crates/nvisy-pipeline/src/actions/detect_manual.rs b/crates/nvisy-pipeline/src/detection/manual.rs similarity index 77% rename from crates/nvisy-pipeline/src/actions/detect_manual.rs rename to crates/nvisy-pipeline/src/detection/manual.rs index 1dafc3a..80f89d2 100644 --- a/crates/nvisy-pipeline/src/actions/detect_manual.rs +++ b/crates/nvisy-pipeline/src/detection/manual.rs @@ -17,9 +17,7 @@ pub struct DetectManualParams {} /// Converts each inclusion [`Annotation`] into a full [`Entity`] with /// `DetectionMethod::Manual` and confidence 1.0. -pub struct DetectManualAction { - params: DetectManualParams, -} +pub struct DetectManualAction; #[async_trait::async_trait] impl Action for DetectManualAction { @@ -31,8 +29,8 @@ impl Action for DetectManualAction { "detect-manual" } - async fn connect(params: Self::Params) -> Result<Self, Error> { - Ok(Self { params }) + async fn connect(_params: Self::Params) -> Result<Self, Error> { + Ok(Self) } async fn execute( @@ -54,19 +52,19 @@ impl Action for DetectManualAction { None => continue, }; let value = ann.value.clone().unwrap_or_default(); - let location = match &ann.location { - Some(l) => l.clone(), - None => continue, - }; - let entity = Entity::new( + let mut entity = Entity::new( category, entity_type, value, DetectionMethod::Manual, 1.0, - location, ); + entity.text_location = ann.text_location.clone(); + entity.image_location = ann.image_location.clone(); + entity.tabular_location = ann.tabular_location.clone(); + entity.audio_location = ann.audio_location.clone(); + entity.video_location = ann.video_location.clone(); entities.push(entity); } diff --git a/crates/nvisy-pipeline/src/detection/mod.rs b/crates/nvisy-pipeline/src/detection/mod.rs new file mode 100644 index 0000000..88c6648 --- /dev/null +++ b/crates/nvisy-pipeline/src/detection/mod.rs @@ -0,0 +1,20 @@ +//! Entity detection actions. +//! +//! Each sub-module exposes a single [`Action`](crate::action::Action) +//! that produces [`Entity`](nvisy_ontology::entity::Entity) values from +//! document content. + +/// Validates detected entities using checksum algorithms (e.g. Luhn). +pub mod checksum; +/// Computes a sensitivity classification for each blob based on detected entities. +pub mod classify; +/// Aho-Corasick dictionary-based entity detection. +pub mod dictionary; +/// Converts user-provided manual annotations into entities. +pub mod manual; +/// AI-powered named-entity recognition (text + image). +pub mod ner; +/// Scans document text with compiled regex patterns to detect PII/PHI entities. +pub mod regex; +/// Column-based rule matching for tabular data. +pub mod tabular; diff --git a/crates/nvisy-pipeline/src/detection/ner.rs b/crates/nvisy-pipeline/src/detection/ner.rs new file mode 100644 index 0000000..305e7c8 --- /dev/null +++ b/crates/nvisy-pipeline/src/detection/ner.rs @@ -0,0 +1,64 @@ +//! AI-powered named-entity recognition (NER) detection action. + +use serde::Deserialize; + +use nvisy_ingest::document::Document; +use nvisy_ingest::handler::TxtHandler; +use nvisy_ontology::entity::Entity; +use nvisy_core::error::Error; + +#[cfg(feature = "image-redaction")] +use nvisy_ingest::handler::PngHandler; + +use crate::action::Action; + +fn default_confidence() -> f64 { + 0.5 +} + +/// Typed parameters for [`DetectNerAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct DetectNerParams { + /// Entity types to detect (empty = all). + #[serde(default)] + pub entity_types: Vec<String>, + /// Minimum confidence score for returned entities. + #[serde(default = "default_confidence")] + pub confidence_threshold: f64, +} + +/// Typed input for [`DetectNerAction`]. +pub struct DetectNerInput { + /// Text documents to scan for named entities. + pub text_docs: Vec<Document<TxtHandler>>, + /// Image documents to scan for named entities (feature-gated). + #[cfg(feature = "image-redaction")] + pub image_docs: Vec<Document<PngHandler>>, +} + +/// AI NER detection stub — delegates to an NER model provider at runtime. +pub struct DetectNerAction; + +#[async_trait::async_trait] +impl Action for DetectNerAction { + type Params = DetectNerParams; + type Input = DetectNerInput; + type Output = Vec<Entity>; + + fn id(&self) -> &str { + "detect-ner" + } + + async fn connect(_params: Self::Params) -> Result<Self, Error> { + Ok(Self) + } + + async fn execute( + &self, + _input: Self::Input, + ) -> Result<Vec<Entity>, Error> { + // Stub: real implementation will call an NER model provider. + Ok(Vec::new()) + } +} diff --git a/crates/nvisy-pipeline/src/actions/detect_regex.rs b/crates/nvisy-pipeline/src/detection/regex.rs similarity index 77% rename from crates/nvisy-pipeline/src/actions/detect_regex.rs rename to crates/nvisy-pipeline/src/detection/regex.rs index a1a21ca..79326c9 100644 --- a/crates/nvisy-pipeline/src/actions/detect_regex.rs +++ b/crates/nvisy-pipeline/src/detection/regex.rs @@ -3,9 +3,9 @@ use regex::Regex; use serde::Deserialize; -use nvisy_ingest::handler::FormatHandler; +use nvisy_ingest::handler::TxtHandler; use nvisy_ingest::document::Document; -use nvisy_ontology::entity::{DetectionMethod, Entity, EntityLocation, TextLocation}; +use nvisy_ontology::entity::{DetectionMethod, Entity, TextLocation}; use nvisy_core::error::Error; use nvisy_pattern::patterns::{self, PatternDefinition}; @@ -27,7 +27,7 @@ pub struct DetectRegexAction { #[async_trait::async_trait] impl Action for DetectRegexAction { type Params = DetectRegexParams; - type Input = Vec<Document<FormatHandler>>; + type Input = Vec<Document<TxtHandler>>; type Output = Vec<Entity>; fn id(&self) -> &str { @@ -55,13 +55,14 @@ impl Action for DetectRegexAction { let mut entities = Vec::new(); for doc in &documents { - let content = match doc.text() { - Some(c) => c, - None => continue, - }; + let lines = doc.handler().lines(); + let mut content = lines.join("\n"); + if doc.handler().trailing_newline() { + content.push('\n'); + } for (pattern, regex) in &compiled { - for mat in regex.find_iter(content) { + for mat in regex.find_iter(&content) { let value = mat.as_str(); if let Some(validate) = pattern.validate { @@ -80,15 +81,15 @@ impl Action for DetectRegexAction { value, DetectionMethod::Regex, pattern.confidence, - EntityLocation::Text(TextLocation { - start_offset: mat.start(), - end_offset: mat.end(), - context_start_offset: None, - context_end_offset: None, - element_id: None, - page_number: None, - }), ) + .with_text_location(TextLocation { + start_offset: mat.start(), + end_offset: mat.end(), + context_start_offset: None, + context_end_offset: None, + element_id: None, + page_number: None, + }) .with_parent(&doc.source); entities.push(entity); diff --git a/crates/nvisy-pipeline/src/actions/detect_tabular.rs b/crates/nvisy-pipeline/src/detection/tabular.rs similarity index 78% rename from crates/nvisy-pipeline/src/actions/detect_tabular.rs rename to crates/nvisy-pipeline/src/detection/tabular.rs index e0a7fa2..92bcfcf 100644 --- a/crates/nvisy-pipeline/src/actions/detect_tabular.rs +++ b/crates/nvisy-pipeline/src/detection/tabular.rs @@ -3,10 +3,10 @@ use regex::Regex; use serde::Deserialize; -use nvisy_ingest::handler::FormatHandler; +use nvisy_ingest::handler::CsvHandler; use nvisy_ingest::document::Document; use nvisy_ontology::entity::{ - DetectionMethod, Entity, EntityCategory, EntityLocation, TabularLocation, + DetectionMethod, Entity, EntityCategory, TabularLocation, }; use nvisy_core::error::{Error, ErrorKind}; @@ -35,14 +35,13 @@ pub struct DetectTabularParams { /// Matches column headers against rules and marks every non-empty cell /// in matched columns as an entity. pub struct DetectTabularAction { - params: DetectTabularParams, compiled_rules: Vec<(Regex, ColumnRule)>, } #[async_trait::async_trait] impl Action for DetectTabularAction { type Params = DetectTabularParams; - type Input = Vec<Document<FormatHandler>>; + type Input = Vec<Document<CsvHandler>>; type Output = Vec<Entity>; fn id(&self) -> &str { @@ -66,10 +65,7 @@ impl Action for DetectTabularAction { Ok((re, r.clone())) }) .collect::<Result<Vec<_>, Error>>()?; - Ok(Self { - params, - compiled_rules, - }) + Ok(Self { compiled_rules }) } async fn execute( @@ -79,18 +75,18 @@ impl Action for DetectTabularAction { let mut entities = Vec::new(); for doc in &documents { - let tabular = match doc.tabular() { - Some(t) => t, + let headers = match doc.handler().headers() { + Some(h) => h, None => continue, }; - for (col_idx, col_name) in tabular.columns.iter().enumerate() { + for (col_idx, col_name) in headers.iter().enumerate() { for (regex, rule) in &self.compiled_rules { if !regex.is_match(col_name) { continue; } - for (row_idx, row) in tabular.rows.iter().enumerate() { + for (row_idx, row) in doc.handler().rows().iter().enumerate() { if let Some(cell) = row.get(col_idx) { if cell.is_empty() { continue; @@ -102,13 +98,13 @@ impl Action for DetectTabularAction { cell.as_str(), DetectionMethod::Composite, 0.9, - EntityLocation::Tabular(TabularLocation { - row_index: row_idx, - column_index: col_idx, - start_offset: Some(0), - end_offset: Some(cell.len()), - }), ) + .with_tabular_location(TabularLocation { + row_index: row_idx, + column_index: col_idx, + start_offset: Some(0), + end_offset: Some(cell.len()), + }) .with_parent(&doc.source); entities.push(entity); diff --git a/crates/nvisy-pipeline/src/generation/mod.rs b/crates/nvisy-pipeline/src/generation/mod.rs new file mode 100644 index 0000000..be6c759 --- /dev/null +++ b/crates/nvisy-pipeline/src/generation/mod.rs @@ -0,0 +1,14 @@ +//! Content generation actions. +//! +//! Each sub-module exposes a single [`Action`](crate::action::Action) +//! that generates derived content (text, entities, or replacement values) +//! from documents. + +/// OCR text extraction from image documents. +#[cfg(feature = "image-redaction")] +pub mod ocr; +/// Synthetic replacement value generation for Synthesize redactions. +pub mod synthetic; +/// Speech-to-text transcription from audio documents. +#[cfg(feature = "audio-redaction")] +pub mod transcribe; diff --git a/crates/nvisy-pipeline/src/generation/ocr.rs b/crates/nvisy-pipeline/src/generation/ocr.rs new file mode 100644 index 0000000..9fbc022 --- /dev/null +++ b/crates/nvisy-pipeline/src/generation/ocr.rs @@ -0,0 +1,81 @@ +//! OCR text extraction action — generates text entities with bounding boxes +//! from image documents. + +use serde::Deserialize; + +use nvisy_ingest::document::Document; +use nvisy_ingest::handler::{PngHandler, TxtHandler}; +use nvisy_ontology::entity::Entity; +use nvisy_core::error::Error; + +use crate::action::Action; + +fn default_language() -> String { + "eng".into() +} + +fn default_engine() -> String { + "tesseract".into() +} + +fn default_confidence() -> f64 { + 0.5 +} + +/// Typed parameters for [`GenerateOcrAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct GenerateOcrParams { + /// OCR language code (ISO 639-3). + #[serde(default = "default_language")] + pub language: String, + /// OCR engine identifier. + #[serde(default = "default_engine")] + pub engine: String, + /// Minimum confidence score for returned entities. + #[serde(default = "default_confidence")] + pub confidence_threshold: f64, +} + +/// Typed input for [`GenerateOcrAction`]. +pub struct GenerateOcrInput { + /// Image documents to extract text from. + pub image_docs: Vec<Document<PngHandler>>, +} + +/// Typed output for [`GenerateOcrAction`]. +pub struct GenerateOcrOutput { + /// Detected text entities with bounding-box locations. + pub entities: Vec<Entity>, + /// Extracted text as new text documents. + pub text_docs: Vec<Document<TxtHandler>>, +} + +/// OCR generation stub — delegates to an OCR engine provider at runtime. +pub struct GenerateOcrAction; + +#[async_trait::async_trait] +impl Action for GenerateOcrAction { + type Params = GenerateOcrParams; + type Input = GenerateOcrInput; + type Output = GenerateOcrOutput; + + fn id(&self) -> &str { + "generate-ocr" + } + + async fn connect(_params: Self::Params) -> Result<Self, Error> { + Ok(Self) + } + + async fn execute( + &self, + _input: Self::Input, + ) -> Result<GenerateOcrOutput, Error> { + // Stub: real implementation will call an OCR engine provider. + Ok(GenerateOcrOutput { + entities: Vec::new(), + text_docs: Vec::new(), + }) + } +} diff --git a/crates/nvisy-pipeline/src/generation/synthetic.rs b/crates/nvisy-pipeline/src/generation/synthetic.rs new file mode 100644 index 0000000..d6200b4 --- /dev/null +++ b/crates/nvisy-pipeline/src/generation/synthetic.rs @@ -0,0 +1,59 @@ +//! Synthetic data generation action — fills in realistic replacement values +//! for redactions marked with `Synthesize`. + +use serde::Deserialize; + +use nvisy_ontology::entity::Entity; +use nvisy_ontology::redaction::Redaction; +use nvisy_core::error::Error; + +use crate::action::Action; + +fn default_locale() -> String { + "en-US".into() +} + +/// Typed parameters for [`GenerateSyntheticAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct GenerateSyntheticParams { + /// BCP-47 locale for synthetic value generation. + #[serde(default = "default_locale")] + pub locale: String, +} + +/// Typed input for [`GenerateSyntheticAction`]. +pub struct GenerateSyntheticInput { + /// The entities whose redactions need synthetic values. + pub entities: Vec<Entity>, + /// The redaction instructions (some may have `Synthesize` outputs). + pub redactions: Vec<Redaction>, +} + +/// Synthetic data generation stub — fills `Synthesize` redaction outputs +/// with realistic replacement values at runtime. +pub struct GenerateSyntheticAction; + +#[async_trait::async_trait] +impl Action for GenerateSyntheticAction { + type Params = GenerateSyntheticParams; + type Input = GenerateSyntheticInput; + type Output = Vec<Redaction>; + + fn id(&self) -> &str { + "generate-synthetic" + } + + async fn connect(_params: Self::Params) -> Result<Self, Error> { + Ok(Self) + } + + async fn execute( + &self, + input: Self::Input, + ) -> Result<Vec<Redaction>, Error> { + // Stub: returns redactions unchanged. Real implementation will fill + // Synthesize variants with generated replacement values. + Ok(input.redactions) + } +} diff --git a/crates/nvisy-pipeline/src/generation/transcribe.rs b/crates/nvisy-pipeline/src/generation/transcribe.rs new file mode 100644 index 0000000..3620e1b --- /dev/null +++ b/crates/nvisy-pipeline/src/generation/transcribe.rs @@ -0,0 +1,77 @@ +//! Speech-to-text transcription action — generates text entities with audio +//! locations and transcript documents from audio input. + +use serde::Deserialize; + +use nvisy_ingest::document::Document; +use nvisy_ingest::handler::{WavHandler, TxtHandler}; +use nvisy_ontology::entity::Entity; +use nvisy_core::error::Error; + +use crate::action::Action; + +fn default_language() -> String { + "en".into() +} + +fn default_confidence() -> f64 { + 0.5 +} + +/// Typed parameters for [`GenerateTranscribeAction`]. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct GenerateTranscribeParams { + /// BCP-47 language tag for transcription. + #[serde(default = "default_language")] + pub language: String, + /// Whether to perform speaker diarization. + #[serde(default)] + pub enable_speaker_diarization: bool, + /// Minimum confidence score for returned entities. + #[serde(default = "default_confidence")] + pub confidence_threshold: f64, +} + +/// Typed input for [`GenerateTranscribeAction`]. +pub struct GenerateTranscribeInput { + /// Audio documents to transcribe. + pub audio_docs: Vec<Document<WavHandler>>, +} + +/// Typed output for [`GenerateTranscribeAction`]. +pub struct GenerateTranscribeOutput { + /// Detected entities with [`AudioLocation`](nvisy_ontology::entity::AudioLocation). + pub entities: Vec<Entity>, + /// Transcripts as new text documents. + pub text_docs: Vec<Document<TxtHandler>>, +} + +/// Speech-to-text stub — delegates to a transcription provider at runtime. +pub struct GenerateTranscribeAction; + +#[async_trait::async_trait] +impl Action for GenerateTranscribeAction { + type Params = GenerateTranscribeParams; + type Input = GenerateTranscribeInput; + type Output = GenerateTranscribeOutput; + + fn id(&self) -> &str { + "generate-transcribe" + } + + async fn connect(_params: Self::Params) -> Result<Self, Error> { + Ok(Self) + } + + async fn execute( + &self, + _input: Self::Input, + ) -> Result<GenerateTranscribeOutput, Error> { + // Stub: real implementation will call a speech-to-text provider. + Ok(GenerateTranscribeOutput { + entities: Vec::new(), + text_docs: Vec::new(), + }) + } +} diff --git a/crates/nvisy-pipeline/src/lib.rs b/crates/nvisy-pipeline/src/lib.rs index efe66b8..b5618c3 100644 --- a/crates/nvisy-pipeline/src/lib.rs +++ b/crates/nvisy-pipeline/src/lib.rs @@ -1,9 +1,10 @@ -//! Pipeline action/provider traits with detection and redaction actions. +//! Pipeline action/provider traits with detection, redaction, and generation actions. //! //! This crate consolidates the processing pipeline: the [`Action`] and -//! [`Provider`] traits, all detection actions (regex, dictionary, checksum, -//! tabular, manual), policy evaluation, text/image/tabular/PDF/audio -//! redaction, and audit-trail emission. +//! [`Provider`] traits, entity detection (regex, dictionary, checksum, +//! tabular, manual, NER), policy evaluation, content redaction +//! (text/image/tabular/audio), content generation (OCR, transcription, +//! synthetic data), and audit-trail emission. #![forbid(unsafe_code)] #![cfg_attr(docsrs, feature(doc_cfg))] @@ -12,11 +13,11 @@ pub mod action; /// The `Provider` trait — factory for authenticated client connections. pub mod provider; -/// Pipeline actions for detection, redaction, policy, and audit. -pub mod actions; -/// Image rendering primitives for redaction overlays. -#[cfg(feature = "image-redaction")] -pub mod render; - +/// Entity detection actions. +pub mod detection; +/// Redaction actions (policy evaluation, apply, audit). +pub mod redaction; +/// Content generation actions (OCR, transcription, synthetic data). +pub mod generation; #[doc(hidden)] pub mod prelude; diff --git a/crates/nvisy-pipeline/src/prelude.rs b/crates/nvisy-pipeline/src/prelude.rs index 877431e..4387f8c 100644 --- a/crates/nvisy-pipeline/src/prelude.rs +++ b/crates/nvisy-pipeline/src/prelude.rs @@ -3,15 +3,29 @@ pub use crate::action::Action; pub use crate::provider::{ConnectedInstance, Provider}; -pub use crate::actions::detect_regex::{DetectRegexAction, DetectRegexParams}; -pub use crate::actions::detect_dictionary::{DetectDictionaryAction, DetectDictionaryParams, DictionaryDef}; -pub use crate::actions::detect_tabular::{DetectTabularAction, DetectTabularParams, ColumnRule}; -pub use crate::actions::detect_manual::{DetectManualAction, DetectManualParams}; -pub use crate::actions::detect_checksum::{DetectChecksumAction, DetectChecksumParams}; -pub use crate::actions::classify::{ClassifyAction, ClassificationResult}; -pub use crate::actions::evaluate_policy::{EvaluatePolicyAction, EvaluatePolicyParams}; -pub use crate::actions::apply_redaction::{ApplyRedactionAction, ApplyRedactionParams}; -pub use crate::actions::emit_audit::{EmitAuditAction, EmitAuditParams}; +pub use crate::detection::regex::{DetectRegexAction, DetectRegexParams}; +pub use crate::detection::dictionary::{DetectDictionaryAction, DetectDictionaryParams, DictionaryDef}; +pub use crate::detection::tabular::{DetectTabularAction, DetectTabularParams, ColumnRule}; +pub use crate::detection::manual::{DetectManualAction, DetectManualParams}; +pub use crate::detection::checksum::{DetectChecksumAction, DetectChecksumParams}; +pub use crate::detection::ner::{DetectNerAction, DetectNerParams, DetectNerInput}; +pub use crate::detection::classify::{ClassifyAction, ClassificationResult}; +pub use crate::redaction::evaluate_policy::{EvaluatePolicyAction, EvaluatePolicyParams}; +pub use crate::redaction::apply::{ + ApplyRedactionAction, ApplyRedactionParams, ApplyRedactionInput, ApplyRedactionOutput, +}; +pub use crate::redaction::emit_audit::{EmitAuditAction, EmitAuditParams}; +pub use crate::generation::synthetic::{ + GenerateSyntheticAction, GenerateSyntheticParams, GenerateSyntheticInput, +}; -#[cfg(feature = "pdf-redaction")] -pub use crate::actions::apply_pdf_redaction::{ApplyPdfRedactionAction, ApplyPdfRedactionParams}; +#[cfg(feature = "image-redaction")] +pub use crate::generation::ocr::{ + GenerateOcrAction, GenerateOcrParams, GenerateOcrInput, GenerateOcrOutput, +}; + +#[cfg(feature = "audio-redaction")] +pub use crate::generation::transcribe::{ + GenerateTranscribeAction, GenerateTranscribeParams, GenerateTranscribeInput, + GenerateTranscribeOutput, +}; diff --git a/crates/nvisy-pipeline/src/actions/apply_redaction.rs b/crates/nvisy-pipeline/src/redaction/apply.rs similarity index 53% rename from crates/nvisy-pipeline/src/actions/apply_redaction.rs rename to crates/nvisy-pipeline/src/redaction/apply.rs index f9c2abc..7927c9a 100644 --- a/crates/nvisy-pipeline/src/actions/apply_redaction.rs +++ b/crates/nvisy-pipeline/src/redaction/apply.rs @@ -4,9 +4,8 @@ use std::collections::HashMap; use uuid::Uuid; use serde::Deserialize; -use nvisy_ingest::handler::{FormatHandler, TxtHandler}; +use nvisy_ingest::handler::{TxtHandler, TxtData, CsvHandler}; use nvisy_ingest::document::Document; -use nvisy_ingest::document::data::*; use nvisy_ontology::entity::Entity; use nvisy_ontology::redaction::{Redaction, RedactionOutput, TextRedactionOutput}; use nvisy_core::error::Error; @@ -22,34 +21,95 @@ use nvisy_ontology::redaction::ImageRedactionOutput; #[cfg(feature = "image-redaction")] use nvisy_core::error::ErrorKind; +#[cfg(feature = "audio-redaction")] +use nvisy_ingest::handler::WavHandler; + use crate::action::Action; /// Typed parameters for [`ApplyRedactionAction`]. #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] pub struct ApplyRedactionParams { + /// Default mask character for text [`Mask`](nvisy_ontology::redaction::TextRedactionOutput::Mask) redactions. + #[serde(default = "default_mask_char")] + pub mask_char: char, /// Sigma value for gaussian blur (image redaction). + #[cfg(feature = "image-redaction")] #[serde(default = "default_sigma")] pub blur_sigma: f32, /// RGBA color for block overlays (image redaction). - #[serde(default = "default_color")] + #[cfg(feature = "image-redaction")] + #[serde(default = "default_block_color")] pub block_color: [u8; 4], + /// Pixel block size for pixelation/mosaic (image redaction). + #[cfg(feature = "image-redaction")] + #[serde(default = "default_pixelate_block_size")] + pub pixelate_block_size: u32, + /// Duration in seconds to crossfade at silence boundaries (audio redaction). + #[cfg(feature = "audio-redaction")] + #[serde(default = "default_crossfade_secs")] + pub crossfade_secs: f64, } +fn default_mask_char() -> char { + '*' +} +#[cfg(feature = "image-redaction")] fn default_sigma() -> f32 { 15.0 } -fn default_color() -> [u8; 4] { +#[cfg(feature = "image-redaction")] +fn default_block_color() -> [u8; 4] { [0, 0, 0, 255] } +#[cfg(feature = "image-redaction")] +fn default_pixelate_block_size() -> u32 { + 10 +} +#[cfg(feature = "audio-redaction")] +fn default_crossfade_secs() -> f64 { + 0.05 +} + +/// Typed input for [`ApplyRedactionAction`]. +pub struct ApplyRedactionInput { + /// Text documents to redact. + pub text_docs: Vec<Document<TxtHandler>>, + /// Image documents to redact (feature-gated). + #[cfg(feature = "image-redaction")] + pub image_docs: Vec<Document<PngHandler>>, + /// Audio documents to redact (feature-gated). + #[cfg(feature = "audio-redaction")] + pub audio_docs: Vec<Document<WavHandler>>, + /// Tabular documents to redact. + pub tabular_docs: Vec<Document<CsvHandler>>, + /// Detected entities referenced by redaction instructions. + pub entities: Vec<Entity>, + /// Redaction instructions to apply. + pub redactions: Vec<Redaction>, +} + +/// Typed output for [`ApplyRedactionAction`]. +pub struct ApplyRedactionOutput { + /// Redacted text documents. + pub text_docs: Vec<Document<TxtHandler>>, + /// Redacted image documents (feature-gated). + #[cfg(feature = "image-redaction")] + pub image_docs: Vec<Document<PngHandler>>, + /// Redacted audio documents (feature-gated). + #[cfg(feature = "audio-redaction")] + pub audio_docs: Vec<Document<WavHandler>>, + /// Redacted tabular documents. + pub tabular_docs: Vec<Document<CsvHandler>>, +} /// Applies pending [`Redaction`] instructions to document content. /// /// Dispatches per-document based on content type: /// - **Text documents**: byte-offset replacement /// - **Image documents**: blur/block overlay (feature-gated) +/// - **Audio documents**: stub pass-through (feature-gated) /// - **Tabular documents**: cell-level redaction -/// - **Audio documents**: pass-through with warning pub struct ApplyRedactionAction { params: ApplyRedactionParams, } @@ -67,8 +127,8 @@ struct PendingRedaction { #[async_trait::async_trait] impl Action for ApplyRedactionAction { type Params = ApplyRedactionParams; - type Input = (Vec<Document<FormatHandler>>, Vec<Entity>, Vec<Redaction>); - type Output = Vec<Document<FormatHandler>>; + type Input = ApplyRedactionInput; + type Output = ApplyRedactionOutput; fn id(&self) -> &str { "apply-redaction" @@ -81,58 +141,61 @@ impl Action for ApplyRedactionAction { async fn execute( &self, input: Self::Input, - ) -> Result<Vec<Document<FormatHandler>>, Error> { - let (documents, entities, redactions) = input; - + ) -> Result<Self::Output, Error> { let entity_map: HashMap<Uuid, &Entity> = - entities.iter().map(|e| (e.source.as_uuid(), e)).collect(); - let redaction_map: HashMap<Uuid, &Redaction> = redactions + input.entities.iter().map(|e| (e.source.as_uuid(), e)).collect(); + let redaction_map: HashMap<Uuid, &Redaction> = input.redactions .iter() .filter(|r| !r.applied) .map(|r| (r.entity_id, r)) .collect(); - let mut result_docs = Vec::new(); - - for doc in &documents { - // Tabular documents - if doc.tabular().is_some() { - let redacted = apply_tabular_doc(doc, &entities, &redaction_map); - result_docs.push(redacted); - continue; - } + // Text documents + let mut result_text = Vec::new(); + for doc in &input.text_docs { + let redacted = apply_text_doc(doc, &entity_map, &redaction_map, &self.params); + result_text.push(redacted); + } - // Image documents - #[cfg(feature = "image-redaction")] - if doc.image().is_some() { - let redacted = apply_image_doc( - doc, - &entities, - &redaction_map, - self.params.blur_sigma, - self.params.block_color, - )?; - result_docs.push(redacted); - continue; - } + // Image documents + #[cfg(feature = "image-redaction")] + let mut result_image = Vec::new(); + #[cfg(feature = "image-redaction")] + for doc in &input.image_docs { + let redacted = apply_image_doc( + doc, + &input.entities, + &redaction_map, + self.params.blur_sigma, + self.params.block_color, + )?; + result_image.push(redacted); + } - // Text documents (content present) - if let Some(content) = doc.text() { - let redacted = apply_text_doc( - doc, - content, - &entity_map, - &redaction_map, - ); - result_docs.push(redacted); - continue; - } + // Audio documents + #[cfg(feature = "audio-redaction")] + let mut result_audio = Vec::new(); + #[cfg(feature = "audio-redaction")] + for doc in &input.audio_docs { + let redacted = apply_audio_doc(doc); + result_audio.push(redacted); + } - // Fallback: pass through unchanged - result_docs.push(doc.clone()); + // Tabular documents + let mut result_tabular = Vec::new(); + for doc in &input.tabular_docs { + let redacted = apply_tabular_doc(doc, &input.entities, &redaction_map, &self.params); + result_tabular.push(redacted); } - Ok(result_docs) + Ok(ApplyRedactionOutput { + text_docs: result_text, + #[cfg(feature = "image-redaction")] + image_docs: result_image, + #[cfg(feature = "audio-redaction")] + audio_docs: result_audio, + tabular_docs: result_tabular, + }) } } @@ -141,11 +204,17 @@ impl Action for ApplyRedactionAction { // --------------------------------------------------------------------------- fn apply_text_doc( - doc: &Document<FormatHandler>, - content: &str, + doc: &Document<TxtHandler>, entity_map: &HashMap<Uuid, &Entity>, redaction_map: &HashMap<Uuid, &Redaction>, -) -> Document<FormatHandler> { + params: &ApplyRedactionParams, +) -> Document<TxtHandler> { + let lines = doc.handler().lines(); + let mut content = lines.join("\n"); + if doc.handler().trailing_newline() { + content.push('\n'); + } + let mut pending: Vec<PendingRedaction> = Vec::new(); for (entity_id, redaction) in redaction_map { @@ -160,20 +229,18 @@ fn apply_text_doc( continue; } - let start_offset = match entity.location.start_offset() { - Some(s) => s, - None => continue, - }; - let end_offset = match entity.location.end_offset() { - Some(e) => e, + let (start_offset, end_offset) = match &entity.text_location { + Some(loc) => (loc.start_offset, loc.end_offset), None => continue, }; - let replacement_value = redaction - .output - .replacement_value() - .unwrap_or("") - .to_string(); + let replacement_value = match redaction.output.replacement_value() { + Some(v) => v.to_string(), + None => { + let span_len = end_offset.saturating_sub(start_offset); + params.mask_char.to_string().repeat(span_len) + } + }; pending.push(PendingRedaction { start_offset, @@ -186,13 +253,16 @@ fn apply_text_doc( return doc.clone(); } - let redacted_content = apply_text_redactions(content, &mut pending); - let mut result = Document::new( - FormatHandler::Txt(TxtHandler), - DocumentData::Text(TextData { text: redacted_content }), - ); - result.source.set_parent_id(Some(doc.source.as_uuid())); + let redacted_content = apply_text_redactions(&content, &mut pending); + let trailing_newline = redacted_content.ends_with('\n'); + let new_lines: Vec<String> = redacted_content.lines().map(String::from).collect(); + let handler = TxtHandler::new(TxtData { + lines: new_lines, + trailing_newline, + }); + let mut result = Document::new(handler); + result.source.set_parent_id(Some(doc.source.as_uuid())); result } @@ -228,24 +298,22 @@ fn apply_text_redactions(text: &str, pending: &mut [PendingRedaction]) -> String #[cfg(feature = "image-redaction")] fn apply_image_doc( - doc: &Document<FormatHandler>, + doc: &Document<PngHandler>, entities: &[Entity], redaction_map: &HashMap<Uuid, &Redaction>, blur_sigma: f32, block_color: [u8; 4], -) -> Result<Document<FormatHandler>, Error> { - use crate::render::{blur, block}; +) -> Result<Document<PngHandler>, Error> { + use crate::redaction::render::{blur, block}; - let image_data = match doc.image() { - Some(d) => d, - None => return Ok(doc.clone()), - }; + let image_bytes = doc.handler().bytes(); let mut blur_regions: Vec<BoundingBox> = Vec::new(); let mut block_regions: Vec<BoundingBox> = Vec::new(); for entity in entities { - if let Some(bbox) = entity.location.bounding_box() { + if let Some(ref img_loc) = entity.image_location { + let bbox = &img_loc.bounding_box; if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { match &redaction.output { RedactionOutput::Image(ImageRedactionOutput::Blur { .. }) => { @@ -264,7 +332,7 @@ fn apply_image_doc( return Ok(doc.clone()); } - let dyn_img = image::load_from_memory(&image_data.bytes).map_err(|e| { + let dyn_img = image::load_from_memory(image_bytes).map_err(|e| { Error::new(ErrorKind::Runtime, format!("image decode failed: {e}")) })?; @@ -285,40 +353,39 @@ fn apply_image_doc( Error::new(ErrorKind::Runtime, format!("image encode failed: {e}")) })?; - let new_doc = Document::new( - FormatHandler::Png(PngHandler), - DocumentData::Image(ImageData { - bytes: Bytes::from(buf.into_inner()), - mime_type: "image/png".to_string(), - width: result.width(), - height: result.height(), - }), - ); - + let new_doc = Document::new(PngHandler::new(Bytes::from(buf.into_inner()))); Ok(new_doc) } +// --------------------------------------------------------------------------- +// Audio redaction (feature-gated) +// --------------------------------------------------------------------------- + +#[cfg(feature = "audio-redaction")] +fn apply_audio_doc(doc: &Document<WavHandler>) -> Document<WavHandler> { + tracing::warn!("audio redaction not yet implemented"); + doc.clone() +} + // --------------------------------------------------------------------------- // Tabular redaction // --------------------------------------------------------------------------- fn apply_tabular_doc( - doc: &Document<FormatHandler>, + doc: &Document<CsvHandler>, entities: &[Entity], redaction_map: &HashMap<Uuid, &Redaction>, -) -> Document<FormatHandler> { + params: &ApplyRedactionParams, +) -> Document<CsvHandler> { let mut result = doc.clone(); for entity in entities { - if let (Some(row_idx), Some(col_idx)) = - (entity.location.row_index(), entity.location.column_index()) - { + if let Some(ref tab_loc) = entity.tabular_location { + let (row_idx, col_idx) = (tab_loc.row_index, tab_loc.column_index); if let Some(redaction) = redaction_map.get(&entity.source.as_uuid()) { - if let Some(tabular) = result.tabular_mut() { - if let Some(row) = tabular.rows.get_mut(row_idx) { - if let Some(cell) = row.get_mut(col_idx) { - *cell = apply_cell_redaction(cell, &redaction.output); - } + if let Some(row) = result.handler_mut().rows_mut().get_mut(row_idx) { + if let Some(cell) = row.get_mut(col_idx) { + *cell = apply_cell_redaction(cell, &redaction.output, params.mask_char); } } } @@ -328,7 +395,7 @@ fn apply_tabular_doc( result } -fn apply_cell_redaction(cell: &str, output: &RedactionOutput) -> String { +fn apply_cell_redaction(cell: &str, output: &RedactionOutput, default_mask: char) -> String { match output { RedactionOutput::Text(TextRedactionOutput::Mask { mask_char, .. }) => { if cell.len() > 4 { @@ -345,7 +412,10 @@ fn apply_cell_redaction(cell: &str, output: &RedactionOutput) -> String { RedactionOutput::Text(TextRedactionOutput::Hash { .. }) => { format!("[HASH:{:x}]", hash_string(cell)) } - _ => output.replacement_value().unwrap_or("").to_string(), + _ => output + .replacement_value() + .map(|v| v.to_string()) + .unwrap_or_else(|| default_mask.to_string().repeat(cell.len())), } } diff --git a/crates/nvisy-pipeline/src/actions/emit_audit.rs b/crates/nvisy-pipeline/src/redaction/emit_audit.rs similarity index 56% rename from crates/nvisy-pipeline/src/actions/emit_audit.rs rename to crates/nvisy-pipeline/src/redaction/emit_audit.rs index c04e91c..0c9a7f1 100644 --- a/crates/nvisy-pipeline/src/actions/emit_audit.rs +++ b/crates/nvisy-pipeline/src/redaction/emit_audit.rs @@ -1,11 +1,13 @@ //! Audit trail emission action. +use jiff::Timestamp; use serde::Deserialize; use uuid::Uuid; +use nvisy_core::error::Error; +use nvisy_core::path::ContentSource; use nvisy_ontology::audit::{Audit, AuditAction}; use nvisy_ontology::redaction::Redaction; -use nvisy_core::error::Error; use crate::action::Action; @@ -47,37 +49,24 @@ impl Action for EmitAuditAction { &self, redactions: Self::Input, ) -> Result<Vec<Audit>, Error> { - let run_id = self.params.run_id; - let actor = &self.params.actor; - let mut audits = Vec::new(); for redaction in &redactions { - let mut audit = Audit::new(AuditAction::Redaction) - .with_entity_id(redaction.entity_id) - .with_redaction_id(redaction.source.as_uuid()); - - if let Some(run_id) = run_id { - audit = audit.with_run_id(run_id); - } - if let Some(actor) = actor { - audit = audit.with_actor(actor); - } - - let mut details = serde_json::Map::new(); - details.insert( - "output".to_string(), - serde_json::to_value(&redaction.output).unwrap_or_default(), - ); - if let Some(rule_id) = redaction.policy_rule_id { - details.insert( - "policyRuleId".to_string(), - serde_json::Value::String(rule_id.to_string()), - ); - } - audit = audit.with_details(details); - - audit.source.set_parent_id(Some(redaction.source.as_uuid())); + let mut source = ContentSource::new(); + source.set_parent_id(Some(redaction.source.as_uuid())); + + let audit = Audit { + source, + action: AuditAction::Redaction, + timestamp: Timestamp::now(), + entity_id: Some(redaction.entity_id), + redaction_id: Some(redaction.source.as_uuid()), + policy_id: None, + source_id: None, + run_id: self.params.run_id, + actor: self.params.actor.clone(), + explanation: None, + }; audits.push(audit); } diff --git a/crates/nvisy-pipeline/src/actions/evaluate_policy.rs b/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs similarity index 99% rename from crates/nvisy-pipeline/src/actions/evaluate_policy.rs rename to crates/nvisy-pipeline/src/redaction/evaluate_policy.rs index 7a68383..92e61d0 100644 --- a/crates/nvisy-pipeline/src/actions/evaluate_policy.rs +++ b/crates/nvisy-pipeline/src/redaction/evaluate_policy.rs @@ -101,9 +101,6 @@ impl Action for EvaluatePolicyAction { /// or `None` if no rule applies. fn find_matching_rule<'a>(entity: &Entity, rules: &'a [PolicyRule]) -> Option<&'a PolicyRule> { for rule in rules { - if !rule.enabled { - continue; - } if rule.selector.matches(&entity.category, &entity.entity_type, entity.confidence) { return Some(rule); } diff --git a/crates/nvisy-pipeline/src/redaction/mod.rs b/crates/nvisy-pipeline/src/redaction/mod.rs new file mode 100644 index 0000000..66ccd46 --- /dev/null +++ b/crates/nvisy-pipeline/src/redaction/mod.rs @@ -0,0 +1,14 @@ +//! Redaction actions. +//! +//! Each sub-module exposes a single [`Action`](crate::action::Action) +//! that evaluates, applies, or records redaction decisions. + +/// Applies pending redactions to document content (text, image, tabular, audio). +pub mod apply; +/// Image rendering primitives for redaction overlays. +#[cfg(feature = "image-redaction")] +pub mod render; +/// Emits audit trail records for every applied redaction. +pub mod emit_audit; +/// Evaluates policy rules against detected entities and produces redaction instructions. +pub mod evaluate_policy; diff --git a/crates/nvisy-pipeline/src/render/block.rs b/crates/nvisy-pipeline/src/redaction/render/block.rs similarity index 100% rename from crates/nvisy-pipeline/src/render/block.rs rename to crates/nvisy-pipeline/src/redaction/render/block.rs diff --git a/crates/nvisy-pipeline/src/render/blur.rs b/crates/nvisy-pipeline/src/redaction/render/blur.rs similarity index 100% rename from crates/nvisy-pipeline/src/render/blur.rs rename to crates/nvisy-pipeline/src/redaction/render/blur.rs diff --git a/crates/nvisy-pipeline/src/render/mod.rs b/crates/nvisy-pipeline/src/redaction/render/mod.rs similarity index 100% rename from crates/nvisy-pipeline/src/render/mod.rs rename to crates/nvisy-pipeline/src/redaction/render/mod.rs From 3e9b4dd9068907897dcaddb2fe51765d31745521 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha <o.martsokha@gmail.com> Date: Sun, 15 Feb 2026 17:07:50 +0100 Subject: [PATCH 17/17] refactor: flatten Entity locations, slim Policy/Audit, add handler Clone and mutability - ontology: remove EntityLocation enum; replace with per-modality Optional fields on Entity (text_location, image_location, etc.) and Annotation. Add with_*_location builder methods. - ontology: remove Audit builder methods and details field; add RetentionPolicy::duration(). Add Policies collection type. - ontology: slim PolicyRule (drop name, description, enabled, context, metadata); remove Policy builder/find_matching_rule methods. - ingest: add bytes field + Clone to PngHandler; add Clone to TxtHandler/CsvHandler; add TxtHandler::new, CsvHandler::rows_mut; remove Document::page_number. - engine: use Policies instead of Vec<Policy>. - python: update NER/OCR bridges for new Entity location API. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --- Cargo.lock | 2 +- crates/nvisy-engine/src/engine.rs | 4 +- crates/nvisy-ingest/src/document/mod.rs | 10 --- crates/nvisy-ingest/src/handler/image/png.rs | 18 ++++- .../src/handler/text/csv_handler.rs | 7 +- .../src/handler/text/txt_handler.rs | 7 +- crates/nvisy-ontology/src/audit/mod.rs | 53 -------------- crates/nvisy-ontology/src/audit/retention.rs | 15 ++++ .../src/detection/annotation.rs | 20 +++++- crates/nvisy-ontology/src/entity/location.rs | 65 ----------------- crates/nvisy-ontology/src/entity/mod.rs | 56 +++++++++++++-- crates/nvisy-ontology/src/policy/mod.rs | 69 ++----------------- crates/nvisy-ontology/src/policy/rule.rs | 14 ---- crates/nvisy-ontology/src/prelude.rs | 4 +- crates/nvisy-python/src/ner/mod.rs | 20 +++--- crates/nvisy-python/src/ocr/mod.rs | 14 ++-- 16 files changed, 140 insertions(+), 238 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b67959a..c8c1ce5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2191,7 +2191,7 @@ dependencies = [ "bytes", "image", "imageproc", - "lopdf", + "jiff", "nvisy-core", "nvisy-ingest", "nvisy-ontology", diff --git a/crates/nvisy-engine/src/engine.rs b/crates/nvisy-engine/src/engine.rs index b7b1a54..b271095 100644 --- a/crates/nvisy-engine/src/engine.rs +++ b/crates/nvisy-engine/src/engine.rs @@ -12,7 +12,7 @@ use nvisy_core::error::Error; use nvisy_core::fs::ContentHandler; use nvisy_ontology::audit::Audit; use nvisy_ontology::detection::{ClassificationResult, DetectionResult}; -use nvisy_ontology::policy::{Policy, PolicyEvaluation}; +use nvisy_ontology::policy::{Policies, PolicyEvaluation}; use nvisy_ontology::redaction::RedactionSummary; use crate::compiler::graph::Graph; @@ -24,7 +24,7 @@ pub struct EngineInput { /// Handle to the managed directory containing the files to process. pub source: ContentHandler, /// Policies to apply (at least one). - pub policies: Vec<Policy>, + pub policies: Policies, /// Execution graph defining the pipeline DAG. pub graph: Graph, /// External service connections for source/target nodes. diff --git a/crates/nvisy-ingest/src/document/mod.rs b/crates/nvisy-ingest/src/document/mod.rs index c05c411..c74fb3e 100644 --- a/crates/nvisy-ingest/src/document/mod.rs +++ b/crates/nvisy-ingest/src/document/mod.rs @@ -17,8 +17,6 @@ use crate::handler::Handler; pub struct Document<H: Handler> { /// Content source identity and lineage. pub source: ContentSource, - /// 1-based page number this was extracted from. - pub page_number: Option<u32>, /// Format handler (holds the loaded data). handler: H, @@ -28,7 +26,6 @@ impl<H: Handler + Clone> Clone for Document<H> { fn clone(&self) -> Self { Self { source: self.source, - page_number: self.page_number, handler: self.handler.clone(), } } @@ -39,7 +36,6 @@ impl<H: Handler> Document<H> { pub fn new(handler: H) -> Self { Self { source: ContentSource::new(), - page_number: None, handler, } } @@ -59,12 +55,6 @@ impl<H: Handler> Document<H> { self.handler.document_type() } - /// Set the 1-based page number this was extracted from. - pub fn with_page_number(mut self, page: u32) -> Self { - self.page_number = Some(page); - self - } - /// Set this document's parent to the given content source. pub fn with_parent(mut self, content: &ContentData) -> Self { self.source.set_parent_id(Some(content.content_source.as_uuid())); diff --git a/crates/nvisy-ingest/src/handler/image/png.rs b/crates/nvisy-ingest/src/handler/image/png.rs index a62e210..a761862 100644 --- a/crates/nvisy-ingest/src/handler/image/png.rs +++ b/crates/nvisy-ingest/src/handler/image/png.rs @@ -1,5 +1,7 @@ //! PNG handler (stub — awaiting migration to Loader/Handler pattern). +use bytes::Bytes; + use nvisy_core::error::Error; use nvisy_ontology::entity::DocumentType; @@ -7,8 +9,20 @@ use crate::document::edit_stream::SpanEditStream; use crate::document::view_stream::SpanStream; use crate::handler::Handler; -#[derive(Debug)] -pub struct PngHandler; +#[derive(Debug, Clone)] +pub struct PngHandler { + pub(crate) bytes: Bytes, +} + +impl PngHandler { + pub fn new(bytes: Bytes) -> Self { + Self { bytes } + } + + pub fn bytes(&self) -> &Bytes { + &self.bytes + } +} #[async_trait::async_trait] impl Handler for PngHandler { diff --git a/crates/nvisy-ingest/src/handler/text/csv_handler.rs b/crates/nvisy-ingest/src/handler/text/csv_handler.rs index c270ea2..f05416f 100644 --- a/crates/nvisy-ingest/src/handler/text/csv_handler.rs +++ b/crates/nvisy-ingest/src/handler/text/csv_handler.rs @@ -75,7 +75,7 @@ pub struct CsvData { } /// Handler for loaded CSV content. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct CsvHandler { pub(crate) data: CsvData, } @@ -144,6 +144,11 @@ impl CsvHandler { &self.data.rows } + /// Mutable access to all data rows. + pub fn rows_mut(&mut self) -> &mut Vec<Vec<String>> { + &mut self.data.rows + } + /// A specific cell by (row, col). pub fn cell(&self, row: usize, col: usize) -> Option<&str> { self.data diff --git a/crates/nvisy-ingest/src/handler/text/txt_handler.rs b/crates/nvisy-ingest/src/handler/text/txt_handler.rs index 3e67c8b..d6c5932 100644 --- a/crates/nvisy-ingest/src/handler/text/txt_handler.rs +++ b/crates/nvisy-ingest/src/handler/text/txt_handler.rs @@ -37,7 +37,7 @@ pub struct TxtData { /// Handler for loaded plain-text content. /// /// Each line is independently addressable via [`TxtSpan`]. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct TxtHandler { pub(crate) data: TxtData, } @@ -77,6 +77,11 @@ impl Handler for TxtHandler { } impl TxtHandler { + /// Create a new handler from parsed text data. + pub fn new(data: TxtData) -> Self { + Self { data } + } + /// All lines in the document. pub fn lines(&self) -> &[String] { &self.data.lines diff --git a/crates/nvisy-ontology/src/audit/mod.rs b/crates/nvisy-ontology/src/audit/mod.rs index 4e95a0d..fad2171 100644 --- a/crates/nvisy-ontology/src/audit/mod.rs +++ b/crates/nvisy-ontology/src/audit/mod.rs @@ -11,7 +11,6 @@ pub use retention::{RetentionPolicy, RetentionScope}; use jiff::Timestamp; use serde::{Deserialize, Serialize}; -use serde_json::{Map, Value}; use uuid::Uuid; use nvisy_core::path::ContentSource; @@ -71,56 +70,4 @@ pub struct Audit { /// Structured explainability metadata. #[serde(skip_serializing_if = "Option::is_none")] pub explanation: Option<Explanation>, - /// Additional unstructured details about the event. - #[serde(skip_serializing_if = "Option::is_none")] - pub details: Option<Map<String, Value>>, -} - -impl Audit { - /// Create a new audit record for the given action, timestamped to now. - pub fn new(action: AuditAction) -> Self { - Self { - source: ContentSource::new(), - action, - timestamp: Timestamp::now(), - entity_id: None, - redaction_id: None, - policy_id: None, - source_id: None, - run_id: None, - actor: None, - explanation: None, - details: None, - } - } - - /// Associate this audit entry with a detected entity. - pub fn with_entity_id(mut self, id: Uuid) -> Self { - self.entity_id = Some(id); - self - } - - /// Associate this audit entry with a redaction. - pub fn with_redaction_id(mut self, id: Uuid) -> Self { - self.redaction_id = Some(id); - self - } - - /// Associate this audit entry with a pipeline run. - pub fn with_run_id(mut self, id: Uuid) -> Self { - self.run_id = Some(id); - self - } - - /// Record the human or service account that triggered the event. - pub fn with_actor(mut self, actor: impl Into<String>) -> Self { - self.actor = Some(actor.into()); - self - } - - /// Attach additional unstructured details to this audit entry. - pub fn with_details(mut self, details: Map<String, Value>) -> Self { - self.details = Some(details); - self - } } diff --git a/crates/nvisy-ontology/src/audit/retention.rs b/crates/nvisy-ontology/src/audit/retention.rs index 8987eb6..612b636 100644 --- a/crates/nvisy-ontology/src/audit/retention.rs +++ b/crates/nvisy-ontology/src/audit/retention.rs @@ -1,5 +1,7 @@ //! Data retention policy types. +use std::time::Duration; + use serde::{Deserialize, Serialize}; /// What class of data a retention policy applies to. @@ -30,3 +32,16 @@ pub struct RetentionPolicy { #[serde(skip_serializing_if = "Option::is_none")] pub description: Option<String>, } + +impl RetentionPolicy { + /// Returns the retention duration, or `None` for indefinite retention. + /// + /// Returns [`Duration::ZERO`] when `zero_retention` is `true`. + pub fn duration(&self) -> Option<Duration> { + if self.zero_retention { + return Some(Duration::ZERO); + } + self.max_duration_days + .map(|days| Duration::from_secs(days * 24 * 60 * 60)) + } +} diff --git a/crates/nvisy-ontology/src/detection/annotation.rs b/crates/nvisy-ontology/src/detection/annotation.rs index a241cc6..eb7b8ae 100644 --- a/crates/nvisy-ontology/src/detection/annotation.rs +++ b/crates/nvisy-ontology/src/detection/annotation.rs @@ -8,7 +8,9 @@ use serde::{Deserialize, Serialize}; -use crate::entity::{EntityCategory, EntityLocation}; +use crate::entity::{ + AudioLocation, EntityCategory, ImageLocation, TabularLocation, TextLocation, VideoLocation, +}; /// The kind of annotation applied to a content region. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] @@ -55,9 +57,21 @@ pub struct Annotation { /// The annotated text or value. #[serde(skip_serializing_if = "Option::is_none")] pub value: Option<String>, - /// Location of the annotated region. + /// Text location of the annotated region. #[serde(skip_serializing_if = "Option::is_none")] - pub location: Option<EntityLocation>, + pub text_location: Option<TextLocation>, + /// Image location of the annotated region. + #[serde(skip_serializing_if = "Option::is_none")] + pub image_location: Option<ImageLocation>, + /// Tabular location of the annotated region. + #[serde(skip_serializing_if = "Option::is_none")] + pub tabular_location: Option<TabularLocation>, + /// Audio location of the annotated region. + #[serde(skip_serializing_if = "Option::is_none")] + pub audio_location: Option<AudioLocation>, + /// Video location of the annotated region. + #[serde(skip_serializing_if = "Option::is_none")] + pub video_location: Option<VideoLocation>, /// Classification labels attached to this annotation. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub labels: Vec<AnnotationLabel>, diff --git a/crates/nvisy-ontology/src/entity/location.rs b/crates/nvisy-ontology/src/entity/location.rs index 592b92f..d8a4ba9 100644 --- a/crates/nvisy-ontology/src/entity/location.rs +++ b/crates/nvisy-ontology/src/entity/location.rs @@ -1,6 +1,5 @@ //! Spatial and temporal location types for entity positions. -use derive_more::From; use serde::{Deserialize, Serialize}; use uuid::Uuid; @@ -114,67 +113,3 @@ pub struct VideoLocation { pub speaker_id: Option<String>, } -/// Location of an entity within its source content. -/// -/// Each variant is specific to a content modality, carrying only the -/// fields that make sense for that modality. -#[derive(Debug, Clone, From, Serialize, Deserialize)] -#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] -#[serde(tag = "kind", rename_all = "snake_case")] -pub enum EntityLocation { - /// Entity found in text content (plain text, HTML, PDF text layer, etc.). - Text(TextLocation), - /// Entity found in an image. - Image(ImageLocation), - /// Entity found in a tabular data cell. - Tabular(TabularLocation), - /// Entity found in an audio stream. - Audio(AudioLocation), - /// Entity found in a video stream. - Video(VideoLocation), -} - -impl EntityLocation { - /// Text start offset, if this is a text or tabular location. - pub fn start_offset(&self) -> Option<usize> { - match self { - Self::Text(t) => Some(t.start_offset), - Self::Tabular(t) => t.start_offset, - _ => None, - } - } - - /// Text end offset, if this is a text or tabular location. - pub fn end_offset(&self) -> Option<usize> { - match self { - Self::Text(t) => Some(t.end_offset), - Self::Tabular(t) => t.end_offset, - _ => None, - } - } - - /// Bounding box, if this is an image or video location. - pub fn bounding_box(&self) -> Option<&BoundingBox> { - match self { - Self::Image(i) => Some(&i.bounding_box), - Self::Video(v) => Some(&v.bounding_box), - _ => None, - } - } - - /// Row index, if this is a tabular location. - pub fn row_index(&self) -> Option<usize> { - match self { - Self::Tabular(t) => Some(t.row_index), - _ => None, - } - } - - /// Column index, if this is a tabular location. - pub fn column_index(&self) -> Option<usize> { - match self { - Self::Tabular(t) => Some(t.column_index), - _ => None, - } - } -} diff --git a/crates/nvisy-ontology/src/entity/mod.rs b/crates/nvisy-ontology/src/entity/mod.rs index 3748bfc..88fe2d3 100644 --- a/crates/nvisy-ontology/src/entity/mod.rs +++ b/crates/nvisy-ontology/src/entity/mod.rs @@ -11,7 +11,7 @@ mod selector; pub use document::DocumentType; pub use location::{ - AudioLocation, BoundingBox, EntityLocation, ImageLocation, TabularLocation, + AudioLocation, BoundingBox, ImageLocation, TabularLocation, TextLocation, TimeSpan, VideoLocation, }; pub use model::{ModelInfo, ModelKind}; @@ -92,8 +92,21 @@ pub struct Entity { pub detection_method: DetectionMethod, /// Detection confidence score in the range `[0.0, 1.0]`. pub confidence: f64, - /// Where this entity was found in the source content. - pub location: EntityLocation, + /// Text location, if this entity was found in text content. + #[serde(skip_serializing_if = "Option::is_none")] + pub text_location: Option<TextLocation>, + /// Image location, if this entity was found in an image. + #[serde(skip_serializing_if = "Option::is_none")] + pub image_location: Option<ImageLocation>, + /// Tabular location, if this entity was found in tabular data. + #[serde(skip_serializing_if = "Option::is_none")] + pub tabular_location: Option<TabularLocation>, + /// Audio location, if this entity was found in audio. + #[serde(skip_serializing_if = "Option::is_none")] + pub audio_location: Option<AudioLocation>, + /// Video location, if this entity was found in video. + #[serde(skip_serializing_if = "Option::is_none")] + pub video_location: Option<VideoLocation>, /// BCP-47 language tag of the detected content. #[serde(skip_serializing_if = "Option::is_none")] pub language: Option<String>, @@ -113,7 +126,6 @@ impl Entity { value: impl Into<String>, detection_method: DetectionMethod, confidence: f64, - location: EntityLocation, ) -> Self { Self { source: ContentSource::new(), @@ -122,13 +134,47 @@ impl Entity { value: value.into(), detection_method, confidence, - location, + text_location: None, + image_location: None, + tabular_location: None, + audio_location: None, + video_location: None, language: None, model: None, metadata: None, } } + /// Set a text location on this entity. + pub fn with_text_location(mut self, location: TextLocation) -> Self { + self.text_location = Some(location); + self + } + + /// Set an image location on this entity. + pub fn with_image_location(mut self, location: ImageLocation) -> Self { + self.image_location = Some(location); + self + } + + /// Set a tabular location on this entity. + pub fn with_tabular_location(mut self, location: TabularLocation) -> Self { + self.tabular_location = Some(location); + self + } + + /// Set an audio location on this entity. + pub fn with_audio_location(mut self, location: AudioLocation) -> Self { + self.audio_location = Some(location); + self + } + + /// Set a video location on this entity. + pub fn with_video_location(mut self, location: VideoLocation) -> Self { + self.video_location = Some(location); + self + } + /// Set the parent source for lineage tracking. pub fn with_parent(mut self, parent: &ContentSource) -> Self { self.source = self.source.with_parent(parent); diff --git a/crates/nvisy-ontology/src/policy/mod.rs b/crates/nvisy-ontology/src/policy/mod.rs index 08820cd..35de77e 100644 --- a/crates/nvisy-ontology/src/policy/mod.rs +++ b/crates/nvisy-ontology/src/policy/mod.rs @@ -16,16 +16,12 @@ use semver::Version; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use crate::entity::EntityCategory; -use crate::redaction::{RedactionSpec, TextRedactionSpec}; +use crate::redaction::RedactionSpec; /// A named redaction policy containing an ordered set of rules. /// /// Policies are pure configuration — they describe *what* to detect and /// *how* to handle it, independent of any specific content source. -/// -/// Evaluated by [`find_matching_rule`](Policy::find_matching_rule) -/// which returns the first matching enabled rule sorted by priority. #[derive(Debug, Clone, Serialize, Deserialize)] #[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] pub struct Policy { @@ -53,61 +49,10 @@ pub struct Policy { pub default_confidence_threshold: f64, } -impl Policy { - /// Create a new policy with the given name, version, and rules, using default - /// fallback spec ([`TextRedactionSpec::Mask`]) and threshold (0.5). - pub fn new( - name: impl Into<String>, - version: Version, - rules: Vec<PolicyRule>, - ) -> Self { - Self { - id: Uuid::new_v4(), - name: name.into(), - version, - description: None, - extends: None, - regulation: None, - rules, - default_spec: RedactionSpec::Text(TextRedactionSpec::Mask { mask_char: '*' }), - default_confidence_threshold: 0.5, - } - } - - /// Override the fallback redaction specification. - pub fn with_default_spec(mut self, spec: RedactionSpec) -> Self { - self.default_spec = spec; - self - } - - /// Override the fallback confidence threshold. - pub fn with_default_confidence_threshold(mut self, threshold: f64) -> Self { - self.default_confidence_threshold = threshold; - self - } - - /// Find the first matching enabled rule for a given entity. - /// - /// Rules are sorted by priority (ascending). A rule matches when it is - /// enabled and its [`EntitySelector`] matches the given entity properties. - pub fn find_matching_rule( - &self, - category: &EntityCategory, - entity_type: &str, - confidence: f64, - ) -> Option<&PolicyRule> { - let mut sorted: Vec<&PolicyRule> = self.rules.iter().collect(); - sorted.sort_by_key(|r| r.priority); - - for rule in sorted { - if !rule.enabled { - continue; - } - if rule.selector.matches(category, entity_type, confidence) { - return Some(rule); - } - } - - None - } +/// A collection of policies to apply during a pipeline run. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "jsonschema", derive(schemars::JsonSchema))] +pub struct Policies { + /// The policies to evaluate, in order. + pub policies: Vec<Policy>, } diff --git a/crates/nvisy-ontology/src/policy/rule.rs b/crates/nvisy-ontology/src/policy/rule.rs index 12b65e2..c7e1af5 100644 --- a/crates/nvisy-ontology/src/policy/rule.rs +++ b/crates/nvisy-ontology/src/policy/rule.rs @@ -4,7 +4,6 @@ //! based on entity categories, types, and confidence thresholds. use serde::{Deserialize, Serialize}; -use serde_json::{Map, Value}; use uuid::Uuid; use crate::detection::SensitivityLevel; @@ -56,11 +55,6 @@ pub enum RuleKind { pub struct PolicyRule { /// Unique identifier for this rule. pub id: Uuid, - /// Human-readable name for display purposes. - pub name: String, - /// Description of the rule's purpose. - #[serde(skip_serializing_if = "Option::is_none")] - pub description: Option<String>, /// What this rule does when it matches. pub kind: RuleKind, /// Which entities this rule applies to. @@ -69,17 +63,9 @@ pub struct PolicyRule { pub spec: RedactionSpec, /// Template string for the replacement value (e.g. `"[REDACTED]"`). pub replacement_template: String, - /// Whether this rule is active. Disabled rules are skipped during evaluation. - pub enabled: bool, /// Evaluation priority (lower numbers are evaluated first). pub priority: i32, /// Additional conditions for this rule to apply. #[serde(skip_serializing_if = "Option::is_none")] pub conditions: Option<RuleCondition>, - /// Regulatory citation or notes explaining the rule. - #[serde(skip_serializing_if = "Option::is_none")] - pub context: Option<String>, - /// Additional unstructured metadata. - #[serde(skip_serializing_if = "Option::is_none")] - pub metadata: Option<Map<String, Value>>, } diff --git a/crates/nvisy-ontology/src/prelude.rs b/crates/nvisy-ontology/src/prelude.rs index 159ff5b..c2dc957 100644 --- a/crates/nvisy-ontology/src/prelude.rs +++ b/crates/nvisy-ontology/src/prelude.rs @@ -9,11 +9,11 @@ pub use crate::detection::{ }; pub use crate::entity::{ AudioLocation, BoundingBox, DetectionMethod, DocumentType, Entity, EntityCategory, - EntityLocation, EntitySelector, ImageLocation, ModelInfo, ModelKind, TabularLocation, + EntitySelector, ImageLocation, ModelInfo, ModelKind, TabularLocation, TextLocation, TimeSpan, VideoLocation, }; pub use crate::policy::{ - Policy, PolicyEvaluation, PolicyRule, RegulationKind, RuleCondition, RuleKind, + Policies, Policy, PolicyEvaluation, PolicyRule, RegulationKind, RuleCondition, RuleKind, }; pub use crate::redaction::{ AudioRedactionMethod, AudioRedactionOutput, AudioRedactionSpec, ImageRedactionMethod, diff --git a/crates/nvisy-python/src/ner/mod.rs b/crates/nvisy-python/src/ner/mod.rs index a3639e3..e16d5fd 100644 --- a/crates/nvisy-python/src/ner/mod.rs +++ b/crates/nvisy-python/src/ner/mod.rs @@ -6,7 +6,7 @@ use pyo3::prelude::*; use pyo3::types::{PyDict, PyList}; -use nvisy_ontology::entity::{DetectionMethod, Entity, EntityCategory, EntityLocation, TextLocation}; +use nvisy_ontology::entity::{DetectionMethod, Entity, EntityCategory, TextLocation}; use nvisy_core::error::Error; use crate::bridge::PythonBridge; use crate::error::from_pyerr; @@ -165,15 +165,15 @@ fn parse_python_entities(_py: Python<'_>, result: Bound<'_, PyAny>) -> Result<Ve value, DetectionMethod::Ner, confidence, - EntityLocation::Text(TextLocation { - start_offset, - end_offset, - context_start_offset: None, - context_end_offset: None, - element_id: None, - page_number: None, - }), - ); + ) + .with_text_location(TextLocation { + start_offset, + end_offset, + context_start_offset: None, + context_end_offset: None, + element_id: None, + page_number: None, + }); entities.push(entity); } diff --git a/crates/nvisy-python/src/ocr/mod.rs b/crates/nvisy-python/src/ocr/mod.rs index 7ad9b54..e2fd1a8 100644 --- a/crates/nvisy-python/src/ocr/mod.rs +++ b/crates/nvisy-python/src/ocr/mod.rs @@ -8,7 +8,7 @@ use pyo3::prelude::*; use pyo3::types::{PyDict, PyList}; use nvisy_ontology::entity::{ - BoundingBox, DetectionMethod, Entity, EntityCategory, EntityLocation, ImageLocation, + BoundingBox, DetectionMethod, Entity, EntityCategory, ImageLocation, }; use nvisy_core::error::Error; use crate::bridge::PythonBridge; @@ -133,12 +133,12 @@ fn parse_ocr_results(result: Bound<'_, PyAny>) -> Result<Vec<Entity>, Error> { &text, DetectionMethod::Ocr, confidence, - EntityLocation::Image(ImageLocation { - bounding_box: BoundingBox { x, y, width, height }, - image_id: None, - page_number: None, - }), - ); + ) + .with_image_location(ImageLocation { + bounding_box: BoundingBox { x, y, width, height }, + image_id: None, + page_number: None, + }); entities.push(entity); }