From 5e62525760e63b4528096ab6754f56fe28e15844 Mon Sep 17 00:00:00 2001 From: QuartzLibrary <81446760+QuartzLibrary@users.noreply.github.com> Date: Fri, 3 Apr 2026 05:09:21 +0100 Subject: [PATCH 1/3] Update settings.json --- .vscode/settings.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 9463976..e85d955 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,3 @@ { - "rust-analyzer.cargo.features": ["cli"] -} + "rust-analyzer.cargo.features": "all", +} \ No newline at end of file From 3748b29877863364ce1e38b1856ea26986c1b506 Mon Sep 17 00:00:00 2001 From: QuartzLibrary <81446760+QuartzLibrary@users.noreply.github.com> Date: Tue, 7 Apr 2026 01:50:42 +0100 Subject: [PATCH 2/3] READMEs --- README.md | 6 +- packages/python/README.md | 108 +++++++++++++++++++++++++++++++++ packages/python/pyproject.toml | 1 + schema_analysis/README.md | 8 ++- 4 files changed, 118 insertions(+), 5 deletions(-) create mode 100644 packages/python/README.md diff --git a/README.md b/README.md index cf75067..409a3f6 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,9 @@ our gymnast friend, serde. - Works with any self-describing format with a Serde implementation. - Suitable for large files. - Keeps track of some useful info for each type. -- Keeps track of null/normal/missing/duplicate values separately. +- Keeps track of null/missing/duplicate values separately. - Integrates with [Schemars](https://github.com/GREsau/schemars) and - [json_typegen](https://github.com/evestera/json_typegen) to produce types and json schema if needed. + [json_typegen](https://github.com/evestera/json_typegen) to produce types and a json schema if needed. - There's a demo website [here](https://schema-analysis.com/). ### Installation @@ -42,7 +42,7 @@ cargo install schema_analysis --features cli --locked ### CLI Usage -The `schema_analysis` binary can infer schemas and generate types directly from the command line. +`schema_analysis` can infer schemas and generate types from data directly from the command line. ``` schema_analysis [OPTIONS] [FILES]... diff --git a/packages/python/README.md b/packages/python/README.md new file mode 100644 index 0000000..5fad526 --- /dev/null +++ b/packages/python/README.md @@ -0,0 +1,108 @@ +# schema_analysis + +## Universal-ish Schema Analysis + +Ever wished you could figure out what was in that json file? Or maybe it was xml... Ehr, yaml? +It was definitely toml. + +Alas, many great tools will only work with one of those formats, and the internet is not so +nice a place as to finally understand that no, xml is not an acceptable data format. + +Enter this neat little tool, a single interface to any self-describing format supported by +our gymnast friend, serde. + +### Features + +- Works with any self-describing format with a Serde implementation. +- Suitable for large files. +- Keeps track of some useful info for each type (opt out with --no-analysis). +- Keeps track of null/missing/duplicate values separately. +- Integrates with [Schemars](https://github.com/GREsau/schemars) and + [json_typegen](https://github.com/evestera/json_typegen) to produce types and a json schema if needed. +- There's a demo website [here](https://schema-analysis.com/). + +### Installation + +```bash +# Run without installing +uvx schema_analysis data.json +# or +pipx run schema_analysis data.json + +# Install +pip install schema_analysis +# or +uv tool install schema_analysis +# or +cargo install schema_analysis --features cli --locked +``` + +### CLI Usage + +`schema_analysis` can infer schemas and generate types from data directly from the command line. + +``` +schema_analysis [OPTIONS] [FILES]... +``` + +It auto-detects the input format from file extensions (`.json`, `.yaml`/`.yml`, `.xml`, `.toml`, `.cbor`, `.bson`) +and reads from stdin if no files are provided. + +**Options:** + +| Option | Description | Default | +| --- | --- | --- | +| `--format ` | Override input format (`json`, `yaml`, `xml`, `toml`, `cbor`, `bson`) | auto-detected | +| `--output ` | Output mode (`schema`, `rust`, `typescript`, `typescript-alias`, `kotlin`, `kotlin-kotlinx`, `json-schema`, `shape`) | `schema` | +| `--name ` | Root type name for code generation | `Root` | +| `--compact` | Compact JSON output (no pretty printing) | | +| `--no-analysis` | Skip analysis info (counts, samples, min/max, etc.), outputting only the schema structure | | + +**Examples:** + +```bash +# Infer a schema from a JSON file +schema_analysis data.json + +# Generate Rust types +schema_analysis data.json --output rust --name MyData + +# Generate TypeScript interfaces +schema_analysis api.json --output typescript --name ApiResponse + +# Generate JSON Schema +schema_analysis data.json --output json-schema + +# Merge multiple files into a single schema +schema_analysis file1.json file2.json file3.json + +# Read from stdin +cat data.json | schema_analysis --format json +``` + +### Library Usage + +For use as a library, see the [Rust crate](https://crates.io/crates/schema_analysis/) or the [repo](https://github.com/QuartzLibrary/schema_analysis). + +### Performance + +> These are not proper benchmarks, but should give a vague idea of the performance on a i7-7700HQ laptop (2017) laptop with the raw data already loaded into memory. + +| Size | wasm (MB/s) | native (MB/s) | Format | File # | +| --------------------- | ------------ | ------------- | ------ | ------ | +| [~180MB] | ~20s (9) | ~5s (36) | json | 1 | +| [~650MB] | ~150s (4.3) | ~50s (13) | json | 1 | +| [~1.7GB] | ~470s (3.6) | ~145s (11.7) | json | 1 | +| [~2.1GB] | a | ~182s (11.5) | json | 1 | +| [~13.3GB]b | | ~810s (16.4) | xml | ~200k | + +a This one seems to go over some kind of browser limit when fetching the data in the Web Worker, I believe I would have to split large files to handle it. + +b ~2.7GB compressed. This one seems like it would be a worst-case scenario because it includes decompression overhead and the files had a section that was formatted text which resulted in crazy schemas. (The json pretty printed schema was almost 0.5GB!) + + +[~180MB]: https://github.com/zemirco/sf-city-lots-json/blob/master/citylots.json +[~650MB]: https://catalog.data.gov/dataset/forestry-planting-spaces +[~1.7GB]: https://catalog.data.gov/dataset/nys-thruway-origin-and-destination-points-for-all-vehicles-15-minute-intervals-2018-q4 +[~2.1GB]: https://catalog.data.gov/dataset/turnstile-usage-data-2016 +[~13.3GB]: https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/ diff --git a/packages/python/pyproject.toml b/packages/python/pyproject.toml index 720a009..f70173a 100644 --- a/packages/python/pyproject.toml +++ b/packages/python/pyproject.toml @@ -6,6 +6,7 @@ build-backend = "maturin" name = "schema_analysis" version = "0.6.0" description = "Infer schemas from JSON, YAML, XML, TOML, CBOR, and BSON" +readme = "README.md" license = { text = "MIT OR Apache-2.0" } requires-python = ">=3.8" authors = [{ name = "QuartzLibrary" }] diff --git a/schema_analysis/README.md b/schema_analysis/README.md index d9123f8..2747cc1 100644 --- a/schema_analysis/README.md +++ b/schema_analysis/README.md @@ -19,9 +19,9 @@ our gymnast friend, serde. - Works with any self-describing format with a Serde implementation. - Suitable for large files. - Keeps track of some useful info for each type. -- Keeps track of null/normal/missing/duplicate values separately. +- Keeps track of null/missing/duplicate values separately. - Integrates with [Schemars](https://github.com/GREsau/schemars) and - [json_typegen](https://github.com/evestera/json_typegen) to produce types and json schema if needed. + [json_typegen](https://github.com/evestera/json_typegen) to produce types and a json schema if needed. - There's a demo website [here](https://schema-analysis.com/). ### Usage @@ -52,6 +52,10 @@ Check [Schema](https://docs.rs/schema_analysis/latest/schema_analysis/enum.Schem to see what info you get, and [targets](https://github.com/QuartzLibrary/schema_analysis/blob/HEAD/schema_analysis/src/targets) to see the available integrations (which include code and json schema generation). +### CLI Usage + +You can use this crate as a CLI, more info in the [repo](https://github.com/QuartzLibrary/schema_analysis). + ### Advanced Usage I know, I know, the internet is evil and has decided to plague you with not one, but thousands, From 1e929c478b3eb926adb852246bd8b2b787b271c5 Mon Sep 17 00:00:00 2001 From: QuartzLibrary <81446760+QuartzLibrary@users.noreply.github.com> Date: Wed, 8 Apr 2026 17:53:04 +0100 Subject: [PATCH 3/3] Rename CLI flag --- README.md | 2 +- packages/python/README.md | 4 ++-- schema_analysis/src/main.rs | 4 ++-- schema_analysis/tests/cli.rs | 6 +++--- ...son_schema_no_analysis.json => json_schema_minimal.json} | 0 5 files changed, 8 insertions(+), 8 deletions(-) rename schema_analysis/tests/cli_fixtures/expected/{json_schema_no_analysis.json => json_schema_minimal.json} (100%) diff --git a/README.md b/README.md index 409a3f6..8a732c1 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ and reads from stdin if no files are provided. | `--output ` | Output mode (`schema`, `rust`, `typescript`, `typescript-alias`, `kotlin`, `kotlin-kotlinx`, `json-schema`, `shape`) | `schema` | | `--name ` | Root type name for code generation | `Root` | | `--compact` | Compact JSON output (no pretty printing) | | -| `--no-analysis` | Skip analysis info (counts, samples, min/max, etc.), outputting only the schema structure | | +| `--minimal` | Skip analysis info (counts, samples, min/max, etc.), outputting only the schema structure | | **Examples:** diff --git a/packages/python/README.md b/packages/python/README.md index 5fad526..1b49ea5 100644 --- a/packages/python/README.md +++ b/packages/python/README.md @@ -15,7 +15,7 @@ our gymnast friend, serde. - Works with any self-describing format with a Serde implementation. - Suitable for large files. -- Keeps track of some useful info for each type (opt out with --no-analysis). +- Keeps track of some useful info for each type (opt out with --minimal). - Keeps track of null/missing/duplicate values separately. - Integrates with [Schemars](https://github.com/GREsau/schemars) and [json_typegen](https://github.com/evestera/json_typegen) to produce types and a json schema if needed. @@ -56,7 +56,7 @@ and reads from stdin if no files are provided. | `--output ` | Output mode (`schema`, `rust`, `typescript`, `typescript-alias`, `kotlin`, `kotlin-kotlinx`, `json-schema`, `shape`) | `schema` | | `--name ` | Root type name for code generation | `Root` | | `--compact` | Compact JSON output (no pretty printing) | | -| `--no-analysis` | Skip analysis info (counts, samples, min/max, etc.), outputting only the schema structure | | +| `--minimal` | Skip analysis info (counts, samples, min/max, etc.), outputting only the schema structure | | **Examples:** diff --git a/schema_analysis/src/main.rs b/schema_analysis/src/main.rs index f6bced2..70d1a57 100644 --- a/schema_analysis/src/main.rs +++ b/schema_analysis/src/main.rs @@ -39,7 +39,7 @@ struct Cli { /// Only output the schema structure, without analysis info (counts, samples, min/max, etc.) #[arg(long)] - no_analysis: bool, + minimal: bool, } #[derive(Clone, Copy, PartialEq, Eq, clap::ValueEnum)] @@ -73,7 +73,7 @@ fn main() -> Result<()> { } let format = resolve_format(&cli)?; - let output = if cli.no_analysis { + let output = if cli.minimal { let mut schema = infer_schema::<()>(format, &cli.files)?; if format == InputFormat::Xml { cleanup_xml_schema(&mut schema); diff --git a/schema_analysis/tests/cli.rs b/schema_analysis/tests/cli.rs index e09d892..fa83e46 100644 --- a/schema_analysis/tests/cli.rs +++ b/schema_analysis/tests/cli.rs @@ -100,14 +100,14 @@ fn compact_flag() { } #[test] -fn no_analysis_flag() { +fn minimal_flag() { cmd() .arg(input("sample.json")) - .arg("--no-analysis") + .arg("--minimal") .assert() .success() .stdout(include_str!( - "cli_fixtures/expected/json_schema_no_analysis.json" + "cli_fixtures/expected/json_schema_minimal.json" )); } diff --git a/schema_analysis/tests/cli_fixtures/expected/json_schema_no_analysis.json b/schema_analysis/tests/cli_fixtures/expected/json_schema_minimal.json similarity index 100% rename from schema_analysis/tests/cli_fixtures/expected/json_schema_no_analysis.json rename to schema_analysis/tests/cli_fixtures/expected/json_schema_minimal.json