From 748336040e8882e4c8e0d46ef9782129a90e29d2 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Gauthier Date: Fri, 27 Mar 2026 21:16:02 -0400 Subject: [PATCH 1/2] Add unique_keys option to parse/2 for faster lookups When JSON is known to have no duplicate object keys, passing unique_keys: true uses sonic-rs native indexing instead of a reverse linear scan (rfind) for pointer lookups. The flag is stored on the ParsedDocument at parse time so all subsequent get/get_many/get_many_nil/length calls use the fast path automatically. Benchmarks show ~15-17% improvement on isolated get operations. Also splits benchmarks into separate parse and get sections to isolate lookup cost from parse cost, and updates benchmark results. --- README.md | 98 +++++++++++++++++++------------- bench/torque_bench.exs | 52 +++++++++++------ bench/torque_only_bench.exs | 39 ++++++++----- lib/torque.ex | 24 ++++++-- lib/torque/native.ex | 2 + native/torque_nif/src/decoder.rs | 82 +++++++++++++++++++------- native/torque_nif/src/lib.rs | 1 + test/pointer_test.exs | 31 ++++++++++ 8 files changed, 234 insertions(+), 95 deletions(-) diff --git a/README.md b/README.md index 96c200b..306ea30 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,13 @@ results = Torque.get_many(doc, ["/id", "/site/domain", "/device/ip"]) # [{:ok, "req-1"}, {:ok, "example.com"}, {:ok, "1.2.3.4"}] ``` +When your JSON is known to have no duplicate object keys, pass `unique_keys: true` +for faster field lookups (uses sonic-rs internal indexing instead of linear scan): + +```elixir +{:ok, doc} = Torque.parse(json, unique_keys: true) +``` + ### Encoding ```elixir @@ -94,7 +101,7 @@ json = Torque.encode_to_iodata(%{id: "abc"}) |----------|-------------| | `Torque.decode(binary)` | Decode JSON to Elixir terms | | `Torque.decode!(binary)` | Decode JSON, raising on error | -| `Torque.parse(binary)` | Parse JSON into opaque document reference | +| `Torque.parse(binary, opts)` | Parse JSON into opaque document reference | | `Torque.get(doc, path)` | Extract field by JSON Pointer path | | `Torque.get(doc, path, default)` | Extract field with default for missing paths | | `Torque.get_many(doc, paths)` | Extract multiple fields in one NIF call | @@ -118,7 +125,7 @@ json = Torque.encode_to_iodata(%{id: "abc"}) | `true`, `false` | `true`, `false` | | `null` | `nil` | -For objects with duplicate keys, the last value wins. +For objects with duplicate keys, the last value wins (unless `unique_keys: true` is passed to `parse/2`). ### Elixir to JSON @@ -165,62 +172,73 @@ Apple M2 Pro, OTP 28, Elixir 1.19: | Library | ips | mean | median | p99 | memory | |---|---|---|---|---|---| -| **torque** | **257.2K** | **3.89 μs** | **3.67 μs** | **8.17 μs** | **1.56 KB** | -| **simdjsone** | 170.0K | 5.88 μs | 5.13 μs | 14.75 μs | 1.59 KB | -| **jiffy** | 146.1K | 6.85 μs | 6.00 μs | 17.08 μs | **1.56 KB** | -| **otp json** | 127.2K | 7.86 μs | 7.29 μs | 18.00 μs | 7.73 KB | -| **jason** | 107.7K | 9.29 μs | 8.71 μs | 18.08 μs | 9.54 KB | +| **torque** | **262.5K** | **3.81 μs** | **3.63 μs** | **7.83 μs** | **1.56 KB** | +| **simdjsone** | 182.7K | 5.47 μs | 5.13 μs | 11.88 μs | 1.59 KB | +| **jiffy** | 144.6K | 6.92 μs | 6.21 μs | 17.17 μs | **1.56 KB** | +| **otp json** | 129.6K | 7.72 μs | 7.21 μs | 16.50 μs | 7.73 KB | +| **jason** | 103.6K | 9.65 μs | 8.71 μs | 22.75 μs | 9.54 KB | ### Decode (750 KB Twitter) | Library | ips | mean | median | p99 | memory | |---|---|---|---|---|---| -| **torque** | **505.0** | **1.98 ms** | **1.82 ms** | **2.58 ms** | **1.56 KB** | -| **simdjsone** | 415.3 | 2.41 ms | 1.90 ms | 3.82 ms | **1.56 KB** | -| **otp json** | 182.5 | 5.48 ms | 5.45 ms | 6.58 ms | 2.49 MB | -| **jason** | 136.8 | 7.31 ms | 7.13 ms | 12.29 ms | 3.55 MB | -| **jiffy** | 100.7 | 9.93 ms | 10.01 ms | 11.91 ms | 5.53 MB | +| **torque** | **476.0** | **2.10 ms** | **1.87 ms** | **4.73 ms** | **1.56 KB** | +| **simdjsone** | 459.4 | 2.18 ms | 1.85 ms | 3.20 ms | **1.56 KB** | +| **otp json** | 195.1 | 5.13 ms | 5.12 ms | 6.16 ms | 2.49 MB | +| **jason** | 142.0 | 7.04 ms | 6.91 ms | 11.47 ms | 3.55 MB | +| **jiffy** | 115.9 | 8.63 ms | 8.72 ms | 9.94 ms | 5.53 MB | ### Encode (1.2 KB OpenRTB) | Library | ips | mean | median | p99 | memory | |---|---|---|---|---|---| -| **torque** [proplist() :: binary()] | **1274.3K** | **0.78 μs** | **0.71 μs** | **0.92 μs** | 88 B | -| **torque** [proplist() :: iodata()] | 1261.4K | 0.79 μs | **0.71 μs** | 0.96 μs | **64 B** | -| **otp json** [map() :: iodata()] | 1078.0K | 0.93 μs | 0.88 μs | 1.38 μs | 3928 B | -| **torque** [map() :: iodata()] | 1064.8K | 0.94 μs | 0.88 μs | 1.13 μs | **64 B** | -| **torque** [map() :: binary()] | 1053.1K | 0.95 μs | 0.88 μs | 1.17 μs | 88 B | -| **jason** [map() :: iodata()] | 591.3K | 1.69 μs | 1.50 μs | 3.54 μs | 3848 B | -| **jiffy** [proplist() :: iodata()] | 579.2K | 1.73 μs | 1.50 μs | 2.13 μs | 120 B | -| **jiffy** [map() :: iodata()] | 498.1K | 2.01 μs | 1.83 μs | 2.50 μs | 824 B | -| **simdjsone** [proplist() :: iodata()] | 441.6K | 2.26 μs | 2.00 μs | 3.71 μs | 184 B | -| **jason** [map() :: binary()] | 399.7K | 2.50 μs | 2.33 μs | 4.21 μs | 3912 B | -| **simdjsone** [map() :: iodata()] | 386.7K | 2.59 μs | 2.38 μs | 4.54 μs | 888 B | +| **otp json** [map() :: iodata()] | **1091.6K** | **0.92 μs** | **0.83 μs** | 1.46 μs | 3928 B | +| **torque** [proplist() :: binary()] | 1073.6K | 0.93 μs | 0.88 μs | **1.13 μs** | 88 B | +| **torque** [proplist() :: iodata()] | 1069.3K | 0.94 μs | 0.88 μs | 1.17 μs | **64 B** | +| **torque** [map() :: binary()] | 917.5K | 1.09 μs | 1.00 μs | 1.33 μs | 88 B | +| **torque** [map() :: iodata()] | 914.6K | 1.09 μs | 1.00 μs | 1.42 μs | **64 B** | +| **jason** [map() :: iodata()] | 571.8K | 1.75 μs | 1.54 μs | 3.75 μs | 3848 B | +| **jiffy** [proplist() :: iodata()] | 518.4K | 1.93 μs | 1.67 μs | 2.75 μs | 120 B | +| **jiffy** [map() :: iodata()] | 427.6K | 2.34 μs | 2.08 μs | 4.33 μs | 824 B | +| **simdjsone** [proplist() :: iodata()] | 415.4K | 2.41 μs | 2.21 μs | 3.96 μs | 184 B | +| **jason** [map() :: binary()] | 385.1K | 2.60 μs | 2.38 μs | 5.00 μs | 3912 B | +| **simdjsone** [map() :: iodata()] | 346.8K | 2.88 μs | 2.67 μs | 4.33 μs | 888 B | ### Encode (750 KB Twitter) | Library | ips | mean | median | p99 | memory | |---|---|---|---|---|---| -| **torque** [proplist() :: iodata()] | **1272.5** | **0.79 ms** | **0.76 ms** | **0.99 ms** | **64 B** | -| **torque** [proplist() :: binary()] | 1252.1 | 0.80 ms | 0.77 ms | 1.04 ms | 88 B | -| **torque** [map() :: iodata()] | 1102.9 | 0.91 ms | 0.89 ms | 1.09 ms | **64 B** | -| **torque** [map() :: binary()] | 1084.0 | 0.92 ms | 0.89 ms | 1.20 ms | 88 B | -| **jiffy** [proplist() :: iodata()] | 342.0 | 2.92 ms | 2.82 ms | 4.75 ms | 37.7 KB | -| **jiffy** [map() :: iodata()] | 287.1 | 3.48 ms | 3.32 ms | 4.29 ms | 1.06 MB | -| **simdjsone** [proplist() :: iodata()] | 259.7 | 3.85 ms | 3.78 ms | 5.79 ms | 37.7 KB | -| **jason** [map() :: iodata()] | 241.3 | 4.14 ms | 3.94 ms | 6.99 ms | 4.96 MB | -| **simdjsone** [map() :: iodata()] | 216.1 | 4.63 ms | 4.66 ms | 6.51 ms | 1.06 MB | -| **otp json** [map() :: iodata()] | 200.2 | 4.99 ms | 5.10 ms | 6.97 ms | 5.40 MB | -| **jason** [map() :: binary()] | 130.9 | 7.64 ms | 7.53 ms | 9.09 ms | 4.96 MB | - -### Parse + Get (5 fields) (1.2 KB OpenRTB) +| **torque** [proplist() :: iodata()] | **1026.4** | **0.97 ms** | **0.96 ms** | **1.18 ms** | **64 B** | +| **torque** [proplist() :: binary()] | 983.5 | 1.02 ms | 0.98 ms | 1.69 ms | 88 B | +| **torque** [map() :: binary()] | 918.5 | 1.09 ms | 1.08 ms | 1.31 ms | 88 B | +| **torque** [map() :: iodata()] | 905.4 | 1.10 ms | 1.09 ms | 1.35 ms | **64 B** | +| **jiffy** [proplist() :: iodata()] | 342.6 | 2.92 ms | 2.86 ms | 4.35 ms | 37.7 KB | +| **jiffy** [map() :: iodata()] | 270.8 | 3.69 ms | 3.53 ms | 5.94 ms | 1.06 MB | +| **jason** [map() :: iodata()] | 254.9 | 3.92 ms | 3.70 ms | 6.50 ms | 4.96 MB | +| **simdjsone** [proplist() :: iodata()] | 247.4 | 4.04 ms | 3.98 ms | 5.63 ms | 37.7 KB | +| **otp json** [map() :: iodata()] | 246.9 | 4.05 ms | 4.13 ms | 5.64 ms | 5.40 MB | +| **simdjsone** [map() :: iodata()] | 210.5 | 4.75 ms | 4.78 ms | 5.41 ms | 1.06 MB | +| **jason** [map() :: binary()] | 141.1 | 7.09 ms | 7.02 ms | 8.40 ms | 4.96 MB | + +### Parse (1.2 KB OpenRTB) + +| Library | ips | mean | median | p99 | +|---|---|---|---|---| +| **torque** parse(unique_keys) | **596.6K** | **1.68 μs** | **1.33 μs** | **3.13 μs** | +| **torque** parse | 579.2K | 1.73 μs | **1.33 μs** | 3.88 μs | +| **simdjsone** parse | 364.9K | 2.74 μs | 1.17 μs | 4.92 μs | + +### Get (5 fields) (1.2 KB OpenRTB) | Library | ips | mean | median | p99 | memory | |---|---|---|---|---|---| -| **torque** parse+get_many_nil | **455.8K** | **2.19 μs** | **1.75 μs** | **6.21 μs** | **288 B** | -| **torque** parse+get_many | 431.6K | 2.32 μs | 1.75 μs | 6.33 μs | 408 B | -| **torque** parse+get | 415.5K | 2.41 μs | 1.96 μs | 7.13 μs | 432 B | -| **simdjsone** parse+get | 353.8K | 2.83 μs | 1.71 μs | 7.25 μs | 408 B | +| **torque** get_many_nil (unique_keys) | **2.49M** | **402 ns** | **375 ns** | **500 ns** | **240 B** | +| **torque** get_many (unique_keys) | 2.37M | 422 ns | **375 ns** | **500 ns** | 360 B | +| **torque** get_many_nil | 2.16M | 463 ns | 458 ns | 583 ns | **240 B** | +| **torque** get_many | 2.07M | 483 ns | 458 ns | 584 ns | 360 B | +| **simdjsone** get | 1.77M | 564 ns | 458 ns | 1083 ns | 384 B | +| **torque** get (unique_keys) | 1.67M | 601 ns | 583 ns | 709 ns | 384 B | +| **torque** get | 1.50M | 669 ns | 625 ns | 792 ns | 384 B | Run benchmarks locally: diff --git a/bench/torque_bench.exs b/bench/torque_bench.exs index 49752b8..6f91003 100644 --- a/bench/torque_bench.exs +++ b/bench/torque_bench.exs @@ -399,27 +399,43 @@ Benchee.run( ] ++ ci_formatters ) -BenchGroup.set("Parse+Get (5 fields) — 1.2 KB OpenRTB") -IO.puts("\n=== PARSE + GET BENCHMARK ===\n") +{:ok, pre_doc} = Torque.parse(sample_json) +{:ok, pre_doc_uk} = Torque.parse(sample_json, unique_keys: true) +pre_ref = :simdjson.parse(sample_json) +BenchGroup.set("Parse — 1.2 KB OpenRTB") +IO.puts("\n=== PARSE BENCHMARK ===\n") + +# NOTE: simdjsone segfaults when Benchee measures memory (GC triggers a +# use-after-free in the NIF resource destructor), so memory_time is 0 here. Benchee.run( %{ - "simdjsone parse+get" => fn -> - ref = :simdjson.parse(sample_json) - for f <- fields, do: :simdjson.get(ref, f) - end, - "torque parse+get" => fn -> - {:ok, doc} = Torque.parse(sample_json) - for f <- fields, do: Torque.get(doc, f) - end, - "torque parse+get_many" => fn -> - {:ok, doc} = Torque.parse(sample_json) - Torque.get_many(doc, fields) - end, - "torque parse+get_many_nil" => fn -> - {:ok, doc} = Torque.parse(sample_json) - Torque.get_many_nil(doc, fields) - end + "simdjsone parse" => fn -> :simdjson.parse(sample_json) end, + "torque parse" => fn -> Torque.parse(sample_json) end, + "torque parse(unique_keys)" => fn -> Torque.parse(sample_json, unique_keys: true) end + }, + warmup: 2, + time: 5, + memory_time: 0, + percentiles: [50, 95, 99], + formatters: + [ + {Benchee.Formatters.Console, percentiles: [50, 95, 99]} + ] ++ ci_formatters +) + +BenchGroup.set("Get (5 fields) — 1.2 KB OpenRTB") +IO.puts("\n=== GET BENCHMARK ===\n") + +Benchee.run( + %{ + "simdjsone get" => fn -> for f <- fields, do: :simdjson.get(pre_ref, f) end, + "torque get" => fn -> for f <- fields, do: Torque.get(pre_doc, f) end, + "torque get_many" => fn -> Torque.get_many(pre_doc, fields) end, + "torque get_many_nil" => fn -> Torque.get_many_nil(pre_doc, fields) end, + "torque get (unique_keys)" => fn -> for f <- fields, do: Torque.get(pre_doc_uk, f) end, + "torque get_many (unique_keys)" => fn -> Torque.get_many(pre_doc_uk, fields) end, + "torque get_many_nil (unique_keys)" => fn -> Torque.get_many_nil(pre_doc_uk, fields) end }, warmup: 2, time: 5, diff --git a/bench/torque_only_bench.exs b/bench/torque_only_bench.exs index ce6de33..15b4919 100644 --- a/bench/torque_only_bench.exs +++ b/bench/torque_only_bench.exs @@ -339,22 +339,35 @@ Benchee.run( ] ) -IO.puts("\n=== PARSE + GET BENCHMARK ===\n") +IO.puts("\n=== PARSE BENCHMARK ===\n") Benchee.run( %{ - "torque parse+get" => fn -> - {:ok, doc} = Torque.parse(sample_json) - for f <- fields, do: Torque.get(doc, f) - end, - "torque parse+get_many" => fn -> - {:ok, doc} = Torque.parse(sample_json) - Torque.get_many(doc, fields) - end, - "torque parse+get_many_nil" => fn -> - {:ok, doc} = Torque.parse(sample_json) - Torque.get_many_nil(doc, fields) - end + "torque parse" => fn -> Torque.parse(sample_json) end, + "torque parse(unique_keys)" => fn -> Torque.parse(sample_json, unique_keys: true) end + }, + warmup: 2, + time: 5, + memory_time: 2, + percentiles: [50, 95, 99], + formatters: [ + {Benchee.Formatters.Console, percentiles: [50, 95, 99]} + ] +) + +{:ok, pre_doc} = Torque.parse(sample_json) +{:ok, pre_doc_uk} = Torque.parse(sample_json, unique_keys: true) + +IO.puts("\n=== GET BENCHMARK ===\n") + +Benchee.run( + %{ + "torque get" => fn -> for f <- fields, do: Torque.get(pre_doc, f) end, + "torque get_many" => fn -> Torque.get_many(pre_doc, fields) end, + "torque get_many_nil" => fn -> Torque.get_many_nil(pre_doc, fields) end, + "torque get (unique_keys)" => fn -> for f <- fields, do: Torque.get(pre_doc_uk, f) end, + "torque get_many (unique_keys)" => fn -> Torque.get_many(pre_doc_uk, fields) end, + "torque get_many_nil (unique_keys)" => fn -> Torque.get_many_nil(pre_doc_uk, fields) end }, warmup: 2, time: 5, diff --git a/lib/torque.ex b/lib/torque.ex index 041b95a..3bcc255 100644 --- a/lib/torque.ex +++ b/lib/torque.ex @@ -31,17 +31,33 @@ defmodule Torque do The returned reference can be passed to `get/2`, `get/3`, or `get_many/2` for efficient repeated field extraction without re-parsing. - Automatically uses a dirty CPU scheduler for inputs larger than 10 KB. + ## Options + + * `:unique_keys` — when `true`, assumes object keys are unique and uses + a faster lookup path. Defaults to `false` (last-value-wins for + duplicate keys). + + Automatically uses a dirty CPU scheduler for inputs larger than 20 KB. """ - @spec parse(binary()) :: {:ok, reference()} | {:error, binary()} - def parse(json) when is_binary(json) and byte_size(json) > @timeslice_bytes do + @spec parse(binary(), keyword()) :: {:ok, reference()} | {:error, binary()} + def parse(json, opts \\ []) + + def parse(json, []) when is_binary(json) and byte_size(json) > @timeslice_bytes do Torque.Native.parse_dirty(json) end - def parse(json) when is_binary(json) do + def parse(json, []) when is_binary(json) do Torque.Native.parse(json) end + def parse(json, opts) when is_binary(json) and byte_size(json) > @timeslice_bytes do + Torque.Native.parse_opts_dirty(json, Keyword.get(opts, :unique_keys, false)) + end + + def parse(json, opts) when is_binary(json) do + Torque.Native.parse_opts(json, Keyword.get(opts, :unique_keys, false)) + end + @doc """ Extracts a value from a parsed document using a JSON Pointer path (RFC 6901). diff --git a/lib/torque/native.ex b/lib/torque/native.ex index 0605656..18a684e 100644 --- a/lib/torque/native.ex +++ b/lib/torque/native.ex @@ -29,6 +29,8 @@ defmodule Torque.Native do def parse(_json), do: :erlang.nif_error(:nif_not_loaded) def parse_dirty(_json), do: :erlang.nif_error(:nif_not_loaded) + def parse_opts(_json, _unique_keys), do: :erlang.nif_error(:nif_not_loaded) + def parse_opts_dirty(_json, _unique_keys), do: :erlang.nif_error(:nif_not_loaded) def get(_doc, _path), do: :erlang.nif_error(:nif_not_loaded) def get_many(_doc, _paths), do: :erlang.nif_error(:nif_not_loaded) def decode(_json), do: :erlang.nif_error(:nif_not_loaded) diff --git a/native/torque_nif/src/decoder.rs b/native/torque_nif/src/decoder.rs index 8ebd11f..fd57f76 100644 --- a/native/torque_nif/src/decoder.rs +++ b/native/torque_nif/src/decoder.rs @@ -18,19 +18,34 @@ fn timeslice_percent(bytes: usize) -> i32 { ((reds * 100 / REDUCTION_COUNT) as i32).clamp(1, 100) } -/// Returns the last value for `key` in an object, matching the last-value-wins -/// behaviour of `value_to_term` / `build_map_dedup` for duplicate keys. +/// Looks up `key` in an object. +/// +/// When `unique_keys` is true, uses sonic-rs's internal index (fast). +/// Otherwise, does a reverse linear scan so that the last value wins, +/// matching the duplicate-key behaviour of `value_to_term` / `build_map_dedup`. #[inline] -fn object_get_last<'v>(value: &'v sonic_rs::Value, key: &str) -> Option<&'v sonic_rs::Value> { - value - .as_object()? - .iter() - .rfind(|(k, _)| *k == key) - .map(|(_, v)| v) +fn object_get<'v>( + value: &'v sonic_rs::Value, + key: &str, + unique_keys: bool, +) -> Option<&'v sonic_rs::Value> { + if unique_keys { + value.get(key) + } else { + value + .as_object()? + .iter() + .rfind(|(k, _)| *k == key) + .map(|(_, v)| v) + } } #[inline] -fn pointer_lookup<'v>(value: &'v sonic_rs::Value, path: &str) -> Option<&'v sonic_rs::Value> { +fn pointer_lookup<'v>( + value: &'v sonic_rs::Value, + path: &str, + unique_keys: bool, +) -> Option<&'v sonic_rs::Value> { let bytes = path.as_bytes(); if bytes.is_empty() { return Some(value); @@ -54,7 +69,7 @@ fn pointer_lookup<'v>(value: &'v sonic_rs::Value, path: &str) -> Option<&'v soni if segment.contains('~') { if segment.len() > 512 { let unescaped = segment.replace("~1", "/").replace("~0", "~"); - current = object_get_last(current, &unescaped)?; + current = object_get(current, &unescaped, unique_keys)?; } else { let bytes = segment.as_bytes(); let mut tmp = [0u8; 512]; @@ -87,25 +102,25 @@ fn pointer_lookup<'v>(value: &'v sonic_rs::Value, path: &str) -> Option<&'v soni } // SAFETY: input is valid UTF-8 &str; substitutions write only ASCII bytes let unescaped = unsafe { std::str::from_utf8_unchecked(&tmp[..out_len]) }; - current = object_get_last(current, unescaped)?; + current = object_get(current, unescaped, unique_keys)?; } } else { - current = object_get_last(current, segment)?; + current = object_get(current, segment, unique_keys)?; } } Some(current) } -fn do_parse(bytes: &[u8]) -> Result, String> { +fn do_parse(bytes: &[u8], unique_keys: bool) -> Result, String> { match sonic_rs::from_slice::(bytes) { - Ok(value) => Ok(ResourceArc::new(ParsedDocument { value })), + Ok(value) => Ok(ResourceArc::new(ParsedDocument { value, unique_keys })), Err(e) => Err(format!("{}", e)), } } #[rustler::nif] fn parse<'a>(env: Env<'a>, json: Binary) -> Term<'a> { - match do_parse(json.as_slice()) { + match do_parse(json.as_slice(), false) { Ok(resource) => { schedule::consume_timeslice(env, timeslice_percent(json.len())); make_tuple2(env, atoms::ok().as_c_arg(), resource.encode(env).as_c_arg()) @@ -120,7 +135,34 @@ fn parse<'a>(env: Env<'a>, json: Binary) -> Term<'a> { #[rustler::nif(schedule = "DirtyCpu")] fn parse_dirty<'a>(env: Env<'a>, json: Binary) -> Term<'a> { - match do_parse(json.as_slice()) { + match do_parse(json.as_slice(), false) { + Ok(resource) => make_tuple2(env, atoms::ok().as_c_arg(), resource.encode(env).as_c_arg()), + Err(reason) => make_tuple2( + env, + atoms::error().as_c_arg(), + reason.encode(env).as_c_arg(), + ), + } +} + +#[rustler::nif] +fn parse_opts<'a>(env: Env<'a>, json: Binary, unique_keys: bool) -> Term<'a> { + match do_parse(json.as_slice(), unique_keys) { + Ok(resource) => { + schedule::consume_timeslice(env, timeslice_percent(json.len())); + make_tuple2(env, atoms::ok().as_c_arg(), resource.encode(env).as_c_arg()) + } + Err(reason) => make_tuple2( + env, + atoms::error().as_c_arg(), + reason.encode(env).as_c_arg(), + ), + } +} + +#[rustler::nif(schedule = "DirtyCpu")] +fn parse_opts_dirty<'a>(env: Env<'a>, json: Binary, unique_keys: bool) -> Term<'a> { + match do_parse(json.as_slice(), unique_keys) { Ok(resource) => make_tuple2(env, atoms::ok().as_c_arg(), resource.encode(env).as_c_arg()), Err(reason) => make_tuple2( env, @@ -136,7 +178,7 @@ fn get<'a>(env: Env<'a>, doc: ResourceArc, path: &str) -> Term<' let err_raw = atoms::error().as_c_arg(); let nsf_raw = atoms::no_such_field().as_c_arg(); let ntd_raw = atoms::nesting_too_deep().as_c_arg(); - match pointer_lookup(&doc.value, path) { + match pointer_lookup(&doc.value, path, doc.unique_keys) { Some(value) => match value_to_term(env, value, MAX_DEPTH) { Some(term) => make_tuple2(env, ok_raw, term.as_c_arg()), None => make_tuple2(env, err_raw, ntd_raw), @@ -155,7 +197,7 @@ fn get_one_result( nsf_raw: ERL_NIF_TERM, ntd_raw: ERL_NIF_TERM, ) -> ERL_NIF_TERM { - match pointer_lookup(&doc.value, path) { + match pointer_lookup(&doc.value, path, doc.unique_keys) { Some(value) => match value_to_term(env, value, MAX_DEPTH) { Some(term) => make_tuple2(env, ok_raw, term.as_c_arg()).as_c_arg(), None => make_tuple2(env, err_raw, ntd_raw).as_c_arg(), @@ -229,7 +271,7 @@ fn get_many<'a>( #[rustler::nif] fn array_length<'a>(env: Env<'a>, doc: ResourceArc, path: &str) -> Term<'a> { - match pointer_lookup(&doc.value, path) { + match pointer_lookup(&doc.value, path, doc.unique_keys) { Some(value) if value.is_array() => { let len = value.as_array().unwrap().len(); unsafe { @@ -288,7 +330,7 @@ fn get_many_nil<'a>( } }; - let r = match pointer_lookup(&doc.value, path) { + let r = match pointer_lookup(&doc.value, path, doc.unique_keys) { Some(value) => match value_to_term(env, value, MAX_DEPTH) { Some(term) => term.as_c_arg(), None => nil_raw, diff --git a/native/torque_nif/src/lib.rs b/native/torque_nif/src/lib.rs index f670be8..8de8d9a 100644 --- a/native/torque_nif/src/lib.rs +++ b/native/torque_nif/src/lib.rs @@ -8,6 +8,7 @@ mod types; pub struct ParsedDocument { pub value: sonic_rs::Value, + pub unique_keys: bool, } #[rustler::resource_impl] diff --git a/test/pointer_test.exs b/test/pointer_test.exs index 3ba6b8a..55512ba 100644 --- a/test/pointer_test.exs +++ b/test/pointer_test.exs @@ -258,6 +258,37 @@ defmodule Torque.PointerTest do end end + describe "parse/2 unique_keys" do + test "uses fast lookup path" do + {:ok, doc} = Torque.parse(~s({"a":1,"b":2}), unique_keys: true) + assert {:ok, 1} = Torque.get(doc, "/a") + assert {:ok, 2} = Torque.get(doc, "/b") + assert {:error, :no_such_field} = Torque.get(doc, "/c") + end + + test "nested objects" do + {:ok, doc} = Torque.parse(~s({"x":{"y":"z"}}), unique_keys: true) + assert {:ok, "z"} = Torque.get(doc, "/x/y") + end + + test "get_many" do + {:ok, doc} = Torque.parse(~s({"a":1,"b":2}), unique_keys: true) + assert [{:ok, 1}, {:ok, 2}] = Torque.get_many(doc, ["/a", "/b"]) + end + + test "get_many_nil" do + {:ok, doc} = Torque.parse(~s({"a":1,"b":2}), unique_keys: true) + assert [1, 2, nil] = Torque.get_many_nil(doc, ["/a", "/b", "/c"]) + end + + test "dirty scheduler for large payload" do + large_map = Map.new(1..500, fn i -> {"key_#{i}", String.duplicate("v", 20)} end) + json = Jason.encode!(large_map) + {:ok, doc} = Torque.parse(json, unique_keys: true) + assert {:ok, _} = Torque.get(doc, "/key_1") + end + end + describe "duplicate keys" do test "parse + get on object with duplicate keys - last value wins" do {:ok, doc} = Torque.parse(~s({"a":1,"b":2,"a":3})) From 08e6b15599b49fe92252b7babb6a55aa079a7a86 Mon Sep 17 00:00:00 2001 From: Louis-Philippe Gauthier Date: Fri, 27 Mar 2026 21:21:57 -0400 Subject: [PATCH 2/2] Improve hexdocs with groups, doctests, and examples Add docs/0 config to mix.exs with groups_for_docs to organize functions into Decoding, Encoding, and Parse + Get sections. Add doctests to every public function and a doctest runner. Rewrite @moduledoc with type conversion table and structured sections. Reorder functions to match their doc groups. --- lib/torque.ex | 305 ++++++++++++++++++++++++++++++---------------- mix.exs | 14 +++ test/doc_test.exs | 5 + 3 files changed, 218 insertions(+), 106 deletions(-) create mode 100644 test/doc_test.exs diff --git a/lib/torque.ex b/lib/torque.ex index 3bcc255..4fe6c29 100644 --- a/lib/torque.ex +++ b/lib/torque.ex @@ -2,34 +2,168 @@ defmodule Torque do @moduledoc """ High-performance JSON library powered by sonic-rs via Rustler NIFs. - Provides two decoding strategies: + ## Decoding strategies - * **Parse + Get** — `parse/1` followed by `get/2,3` or `get_many/2` for - selective field extraction via JSON Pointer (RFC 6901) paths. Ideal when - only a subset of fields is needed. + * **Parse + Get** — `parse/2` returns an opaque document reference. + `get/2`, `get/3`, `get_many/2`, and `get_many_nil/2` extract fields + by JSON Pointer (RFC 6901) paths without materializing the full + Elixir term tree. Ideal when only a subset of fields is needed. - * **Full decode** — `decode/1` converts an entire JSON binary into Elixir - terms in one pass. + * **Full decode** — `decode/1` converts an entire JSON binary into + Elixir terms in one pass. - And encoding: + ## Encoding - * `encode/1` serializes Elixir terms to JSON. Supports maps (atom or binary - keys), lists, binaries, numbers, booleans, `nil`, and jiffy-style - `{proplist}` tuples. + `encode/1` serializes Elixir terms to JSON. Supports maps (atom or + binary keys), lists, binaries, numbers, booleans, `nil`, and + jiffy-style `{proplist}` tuples. - Inputs larger than 20 KB are automatically scheduled on a dirty CPU scheduler - to avoid blocking normal BEAM schedulers. + ## Scheduler awareness + + Inputs larger than 20 KB are automatically dispatched to a dirty CPU + scheduler to avoid blocking normal BEAM schedulers. + + ## Type conversion + + | JSON | Elixir | + |------|--------| + | object | map with binary keys | + | array | list | + | string | binary | + | integer | integer | + | float | float | + | `true` / `false` | `true` / `false` | + | `null` | `nil` | + + For objects with duplicate keys, the last value wins (unless + `unique_keys: true` is passed to `parse/2`). """ @timeslice_bytes 20_480 # --- Decoding --- + @doc """ + Decodes a JSON binary into Elixir terms. + + JSON objects become maps with binary keys, arrays become lists, strings + become binaries, numbers become integers or floats, booleans become + `true`/`false`, and `null` becomes `nil`. + + Automatically uses a dirty CPU scheduler for inputs larger than 20 KB. + + ## Examples + + iex> Torque.decode(~s({"a":1,"b":"hello"})) + {:ok, %{"a" => 1, "b" => "hello"}} + + iex> Torque.decode(~s([1,2,3])) + {:ok, [1, 2, 3]} + + iex> match?({:error, _}, Torque.decode("invalid")) + true + """ + @doc group: :decode + @spec decode(binary()) :: {:ok, term()} | {:error, binary() | :nesting_too_deep} + def decode(json) when is_binary(json) and byte_size(json) > @timeslice_bytes do + Torque.Native.decode_dirty(json) + end + + def decode(json) when is_binary(json) do + Torque.Native.decode(json) + end + + @doc """ + Decodes a JSON binary into Elixir terms, raising on error. + + ## Examples + + iex> Torque.decode!(~s({"a":1})) + %{"a" => 1} + """ + @doc group: :decode + @spec decode!(binary()) :: term() + def decode!(json) when is_binary(json) do + case decode(json) do + {:ok, term} -> term + {:error, reason} -> raise ArgumentError, "decode error: #{reason}" + end + end + + # --- Encoding --- + + @doc """ + Encodes an Elixir term into a JSON binary. + + ## Supported terms + + * Maps with atom or binary keys + * Lists (JSON arrays) + * Binaries (JSON strings) + * Integers and floats + * `true`, `false`, `nil` (JSON `null`) + * Other atoms (encoded as JSON strings) + * `{keyword_list}` tuples (jiffy-style proplist objects) + + ## Examples + + iex> Torque.encode(%{id: "abc", price: 1.5}) + {:ok, ~s({"id":"abc","price":1.5})} + + iex> Torque.encode({[{:id, "abc"}]}) + {:ok, ~s({"id":"abc"})} + """ + @doc group: :encode + @spec encode(term()) :: {:ok, binary()} | {:error, binary() | :nesting_too_deep} + def encode(term) do + Torque.Native.encode(term) + end + + @doc """ + Encodes an Elixir term into a JSON binary, raising on error. + + ## Examples + + iex> Torque.encode!(%{ok: true}) + ~s({"ok":true}) + """ + @doc group: :encode + @spec encode!(term()) :: binary() + def encode!(term) do + case encode(term) do + {:ok, json} -> json + {:error, reason} -> raise ArgumentError, "encode error: #{reason}" + end + end + + @doc """ + Encodes an Elixir term into a JSON binary (iodata-compatible). + + Returns the binary directly without `{:ok, ...}` tuple wrapping. + Raises on error. This is the fastest encoding path when the result + is passed directly to I/O (e.g. as an HTTP response body). + + ## Examples + + iex> Torque.encode_to_iodata(%{ok: true}) + ~s({"ok":true}) + """ + @doc group: :encode + @spec encode_to_iodata(term()) :: binary() + def encode_to_iodata(term) do + Torque.Native.encode_iodata(term) + catch + :error, value -> raise ArgumentError, "encode error: #{inspect(value)}" + end + + # --- Parse + Get --- + @doc """ Parses a JSON binary into an opaque document reference. - The returned reference can be passed to `get/2`, `get/3`, or `get_many/2` - for efficient repeated field extraction without re-parsing. + The returned reference can be passed to `get/2`, `get/3`, `get_many/2`, + `get_many_nil/2`, or `length/2` for efficient repeated field extraction + without re-parsing. ## Options @@ -38,7 +172,18 @@ defmodule Torque do duplicate keys). Automatically uses a dirty CPU scheduler for inputs larger than 20 KB. + + ## Examples + + iex> {:ok, doc} = Torque.parse(~s({"a":1})) + iex> is_reference(doc) + true + + iex> {:ok, doc} = Torque.parse(~s({"a":1}), unique_keys: true) + iex> Torque.get(doc, "/a") + {:ok, 1} """ + @doc group: :parse_get @spec parse(binary(), keyword()) :: {:ok, reference()} | {:error, binary()} def parse(json, opts \\ []) @@ -62,14 +207,19 @@ defmodule Torque do Extracts a value from a parsed document using a JSON Pointer path (RFC 6901). Paths must start with `"/"`. Array elements are addressed by index - (e.g. `"/imp/0/banner/w"`). + (e.g. `"/imp/0/banner/w"`). An empty path `""` returns the root value. ## Examples - {:ok, doc} = Torque.parse(~s({"site":{"domain":"example.com"}})) - {:ok, "example.com"} = Torque.get(doc, "/site/domain") - {:error, :no_such_field} = Torque.get(doc, "/missing") + iex> {:ok, doc} = Torque.parse(~s({"site":{"domain":"example.com"}})) + iex> Torque.get(doc, "/site/domain") + {:ok, "example.com"} + + iex> {:ok, doc} = Torque.parse(~s({"site":{"domain":"example.com"}})) + iex> Torque.get(doc, "/missing") + {:error, :no_such_field} """ + @doc group: :parse_get @spec get(reference(), binary()) :: {:ok, term()} | {:error, :no_such_field | :nesting_too_deep} def get(doc, path) when is_reference(doc) and is_binary(path) do @@ -80,12 +230,20 @@ defmodule Torque do Extracts a value from a parsed document, returning `default` when the path does not exist. + Raises `ArgumentError` for errors other than `:no_such_field` + (e.g. `:nesting_too_deep`). + ## Examples - {:ok, doc} = Torque.parse(~s({"a":1})) - 1 = Torque.get(doc, "/a", nil) - nil = Torque.get(doc, "/b", nil) + iex> {:ok, doc} = Torque.parse(~s({"a":1})) + iex> Torque.get(doc, "/a", nil) + 1 + + iex> {:ok, doc} = Torque.parse(~s({"a":1})) + iex> Torque.get(doc, "/b", :default) + :default """ + @doc group: :parse_get @compile {:inline, get: 3} @spec get(reference(), binary(), term()) :: term() def get(doc, path, default) when is_reference(doc) and is_binary(path) do @@ -102,15 +260,16 @@ defmodule Torque do Returns a list of results in the same order as `paths`, each being `{:ok, value}` or `{:error, :no_such_field}`. - This is more efficient than calling `get/2` in a loop because it crosses + More efficient than calling `get/2` in a loop because it crosses the NIF boundary only once. ## Examples - {:ok, doc} = Torque.parse(~s({"a":1,"b":2})) - [{:ok, 1}, {:ok, 2}, {:error, :no_such_field}] = - Torque.get_many(doc, ["/a", "/b", "/c"]) + iex> {:ok, doc} = Torque.parse(~s({"a":1,"b":2})) + iex> Torque.get_many(doc, ["/a", "/b", "/c"]) + [{:ok, 1}, {:ok, 2}, {:error, :no_such_field}] """ + @doc group: :parse_get @spec get_many(reference(), [binary()]) :: [{:ok, term()} | {:error, :no_such_field | :nesting_too_deep}] def get_many(doc, paths) when is_reference(doc) and is_list(paths) do @@ -118,19 +277,22 @@ defmodule Torque do end @doc """ - Extracts multiple values from a parsed document, returning `nil` for missing fields. + Extracts multiple values from a parsed document, returning `nil` for missing + fields. Like `get_many/2` but returns bare values instead of `{:ok, value}` tuples. Missing fields return `nil` (indistinguishable from JSON `null`). - This is faster than `get_many/2` when you don't need to distinguish between + Faster than `get_many/2` when you don't need to distinguish between missing fields and null values, as it avoids allocating wrapper tuples. ## Examples - {:ok, doc} = Torque.parse(~s({"a":1,"b":null})) - [1, nil, nil] = Torque.get_many_nil(doc, ["/a", "/b", "/c"]) + iex> {:ok, doc} = Torque.parse(~s({"a":1,"b":null})) + iex> Torque.get_many_nil(doc, ["/a", "/b", "/c"]) + [1, nil, nil] """ + @doc group: :parse_get @spec get_many_nil(reference(), [binary()]) :: [term()] def get_many_nil(doc, paths) when is_reference(doc) and is_list(paths) do Torque.Native.get_many_nil(doc, paths) @@ -142,86 +304,17 @@ defmodule Torque do ## Examples - {:ok, doc} = Torque.parse(~s({"a":[1,2,3]})) - 3 = Torque.length(doc, "/a") - nil = Torque.length(doc, "/missing") + iex> {:ok, doc} = Torque.parse(~s({"a":[1,2,3]})) + iex> Torque.length(doc, "/a") + 3 + + iex> {:ok, doc} = Torque.parse(~s({"a":[1,2,3]})) + iex> Torque.length(doc, "/missing") + nil """ + @doc group: :parse_get @spec length(reference(), binary()) :: non_neg_integer() | nil def length(doc, path) when is_reference(doc) and is_binary(path) do Torque.Native.array_length(doc, path) end - - @doc """ - Decodes a JSON binary into Elixir terms. - - JSON objects become maps with binary keys, arrays become lists, strings become - binaries, numbers become integers or floats, booleans become `true`/`false`, - and `null` becomes `nil`. - - Automatically uses a dirty CPU scheduler for inputs larger than 10 KB. - """ - @spec decode(binary()) :: {:ok, term()} | {:error, binary() | :nesting_too_deep} - def decode(json) when is_binary(json) and byte_size(json) > @timeslice_bytes do - Torque.Native.decode_dirty(json) - end - - def decode(json) when is_binary(json) do - Torque.Native.decode(json) - end - - @doc """ - Decodes a JSON binary into Elixir terms, raising on error. - """ - @spec decode!(binary()) :: term() - def decode!(json) when is_binary(json) do - case decode(json) do - {:ok, term} -> term - {:error, reason} -> raise ArgumentError, "decode error: #{reason}" - end - end - - # --- Encoding --- - - @doc """ - Encodes an Elixir term into a JSON binary. - - Supported terms: - - * Maps with atom or binary keys - * Lists (JSON arrays) - * Binaries (JSON strings) - * Integers and floats - * `true`, `false`, `nil` (JSON `null`) - * Other atoms (encoded as JSON strings) - * `{keyword_list}` tuples (jiffy-style proplist objects) - """ - @spec encode(term()) :: {:ok, binary()} | {:error, binary() | :nesting_too_deep} - def encode(term) do - Torque.Native.encode(term) - end - - @doc """ - Encodes an Elixir term into a JSON binary, raising on error. - """ - @spec encode!(term()) :: binary() - def encode!(term) do - case encode(term) do - {:ok, json} -> json - {:error, reason} -> raise ArgumentError, "encode error: #{reason}" - end - end - - @doc """ - Encodes an Elixir term into a JSON binary (iodata-compatible). - - Returns the binary directly without `{:ok, ...}` tuple wrapping. - Raises on error. This is the fastest encoding path when the result - is passed directly to I/O (e.g. as an HTTP response body). - """ - @spec encode_to_iodata(term()) :: binary() - def encode_to_iodata(term) do - Torque.Native.encode_iodata(term) - catch - :error, value -> raise ArgumentError, "encode error: #{inspect(value)}" - end end diff --git a/mix.exs b/mix.exs index f797fc0..c2205fb 100644 --- a/mix.exs +++ b/mix.exs @@ -13,6 +13,7 @@ defmodule Torque.MixProject do deps: deps(), package: package(), description: "High-performance JSON library for Elixir via Rustler NIFs (sonic-rs)", + docs: docs(), source_url: @source_url, homepage_url: @source_url ] @@ -22,6 +23,19 @@ defmodule Torque.MixProject do [extra_applications: [:logger]] end + defp docs do + [ + main: "Torque", + source_ref: "v#{@version}", + extras: ["README.md": [title: "Overview"], LICENSE: [title: "License"]], + groups_for_docs: [ + Decoding: &(&1[:group] == :decode), + Encoding: &(&1[:group] == :encode), + "Parse + Get": &(&1[:group] == :parse_get) + ] + ] + end + defp deps do [ {:rustler_precompiled, "~> 0.8"}, diff --git a/test/doc_test.exs b/test/doc_test.exs new file mode 100644 index 0000000..a1fcfbc --- /dev/null +++ b/test/doc_test.exs @@ -0,0 +1,5 @@ +defmodule Torque.DocTest do + use ExUnit.Case, async: true + + doctest Torque +end