From e4e690dc08b27783b4ff44c8cdfa217af3741568 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 21 Apr 2026 19:45:13 +0200 Subject: [PATCH 01/25] Bump iceberg-rust to 418213731e91, version 0.7.17 Update iceberg and iceberg-catalog-rest to rev 418213731e91544f5eb31a3efa459e88f599030e, which includes the fix for incremental scans with from=None silently dropping EXISTING manifest entries from expired snapshots. Also pulls in arrow/parquet v58.1.0 via the updated lock file. Co-Authored-By: Claude Sonnet 4.6 --- iceberg_rust_ffi/Cargo.lock | 380 ++++++++++++++++++++++++++++++------ iceberg_rust_ffi/Cargo.toml | 6 +- 2 files changed, 321 insertions(+), 65 deletions(-) diff --git a/iceberg_rust_ffi/Cargo.lock b/iceberg_rust_ffi/Cargo.lock index 8f6d067..c36ad78 100644 --- a/iceberg_rust_ffi/Cargo.lock +++ b/iceberg_rust_ffi/Cargo.lock @@ -8,6 +8,41 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "aead" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0" +dependencies = [ + "crypto-common", + "generic-array", +] + +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + +[[package]] +name = "aes-gcm" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1" +dependencies = [ + "aead", + "aes", + "cipher", + "ctr", + "ghash", + "subtle", +] + [[package]] name = "ahash" version = "0.8.12" @@ -94,14 +129,14 @@ checksum = "3d62b7694a562cdf5a74227903507c56ab2cc8bdd1f781ed5cb4cf9c9f810bfc" [[package]] name = "arrow-arith" -version = "57.3.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b" +checksum = "ced5406f8b720cc0bc3aa9cf5758f93e8593cda5490677aa194e4b4b383f9a59" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", "chrono", "num-traits", ] @@ -113,9 +148,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" dependencies = [ "ahash", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "half", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-array" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "772bd34cacdda8baec9418d80d23d0fb4d50ef0735685bd45158b83dfeb6e62d" +dependencies = [ + "ahash", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", "chrono", "half", "hashbrown 0.16.1", @@ -136,18 +189,51 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-buffer" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "898f4cf1e9598fdb77f356fdf2134feedfd0ee8d5a4e0a5f573e7d0aec16baa4" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + [[package]] name = "arrow-cast" version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-ord 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "atoi", + "base64", + "chrono", + "half", + "lexical-core", + "num-traits", + "ryu", +] + +[[package]] +name = "arrow-cast" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0127816c96533d20fc938729f48c52d3e48f99717e7a0b5ade77d742510736d" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-ord 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "atoi", "base64", "chrono", @@ -163,8 +249,21 @@ version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" dependencies = [ - "arrow-buffer", - "arrow-schema", + "arrow-buffer 57.3.0", + "arrow-schema 57.3.0", + "half", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-data" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d10beeab2b1c3bb0b53a00f7c944a178b622173a5c7bcabc3cb45d90238df4" +dependencies = [ + "arrow-buffer 58.1.0", + "arrow-schema 58.1.0", "half", "num-integer", "num-traits", @@ -176,11 +275,25 @@ version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "flatbuffers", +] + +[[package]] +name = "arrow-ipc" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "609a441080e338147a84e8e6904b6da482cefb957c5cdc0f3398872f69a315d0" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "flatbuffers", ] @@ -190,11 +303,24 @@ version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", +] + +[[package]] +name = "arrow-ord" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "763a7ba279b20b52dad300e68cfc37c17efa65e68623169076855b3a9e941ca5" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", ] [[package]] @@ -203,6 +329,12 @@ version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" +[[package]] +name = "arrow-schema" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743" + [[package]] name = "arrow-select" version = "57.3.0" @@ -210,24 +342,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" dependencies = [ "ahash", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "num-traits", +] + +[[package]] +name = "arrow-select" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78694888660a9e8ac949853db393af2a8b8fc82c19ce333132dfa2e72cc1a7fe" +dependencies = [ + "ahash", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", "num-traits", ] [[package]] name = "arrow-string" -version = "57.3.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8" +checksum = "61e04a01f8bb73ce54437514c5fd3ee2aa3e8abe4c777ee5cc55853b1652f79e" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "memchr", "num-traits", "regex", @@ -487,6 +633,16 @@ dependencies = [ "windows-link", ] +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + [[package]] name = "cmake" version = "0.1.57" @@ -645,9 +801,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" dependencies = [ "generic-array", + "rand_core 0.6.4", "typenum", ] +[[package]] +name = "ctr" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835" +dependencies = [ + "cipher", +] + [[package]] name = "darling" version = "0.20.11" @@ -1163,6 +1329,16 @@ dependencies = [ "wasip3", ] +[[package]] +name = "ghash" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1" +dependencies = [ + "opaque-debug", + "polyval", +] + [[package]] name = "gloo-timers" version = "0.3.0" @@ -1498,18 +1674,19 @@ dependencies = [ [[package]] name = "iceberg" version = "0.9.0" -source = "git+https://github.com/RelationalAI/iceberg-rust.git?rev=ff28348dff9e72ca7188a330508c65b637a57c53#ff28348dff9e72ca7188a330508c65b637a57c53" +source = "git+https://github.com/RelationalAI/iceberg-rust.git?rev=418213731e91544f5eb31a3efa459e88f599030e#418213731e91544f5eb31a3efa459e88f599030e" dependencies = [ + "aes-gcm", "anyhow", "apache-avro", "array-init", "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-cast 58.1.0", + "arrow-ord 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "arrow-string", "as-any", "async-trait", @@ -1530,7 +1707,7 @@ dependencies = [ "once_cell", "opendal", "ordered-float 4.6.0", - "parquet", + "parquet 58.1.0", "rand 0.8.5", "reqsign", "reqwest", @@ -1547,13 +1724,14 @@ dependencies = [ "typetag", "url", "uuid", + "zeroize", "zstd", ] [[package]] name = "iceberg-catalog-rest" version = "0.9.0" -source = "git+https://github.com/RelationalAI/iceberg-rust.git?rev=ff28348dff9e72ca7188a330508c65b637a57c53#ff28348dff9e72ca7188a330508c65b637a57c53" +source = "git+https://github.com/RelationalAI/iceberg-rust.git?rev=418213731e91544f5eb31a3efa459e88f599030e#418213731e91544f5eb31a3efa459e88f599030e" dependencies = [ "async-trait", "chrono", @@ -1565,20 +1743,19 @@ dependencies = [ "serde_derive", "serde_json", "tokio", - "tracing", "typed-builder", "uuid", ] [[package]] name = "iceberg_rust_ffi" -version = "0.7.16" +version = "0.7.17" dependencies = [ "anyhow", - "arrow-array", - "arrow-buffer", - "arrow-ipc", - "arrow-schema", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-ipc 57.3.0", + "arrow-schema 57.3.0", "async-trait", "futures", "home", @@ -1587,7 +1764,7 @@ dependencies = [ "libc", "object_store_ffi", "once_cell", - "parquet", + "parquet 57.3.0", "serde_json", "tempfile", "tokio", @@ -1732,6 +1909,15 @@ dependencies = [ "serde_core", ] +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "generic-array", +] + [[package]] name = "instant" version = "0.1.13" @@ -2008,6 +2194,15 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "lz4_flex" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a" +dependencies = [ + "twox-hash", +] + [[package]] name = "matchers" version = "0.2.0" @@ -2275,6 +2470,12 @@ dependencies = [ "portable-atomic", ] +[[package]] +name = "opaque-debug" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" + [[package]] name = "opendal" version = "0.55.0" @@ -2422,13 +2623,46 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb" dependencies = [ "ahash", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ipc", - "arrow-schema", - "arrow-select", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-data 57.3.0", + "arrow-ipc 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "half", + "hashbrown 0.16.1", + "lz4_flex 0.12.0", + "num-bigint", + "num-integer", + "num-traits", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "twox-hash", + "zstd", +] + +[[package]] +name = "parquet" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d3f9f2205199603564127932b89695f52b62322f541d0fc7179d57c2e1c9877" +dependencies = [ + "ahash", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-ipc 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "base64", "brotli", "bytes", @@ -2437,7 +2671,7 @@ dependencies = [ "futures", "half", "hashbrown 0.16.1", - "lz4_flex", + "lz4_flex 0.13.0", "num-bigint", "num-integer", "num-traits", @@ -2501,6 +2735,18 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "polyval" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25" +dependencies = [ + "cfg-if", + "cpufeatures", + "opaque-debug", + "universal-hash", +] + [[package]] name = "portable-atomic" version = "1.13.1" @@ -3807,6 +4053,16 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "universal-hash" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea" +dependencies = [ + "crypto-common", + "subtle", +] + [[package]] name = "untrusted" version = "0.9.0" diff --git a/iceberg_rust_ffi/Cargo.toml b/iceberg_rust_ffi/Cargo.toml index c238db6..3d3dfb6 100644 --- a/iceberg_rust_ffi/Cargo.toml +++ b/iceberg_rust_ffi/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iceberg_rust_ffi" -version = "0.7.16" +version = "0.7.17" edition = "2021" [lib] @@ -12,8 +12,8 @@ default = ["julia"] julia = [] [dependencies] -iceberg = { git = "https://github.com/RelationalAI/iceberg-rust.git", rev = "ff28348dff9e72ca7188a330508c65b637a57c53", features = ["storage-azdls"] } -iceberg-catalog-rest = { git = "https://github.com/RelationalAI/iceberg-rust.git", rev = "ff28348dff9e72ca7188a330508c65b637a57c53" } +iceberg = { git = "https://github.com/RelationalAI/iceberg-rust.git", rev = "418213731e91544f5eb31a3efa459e88f599030e", features = ["storage-azdls"] } +iceberg-catalog-rest = { git = "https://github.com/RelationalAI/iceberg-rust.git", rev = "418213731e91544f5eb31a3efa459e88f599030e" } object_store_ffi = { git = "https://github.com/RelationalAI/object_store_ffi", rev = "79b08071c7a1642532b5891253280861eca9e44e", default-features = false } tokio = { version = "1.0", features = ["full"] } futures = "0.3" From 8c730b29cb3c375d26b587a3a37b5bbeaef64377 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 21 Apr 2026 19:47:16 +0200 Subject: [PATCH 02/25] Bump version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 5620c51..c0ec1e9 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "RustyIceberg" uuid = "390bdf5b-b624-43dc-a846-0ef7a3405804" -version = "0.6.9" +version = "0.7.0" [deps] Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" From 845fd0c05bde6bfbc93f0826e743d137ab3f1cc3 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 21 Apr 2026 20:06:37 +0200 Subject: [PATCH 03/25] Bump arrow/parquet deps to 58.1 to match iceberg-rust iceberg-rust rev 418213731e91 depends on arrow/parquet v58.1.0; keeping the FFI crate on 57.x caused two incompatible versions of RecordBatch to be linked, failing to compile. Align all arrow-* and parquet pins to "58.1". Co-Authored-By: Claude Sonnet 4.6 --- iceberg_rust_ffi/Cargo.lock | 260 ++++++++---------------------------- iceberg_rust_ffi/Cargo.toml | 10 +- 2 files changed, 58 insertions(+), 212 deletions(-) diff --git a/iceberg_rust_ffi/Cargo.lock b/iceberg_rust_ffi/Cargo.lock index c36ad78..8070ddd 100644 --- a/iceberg_rust_ffi/Cargo.lock +++ b/iceberg_rust_ffi/Cargo.lock @@ -133,32 +133,14 @@ version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced5406f8b720cc0bc3aa9cf5758f93e8593cda5490677aa194e4b4b383f9a59" dependencies = [ - "arrow-array 58.1.0", - "arrow-buffer 58.1.0", - "arrow-data 58.1.0", - "arrow-schema 58.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "num-traits", ] -[[package]] -name = "arrow-array" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" -dependencies = [ - "ahash", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "chrono", - "half", - "hashbrown 0.16.1", - "num-complex", - "num-integer", - "num-traits", -] - [[package]] name = "arrow-array" version = "58.1.0" @@ -166,9 +148,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "772bd34cacdda8baec9418d80d23d0fb4d50ef0735685bd45158b83dfeb6e62d" dependencies = [ "ahash", - "arrow-buffer 58.1.0", - "arrow-data 58.1.0", - "arrow-schema 58.1.0", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "half", "hashbrown 0.16.1", @@ -177,18 +159,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "arrow-buffer" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2" -dependencies = [ - "bytes", - "half", - "num-bigint", - "num-traits", -] - [[package]] name = "arrow-buffer" version = "58.1.0" @@ -201,39 +171,18 @@ dependencies = [ "num-traits", ] -[[package]] -name = "arrow-cast" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" -dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-ord 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", - "atoi", - "base64", - "chrono", - "half", - "lexical-core", - "num-traits", - "ryu", -] - [[package]] name = "arrow-cast" version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0127816c96533d20fc938729f48c52d3e48f99717e7a0b5ade77d742510736d" dependencies = [ - "arrow-array 58.1.0", - "arrow-buffer 58.1.0", - "arrow-data 58.1.0", - "arrow-ord 58.1.0", - "arrow-schema 58.1.0", - "arrow-select 58.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", "atoi", "base64", "chrono", @@ -243,112 +192,52 @@ dependencies = [ "ryu", ] -[[package]] -name = "arrow-data" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" -dependencies = [ - "arrow-buffer 57.3.0", - "arrow-schema 57.3.0", - "half", - "num-integer", - "num-traits", -] - [[package]] name = "arrow-data" version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42d10beeab2b1c3bb0b53a00f7c944a178b622173a5c7bcabc3cb45d90238df4" dependencies = [ - "arrow-buffer 58.1.0", - "arrow-schema 58.1.0", + "arrow-buffer", + "arrow-schema", "half", "num-integer", "num-traits", ] -[[package]] -name = "arrow-ipc" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" -dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", - "flatbuffers", -] - [[package]] name = "arrow-ipc" version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "609a441080e338147a84e8e6904b6da482cefb957c5cdc0f3398872f69a315d0" dependencies = [ - "arrow-array 58.1.0", - "arrow-buffer 58.1.0", - "arrow-data 58.1.0", - "arrow-schema 58.1.0", - "arrow-select 58.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "flatbuffers", ] -[[package]] -name = "arrow-ord" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" -dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", -] - [[package]] name = "arrow-ord" version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "763a7ba279b20b52dad300e68cfc37c17efa65e68623169076855b3a9e941ca5" dependencies = [ - "arrow-array 58.1.0", - "arrow-buffer 58.1.0", - "arrow-data 58.1.0", - "arrow-schema 58.1.0", - "arrow-select 58.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", ] -[[package]] -name = "arrow-schema" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" - [[package]] name = "arrow-schema" version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743" -[[package]] -name = "arrow-select" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" -dependencies = [ - "ahash", - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "num-traits", -] - [[package]] name = "arrow-select" version = "58.1.0" @@ -356,10 +245,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78694888660a9e8ac949853db393af2a8b8fc82c19ce333132dfa2e72cc1a7fe" dependencies = [ "ahash", - "arrow-array 58.1.0", - "arrow-buffer 58.1.0", - "arrow-data 58.1.0", - "arrow-schema 58.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "num-traits", ] @@ -369,11 +258,11 @@ version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61e04a01f8bb73ce54437514c5fd3ee2aa3e8abe4c777ee5cc55853b1652f79e" dependencies = [ - "arrow-array 58.1.0", - "arrow-buffer 58.1.0", - "arrow-data 58.1.0", - "arrow-schema 58.1.0", - "arrow-select 58.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "memchr", "num-traits", "regex", @@ -1681,12 +1570,12 @@ dependencies = [ "apache-avro", "array-init", "arrow-arith", - "arrow-array 58.1.0", - "arrow-buffer 58.1.0", - "arrow-cast 58.1.0", - "arrow-ord 58.1.0", - "arrow-schema 58.1.0", - "arrow-select 58.1.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ord", + "arrow-schema", + "arrow-select", "arrow-string", "as-any", "async-trait", @@ -1707,7 +1596,7 @@ dependencies = [ "once_cell", "opendal", "ordered-float 4.6.0", - "parquet 58.1.0", + "parquet", "rand 0.8.5", "reqsign", "reqwest", @@ -1752,10 +1641,10 @@ name = "iceberg_rust_ffi" version = "0.7.17" dependencies = [ "anyhow", - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-ipc 57.3.0", - "arrow-schema 57.3.0", + "arrow-array", + "arrow-buffer", + "arrow-ipc", + "arrow-schema", "async-trait", "futures", "home", @@ -1764,7 +1653,7 @@ dependencies = [ "libc", "object_store_ffi", "once_cell", - "parquet 57.3.0", + "parquet", "serde_json", "tempfile", "tokio", @@ -2185,15 +2074,6 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" -[[package]] -name = "lz4_flex" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e" -dependencies = [ - "twox-hash", -] - [[package]] name = "lz4_flex" version = "0.13.0" @@ -2616,40 +2496,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "parquet" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb" -dependencies = [ - "ahash", - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-cast 57.3.0", - "arrow-data 57.3.0", - "arrow-ipc 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", - "base64", - "brotli", - "bytes", - "chrono", - "flate2", - "half", - "hashbrown 0.16.1", - "lz4_flex 0.12.0", - "num-bigint", - "num-integer", - "num-traits", - "paste", - "seq-macro", - "simdutf8", - "snap", - "thrift", - "twox-hash", - "zstd", -] - [[package]] name = "parquet" version = "58.1.0" @@ -2657,12 +2503,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d3f9f2205199603564127932b89695f52b62322f541d0fc7179d57c2e1c9877" dependencies = [ "ahash", - "arrow-array 58.1.0", - "arrow-buffer 58.1.0", - "arrow-data 58.1.0", - "arrow-ipc 58.1.0", - "arrow-schema 58.1.0", - "arrow-select 58.1.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", "base64", "brotli", "bytes", @@ -2671,7 +2517,7 @@ dependencies = [ "futures", "half", "hashbrown 0.16.1", - "lz4_flex 0.13.0", + "lz4_flex", "num-bigint", "num-integer", "num-traits", diff --git a/iceberg_rust_ffi/Cargo.toml b/iceberg_rust_ffi/Cargo.toml index 3d3dfb6..7b2cd92 100644 --- a/iceberg_rust_ffi/Cargo.toml +++ b/iceberg_rust_ffi/Cargo.toml @@ -19,11 +19,11 @@ tokio = { version = "1.0", features = ["full"] } futures = "0.3" libc = "0.2" anyhow = "1.0" -arrow-array = "57.1" -arrow-buffer = "57.1" -arrow-schema = "57.1" -arrow-ipc = "57.1" -parquet = "57.1" +arrow-array = "58.1" +arrow-buffer = "58.1" +arrow-schema = "58.1" +arrow-ipc = "58.1" +parquet = "58.1" tracing-subscriber = "0.3" tracing = "0.1" once_cell = "1.19" From 029fe570ae1cec4fb12e5d4302f803b6a7060cd0 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 16:42:46 +0200 Subject: [PATCH 04/25] Add optimizations --- iceberg_rust_ffi/src/writer.rs | 485 +++++++++++++++++++++---- iceberg_rust_ffi/src/writer_columns.rs | 368 ++++++++++++++++--- src/RustyIceberg.jl | 5 +- src/writer.jl | 370 ++++++++++++++++++- 4 files changed, 1107 insertions(+), 121 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index efc194a..bfe95b0 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -1,12 +1,18 @@ /// Writer support for iceberg_rust_ffi /// -/// This module provides FFI bindings for the iceberg-rust Writer API, -/// enabling Julia to write Arrow data to Parquet files and produce DataFiles -/// for use with the Transaction API. +/// Encoding is handled by a global pool of N=available_parallelism OS threads shared +/// across all writers. Per-writer ordering is guaranteed by the per-writer +/// `Arc>` inside WriterState: only one pool thread encodes +/// a given writer at a time, and the FIFO global queue ensures batches are submitted +/// in order. Drain tasks are lightweight async forwarders (no CPU work). use std::ffi::{c_char, c_void}; use std::io::Cursor; -use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex, OnceLock}; +use std::panic::{catch_unwind, AssertUnwindSafe}; +use std::thread; +use arrow_array::RecordBatch; use arrow_ipc::reader::StreamReader; use arrow_schema::SchemaRef as ArrowSchemaRef; use iceberg::arrow::schema_to_arrow_schema; @@ -18,8 +24,10 @@ use iceberg::writer::file_writer::location_generator::{ use iceberg::writer::file_writer::rolling_writer::RollingFileWriterBuilder; use iceberg::writer::file_writer::ParquetWriterBuilder; use iceberg::writer::{IcebergWriter, IcebergWriterBuilder}; -use parquet::basic::Compression; -use parquet::file::properties::WriterProperties; +use parquet::basic::{Compression, Encoding}; +use parquet::file::properties::{EnabledStatistics, WriterProperties}; +use tokio::sync::{mpsc, oneshot}; +use tokio::task::JoinHandle; /// Compression codec values (must match Julia's CompressionCodec enum) const COMPRESSION_UNCOMPRESSED: i32 = 0; @@ -28,7 +36,6 @@ const COMPRESSION_GZIP: i32 = 2; const COMPRESSION_LZ4: i32 = 3; const COMPRESSION_ZSTD: i32 = 4; -/// Convert FFI compression code to parquet Compression fn compression_from_code(code: i32) -> Compression { match code { COMPRESSION_UNCOMPRESSED => Compression::UNCOMPRESSED, @@ -36,7 +43,52 @@ fn compression_from_code(code: i32) -> Compression { COMPRESSION_GZIP => Compression::GZIP(Default::default()), COMPRESSION_LZ4 => Compression::LZ4, COMPRESSION_ZSTD => Compression::ZSTD(Default::default()), - _ => Compression::SNAPPY, // Default to SNAPPY for unknown values + _ => Compression::SNAPPY, + } +} + +/// Parquet writer properties passed from Julia (must match Julia's ParquetWriterProperties layout). +/// Fields are ordered largest-to-smallest to avoid gaps in the repr(C) layout. +/// Size fields use 0 to mean "use parquet default". +#[repr(C)] +pub struct ParquetWriterPropertiesFFI { + /// Maximum number of rows per row group (0 = parquet default: 1 048 576) + pub max_row_group_size: i64, + /// Target uncompressed data page size in bytes (0 = parquet default: 1 048 576) + pub data_page_size: i64, + /// Number of rows encoded per column chunk within a row group (0 = parquet default: 1024) + pub write_batch_size: i64, + /// Compression codec (see COMPRESSION_* constants) + pub compression_codec: i32, + /// Whether to enable dictionary encoding globally + pub dictionary_enabled: bool, + /// Force PLAIN encoding for all columns, bypassing DELTA_BINARY_PACKED default for INT64/INT32 + pub use_plain_encoding: bool, + /// Collect per-page and per-row-group min/max statistics (default true; set false to skip) + pub statistics_enabled: bool, +} + +impl ParquetWriterPropertiesFFI { + fn to_writer_properties(&self) -> WriterProperties { + let mut builder = WriterProperties::builder() + .set_compression(compression_from_code(self.compression_codec)) + .set_dictionary_enabled(self.dictionary_enabled); + if self.max_row_group_size > 0 { + builder = builder.set_max_row_group_row_count(Some(self.max_row_group_size as usize)); + } + if self.data_page_size > 0 { + builder = builder.set_data_page_size_limit(self.data_page_size as usize); + } + if self.write_batch_size > 0 { + builder = builder.set_write_batch_size(self.write_batch_size as usize); + } + if self.use_plain_encoding { + builder = builder.set_encoding(Encoding::PLAIN); + } + if !self.statistics_enabled { + builder = builder.set_statistics_enabled(EnabledStatistics::None); + } + builder.build() } } @@ -44,6 +96,7 @@ use crate::response::IcebergBoxedResponse; use crate::table::IcebergTable; use crate::transaction::IcebergDataFiles; use crate::util::parse_c_string; +use crate::writer_columns::{build_arrow_array_scattered, GatheredColumnDescriptor}; use object_store_ffi::{ export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT, }; @@ -52,12 +105,169 @@ use object_store_ffi::{ type ConcreteDataFileWriter = DataFileWriter; -/// Opaque writer handle for FFI -/// Holds the DataFileWriter which can write RecordBatches and produce DataFiles +/// Batch item sent through the per-writer drain channel. +/// `ack` is `Some` only for the zero-copy path (`write_columns`), where Arrow arrays +/// point directly into Julia memory — the pool worker sends on it after encoding so +/// Julia knows those buffers are safe to reuse. +pub(crate) type BatchItem = (RecordBatch, Option>); + +/// Encode task submitted to the global worker pool. +struct EncodeTask { + batch: RecordBatch, + ack_opt: Option>, + state: Arc, +} + +// Safety: RecordBatch is Send; WriterState fields are Send. +unsafe impl Send for EncodeTask {} + +/// Shared mutable state for one IcebergDataFileWriter. +/// Owned by the IcebergDataFileWriter and shared with pool workers via Arc. +pub(crate) struct WriterState { + /// The underlying Parquet writer. Protected by a Mutex so pool workers can access it + /// concurrently (though at most one worker encodes a given writer at a time due to the + /// bounded per-writer channel). Set to None when the writer is closed or freed. + writer: Mutex>, + /// Number of encode tasks submitted to the pool but not yet completed. + pending: AtomicUsize, + /// Notified when `pending` drops to zero, so iceberg_writer_close can wait efficiently. + done_notify: tokio::sync::Notify, + /// First encode error encountered by a pool worker, if any. + error: Mutex>, +} + +// Safety: ConcreteDataFileWriter is Send (verified by its use in spawn_blocking previously). +unsafe impl Send for WriterState {} +unsafe impl Sync for WriterState {} + +/// Global pool of N=available_parallelism encode worker threads shared across all writers. +struct GlobalWorkerPool { + task_tx: tokio::sync::mpsc::Sender, +} + +static GLOBAL_ENCODE_POOL: OnceLock = OnceLock::new(); + +/// Desired encode worker count. 0 means "use available_parallelism". +/// Must be set before the first iceberg_writer_new call. +static ENCODE_WORKERS: AtomicUsize = AtomicUsize::new(0); + +/// Set the number of encode worker threads in the global pool. +/// Must be called before creating any writers; has no effect afterwards. +#[no_mangle] +pub extern "C" fn iceberg_set_encode_workers(n: i32) { + if n > 0 { + ENCODE_WORKERS.store(n as usize, Ordering::Relaxed); + } +} + +/// Initialize the global encode pool on first call. +/// Must be called from within a Tokio runtime (iceberg_writer_new satisfies this). +fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { + GLOBAL_ENCODE_POOL.get_or_init(|| { + let configured = ENCODE_WORKERS.load(Ordering::Relaxed); + let n = if configured > 0 { + configured + } else { + // available_parallelism() only fails on unusual platforms (embedded, some sandboxes). + // On Linux/macOS/Windows it always succeeds, so the unwrap never fires in practice. + thread::available_parallelism().unwrap().get() + }; + let handle = tokio::runtime::Handle::current(); + // Buffer 2× workers — drain tasks are rarely blocked on submit. + let (task_tx, task_rx) = tokio::sync::mpsc::channel::(n * 2); + let task_rx = Arc::new(tokio::sync::Mutex::new(task_rx)); + + for i in 0..n { + let task_rx = task_rx.clone(); + let handle = handle.clone(); + thread::Builder::new() + .name(format!("iceberg-encode-{}", i)) + .spawn(move || { + loop { + // Acquire the shared receiver lock, then wait for a task. + // The lock is released as soon as recv() returns, so workers + // are not serialized during encoding — only during task pickup. + let task = { + let mut rx = handle.block_on(task_rx.lock()); + match handle.block_on(rx.recv()) { + Some(t) => t, + None => break, // sender dropped → pool shutting down + } + }; + + // Clone state before moving task into the closure so we can + // always decrement pending even if the closure panics. + let state = task.state.clone(); + + let handle_enc = handle.clone(); + let encode_result = catch_unwind(AssertUnwindSafe(move || { + let t0 = std::time::Instant::now(); + // Use unwrap_or_else to recover from a poisoned mutex + // (which happens when a previous encode panicked mid-write). + let result: anyhow::Result<()> = { + let mut guard = task.state.writer + .lock() + .unwrap_or_else(|e| e.into_inner()); + match guard.as_mut() { + Some(w) => handle_enc + .block_on(w.write(task.batch)) + .map_err(|e| anyhow::anyhow!("write batch: {}", e)), + None => Err(anyhow::anyhow!("writer already closed")), + } + }; + // ACK inside the closure: if we panic before reaching this, + // the Sender is dropped and ack_rx returns RecvError, which + // write_columns propagates as an error to Julia — correct behavior. + if let Some(ack) = task.ack_opt { + let _ = ack.send(()); + } + result + })); + + let err = match encode_result { + Ok(Ok(())) => None, + Ok(Err(e)) => Some(e), + Err(_panic) => Some(anyhow::anyhow!("encode worker panicked")), + }; + if let Some(e) = err { + let mut slot = state.error + .lock() + .unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(e); + } + } + + // Always decrement pending; notify close() if this was the last task. + let prev = state.pending.fetch_sub(1, Ordering::AcqRel); + if prev == 1 { + state.done_notify.notify_one(); + } + } + }) + .expect("failed to spawn iceberg encode worker"); + } + + GlobalWorkerPool { task_tx } + }) +} + +/// Opaque writer handle for FFI. +/// +/// Writing is pipelined: Julia gathers a RecordBatch, sends it to the bounded per-writer +/// channel (capacity 1), and returns immediately. A lightweight async drain task forwards +/// batches to the global encode pool. Pool workers (N = available_parallelism) encode +/// Parquet concurrently across all active writers. pub struct IcebergDataFileWriter { - pub(crate) writer: Option, - /// The Arrow schema for this table, used by write_columns to create RecordBatches + /// Arrow schema for this table, used by write_columns to create RecordBatches. pub(crate) arrow_schema: ArrowSchemaRef, + /// Sender side of the per-writer batch queue (bounded, capacity 1). + /// Dropped in iceberg_writer_close to signal EOF to the drain task. + pub(crate) batch_tx: Option>, + /// Lightweight async drain task: forwards batches from batch_tx to the global pool. + pub(crate) drain_handle: Option>, + /// Shared state: owns the ConcreteDataFileWriter, tracks pending count and errors. + pub(crate) writer_state: Arc, } unsafe impl Send for IcebergDataFileWriter {} @@ -69,25 +279,112 @@ pub type IcebergDataFileWriterResponse = IcebergBoxedResponse; -/// Free a writer +/// Synchronous scatter-gather write: gathers all column data from Julia memory into +/// Arrow arrays in the calling thread, submits the RecordBatch to the global encode +/// pool, then returns immediately (encode is still async). +/// +/// Unlike `iceberg_writer_write_scattered_columns` (which requires a Tokio async context +/// and uses a callback), this function is designed for plain C `ccall` from Julia: +/// - Julia keeps source arrays alive via `GC.@preserve` for the duration of this call. +/// - After this function returns, all Julia pointers have been consumed; Julia may safely +/// release the source data. +/// - Encode is still asynchronous in the global pool; call `iceberg_writer_close` to +/// wait for all pending encodes. +/// +/// Returns 0 on success, -1 on error (error stored in writer state, propagated on close). +#[no_mangle] +pub extern "C" fn iceberg_writer_write_scattered_columns_sync( + writer: *mut IcebergDataFileWriter, + columns: *const GatheredColumnDescriptor, + num_columns: usize, +) -> i32 { + if writer.is_null() || columns.is_null() || num_columns == 0 { + return -1; + } + + let writer_ref = unsafe { &*writer }; + + let pool = match GLOBAL_ENCODE_POOL.get() { + Some(p) => p, + None => { + eprintln!("[iceberg:sync] encode pool not initialized; call iceberg_writer_new first"); + return -1; + } + }; + + let arrow_schema = writer_ref.arrow_schema.clone(); + let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) }; + + if col_descs.len() != arrow_schema.fields().len() { + let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(anyhow::anyhow!( + "Column count mismatch: got {} but schema has {}", + col_descs.len(), + arrow_schema.fields().len() + )); + } + return -1; + } + + // Gather: read from Julia memory into Rust-owned Arrow arrays (blocking, in calling thread). + // After this block, all Julia pointers have been consumed — safe to release. + let t0 = std::time::Instant::now(); + let batch = match (|| -> Result { + let mut arrays = Vec::with_capacity(num_columns); + for (i, desc) in col_descs.iter().enumerate() { + arrays.push(unsafe { build_arrow_array_scattered(desc, arrow_schema.field(i))? }); + } + RecordBatch::try_new(arrow_schema, arrays) + .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e)) + })() { + Ok(b) => b, + Err(e) => { + let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { *slot = Some(e); } + return -1; + } + }; + + // Increment pending before submit so iceberg_writer_close always accounts for this batch. + writer_ref.writer_state.pending.fetch_add(1, Ordering::AcqRel); + let task = EncodeTask { batch, ack_opt: None, state: writer_ref.writer_state.clone() }; + + match pool.task_tx.blocking_send(task) { + Ok(()) => 0, + Err(_) => { + // Pool channel closed — undo the pending increment. + let prev = writer_ref.writer_state.pending.fetch_sub(1, Ordering::AcqRel); + if prev == 1 { + writer_ref.writer_state.done_notify.notify_one(); + } + eprintln!("[iceberg:sync] pool channel closed"); + -1 + } + } +} + +/// Free a writer. Aborts the drain task and poisons the writer state so any in-flight +/// pool tasks fail gracefully (error/cancel path). #[no_mangle] pub extern "C" fn iceberg_writer_free(writer: *mut IcebergDataFileWriter) { if !writer.is_null() { unsafe { - let _ = Box::from_raw(writer); + let boxed = Box::from_raw(writer); + if let Some(handle) = boxed.drain_handle { + handle.abort(); + } + // Poison the ConcreteDataFileWriter so any in-flight pool tasks return an error + // rather than writing to a partially-freed writer. + let _ = boxed.writer_state.writer.lock().unwrap().take(); } } } -// Create a new DataFileWriter from a table with configuration options +// Create a new DataFileWriter from a table with configuration options. // -// This creates the full writer chain: -// ParquetWriterBuilder -> RollingFileWriterBuilder -> DataFileWriterBuilder -> build() -// -// Configuration options: -// - `prefix`: File name prefix (e.g., "data" produces files like "data-xxx.parquet") -// - `target_file_size_bytes`: Target size for rolling to a new file (0 = use default 512 MB) -// - `compression_codec`: Compression codec (0=UNCOMPRESSED, 1=SNAPPY, 2=GZIP, 3=LZ4, 4=ZSTD) +// Spawns a lightweight async drain task per writer. The global encode pool +// (N = available_parallelism threads) is initialized on the first call. export_runtime_op!( iceberg_writer_new, IcebergDataFileWriterResponse, @@ -95,14 +392,18 @@ export_runtime_op!( if table.is_null() { return Err(anyhow::anyhow!("Null table pointer provided")); } + if parquet_props.is_null() { + return Err(anyhow::anyhow!("Null parquet_props pointer provided")); + } let prefix_str = parse_c_string(prefix, "prefix")?; let table_ref = unsafe { &*table }; - Ok((table_ref, prefix_str, target_file_size_bytes, compression_codec)) + let props = unsafe { &*parquet_props }; + Ok((table_ref, prefix_str, target_file_size_bytes, props.to_writer_properties())) }, result_tuple, async { - let (table_ref, prefix_str, target_file_size_bytes, compression_codec) = result_tuple; + let (table_ref, prefix_str, target_file_size_bytes, writer_props) = result_tuple; let table = &table_ref.table; // Create LocationGenerator from table metadata @@ -116,12 +417,6 @@ export_runtime_op!( DataFileFormat::Parquet, ); - // Create WriterProperties with compression - let compression = compression_from_code(compression_codec); - let writer_props = WriterProperties::builder() - .set_compression(compression) - .build(); - // Create ParquetWriterBuilder with table schema and configured properties let parquet_writer_builder = ParquetWriterBuilder::new( writer_props, @@ -146,11 +441,8 @@ export_runtime_op!( ) }; - // Create DataFileWriterBuilder - let data_file_writer_builder = DataFileWriterBuilder::new(rolling_file_writer_builder); - - // Build the writer (no partition key for basic writes) - let writer = data_file_writer_builder + // Build the concrete DataFileWriter + let concrete_writer = DataFileWriterBuilder::new(rolling_file_writer_builder) .build(None) .await .map_err(|e| anyhow::anyhow!("Failed to build data file writer: {}", e))?; @@ -161,21 +453,54 @@ export_runtime_op!( .map_err(|e| anyhow::anyhow!("Failed to convert schema to Arrow: {}", e))? ); + // Initialize global pool (no-op if already running). + let pool = get_or_init_encode_pool(); + + let writer_state = Arc::new(WriterState { + writer: Mutex::new(Some(concrete_writer)), + pending: AtomicUsize::new(0), + done_notify: tokio::sync::Notify::new(), + error: Mutex::new(None), + }); + + // Bounded per-writer channel (capacity 1): Julia can be at most one batch ahead + // of the pool. This provides backpressure: send blocks if the drain task hasn't + // forwarded the previous batch yet. + let (batch_tx, mut batch_rx) = mpsc::channel::(1); + + // Lightweight drain task: forwards batches to the global pool asynchronously. + let drain_handle: JoinHandle<()> = { + let state = writer_state.clone(); + let pool_tx = pool.task_tx.clone(); + tokio::task::spawn(async move { + while let Some((batch, ack_opt)) = batch_rx.recv().await { + state.pending.fetch_add(1, Ordering::AcqRel); + let task = EncodeTask { batch, ack_opt, state: state.clone() }; + if pool_tx.send(task).await.is_err() { + eprintln!("[iceberg:drain] pool channel closed"); + break; + } + } + }) + }; + Ok::(IcebergDataFileWriter { - writer: Some(writer), arrow_schema, + batch_tx: Some(batch_tx), + drain_handle: Some(drain_handle), + writer_state, }) }, table: *mut IcebergTable, prefix: *const c_char, target_file_size_bytes: i64, - compression_codec: i32 + parquet_props: *const ParquetWriterPropertiesFFI ); -// Write Arrow IPC data to the writer +// Write Arrow IPC data to the writer. // -// The data must be in Arrow IPC stream format (as produced by Arrow.tobuffer in Julia). -// This deserializes the IPC data to RecordBatch and writes it to the Parquet file. +// Deserializes the IPC stream, then sends each RecordBatch to the drain task's channel. +// Returns immediately after queuing — encoding happens asynchronously in the global pool. export_runtime_op!( iceberg_writer_write, crate::IcebergResponse, @@ -202,25 +527,22 @@ export_runtime_op!( async { let (writer_ref, ipc_bytes) = result_tuple; - // Deserialize Arrow IPC to RecordBatch + let tx = writer_ref + .batch_tx + .as_ref() + .ok_or_else(|| anyhow::anyhow!("Writer has been closed"))?; + + // Deserialize Arrow IPC to RecordBatches and enqueue each one let cursor = Cursor::new(ipc_bytes); let mut reader = StreamReader::try_new(cursor, None) .map_err(|e| anyhow::anyhow!("Failed to create Arrow IPC reader: {}", e))?; - // Get the writer - let writer = writer_ref - .writer - .as_mut() - .ok_or_else(|| anyhow::anyhow!("Writer has been closed"))?; - - // Read and write all batches from the IPC stream while let Some(batch_result) = reader.next() { let batch = batch_result .map_err(|e| anyhow::anyhow!("Failed to read Arrow IPC batch: {}", e))?; - writer - .write(batch) - .await - .map_err(|e| anyhow::anyhow!("Failed to write batch: {}", e))?; + // IPC data already copied into ipc_bytes above — no ACK needed. + tx.send((batch, None)).await + .map_err(|_| anyhow::anyhow!("Writer channel closed (drain task may have failed)"))?; } Ok::<(), anyhow::Error>(()) @@ -230,12 +552,11 @@ export_runtime_op!( arrow_ipc_len: usize ); -// Close the writer and return the produced DataFiles -// -// This flushes any remaining data, closes the Parquet file(s), and returns -// the DataFiles metadata that can be used with fast_append! in a Transaction. +// Close the writer and return the produced DataFiles. // -// After calling this, the writer cannot be used again. +// Drops the batch sender (signals EOF to the drain task), waits for the drain task to +// finish forwarding all batches, waits for all pool encodes to complete, then finalizes +// the Parquet file and returns the DataFiles metadata. export_runtime_op!( iceberg_writer_close, IcebergWriterCloseResponse, @@ -248,14 +569,54 @@ export_runtime_op!( }, writer_ref, async { - // Take the writer out - let mut writer = writer_ref + // Drop the sender — signals the drain task that no more batches are coming + drop(writer_ref.batch_tx.take()); + + // Wait for the drain task to finish forwarding all batches to the pool + if let Some(handle) = writer_ref.drain_handle.take() { + handle.await + .map_err(|e| anyhow::anyhow!("Drain task panicked: {}", e))?; + } + + // Wait for all pending pool encodes to complete. + // Uses a timeout to guard against a dead worker thread (e.g. panic outside + // catch_unwind) that would otherwise leave pending > 0 forever. + // Tokio's Notify preserves the notification if notify_one() fired before + // notified() is polled, so the check-then-wait sequence is race-free. + let state = &writer_ref.writer_state; + let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(120); + loop { + if state.pending.load(Ordering::Acquire) == 0 { + break; + } + let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); + if remaining.is_zero() { + return Err(anyhow::anyhow!( + "Timed out waiting for {} encode task(s) to complete; \ + an encode worker may have crashed", + state.pending.load(Ordering::Acquire) + )); + } + tokio::select! { + _ = state.done_notify.notified() => {} + _ = tokio::time::sleep(remaining) => {} + } + } + + // Propagate any encode error + if let Some(e) = state.error.lock().unwrap().take() { + return Err(e); + } + + // Take the concrete writer and finalize the Parquet file + let mut concrete = state .writer + .lock() + .unwrap() .take() - .ok_or_else(|| anyhow::anyhow!("Writer has already been closed"))?; + .ok_or_else(|| anyhow::anyhow!("Writer already closed"))?; - // Close the writer to get the DataFiles - let data_files = writer + let data_files = concrete .close() .await .map_err(|e| anyhow::anyhow!("Failed to close writer: {}", e))?; diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs index f892d0c..0f61849 100644 --- a/iceberg_rust_ffi/src/writer_columns.rs +++ b/iceberg_rust_ffi/src/writer_columns.rs @@ -4,6 +4,7 @@ /// avoiding the overhead of Arrow IPC serialization. Julia passes raw column pointers /// and metadata, and Rust builds Arrow arrays directly from them. use std::ffi::c_void; +use std::ptr::NonNull; use std::sync::Arc; use arrow_array::{ @@ -13,11 +14,26 @@ use arrow_array::{ }, ArrayRef, BooleanArray, PrimitiveArray, RecordBatch, StringArray, }; -use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, ScalarBuffer}; +use arrow_buffer::{ArrowNativeType, BooleanBuffer, Buffer, NullBuffer, ScalarBuffer}; + +/// No-op allocator: Arrow buffers created from Julia memory must not be freed by Rust. +/// Julia owns the memory until the per-batch oneshot ACK is received (write_columns path). +struct NoopAllocation; + +/// Build a zero-copy `ScalarBuffer` pointing directly into Julia-owned memory. +/// Safety: `ptr` must be non-null, valid, and alive until the buffer is dropped. +unsafe fn zero_copy_scalar_buffer( + ptr: *const T, + num_rows: usize, +) -> ScalarBuffer { + let byte_len = num_rows * std::mem::size_of::(); + let nn_ptr = NonNull::new_unchecked(ptr as *mut u8); + let buf = Buffer::from_custom_allocation(nn_ptr, byte_len, Arc::new(NoopAllocation)); + ScalarBuffer::new(buf, 0, num_rows) +} use arrow_schema::DataType; use crate::writer::IcebergDataFileWriter; -use iceberg::writer::IcebergWriter; use object_store_ffi::{ export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT, }; @@ -72,52 +88,42 @@ unsafe fn build_arrow_array( schema_field: &arrow_schema::Field, ) -> Result { let null_buffer = if desc.is_nullable && !desc.validity_ptr.is_null() { - // Julia's BitVector stores bits packed in UInt64 chunks, which we can use directly - // since Arrow also uses little-endian bit-packed format. - // We just need to copy the right number of bytes. + // Julia's BitVector uses the same little-endian bit-packed layout as Arrow. + // Point directly into Julia's buffer — no copy needed. let num_bytes = (desc.num_rows + 7) / 8; - let validity_slice = std::slice::from_raw_parts(desc.validity_ptr, num_bytes); - Some(NullBuffer::new(BooleanBuffer::new( - Buffer::from(validity_slice.to_vec()), - 0, - desc.num_rows, - ))) + let nn_ptr = NonNull::new_unchecked(desc.validity_ptr as *mut u8); + let buf = Buffer::from_custom_allocation(nn_ptr, num_bytes, Arc::new(NoopAllocation)); + Some(NullBuffer::new(BooleanBuffer::new(buf, 0, desc.num_rows))) } else { None }; let array: ArrayRef = match desc.column_type { COLUMN_TYPE_INT32 => { - let data = std::slice::from_raw_parts(desc.data_ptr as *const i32, desc.num_rows); - let buffer = ScalarBuffer::from(data.to_vec()); + let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i32, desc.num_rows); Arc::new(PrimitiveArray::::new(buffer, null_buffer)) } COLUMN_TYPE_INT64 => { - let data = std::slice::from_raw_parts(desc.data_ptr as *const i64, desc.num_rows); - let buffer = ScalarBuffer::from(data.to_vec()); + let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i64, desc.num_rows); Arc::new(PrimitiveArray::::new(buffer, null_buffer)) } COLUMN_TYPE_FLOAT32 => { - let data = std::slice::from_raw_parts(desc.data_ptr as *const f32, desc.num_rows); - let buffer = ScalarBuffer::from(data.to_vec()); + let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const f32, desc.num_rows); Arc::new(PrimitiveArray::::new(buffer, null_buffer)) } COLUMN_TYPE_FLOAT64 => { - let data = std::slice::from_raw_parts(desc.data_ptr as *const f64, desc.num_rows); - let buffer = ScalarBuffer::from(data.to_vec()); + let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const f64, desc.num_rows); Arc::new(PrimitiveArray::::new(buffer, null_buffer)) } COLUMN_TYPE_DATE => { // Date is stored as Int32 (days since epoch) - let data = std::slice::from_raw_parts(desc.data_ptr as *const i32, desc.num_rows); - let buffer = ScalarBuffer::from(data.to_vec()); + let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i32, desc.num_rows); Arc::new(PrimitiveArray::::new(buffer, null_buffer)) } COLUMN_TYPE_TIMESTAMP => { // Timestamp without timezone (Iceberg `timestamp`) // Stored as Int64 microseconds since epoch - let data = std::slice::from_raw_parts(desc.data_ptr as *const i64, desc.num_rows); - let buffer = ScalarBuffer::from(data.to_vec()); + let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i64, desc.num_rows); Arc::new(PrimitiveArray::::new( buffer, null_buffer, @@ -126,8 +132,7 @@ unsafe fn build_arrow_array( COLUMN_TYPE_TIMESTAMPTZ => { // Timestamp with UTC timezone (Iceberg `timestamptz`) // Stored as Int64 microseconds since epoch, with timezone metadata - let data = std::slice::from_raw_parts(desc.data_ptr as *const i64, desc.num_rows); - let buffer = ScalarBuffer::from(data.to_vec()); + let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i64, desc.num_rows); Arc::new( PrimitiveArray::::new(buffer, null_buffer) .with_timezone("UTC"), @@ -204,28 +209,26 @@ unsafe fn build_arrow_array( } }; - // Convert the backing integer representation to i128 values - let i128_values: Vec = match desc.column_type { + // All decimal variants produce an Arrow Decimal128 (i128 backing). + // INT32/INT64 variants need widening — materialize to avoid a cast loop + // in the zero-copy path; INT128 shares the same layout and is zero-copy. + let buffer: ScalarBuffer = match desc.column_type { COLUMN_TYPE_DECIMAL_INT32 => { let data = std::slice::from_raw_parts(desc.data_ptr as *const i32, desc.num_rows); - data.iter().map(|&v| v as i128).collect() + ScalarBuffer::from(data.iter().map(|&v| v as i128).collect::>()) } COLUMN_TYPE_DECIMAL_INT64 => { let data = std::slice::from_raw_parts(desc.data_ptr as *const i64, desc.num_rows); - data.iter().map(|&v| v as i128).collect() + ScalarBuffer::from(data.iter().map(|&v| v as i128).collect::>()) } _ => { // COLUMN_TYPE_DECIMAL_INT128: Julia Int128 and Rust i128 share the same - // 16-byte little-endian layout on all supported platforms. - let data = - std::slice::from_raw_parts(desc.data_ptr as *const i128, desc.num_rows); - data.to_vec() + // 16-byte little-endian layout on all supported platforms — zero-copy. + zero_copy_scalar_buffer(desc.data_ptr as *const i128, desc.num_rows) } }; - - let buffer = ScalarBuffer::::from(i128_values); Arc::new( PrimitiveArray::::new(buffer, null_buffer) .with_precision_and_scale(precision, scale) @@ -240,6 +243,272 @@ unsafe fn build_arrow_array( Ok(array) } +// ============================================================================= +// Scattered-gather writer: pass raw source pointers + selection indices to Rust, +// which gathers the data directly into Arrow arrays — eliminating the Julia-side +// staging copy for non-converting numeric column types. +// ============================================================================= + +/// A reference to one slice of source column data. +/// `sel_ptr = null` → sequential (identity) access: read data[0..len]. +/// `sel_ptr != null` → scattered access: read data[sel[i]-1] for i in 0..len (1-based Julia indices). +/// `validity_ptr = null` → all rows valid (non-nullable or known all-valid slice). +/// `lengths_ptr != null` → string column: data_ptr is Ptr{UInt8}[], lengths_ptr is Int64[]. +/// Fields are all 8 bytes — no padding, total 40 bytes. +#[repr(C)] +#[derive(Clone, Copy)] +pub struct SliceRef { + pub data_ptr: *const c_void, + pub lengths_ptr: *const i64, + pub validity_ptr: *const u8, + pub sel_ptr: *const i64, + pub len: usize, +} + +unsafe impl Send for SliceRef {} +unsafe impl Sync for SliceRef {} + +/// Gathered column descriptor: gather `num_slices` SliceRefs into one Arrow column. +/// `total_rows` must equal the sum of all `slice.len` values. +/// Fields ordered largest-to-smallest; 3 bytes trailing padding → 32 bytes total. +#[repr(C)] +#[derive(Clone, Copy)] +pub struct GatheredColumnDescriptor { + pub slices: *const SliceRef, + pub num_slices: usize, + pub total_rows: usize, + pub column_type: i32, + pub is_nullable: bool, +} + +unsafe impl Send for GatheredColumnDescriptor {} +unsafe impl Sync for GatheredColumnDescriptor {} + +/// Build the Arrow null buffer by scanning validity bits across all slices. +/// Returns `None` if every slice has a null `validity_ptr` (all rows valid). +unsafe fn build_null_buffer_scattered(slices: &[SliceRef], total_rows: usize) -> Option { + if !slices.iter().any(|s| !s.validity_ptr.is_null()) { + return None; + } + let mut bits = vec![0u8; (total_rows + 7) / 8]; + let mut out = 0usize; + for slice in slices { + if slice.validity_ptr.is_null() { + // All valid — set bits 1 + for i in 0..slice.len { + bits[(out + i) / 8] |= 1u8 << ((out + i) % 8); + } + } else if slice.sel_ptr.is_null() { + // Sequential: copy bits with possible alignment shift + for i in 0..slice.len { + let b = (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1; + bits[(out + i) / 8] |= b << ((out + i) % 8); + } + } else { + // Scattered: gather validity bits via selection indices (1-based) + let sel = std::slice::from_raw_parts(slice.sel_ptr, slice.len); + for (i, &idx) in sel.iter().enumerate() { + let src = (idx - 1) as usize; + let b = (*slice.validity_ptr.add(src / 8) >> (src % 8)) & 1; + bits[(out + i) / 8] |= b << ((out + i) % 8); + } + } + out += slice.len; + } + Some(NullBuffer::new(BooleanBuffer::new(Buffer::from(bits), 0, total_rows))) +} + +/// Gather all slices for a column into an Arrow array. +pub(crate) unsafe fn build_arrow_array_scattered( + desc: &GatheredColumnDescriptor, + schema_field: &arrow_schema::Field, +) -> Result { + let slices = std::slice::from_raw_parts(desc.slices, desc.num_slices); + let total = desc.total_rows; + let null_buf = if desc.is_nullable { + build_null_buffer_scattered(slices, total) + } else { + None + }; + + // Macro gathers a primitive numeric type from all slices. + // sel_ptr=null → sequential copy; sel_ptr!=null → indirect gather (1-based indices). + macro_rules! gather_primitive { + ($T:ty, $ArrowType:ty) => {{ + let mut values = Vec::<$T>::with_capacity(total); + for slice in slices { + let src = slice.data_ptr as *const $T; + if slice.sel_ptr.is_null() { + values.extend_from_slice(std::slice::from_raw_parts(src, slice.len)); + } else { + for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { + values.push(*src.add((idx - 1) as usize)); + } + } + } + Arc::new(PrimitiveArray::<$ArrowType>::new(ScalarBuffer::from(values), null_buf)) + as ArrayRef + }}; + } + + let array: ArrayRef = match desc.column_type { + COLUMN_TYPE_INT32 => gather_primitive!(i32, Int32Type), + COLUMN_TYPE_INT64 => gather_primitive!(i64, Int64Type), + COLUMN_TYPE_FLOAT32 => gather_primitive!(f32, Float32Type), + COLUMN_TYPE_FLOAT64 => gather_primitive!(f64, Float64Type), + COLUMN_TYPE_DATE => gather_primitive!(i32, Date32Type), + COLUMN_TYPE_TIMESTAMP => { + let mut values = Vec::::with_capacity(total); + for slice in slices { + let src = slice.data_ptr as *const i64; + if slice.sel_ptr.is_null() { + values.extend_from_slice(std::slice::from_raw_parts(src, slice.len)); + } else { + for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { + values.push(*src.add((idx - 1) as usize)); + } + } + } + Arc::new(PrimitiveArray::::new( + ScalarBuffer::from(values), + null_buf, + )) + } + COLUMN_TYPE_TIMESTAMPTZ => { + let mut values = Vec::::with_capacity(total); + for slice in slices { + let src = slice.data_ptr as *const i64; + if slice.sel_ptr.is_null() { + values.extend_from_slice(std::slice::from_raw_parts(src, slice.len)); + } else { + for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { + values.push(*src.add((idx - 1) as usize)); + } + } + } + Arc::new( + PrimitiveArray::::new(ScalarBuffer::from(values), null_buf) + .with_timezone("UTC"), + ) + } + COLUMN_TYPE_BOOLEAN => { + let mut bits = vec![0u8; (total + 7) / 8]; + let mut out = 0usize; + for slice in slices { + let src = slice.data_ptr as *const u8; + if slice.sel_ptr.is_null() { + let data = std::slice::from_raw_parts(src, slice.len); + for (i, &v) in data.iter().enumerate() { + if v != 0 { bits[(out + i) / 8] |= 1 << ((out + i) % 8); } + } + } else { + for (i, &idx) in std::slice::from_raw_parts(slice.sel_ptr, slice.len).iter().enumerate() { + if *src.add((idx - 1) as usize) != 0 { + bits[(out + i) / 8] |= 1 << ((out + i) % 8); + } + } + } + out += slice.len; + } + Arc::new(BooleanArray::new(BooleanBuffer::new(Buffer::from(bits), 0, total), null_buf)) + } + COLUMN_TYPE_STRING => { + // Strings are always pre-staged in Julia; data_ptr = *const *const u8, + // lengths_ptr = *const i64, sel_ptr is always null (identity). + let mut all_strings: Vec> = Vec::with_capacity(total); + for slice in slices { + if slice.lengths_ptr.is_null() { + return Err(anyhow::anyhow!("String column requires lengths_ptr")); + } + let ptrs = std::slice::from_raw_parts(slice.data_ptr as *const *const u8, slice.len); + let lens = std::slice::from_raw_parts(slice.lengths_ptr, slice.len); + for i in 0..slice.len { + let is_null = if !slice.validity_ptr.is_null() { + (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1 == 0 + } else { + false + }; + if is_null { + all_strings.push(None); + } else { + let s = std::str::from_utf8(std::slice::from_raw_parts(ptrs[i], lens[i] as usize)) + .map_err(|e| anyhow::anyhow!("Invalid UTF-8: {}", e))?; + all_strings.push(Some(s)); + } + } + } + Arc::new(StringArray::from(all_strings)) + } + COLUMN_TYPE_UUID => { + let mut data: Vec = Vec::with_capacity(total * 16); + for slice in slices { + let src = slice.data_ptr as *const u8; + if slice.sel_ptr.is_null() { + data.extend_from_slice(std::slice::from_raw_parts(src, slice.len * 16)); + } else { + for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { + data.extend_from_slice(std::slice::from_raw_parts(src.add((idx - 1) as usize * 16), 16)); + } + } + } + let chunks: Vec<&[u8]> = data.chunks(16).collect(); + Arc::new( + arrow_array::FixedSizeBinaryArray::try_from_iter(chunks.into_iter()) + .map_err(|e| anyhow::anyhow!("Failed to build UUID array: {}", e))?, + ) + } + COLUMN_TYPE_DECIMAL_INT32 | COLUMN_TYPE_DECIMAL_INT64 | COLUMN_TYPE_DECIMAL_INT128 => { + let (precision, scale) = match schema_field.data_type() { + arrow_schema::DataType::Decimal128(p, s) => (*p, *s), + dt => return Err(anyhow::anyhow!("Expected Decimal128, got {:?}", dt)), + }; + let mut values = Vec::::with_capacity(total); + for slice in slices { + match desc.column_type { + COLUMN_TYPE_DECIMAL_INT32 => { + let src = slice.data_ptr as *const i32; + if slice.sel_ptr.is_null() { + values.extend(std::slice::from_raw_parts(src, slice.len).iter().map(|&v| v as i128)); + } else { + for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { + values.push(*src.add((idx - 1) as usize) as i128); + } + } + } + COLUMN_TYPE_DECIMAL_INT64 => { + let src = slice.data_ptr as *const i64; + if slice.sel_ptr.is_null() { + values.extend(std::slice::from_raw_parts(src, slice.len).iter().map(|&v| v as i128)); + } else { + for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { + values.push(*src.add((idx - 1) as usize) as i128); + } + } + } + _ => { + // DECIMAL_INT128: i128 layout matches Julia Int128 + let src = slice.data_ptr as *const i128; + if slice.sel_ptr.is_null() { + values.extend_from_slice(std::slice::from_raw_parts(src, slice.len)); + } else { + for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { + values.push(*src.add((idx - 1) as usize)); + } + } + } + } + } + Arc::new( + PrimitiveArray::::new(ScalarBuffer::from(values), null_buf) + .with_precision_and_scale(precision, scale) + .map_err(|e| anyhow::anyhow!("Decimal precision/scale: {}", e))?, + ) + } + _ => return Err(anyhow::anyhow!("Unknown column type: {}", desc.column_type)), + }; + Ok(array) +} + // Write columns directly to the Parquet writer. // Accepts an array of ColumnDescriptors and builds a RecordBatch from them, // then writes to the underlying Parquet writer. @@ -267,7 +536,6 @@ export_runtime_op!( async { let (writer_ref, cols) = result_tuple; - // Get the writer's schema (stored when writer was created) let arrow_schema = writer_ref.arrow_schema.clone(); // Validate column count matches schema @@ -279,30 +547,32 @@ export_runtime_op!( )); } - // Get the writer - let iceberg_writer = writer_ref - .writer - .as_mut() - .ok_or_else(|| anyhow::anyhow!("Writer has been closed"))?; - - // Build Arrow arrays from column descriptors + // Build Arrow arrays from column descriptors. + // These arrays are zero-copy: they hold raw pointers into Julia-owned buffers. let mut arrays: Vec = Vec::with_capacity(cols.len()); - for (i, desc) in cols.iter().enumerate() { let schema_field = arrow_schema.field(i); let array = unsafe { build_arrow_array(desc, schema_field)? }; arrays.push(array); } - // Create record batch using the table's Arrow schema (with proper field IDs) let batch = RecordBatch::try_new(arrow_schema, arrays) .map_err(|e| anyhow::anyhow!("Failed to create RecordBatch: {}", e))?; - // Write the batch - iceberg_writer - .write(batch) + // Arrow arrays are zero-copy (pointing into Julia buffers) — we must not return + // until Rust has released them. Use a per-batch oneshot: the drain task signals + // after encoding, at which point the Arrow arrays have been dropped. + let (ack_tx, ack_rx) = tokio::sync::oneshot::channel::<()>(); + writer_ref + .batch_tx + .as_ref() + .ok_or_else(|| anyhow::anyhow!("Writer has been closed"))? + .send((batch, Some(ack_tx))) + .await + .map_err(|_| anyhow::anyhow!("Writer channel closed (drain task may have failed)"))?; + ack_rx .await - .map_err(|e| anyhow::anyhow!("Failed to write batch: {}", e))?; + .map_err(|_| anyhow::anyhow!("Ack channel closed (drain task may have failed)"))?; Ok::<(), anyhow::Error>(()) }, diff --git a/src/RustyIceberg.jl b/src/RustyIceberg.jl index 8e29037..85b1ee4 100644 --- a/src/RustyIceberg.jl +++ b/src/RustyIceberg.jl @@ -33,13 +33,14 @@ export IcebergTimestampNs, IcebergTimestamptzNs export IcebergString, IcebergUuid, IcebergBinary, IcebergDecimal export Transaction, DataFiles, free_transaction!, free_data_files!, commit, transaction export FastAppendAction, free_fast_append_action!, add_data_files, apply, with_fast_append -export DataFileWriter, free_writer!, close_writer, write_columns +export DataFileWriter, free_writer!, close_writer, write_columns, set_encode_workers! export WriterConfig, CompressionCodec, UNCOMPRESSED, SNAPPY, GZIP, LZ4, ZSTD -export ColumnDescriptor, ColumnType +export ColumnDescriptor, ColumnBatch, ColumnType export COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FLOAT64 export COLUMN_TYPE_STRING, COLUMN_TYPE_DATE, COLUMN_TYPE_TIMESTAMP, COLUMN_TYPE_TIMESTAMPTZ, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_UUID export COLUMN_TYPE_DECIMAL_INT32, COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_DECIMAL_INT128 export julia_type_to_column_type +export GatheredColumn, GatheredBatch, add_slice!, add_string_slice!, write_scattered_columns_sync # Always use the JLL library - override via Preferences if needed for local development # To use a local build, set the preference: diff --git a/src/writer.jl b/src/writer.jl index 563d01a..6abab6e 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -36,16 +36,24 @@ Configuration options for the DataFileWriter. # Fields - `prefix::String`: Prefix for output file names (default: "data") -- `target_file_size_bytes::Int`: Target size for rolling to a new file (default: 512 MB) -- `compression::CompressionCodec`: Compression codec for Parquet files (default: UNCOMPRESSED) +- `target_file_size_bytes::Int`: Target file size before rolling to a new file (default: 512 MB) +- `compression::CompressionCodec`: Parquet compression codec (default: UNCOMPRESSED) +- `dictionary_enabled::Bool`: Enable Parquet dictionary encoding globally (default: true) +- `max_row_group_size::Int`: Maximum rows per Parquet row group, 0 = parquet default (1 048 576) +- `data_page_size::Int`: Target uncompressed data page size in bytes, 0 = parquet default (1 048 576) +- `write_batch_size::Int`: Rows encoded per column chunk within a row group, 0 = parquet default (1024). + Increasing this (e.g. 65536) reduces encoding overhead for large row groups. +- `plain_encoding::Bool`: Force PLAIN encoding for all columns, bypassing the DELTA_BINARY_PACKED + default that parquet-rs uses for INT64/INT32 under PARQUET_2_0 (default: false). # Example ```julia -# Override defaults: use smaller file size and ZSTD compression config = WriterConfig( prefix = "my_data", - target_file_size_bytes = 128 * 1024 * 1024, # 128 MB (default is 512 MB) - compression = ZSTD + compression = ZSTD, + dictionary_enabled = false, # disable dict encoding for benchmarking + plain_encoding = true, # use PLAIN to avoid DELTA_BINARY_PACKED overhead + write_batch_size = 65536, # larger batches for better throughput ) writer = DataFileWriter(table, config) ``` @@ -54,6 +62,24 @@ writer = DataFileWriter(table, config) prefix::String = "data" target_file_size_bytes::Int = DEFAULT_TARGET_FILE_SIZE_BYTES compression::CompressionCodec = UNCOMPRESSED + dictionary_enabled::Bool = true + max_row_group_size::Int = 0 + data_page_size::Int = 0 + write_batch_size::Int = 0 + plain_encoding::Bool = false + statistics_enabled::Bool = true +end + +# C-layout struct matching ParquetWriterPropertiesFFI in Rust. +# Fields are ordered largest-to-smallest to match repr(C) padding rules. +struct ParquetWriterPropertiesFFI + max_row_group_size::Int64 + data_page_size::Int64 + write_batch_size::Int64 + compression_codec::Int32 + dictionary_enabled::Bool + use_plain_encoding::Bool + statistics_enabled::Bool end """ @@ -120,6 +146,21 @@ function get_column_metadata(table::Table)::Dict{Symbol, Vector{Pair{String, Str return colmeta end +""" + set_encode_workers!(n::Int) + +Set the number of threads in the global Parquet encode worker pool. + +Must be called before creating any `DataFileWriter`; has no effect once the pool +has been initialized (i.e. after the first writer is created). Defaults to +`Sys.CPU_THREADS` if not set. +""" +function set_encode_workers!(n::Int) + n > 0 || throw(ArgumentError("n must be positive, got $n")) + @ccall rust_lib.iceberg_set_encode_workers(n::Cint)::Cvoid + return nothing +end + """ DataFileWriter(table::Table, config::WriterConfig) -> DataFileWriter DataFileWriter(table::Table; prefix="data", target_file_size_bytes=512MB, compression=UNCOMPRESSED) -> DataFileWriter @@ -157,13 +198,22 @@ free_writer!(writer) """ function DataFileWriter(table::Table, config::WriterConfig) response = DataFileWriterResponse() - - async_ccall(response, config.prefix) do handle + parquet_props = Ref(ParquetWriterPropertiesFFI( + Int64(config.max_row_group_size), + Int64(config.data_page_size), + Int64(config.write_batch_size), + Int32(config.compression), + config.dictionary_enabled, + config.plain_encoding, + config.statistics_enabled, + )) + + async_ccall(response, config.prefix, parquet_props) do handle @ccall rust_lib.iceberg_writer_new( table::Table, config.prefix::Cstring, config.target_file_size_bytes::Int64, - Int32(config.compression)::Int32, + parquet_props::Ref{ParquetWriterPropertiesFFI}, response::Ref{DataFileWriterResponse}, handle::Ptr{Cvoid} )::Cint @@ -446,6 +496,47 @@ end # Column-based writing (zero-copy from Julia) # ========================================================================================== +""" + SliceRef + +FFI reference to a single slice of source column data for the scattered-gather writer. + +- `data_ptr`: pointer to source data array (T[]) or string pointers (Ptr{UInt8}[]) +- `lengths_ptr`: for string columns, pointer to lengths array; null for other types +- `validity_ptr`: pointer to validity bitmap (BitVector.chunks); null if all rows valid +- `sel_ptr`: pointer to selection index array (1-based Julia indices); null for sequential access +- `len`: number of rows in this slice + +All fields are 8 bytes — total struct size is 40 bytes with no padding. +""" +struct SliceRef + data_ptr::Ptr{Cvoid} + lengths_ptr::Ptr{Int64} + validity_ptr::Ptr{UInt8} + sel_ptr::Ptr{Int64} + len::Csize_t +end + +""" + GatheredColumnDescriptor + +FFI descriptor for a column to be gathered from multiple SliceRefs. +Pass an array of these to `write_scattered_columns`. + +- `slices_ptr`: pointer to array of SliceRef structs +- `num_slices`: number of SliceRef entries +- `total_rows`: sum of all slice lengths +- `column_type`: ColumnType enum value +- `is_nullable`: whether the column may contain null values +""" +struct GatheredColumnDescriptor + slices_ptr::Ptr{SliceRef} + num_slices::Csize_t + total_rows::Csize_t + column_type::Int32 + is_nullable::Bool +end + """ ColumnType @@ -632,6 +723,47 @@ function Base.push!( return batch end +""" + push!(batch::ColumnBatch, data::Vector{String}, str_ptrs::Vector{Ptr{UInt8}}, str_lens::Vector{Int64}; validity=nothing, length=nothing, column_type=nothing) + +Add a string column to the batch using pre-allocated pointer/length buffers. +The caller is responsible for filling `str_ptrs` and `str_lens` before calling this. +Avoids allocating new pointer/length arrays on every write. +""" +function Base.push!( + batch::ColumnBatch, + data::Vector{String}, + str_ptrs::Vector{Ptr{UInt8}}, + str_lens::Vector{Int64}; + validity::Union{Nothing, BitVector}=nothing, + length::Union{Nothing, Int}=nothing, + column_type::Union{Nothing, ColumnType}=nothing, +) + num_rows = length === nothing ? Base.length(str_ptrs) : length + is_nullable = validity !== nothing + col_type = column_type === nothing ? COLUMN_TYPE_STRING : column_type + + push!(batch.arrays_to_preserve, data, str_ptrs, str_lens) + + validity_ptr = if is_nullable + push!(batch.arrays_to_preserve, validity) + Ptr{UInt8}(pointer(validity.chunks)) + else + Ptr{UInt8}(C_NULL) + end + + desc = ColumnDescriptor( + Ptr{Cvoid}(pointer(str_ptrs)), + pointer(str_lens), + validity_ptr, + Csize_t(num_rows), + Int32(col_type), + is_nullable + ) + push!(batch.descriptors, desc) + return batch +end + """ push!(batch::ColumnBatch, data::Vector{T}; validity=nothing, length=nothing, column_type=nothing) where T @@ -767,3 +899,225 @@ write_columns(writer, batch) function write_columns(writer::DataFileWriter, batch::ColumnBatch) write_columns(writer, batch.descriptors, batch.arrays_to_preserve) end + +# ========================================================================================== +# High-level gathered-column API +# ========================================================================================== + +""" + GatheredColumn + +Accumulates one or more source slices for a single column. Rust gathers the data +directly from source buffers when the batch is written, avoiding a Julia-side staging +copy for numeric columns. + +Typical usage: + +```julia +col = GatheredColumn(COLUMN_TYPE_INT64) +add_slice!(col, src_array) # sequential: all rows +add_slice!(col, src_array2; sel=sel_indices) # scattered: rows at sel_indices +add_slice!(col, src_array3; validity=valid_bv) # nullable slice +``` + +String columns are not supported on the scattered path; use `ColumnBatch` instead. +""" +mutable struct GatheredColumn + slices::Vector{SliceRef} + total_rows::Int + column_type::ColumnType + is_nullable::Bool + preserve::Vector{Any} # source arrays kept alive until write +end + +GatheredColumn(column_type::ColumnType; nullable::Bool=false) = + GatheredColumn(SliceRef[], 0, column_type, nullable, Any[]) + +""" + add_slice!(col::GatheredColumn, data::AbstractVector{T}; + sel=nothing, validity=nothing) + +Append a slice of `data` to `col`. + +- `sel`: optional `Vector{Int64}` of 1-based row indices into `data` to select. + If omitted, all rows of `data` are used sequentially. +- `validity`: optional `BitVector` (length = number of selected rows, `true` = valid). + Providing this marks the column as nullable. +""" +function add_slice!( + col::GatheredColumn, + data::AbstractVector{T}; + sel::Union{Nothing, Vector{Int64}} = nothing, + validity::Union{Nothing, BitVector} = nothing, +) where T + len = sel === nothing ? length(data) : length(sel) + + sel_ptr = if sel !== nothing + push!(col.preserve, sel) + pointer(sel) + else + Ptr{Int64}(C_NULL) + end + + validity_ptr = if validity !== nothing + col.is_nullable = true + push!(col.preserve, validity) + Ptr{UInt8}(pointer(validity.chunks)) + else + Ptr{UInt8}(C_NULL) + end + + push!(col.preserve, data) + push!(col.slices, SliceRef( + Ptr{Cvoid}(pointer(data)), + Ptr{Int64}(C_NULL), # lengths_ptr unused for non-string types + validity_ptr, + sel_ptr, + Csize_t(len), + )) + col.total_rows += len + return col +end + +""" + add_string_slice!(col::GatheredColumn, str_ptrs, str_lens; validity=nothing) + +Append a string slice to `col`. `str_ptrs` is a `Vector{Ptr{UInt8}}` of pointers to +UTF-8 string data and `str_lens` is a `Vector{Int64}` of corresponding byte lengths. +The caller is responsible for keeping the pointed-to string bytes alive. +""" +function add_string_slice!( + col::GatheredColumn, + str_ptrs::Vector{Ptr{UInt8}}, + str_lens::Vector{Int64}; + validity::Union{Nothing, BitVector} = nothing, +) + len = length(str_ptrs) + + validity_ptr = if validity !== nothing + col.is_nullable = true + push!(col.preserve, validity) + Ptr{UInt8}(pointer(validity.chunks)) + else + Ptr{UInt8}(C_NULL) + end + + push!(col.preserve, str_ptrs, str_lens) + push!(col.slices, SliceRef( + Ptr{Cvoid}(pointer(str_ptrs)), + pointer(str_lens), + validity_ptr, + Ptr{Int64}(C_NULL), + Csize_t(len), + )) + col.total_rows += len + return col +end + +""" + GatheredBatch + +Collects a `GatheredColumn` per output column, then writes all of them in one call. + +```julia +batch = GatheredBatch() +push!(batch, col_int64) +push!(batch, col_float64) +write_scattered_columns_sync(writer, batch) +``` + +You can also push a single-slice column inline without building a `GatheredColumn` +explicitly: + +```julia +batch = GatheredBatch() +push!(batch, src_ints, COLUMN_TYPE_INT64) +push!(batch, src_floats, COLUMN_TYPE_FLOAT64; sel=indices, validity=valid_bv) +write_scattered_columns_sync(writer, batch) +``` +""" +mutable struct GatheredBatch + columns::Vector{GatheredColumn} +end + +GatheredBatch() = GatheredBatch(GatheredColumn[]) + +""" + push!(batch::GatheredBatch, col::GatheredColumn) + +Append an already-built `GatheredColumn` to the batch. +""" +Base.push!(batch::GatheredBatch, col::GatheredColumn) = (push!(batch.columns, col); batch) + +""" + push!(batch::GatheredBatch, data::AbstractVector, column_type::ColumnType; + sel=nothing, validity=nothing, nullable=false) + +Convenience: create a single-slice `GatheredColumn` from `data` and append it. +""" +function Base.push!( + batch::GatheredBatch, + data::AbstractVector, + column_type::ColumnType; + sel::Union{Nothing, Vector{Int64}} = nothing, + validity::Union{Nothing, BitVector} = nothing, + nullable::Bool = validity !== nothing, +) + col = GatheredColumn(column_type; nullable) + add_slice!(col, data; sel, validity) + push!(batch.columns, col) + return batch +end + +""" + write_scattered_columns_sync(writer::DataFileWriter, batch::GatheredBatch[, extra_preserve]) + +Gather column data from Julia memory synchronously, then encode asynchronously. + +Gathers all column data from Julia memory in the calling thread using a plain blocking +`ccall`. Encode runs asynchronously in the global worker pool. + +`extra_preserve` (optional) is an additional collection of objects whose memory must +stay alive during the gather (e.g. source string arrays for zero-copy string columns). + +The source data pointed to by the `GatheredBatch` slices and `extra_preserve` must be +valid for the duration of this call. After the call returns, all Julia pointers have +been consumed and the source data may be safely released. +""" +function write_scattered_columns_sync( + writer::DataFileWriter, + batch::GatheredBatch, + extra_preserve = nothing, +) + isempty(batch.columns) && throw(IcebergException("GatheredBatch has no columns")) + writer.ptr == C_NULL && throw(IcebergException("Writer has been freed")) + + all_slice_arrays = Vector{Vector{SliceRef}}(undef, length(batch.columns)) + descriptors = Vector{GatheredColumnDescriptor}(undef, length(batch.columns)) + preserve = Any[] + + for (i, col) in enumerate(batch.columns) + slices = col.slices + all_slice_arrays[i] = slices + append!(preserve, col.preserve) + push!(preserve, slices) + descriptors[i] = GatheredColumnDescriptor( + pointer(slices), + Csize_t(length(slices)), + Csize_t(col.total_rows), + Int32(col.column_type), + col.is_nullable, + ) + end + extra_preserve !== nothing && append!(preserve, extra_preserve) + + ret = GC.@preserve preserve all_slice_arrays descriptors begin + @ccall rust_lib.iceberg_writer_write_scattered_columns_sync( + writer.ptr::Ptr{Cvoid}, + pointer(descriptors)::Ptr{GatheredColumnDescriptor}, + length(descriptors)::Csize_t, + )::Int32 + end + ret == 0 || throw(IcebergException("write_scattered_columns_sync: gather failed (see writer close for details)")) + return nothing +end From 26176147ed034993d540d80d415491b2eb02d871 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 16:57:46 +0200 Subject: [PATCH 05/25] Add tests --- iceberg_rust_ffi/src/writer.rs | 188 +++++++++++--- iceberg_rust_ffi/src/writer_columns.rs | 326 ++++--------------------- src/writer.jl | 23 +- test/writer_tests.jl | 155 ++++++++++++ 4 files changed, 367 insertions(+), 325 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index bfe95b0..38aae30 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -7,9 +7,9 @@ /// in order. Drain tasks are lightweight async forwarders (no CPU work). use std::ffi::{c_char, c_void}; use std::io::Cursor; +use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; -use std::panic::{catch_unwind, AssertUnwindSafe}; use std::thread; use arrow_array::RecordBatch; @@ -26,7 +26,7 @@ use iceberg::writer::file_writer::ParquetWriterBuilder; use iceberg::writer::{IcebergWriter, IcebergWriterBuilder}; use parquet::basic::{Compression, Encoding}; use parquet::file::properties::{EnabledStatistics, WriterProperties}; -use tokio::sync::{mpsc, oneshot}; +use tokio::sync::mpsc; use tokio::task::JoinHandle; /// Compression codec values (must match Julia's CompressionCodec enum) @@ -96,7 +96,9 @@ use crate::response::IcebergBoxedResponse; use crate::table::IcebergTable; use crate::transaction::IcebergDataFiles; use crate::util::parse_c_string; -use crate::writer_columns::{build_arrow_array_scattered, GatheredColumnDescriptor}; +use crate::writer_columns::{ + build_arrow_array_scattered, ColumnDescriptor, GatheredColumnDescriptor, SliceRef, +}; use object_store_ffi::{ export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT, }; @@ -106,15 +108,11 @@ type ConcreteDataFileWriter = DataFileWriter; /// Batch item sent through the per-writer drain channel. -/// `ack` is `Some` only for the zero-copy path (`write_columns`), where Arrow arrays -/// point directly into Julia memory — the pool worker sends on it after encoding so -/// Julia knows those buffers are safe to reuse. -pub(crate) type BatchItem = (RecordBatch, Option>); +pub(crate) type BatchItem = RecordBatch; /// Encode task submitted to the global worker pool. struct EncodeTask { batch: RecordBatch, - ack_opt: Option>, state: Arc, } @@ -201,13 +199,9 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { let handle_enc = handle.clone(); let encode_result = catch_unwind(AssertUnwindSafe(move || { - let t0 = std::time::Instant::now(); - // Use unwrap_or_else to recover from a poisoned mutex - // (which happens when a previous encode panicked mid-write). let result: anyhow::Result<()> = { - let mut guard = task.state.writer - .lock() - .unwrap_or_else(|e| e.into_inner()); + let mut guard = + task.state.writer.lock().unwrap_or_else(|e| e.into_inner()); match guard.as_mut() { Some(w) => handle_enc .block_on(w.write(task.batch)) @@ -215,12 +209,6 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { None => Err(anyhow::anyhow!("writer already closed")), } }; - // ACK inside the closure: if we panic before reaching this, - // the Sender is dropped and ack_rx returns RecvError, which - // write_columns propagates as an error to Julia — correct behavior. - if let Some(ack) = task.ack_opt { - let _ = ack.send(()); - } result })); @@ -230,9 +218,7 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { Err(_panic) => Some(anyhow::anyhow!("encode worker panicked")), }; if let Some(e) = err { - let mut slot = state.error - .lock() - .unwrap_or_else(|e| e.into_inner()); + let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); if slot.is_none() { *slot = Some(e); } @@ -316,7 +302,11 @@ pub extern "C" fn iceberg_writer_write_scattered_columns_sync( let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) }; if col_descs.len() != arrow_schema.fields().len() { - let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner()); + let mut slot = writer_ref + .writer_state + .error + .lock() + .unwrap_or_else(|e| e.into_inner()); if slot.is_none() { *slot = Some(anyhow::anyhow!( "Column count mismatch: got {} but schema has {}", @@ -329,7 +319,6 @@ pub extern "C" fn iceberg_writer_write_scattered_columns_sync( // Gather: read from Julia memory into Rust-owned Arrow arrays (blocking, in calling thread). // After this block, all Julia pointers have been consumed — safe to release. - let t0 = std::time::Instant::now(); let batch = match (|| -> Result { let mut arrays = Vec::with_capacity(num_columns); for (i, desc) in col_descs.iter().enumerate() { @@ -340,21 +329,153 @@ pub extern "C" fn iceberg_writer_write_scattered_columns_sync( })() { Ok(b) => b, Err(e) => { - let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { *slot = Some(e); } + let mut slot = writer_ref + .writer_state + .error + .lock() + .unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(e); + } return -1; } }; // Increment pending before submit so iceberg_writer_close always accounts for this batch. - writer_ref.writer_state.pending.fetch_add(1, Ordering::AcqRel); - let task = EncodeTask { batch, ack_opt: None, state: writer_ref.writer_state.clone() }; + writer_ref + .writer_state + .pending + .fetch_add(1, Ordering::AcqRel); + let task = EncodeTask { + batch, + state: writer_ref.writer_state.clone(), + }; match pool.task_tx.blocking_send(task) { Ok(()) => 0, Err(_) => { // Pool channel closed — undo the pending increment. - let prev = writer_ref.writer_state.pending.fetch_sub(1, Ordering::AcqRel); + let prev = writer_ref + .writer_state + .pending + .fetch_sub(1, Ordering::AcqRel); + if prev == 1 { + writer_ref.writer_state.done_notify.notify_one(); + } + eprintln!("[iceberg:sync] pool channel closed"); + -1 + } + } +} + +/// Synchronous write of flat column data: copies each column from Julia memory into +/// Rust-owned Arrow arrays in the calling thread, then submits to the global encode +/// pool asynchronously. +/// +/// Each `ColumnDescriptor` is treated as a single sequential slice (no scatter/gather). +/// Returns 0 on success, -1 on error (error stored in writer state, propagated on close). +#[no_mangle] +pub extern "C" fn iceberg_writer_write_columns_sync( + writer: *mut IcebergDataFileWriter, + columns: *const ColumnDescriptor, + num_columns: usize, +) -> i32 { + if writer.is_null() || columns.is_null() || num_columns == 0 { + return -1; + } + + let writer_ref = unsafe { &*writer }; + + let pool = match GLOBAL_ENCODE_POOL.get() { + Some(p) => p, + None => { + eprintln!("[iceberg:sync] encode pool not initialized; call iceberg_writer_new first"); + return -1; + } + }; + + let arrow_schema = writer_ref.arrow_schema.clone(); + let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) }; + + if col_descs.len() != arrow_schema.fields().len() { + let mut slot = writer_ref + .writer_state + .error + .lock() + .unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(anyhow::anyhow!( + "Column count mismatch: got {} but schema has {}", + col_descs.len(), + arrow_schema.fields().len() + )); + } + return -1; + } + + // Wrap each ColumnDescriptor as a single-slice GatheredColumnDescriptor. + // Sequential access (sel_ptr = null) so data[0..num_rows] is read in order. + let slices: Vec = col_descs + .iter() + .map(|d| SliceRef { + data_ptr: d.data_ptr, + lengths_ptr: d.lengths_ptr, + validity_ptr: d.validity_ptr, + sel_ptr: std::ptr::null(), + len: d.num_rows, + }) + .collect(); + + let gathered: Vec = col_descs + .iter() + .zip(slices.iter()) + .map(|(d, s)| GatheredColumnDescriptor { + slices: s as *const SliceRef, + num_slices: 1, + total_rows: d.num_rows, + column_type: d.column_type, + is_nullable: d.is_nullable, + }) + .collect(); + + let batch = match (|| -> Result { + let mut arrays = Vec::with_capacity(num_columns); + for (i, desc) in gathered.iter().enumerate() { + arrays.push(unsafe { build_arrow_array_scattered(desc, arrow_schema.field(i))? }); + } + RecordBatch::try_new(arrow_schema, arrays) + .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e)) + })() { + Ok(b) => b, + Err(e) => { + let mut slot = writer_ref + .writer_state + .error + .lock() + .unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(e); + } + return -1; + } + }; + + writer_ref + .writer_state + .pending + .fetch_add(1, Ordering::AcqRel); + let task = EncodeTask { + batch, + state: writer_ref.writer_state.clone(), + }; + + match pool.task_tx.blocking_send(task) { + Ok(()) => 0, + Err(_) => { + let prev = writer_ref + .writer_state + .pending + .fetch_sub(1, Ordering::AcqRel); if prev == 1 { writer_ref.writer_state.done_notify.notify_one(); } @@ -473,9 +594,9 @@ export_runtime_op!( let state = writer_state.clone(); let pool_tx = pool.task_tx.clone(); tokio::task::spawn(async move { - while let Some((batch, ack_opt)) = batch_rx.recv().await { + while let Some(batch) = batch_rx.recv().await { state.pending.fetch_add(1, Ordering::AcqRel); - let task = EncodeTask { batch, ack_opt, state: state.clone() }; + let task = EncodeTask { batch, state: state.clone() }; if pool_tx.send(task).await.is_err() { eprintln!("[iceberg:drain] pool channel closed"); break; @@ -540,8 +661,7 @@ export_runtime_op!( while let Some(batch_result) = reader.next() { let batch = batch_result .map_err(|e| anyhow::anyhow!("Failed to read Arrow IPC batch: {}", e))?; - // IPC data already copied into ipc_bytes above — no ACK needed. - tx.send((batch, None)).await + tx.send(batch).await .map_err(|_| anyhow::anyhow!("Writer channel closed (drain task may have failed)"))?; } diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs index 0f61849..6bf4c67 100644 --- a/iceberg_rust_ffi/src/writer_columns.rs +++ b/iceberg_rust_ffi/src/writer_columns.rs @@ -4,7 +4,6 @@ /// avoiding the overhead of Arrow IPC serialization. Julia passes raw column pointers /// and metadata, and Rust builds Arrow arrays directly from them. use std::ffi::c_void; -use std::ptr::NonNull; use std::sync::Arc; use arrow_array::{ @@ -12,31 +11,9 @@ use arrow_array::{ Date32Type, Decimal128Type, Float32Type, Float64Type, Int32Type, Int64Type, TimestampMicrosecondType, }, - ArrayRef, BooleanArray, PrimitiveArray, RecordBatch, StringArray, -}; -use arrow_buffer::{ArrowNativeType, BooleanBuffer, Buffer, NullBuffer, ScalarBuffer}; - -/// No-op allocator: Arrow buffers created from Julia memory must not be freed by Rust. -/// Julia owns the memory until the per-batch oneshot ACK is received (write_columns path). -struct NoopAllocation; - -/// Build a zero-copy `ScalarBuffer` pointing directly into Julia-owned memory. -/// Safety: `ptr` must be non-null, valid, and alive until the buffer is dropped. -unsafe fn zero_copy_scalar_buffer( - ptr: *const T, - num_rows: usize, -) -> ScalarBuffer { - let byte_len = num_rows * std::mem::size_of::(); - let nn_ptr = NonNull::new_unchecked(ptr as *mut u8); - let buf = Buffer::from_custom_allocation(nn_ptr, byte_len, Arc::new(NoopAllocation)); - ScalarBuffer::new(buf, 0, num_rows) -} -use arrow_schema::DataType; - -use crate::writer::IcebergDataFileWriter; -use object_store_ffi::{ - export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT, + ArrayRef, BooleanArray, PrimitiveArray, StringArray, }; +use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, ScalarBuffer}; /// Column type codes (must match Julia's ColumnType enum) pub const COLUMN_TYPE_INT32: i32 = 0; @@ -81,168 +58,6 @@ pub struct ColumnDescriptor { unsafe impl Send for ColumnDescriptor {} unsafe impl Sync for ColumnDescriptor {} -/// Build an Arrow array from a ColumnDescriptor and its corresponding schema field. -/// The schema field is used to extract precision/scale for decimal columns. -unsafe fn build_arrow_array( - desc: &ColumnDescriptor, - schema_field: &arrow_schema::Field, -) -> Result { - let null_buffer = if desc.is_nullable && !desc.validity_ptr.is_null() { - // Julia's BitVector uses the same little-endian bit-packed layout as Arrow. - // Point directly into Julia's buffer — no copy needed. - let num_bytes = (desc.num_rows + 7) / 8; - let nn_ptr = NonNull::new_unchecked(desc.validity_ptr as *mut u8); - let buf = Buffer::from_custom_allocation(nn_ptr, num_bytes, Arc::new(NoopAllocation)); - Some(NullBuffer::new(BooleanBuffer::new(buf, 0, desc.num_rows))) - } else { - None - }; - - let array: ArrayRef = match desc.column_type { - COLUMN_TYPE_INT32 => { - let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i32, desc.num_rows); - Arc::new(PrimitiveArray::::new(buffer, null_buffer)) - } - COLUMN_TYPE_INT64 => { - let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i64, desc.num_rows); - Arc::new(PrimitiveArray::::new(buffer, null_buffer)) - } - COLUMN_TYPE_FLOAT32 => { - let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const f32, desc.num_rows); - Arc::new(PrimitiveArray::::new(buffer, null_buffer)) - } - COLUMN_TYPE_FLOAT64 => { - let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const f64, desc.num_rows); - Arc::new(PrimitiveArray::::new(buffer, null_buffer)) - } - COLUMN_TYPE_DATE => { - // Date is stored as Int32 (days since epoch) - let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i32, desc.num_rows); - Arc::new(PrimitiveArray::::new(buffer, null_buffer)) - } - COLUMN_TYPE_TIMESTAMP => { - // Timestamp without timezone (Iceberg `timestamp`) - // Stored as Int64 microseconds since epoch - let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i64, desc.num_rows); - Arc::new(PrimitiveArray::::new( - buffer, - null_buffer, - )) - } - COLUMN_TYPE_TIMESTAMPTZ => { - // Timestamp with UTC timezone (Iceberg `timestamptz`) - // Stored as Int64 microseconds since epoch, with timezone metadata - let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i64, desc.num_rows); - Arc::new( - PrimitiveArray::::new(buffer, null_buffer) - .with_timezone("UTC"), - ) - } - COLUMN_TYPE_BOOLEAN => { - let data = std::slice::from_raw_parts(desc.data_ptr as *const u8, desc.num_rows); - // Convert bytes to boolean buffer - let mut bits = vec![0u8; (desc.num_rows + 7) / 8]; - for (i, &val) in data.iter().enumerate() { - if val != 0 { - bits[i / 8] |= 1 << (i % 8); - } - } - let values = BooleanBuffer::new(Buffer::from(bits), 0, desc.num_rows); - Arc::new(BooleanArray::new(values, null_buffer)) - } - COLUMN_TYPE_STRING => { - // String data passed from Julia: - // - data_ptr: pointer to array of string pointers (each pointing to UTF-8 bytes) - // - lengths_ptr: pointer to array of string lengths (Int64) - // Note: While we avoid copying on the Julia side (just passing pointers), - // Arrow's StringArray copies the data into its own contiguous buffer below. - if desc.lengths_ptr.is_null() { - return Err(anyhow::anyhow!("String column requires lengths")); - } - let str_ptrs = - std::slice::from_raw_parts(desc.data_ptr as *const *const u8, desc.num_rows); - let str_lens = std::slice::from_raw_parts(desc.lengths_ptr, desc.num_rows); - - // Build string references, then Arrow copies them into its internal buffer - let mut strings: Vec> = Vec::with_capacity(desc.num_rows); - for i in 0..desc.num_rows { - let is_null: bool = if let Some(ref nb) = null_buffer { - nb.is_null(i) - } else { - false - }; - if is_null { - strings.push(None); - } else { - let ptr = str_ptrs[i]; - let len = str_lens[i] as usize; - let bytes = std::slice::from_raw_parts(ptr, len); - let s = std::str::from_utf8(bytes) - .map_err(|e| anyhow::anyhow!("Invalid UTF-8 in string column: {}", e))?; - strings.push(Some(s)); - } - } - Arc::new(StringArray::from(strings)) - } - COLUMN_TYPE_UUID => { - // UUID is stored as 16 bytes (UInt128 in Julia) - // Store as fixed-size binary (16 bytes per value) - let data = std::slice::from_raw_parts(desc.data_ptr as *const u8, desc.num_rows * 16); - - // Build the array using the builder or from_iter_values - let values: Vec<&[u8]> = data.chunks(16).collect(); - Arc::new( - arrow_array::FixedSizeBinaryArray::try_from_iter(values.into_iter()) - .map_err(|e| anyhow::anyhow!("Failed to create UUID array: {}", e))?, - ) - } - COLUMN_TYPE_DECIMAL_INT32 | COLUMN_TYPE_DECIMAL_INT64 | COLUMN_TYPE_DECIMAL_INT128 => { - // All decimal variants map to Arrow Decimal128. Precision and scale come from - // the schema field (set when the Iceberg table was created). - let (precision, scale) = match schema_field.data_type() { - DataType::Decimal128(p, s) => (*p, *s), - dt => { - return Err(anyhow::anyhow!( - "Expected Decimal128 schema field for decimal column, got {:?}", - dt - )) - } - }; - - // All decimal variants produce an Arrow Decimal128 (i128 backing). - // INT32/INT64 variants need widening — materialize to avoid a cast loop - // in the zero-copy path; INT128 shares the same layout and is zero-copy. - let buffer: ScalarBuffer = match desc.column_type { - COLUMN_TYPE_DECIMAL_INT32 => { - let data = - std::slice::from_raw_parts(desc.data_ptr as *const i32, desc.num_rows); - ScalarBuffer::from(data.iter().map(|&v| v as i128).collect::>()) - } - COLUMN_TYPE_DECIMAL_INT64 => { - let data = - std::slice::from_raw_parts(desc.data_ptr as *const i64, desc.num_rows); - ScalarBuffer::from(data.iter().map(|&v| v as i128).collect::>()) - } - _ => { - // COLUMN_TYPE_DECIMAL_INT128: Julia Int128 and Rust i128 share the same - // 16-byte little-endian layout on all supported platforms — zero-copy. - zero_copy_scalar_buffer(desc.data_ptr as *const i128, desc.num_rows) - } - }; - Arc::new( - PrimitiveArray::::new(buffer, null_buffer) - .with_precision_and_scale(precision, scale) - .map_err(|e| anyhow::anyhow!("Failed to set decimal precision/scale: {}", e))?, - ) - } - _ => { - return Err(anyhow::anyhow!("Unknown column type: {}", desc.column_type)); - } - }; - - Ok(array) -} - // ============================================================================= // Scattered-gather writer: pass raw source pointers + selection indices to Rust, // which gathers the data directly into Arrow arrays — eliminating the Julia-side @@ -286,7 +101,10 @@ unsafe impl Sync for GatheredColumnDescriptor {} /// Build the Arrow null buffer by scanning validity bits across all slices. /// Returns `None` if every slice has a null `validity_ptr` (all rows valid). -unsafe fn build_null_buffer_scattered(slices: &[SliceRef], total_rows: usize) -> Option { +unsafe fn build_null_buffer_scattered( + slices: &[SliceRef], + total_rows: usize, +) -> Option { if !slices.iter().any(|s| !s.validity_ptr.is_null()) { return None; } @@ -315,7 +133,11 @@ unsafe fn build_null_buffer_scattered(slices: &[SliceRef], total_rows: usize) -> } out += slice.len; } - Some(NullBuffer::new(BooleanBuffer::new(Buffer::from(bits), 0, total_rows))) + Some(NullBuffer::new(BooleanBuffer::new( + Buffer::from(bits), + 0, + total_rows, + ))) } /// Gather all slices for a column into an Arrow array. @@ -346,8 +168,10 @@ pub(crate) unsafe fn build_arrow_array_scattered( } } } - Arc::new(PrimitiveArray::<$ArrowType>::new(ScalarBuffer::from(values), null_buf)) - as ArrayRef + Arc::new(PrimitiveArray::<$ArrowType>::new( + ScalarBuffer::from(values), + null_buf, + )) as ArrayRef }}; } @@ -387,8 +211,11 @@ pub(crate) unsafe fn build_arrow_array_scattered( } } Arc::new( - PrimitiveArray::::new(ScalarBuffer::from(values), null_buf) - .with_timezone("UTC"), + PrimitiveArray::::new( + ScalarBuffer::from(values), + null_buf, + ) + .with_timezone("UTC"), ) } COLUMN_TYPE_BOOLEAN => { @@ -399,10 +226,15 @@ pub(crate) unsafe fn build_arrow_array_scattered( if slice.sel_ptr.is_null() { let data = std::slice::from_raw_parts(src, slice.len); for (i, &v) in data.iter().enumerate() { - if v != 0 { bits[(out + i) / 8] |= 1 << ((out + i) % 8); } + if v != 0 { + bits[(out + i) / 8] |= 1 << ((out + i) % 8); + } } } else { - for (i, &idx) in std::slice::from_raw_parts(slice.sel_ptr, slice.len).iter().enumerate() { + for (i, &idx) in std::slice::from_raw_parts(slice.sel_ptr, slice.len) + .iter() + .enumerate() + { if *src.add((idx - 1) as usize) != 0 { bits[(out + i) / 8] |= 1 << ((out + i) % 8); } @@ -410,7 +242,10 @@ pub(crate) unsafe fn build_arrow_array_scattered( } out += slice.len; } - Arc::new(BooleanArray::new(BooleanBuffer::new(Buffer::from(bits), 0, total), null_buf)) + Arc::new(BooleanArray::new( + BooleanBuffer::new(Buffer::from(bits), 0, total), + null_buf, + )) } COLUMN_TYPE_STRING => { // Strings are always pre-staged in Julia; data_ptr = *const *const u8, @@ -420,7 +255,8 @@ pub(crate) unsafe fn build_arrow_array_scattered( if slice.lengths_ptr.is_null() { return Err(anyhow::anyhow!("String column requires lengths_ptr")); } - let ptrs = std::slice::from_raw_parts(slice.data_ptr as *const *const u8, slice.len); + let ptrs = + std::slice::from_raw_parts(slice.data_ptr as *const *const u8, slice.len); let lens = std::slice::from_raw_parts(slice.lengths_ptr, slice.len); for i in 0..slice.len { let is_null = if !slice.validity_ptr.is_null() { @@ -431,8 +267,11 @@ pub(crate) unsafe fn build_arrow_array_scattered( if is_null { all_strings.push(None); } else { - let s = std::str::from_utf8(std::slice::from_raw_parts(ptrs[i], lens[i] as usize)) - .map_err(|e| anyhow::anyhow!("Invalid UTF-8: {}", e))?; + let s = std::str::from_utf8(std::slice::from_raw_parts( + ptrs[i], + lens[i] as usize, + )) + .map_err(|e| anyhow::anyhow!("Invalid UTF-8: {}", e))?; all_strings.push(Some(s)); } } @@ -447,7 +286,10 @@ pub(crate) unsafe fn build_arrow_array_scattered( data.extend_from_slice(std::slice::from_raw_parts(src, slice.len * 16)); } else { for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { - data.extend_from_slice(std::slice::from_raw_parts(src.add((idx - 1) as usize * 16), 16)); + data.extend_from_slice(std::slice::from_raw_parts( + src.add((idx - 1) as usize * 16), + 16, + )); } } } @@ -468,7 +310,11 @@ pub(crate) unsafe fn build_arrow_array_scattered( COLUMN_TYPE_DECIMAL_INT32 => { let src = slice.data_ptr as *const i32; if slice.sel_ptr.is_null() { - values.extend(std::slice::from_raw_parts(src, slice.len).iter().map(|&v| v as i128)); + values.extend( + std::slice::from_raw_parts(src, slice.len) + .iter() + .map(|&v| v as i128), + ); } else { for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { values.push(*src.add((idx - 1) as usize) as i128); @@ -478,7 +324,11 @@ pub(crate) unsafe fn build_arrow_array_scattered( COLUMN_TYPE_DECIMAL_INT64 => { let src = slice.data_ptr as *const i64; if slice.sel_ptr.is_null() { - values.extend(std::slice::from_raw_parts(src, slice.len).iter().map(|&v| v as i128)); + values.extend( + std::slice::from_raw_parts(src, slice.len) + .iter() + .map(|&v| v as i128), + ); } else { for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) { values.push(*src.add((idx - 1) as usize) as i128); @@ -508,75 +358,3 @@ pub(crate) unsafe fn build_arrow_array_scattered( }; Ok(array) } - -// Write columns directly to the Parquet writer. -// Accepts an array of ColumnDescriptors and builds a RecordBatch from them, -// then writes to the underlying Parquet writer. -// The caller must ensure all pointers are valid and point to appropriately sized data. -export_runtime_op!( - iceberg_writer_write_columns, - crate::IcebergResponse, - || { - if writer.is_null() { - return Err(anyhow::anyhow!("Null writer pointer provided")); - } - if columns.is_null() || num_columns == 0 { - return Err(anyhow::anyhow!("No columns provided")); - } - - // Copy column descriptors for safe use across await - let cols: Vec = unsafe { - std::slice::from_raw_parts(columns, num_columns).to_vec() - }; - - let writer_ref = unsafe { &mut *writer }; - Ok((writer_ref, cols)) - }, - result_tuple, - async { - let (writer_ref, cols) = result_tuple; - - let arrow_schema = writer_ref.arrow_schema.clone(); - - // Validate column count matches schema - if cols.len() != arrow_schema.fields().len() { - return Err(anyhow::anyhow!( - "Column count mismatch: got {} columns but schema has {} fields", - cols.len(), - arrow_schema.fields().len() - )); - } - - // Build Arrow arrays from column descriptors. - // These arrays are zero-copy: they hold raw pointers into Julia-owned buffers. - let mut arrays: Vec = Vec::with_capacity(cols.len()); - for (i, desc) in cols.iter().enumerate() { - let schema_field = arrow_schema.field(i); - let array = unsafe { build_arrow_array(desc, schema_field)? }; - arrays.push(array); - } - - let batch = RecordBatch::try_new(arrow_schema, arrays) - .map_err(|e| anyhow::anyhow!("Failed to create RecordBatch: {}", e))?; - - // Arrow arrays are zero-copy (pointing into Julia buffers) — we must not return - // until Rust has released them. Use a per-batch oneshot: the drain task signals - // after encoding, at which point the Arrow arrays have been dropped. - let (ack_tx, ack_rx) = tokio::sync::oneshot::channel::<()>(); - writer_ref - .batch_tx - .as_ref() - .ok_or_else(|| anyhow::anyhow!("Writer has been closed"))? - .send((batch, Some(ack_tx))) - .await - .map_err(|_| anyhow::anyhow!("Writer channel closed (drain task may have failed)"))?; - ack_rx - .await - .map_err(|_| anyhow::anyhow!("Ack channel closed (drain task may have failed)"))?; - - Ok::<(), anyhow::Error>(()) - }, - writer: *mut IcebergDataFileWriter, - columns: *const ColumnDescriptor, - num_columns: usize -); diff --git a/src/writer.jl b/src/writer.jl index 6abab6e..4e41776 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -844,29 +844,18 @@ write_columns(writer, [desc], (data, validity)) # Arrays preserved during call ``` """ function write_columns(writer::DataFileWriter, columns::Vector{ColumnDescriptor}, arrays_to_preserve) - if writer.ptr == C_NULL - throw(IcebergException("Writer has been freed")) - end - - if isempty(columns) - throw(IcebergException("No columns provided")) - end - - response = Response{Cvoid}(-1, nothing, C_NULL, C_NULL) + writer.ptr == C_NULL && throw(IcebergException("Writer has been freed")) + isempty(columns) && throw(IcebergException("No columns provided")) - # Pass arrays_to_preserve to async_ccall so GC.@preserve keeps them alive - async_ccall(response, columns, arrays_to_preserve) do handle - @ccall rust_lib.iceberg_writer_write_columns( + ret = GC.@preserve columns arrays_to_preserve begin + @ccall rust_lib.iceberg_writer_write_columns_sync( writer.ptr::Ptr{Cvoid}, pointer(columns)::Ptr{ColumnDescriptor}, length(columns)::Csize_t, - response::Ref{Response{Cvoid}}, - handle::Ptr{Cvoid} - )::Cint + )::Int32 end - @throw_on_error(response, "write_columns", IcebergException) - + ret == 0 || throw(IcebergException("write_columns failed (see writer close for details)")) return nothing end diff --git a/test/writer_tests.jl b/test/writer_tests.jl index f8b1bef..b8ad8a3 100644 --- a/test/writer_tests.jl +++ b/test/writer_tests.jl @@ -1126,3 +1126,158 @@ end println("\n✅ write_columns decimal nullable tests completed!") end + +@testset "Writer write_scattered_columns_sync API" begin + println("Testing write_scattered_columns_sync (gathered-column) API...") + + catalog_uri = get_catalog_uri() + props = get_catalog_properties() + + catalog = nothing + table = C_NULL + data_files = nothing + test_namespace = nothing + table_name = nothing + + try + catalog = RustyIceberg.catalog_create_rest(catalog_uri; properties=props) + @test catalog !== nothing + + test_namespace = ["test_gathered_$(round(Int, time() * 1000))"] + RustyIceberg.create_namespace(catalog, test_namespace) + + # Schema: id (non-nullable long), score (nullable double), tag (nullable string) + schema = Schema([ + Field(Int32(1), "id", IcebergLong(); required=true), + Field(Int32(2), "score", IcebergDouble(); required=false), + Field(Int32(3), "tag", IcebergString(); required=false), + ]) + + table_name = "gathered_test_$(round(Int, time() * 1000))" + table = RustyIceberg.create_table(catalog, test_namespace, table_name, schema) + @test table != C_NULL + println("✅ Table created") + + # --- Data layout (4 rows) --- + # id: [1, 2, 3, 4] — non-nullable, single sequential slice + # score: [1.1, null, 3.3, null] — nullable; two sequential slices, each with validity + # tag: ["alpha", null, "gamma", null] — nullable string via add_string_slice! + + data_files = RustyIceberg.with_data_file_writer(table) do writer + batch = RustyIceberg.GatheredBatch() + + # id: single sequential slice, no nulls + id_data = Int64[1, 2, 3, 4] + id_col = RustyIceberg.GatheredColumn(RustyIceberg.COLUMN_TYPE_INT64) + RustyIceberg.add_slice!(id_col, id_data) + push!(batch, id_col) + println("✅ id column built (sequential, non-nullable)") + + # score: two sequential slices, each with validity masks + # Slice 1 — src = [1.1, 9.9], validity = [true, false] → rows 0 (1.1) and 1 (null) + # Slice 2 — src = [3.3, 8.8] via selection [1] + identity [8.8] (just sequential here) + # Use scattered access for slice 2: src=[99.9, 3.3, 88.8], sel=[2] → picks 3.3 + score_src1 = Float64[1.1, 9.9] + score_valid1 = BitVector([true, false]) + + score_src2 = Float64[99.9, 3.3, 88.8] + score_sel2 = Int64[2] # picks index 2 → 3.3 (1-based) + score_valid2 = BitVector([true]) + + score_src3 = Float64[7.7, 8.8] + score_valid3 = BitVector([false]) # null for row 3 + + score_col = RustyIceberg.GatheredColumn(RustyIceberg.COLUMN_TYPE_FLOAT64; nullable=true) + RustyIceberg.add_slice!(score_col, score_src1; validity=score_valid1) + RustyIceberg.add_slice!(score_col, score_src2; sel=score_sel2, validity=score_valid2) + RustyIceberg.add_slice!(score_col, score_src3; sel=Int64[1], validity=score_valid3) + push!(batch, score_col) + println("✅ score column built (scattered + nullable)") + + # tag: string column via add_string_slice! + # Row 0: "alpha", row 1: null, row 2: "gamma", row 3: null + tag_strings = ["alpha", "", "gamma", ""] # empty at null positions + tag_ptrs = Vector{Ptr{UInt8}}(undef, 4) + tag_lens = Vector{Int64}(undef, 4) + tag_valid = BitVector([true, false, true, false]) + for i in 1:4 + if tag_valid[i] + tag_ptrs[i] = pointer(tag_strings[i]) + tag_lens[i] = ncodeunits(tag_strings[i]) + else + tag_ptrs[i] = Ptr{UInt8}(C_NULL) + tag_lens[i] = 0 + end + end + + tag_col = RustyIceberg.GatheredColumn(RustyIceberg.COLUMN_TYPE_STRING; nullable=true) + RustyIceberg.add_string_slice!(tag_col, tag_ptrs, tag_lens; validity=tag_valid) + # Preserve source strings so pointers stay valid until write completes + push!(tag_col.preserve, tag_strings) + push!(batch, tag_col) + println("✅ tag column built (string via add_string_slice!)") + + RustyIceberg.write_scattered_columns_sync(writer, batch) + println("✅ Batch written via write_scattered_columns_sync") + end + @test data_files !== nothing && data_files.ptr != C_NULL + println("✅ Writer closed") + + updated_table = RustyIceberg.with_transaction(table, catalog) do tx + RustyIceberg.with_fast_append(tx) do action + RustyIceberg.add_data_files(action, data_files) + end + end + @test updated_table != C_NULL + println("✅ Transaction committed") + + tbl = read_table_data(updated_table) + @test tbl !== nothing + @test length(tbl.id) == 4 + println("✅ Read $(length(tbl.id)) rows") + + perm = sortperm(tbl.id) + sorted_ids = tbl.id[perm] + sorted_scores = tbl.score[perm] + sorted_tags = tbl.tag[perm] + + # Verify id column (non-nullable, sequential) + @test sorted_ids == Int64[1, 2, 3, 4] + println("✅ id values correct") + + # Verify score column (nullable, scattered slices) + @test !ismissing(sorted_scores[1]) && sorted_scores[1] ≈ 1.1 + @test ismissing(sorted_scores[2]) + @test !ismissing(sorted_scores[3]) && sorted_scores[3] ≈ 3.3 + @test ismissing(sorted_scores[4]) + println("✅ score values correct (including nulls)") + + # Verify tag column (nullable string) + @test !ismissing(sorted_tags[1]) && sorted_tags[1] == "alpha" + @test ismissing(sorted_tags[2]) + @test !ismissing(sorted_tags[3]) && sorted_tags[3] == "gamma" + @test ismissing(sorted_tags[4]) + println("✅ tag values correct (including nulls)") + + RustyIceberg.free_table(updated_table) + + finally + if data_files !== nothing && data_files.ptr != C_NULL + RustyIceberg.free_data_files!(data_files) + end + if table != C_NULL + RustyIceberg.free_table(table) + end + if table_name !== nothing && test_namespace !== nothing && catalog !== nothing + RustyIceberg.drop_table(catalog, test_namespace, table_name) + end + if test_namespace !== nothing && catalog !== nothing + RustyIceberg.drop_namespace(catalog, test_namespace) + end + if catalog !== nothing + RustyIceberg.free_catalog!(catalog) + end + end + + println("\n✅ write_scattered_columns_sync API tests completed!") +end From a2c016964ab11f36ad4fd92f8327b6a6415c2ff9 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 16:59:09 +0200 Subject: [PATCH 06/25] Bump version --- iceberg_rust_ffi/Cargo.lock | 2 +- iceberg_rust_ffi/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/iceberg_rust_ffi/Cargo.lock b/iceberg_rust_ffi/Cargo.lock index 8070ddd..c2cd772 100644 --- a/iceberg_rust_ffi/Cargo.lock +++ b/iceberg_rust_ffi/Cargo.lock @@ -1638,7 +1638,7 @@ dependencies = [ [[package]] name = "iceberg_rust_ffi" -version = "0.7.17" +version = "0.7.19" dependencies = [ "anyhow", "arrow-array", diff --git a/iceberg_rust_ffi/Cargo.toml b/iceberg_rust_ffi/Cargo.toml index 7b2cd92..f76f726 100644 --- a/iceberg_rust_ffi/Cargo.toml +++ b/iceberg_rust_ffi/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "iceberg_rust_ffi" -version = "0.7.17" +version = "0.7.19" edition = "2021" [lib] From 218f426acd225b75803faef63950f19b6b39a5c6 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 17:18:09 +0200 Subject: [PATCH 07/25] Fix test --- iceberg_rust_ffi/src/writer.rs | 168 ++++++++++--------------- iceberg_rust_ffi/src/writer_columns.rs | 10 +- src/writer.jl | 21 +--- 3 files changed, 74 insertions(+), 125 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 38aae30..fb04b84 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -4,7 +4,7 @@ /// across all writers. Per-writer ordering is guaranteed by the per-writer /// `Arc>` inside WriterState: only one pool thread encodes /// a given writer at a time, and the FIFO global queue ensures batches are submitted -/// in order. Drain tasks are lightweight async forwarders (no CPU work). +/// in order. use std::ffi::{c_char, c_void}; use std::io::Cursor; use std::panic::{catch_unwind, AssertUnwindSafe}; @@ -26,8 +26,6 @@ use iceberg::writer::file_writer::ParquetWriterBuilder; use iceberg::writer::{IcebergWriter, IcebergWriterBuilder}; use parquet::basic::{Compression, Encoding}; use parquet::file::properties::{EnabledStatistics, WriterProperties}; -use tokio::sync::mpsc; -use tokio::task::JoinHandle; /// Compression codec values (must match Julia's CompressionCodec enum) const COMPRESSION_UNCOMPRESSED: i32 = 0; @@ -107,9 +105,6 @@ use object_store_ffi::{ type ConcreteDataFileWriter = DataFileWriter; -/// Batch item sent through the per-writer drain channel. -pub(crate) type BatchItem = RecordBatch; - /// Encode task submitted to the global worker pool. struct EncodeTask { batch: RecordBatch, @@ -240,18 +235,12 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { /// Opaque writer handle for FFI. /// -/// Writing is pipelined: Julia gathers a RecordBatch, sends it to the bounded per-writer -/// channel (capacity 1), and returns immediately. A lightweight async drain task forwards -/// batches to the global encode pool. Pool workers (N = available_parallelism) encode -/// Parquet concurrently across all active writers. +/// Writing is pipelined: Julia gathers a RecordBatch and submits it directly to the +/// global encode pool, then returns immediately. Pool workers (N = available_parallelism) +/// encode Parquet concurrently across all active writers. pub struct IcebergDataFileWriter { /// Arrow schema for this table, used by write_columns to create RecordBatches. pub(crate) arrow_schema: ArrowSchemaRef, - /// Sender side of the per-writer batch queue (bounded, capacity 1). - /// Dropped in iceberg_writer_close to signal EOF to the drain task. - pub(crate) batch_tx: Option>, - /// Lightweight async drain task: forwards batches from batch_tx to the global pool. - pub(crate) drain_handle: Option>, /// Shared state: owns the ConcreteDataFileWriter, tracks pending count and errors. pub(crate) writer_state: Arc, } @@ -485,16 +474,12 @@ pub extern "C" fn iceberg_writer_write_columns_sync( } } -/// Free a writer. Aborts the drain task and poisons the writer state so any in-flight -/// pool tasks fail gracefully (error/cancel path). +/// Free a writer. Poisons the writer state so any in-flight pool tasks fail gracefully. #[no_mangle] pub extern "C" fn iceberg_writer_free(writer: *mut IcebergDataFileWriter) { if !writer.is_null() { unsafe { let boxed = Box::from_raw(writer); - if let Some(handle) = boxed.drain_handle { - handle.abort(); - } // Poison the ConcreteDataFileWriter so any in-flight pool tasks return an error // rather than writing to a partially-freed writer. let _ = boxed.writer_state.writer.lock().unwrap().take(); @@ -504,8 +489,7 @@ pub extern "C" fn iceberg_writer_free(writer: *mut IcebergDataFileWriter) { // Create a new DataFileWriter from a table with configuration options. // -// Spawns a lightweight async drain task per writer. The global encode pool -// (N = available_parallelism threads) is initialized on the first call. +// The global encode pool (N = available_parallelism threads) is initialized on the first call. export_runtime_op!( iceberg_writer_new, IcebergDataFileWriterResponse, @@ -575,7 +559,7 @@ export_runtime_op!( ); // Initialize global pool (no-op if already running). - let pool = get_or_init_encode_pool(); + get_or_init_encode_pool(); let writer_state = Arc::new(WriterState { writer: Mutex::new(Some(concrete_writer)), @@ -584,31 +568,8 @@ export_runtime_op!( error: Mutex::new(None), }); - // Bounded per-writer channel (capacity 1): Julia can be at most one batch ahead - // of the pool. This provides backpressure: send blocks if the drain task hasn't - // forwarded the previous batch yet. - let (batch_tx, mut batch_rx) = mpsc::channel::(1); - - // Lightweight drain task: forwards batches to the global pool asynchronously. - let drain_handle: JoinHandle<()> = { - let state = writer_state.clone(); - let pool_tx = pool.task_tx.clone(); - tokio::task::spawn(async move { - while let Some(batch) = batch_rx.recv().await { - state.pending.fetch_add(1, Ordering::AcqRel); - let task = EncodeTask { batch, state: state.clone() }; - if pool_tx.send(task).await.is_err() { - eprintln!("[iceberg:drain] pool channel closed"); - break; - } - } - }) - }; - Ok::(IcebergDataFileWriter { arrow_schema, - batch_tx: Some(batch_tx), - drain_handle: Some(drain_handle), writer_state, }) }, @@ -618,65 +579,71 @@ export_runtime_op!( parquet_props: *const ParquetWriterPropertiesFFI ); -// Write Arrow IPC data to the writer. -// -// Deserializes the IPC stream, then sends each RecordBatch to the drain task's channel. -// Returns immediately after queuing — encoding happens asynchronously in the global pool. -export_runtime_op!( - iceberg_writer_write, - crate::IcebergResponse, - || { - if writer.is_null() { - return Err(anyhow::anyhow!("Null writer pointer provided")); - } - if arrow_ipc_data.is_null() { - return Err(anyhow::anyhow!("Null arrow_ipc_data pointer provided")); +/// Write Arrow IPC data synchronously: copy IPC bytes from Julia, deserialize the stream, +/// and submit each RecordBatch to the global encode pool. +/// +/// Returns 0 on success, -1 on error (error stored in writer state, propagated on close). +#[no_mangle] +pub extern "C" fn iceberg_writer_write_sync( + writer: *mut IcebergDataFileWriter, + arrow_ipc_data: *const u8, + arrow_ipc_len: usize, +) -> i32 { + if writer.is_null() || arrow_ipc_data.is_null() || arrow_ipc_len == 0 { + return -1; + } + + let writer_ref = unsafe { &*writer }; + + let pool = match GLOBAL_ENCODE_POOL.get() { + Some(p) => p, + None => { + eprintln!("[iceberg:sync] encode pool not initialized; call iceberg_writer_new first"); + return -1; } - if arrow_ipc_len == 0 { - return Err(anyhow::anyhow!("Arrow IPC data length is zero")); + }; + + let ipc_bytes = unsafe { + std::slice::from_raw_parts(arrow_ipc_data, arrow_ipc_len).to_vec() + }; + + let cursor = Cursor::new(ipc_bytes); + let reader = match StreamReader::try_new(cursor, None) { + Ok(r) => r, + Err(e) => { + let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { *slot = Some(anyhow::anyhow!("IPC reader: {}", e)); } + return -1; } + }; - // Copy the IPC data into a Vec for safe use across await points - let ipc_bytes = unsafe { - std::slice::from_raw_parts(arrow_ipc_data, arrow_ipc_len).to_vec() + for batch_result in reader { + let batch = match batch_result { + Ok(b) => b, + Err(e) => { + let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { *slot = Some(anyhow::anyhow!("IPC batch: {}", e)); } + return -1; + } }; - - let writer_ref = unsafe { &mut *writer }; - Ok((writer_ref, ipc_bytes)) - }, - result_tuple, - async { - let (writer_ref, ipc_bytes) = result_tuple; - - let tx = writer_ref - .batch_tx - .as_ref() - .ok_or_else(|| anyhow::anyhow!("Writer has been closed"))?; - - // Deserialize Arrow IPC to RecordBatches and enqueue each one - let cursor = Cursor::new(ipc_bytes); - let mut reader = StreamReader::try_new(cursor, None) - .map_err(|e| anyhow::anyhow!("Failed to create Arrow IPC reader: {}", e))?; - - while let Some(batch_result) = reader.next() { - let batch = batch_result - .map_err(|e| anyhow::anyhow!("Failed to read Arrow IPC batch: {}", e))?; - tx.send(batch).await - .map_err(|_| anyhow::anyhow!("Writer channel closed (drain task may have failed)"))?; + writer_ref.writer_state.pending.fetch_add(1, Ordering::AcqRel); + let task = EncodeTask { batch, state: writer_ref.writer_state.clone() }; + if pool.task_tx.blocking_send(task).is_err() { + let prev = writer_ref.writer_state.pending.fetch_sub(1, Ordering::AcqRel); + if prev == 1 { writer_ref.writer_state.done_notify.notify_one(); } + let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { *slot = Some(anyhow::anyhow!("Encode pool closed")); } + return -1; } + } - Ok::<(), anyhow::Error>(()) - }, - writer: *mut IcebergDataFileWriter, - arrow_ipc_data: *const u8, - arrow_ipc_len: usize -); + 0 +} // Close the writer and return the produced DataFiles. // -// Drops the batch sender (signals EOF to the drain task), waits for the drain task to -// finish forwarding all batches, waits for all pool encodes to complete, then finalizes -// the Parquet file and returns the DataFiles metadata. +// Waits for all pending pool encodes to complete, then finalizes the Parquet file +// and returns the DataFiles metadata. export_runtime_op!( iceberg_writer_close, IcebergWriterCloseResponse, @@ -689,15 +656,6 @@ export_runtime_op!( }, writer_ref, async { - // Drop the sender — signals the drain task that no more batches are coming - drop(writer_ref.batch_tx.take()); - - // Wait for the drain task to finish forwarding all batches to the pool - if let Some(handle) = writer_ref.drain_handle.take() { - handle.await - .map_err(|e| anyhow::anyhow!("Drain task panicked: {}", e))?; - } - // Wait for all pending pool encodes to complete. // Uses a timeout to guard against a dead worker thread (e.g. panic outside // catch_unwind) that would otherwise leave pending > 0 forever. diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs index 6bf4c67..b099e3b 100644 --- a/iceberg_rust_ffi/src/writer_columns.rs +++ b/iceberg_rust_ffi/src/writer_columns.rs @@ -123,11 +123,11 @@ unsafe fn build_null_buffer_scattered( bits[(out + i) / 8] |= b << ((out + i) % 8); } } else { - // Scattered: gather validity bits via selection indices (1-based) - let sel = std::slice::from_raw_parts(slice.sel_ptr, slice.len); - for (i, &idx) in sel.iter().enumerate() { - let src = (idx - 1) as usize; - let b = (*slice.validity_ptr.add(src / 8) >> (src % 8)) & 1; + // Scattered: validity is per output row (0..len-1), same as sequential. + // The selection governs which *data* elements are picked, not which + // validity bits — the validity bitmap has one bit per output row. + for i in 0..slice.len { + let b = (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1; bits[(out + i) / 8] |= b << ((out + i) % 8); } } diff --git a/src/writer.jl b/src/writer.jl index 4e41776..c9d50a5 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -415,23 +415,14 @@ end # Internal helper to write raw IPC bytes to the Rust writer function _write_ipc_bytes(writer::DataFileWriter, ipc_bytes::Vector{UInt8}) - ipc_data = pointer(ipc_bytes) - ipc_len = length(ipc_bytes) - - response = Response{Cvoid}(-1, nothing, C_NULL, C_NULL) - - async_ccall(response, ipc_bytes) do handle - @ccall rust_lib.iceberg_writer_write( + ret = GC.@preserve ipc_bytes begin + @ccall rust_lib.iceberg_writer_write_sync( writer.ptr::Ptr{Cvoid}, - ipc_data::Ptr{UInt8}, - ipc_len::Csize_t, - response::Ref{Response{Cvoid}}, - handle::Ptr{Cvoid} - )::Cint + pointer(ipc_bytes)::Ptr{UInt8}, + length(ipc_bytes)::Csize_t, + )::Int32 end - - @throw_on_error(response, "write", IcebergException) - + ret == 0 || throw(IcebergException("write failed (see writer close for details)")) return nothing end From a1a78babb2ed3788380778baa58409af3ad8c54f Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 17:20:31 +0200 Subject: [PATCH 08/25] format --- iceberg_rust_ffi/src/writer.rs | 53 +++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index fb04b84..8dff45f 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -603,16 +603,20 @@ pub extern "C" fn iceberg_writer_write_sync( } }; - let ipc_bytes = unsafe { - std::slice::from_raw_parts(arrow_ipc_data, arrow_ipc_len).to_vec() - }; + let ipc_bytes = unsafe { std::slice::from_raw_parts(arrow_ipc_data, arrow_ipc_len).to_vec() }; let cursor = Cursor::new(ipc_bytes); let reader = match StreamReader::try_new(cursor, None) { Ok(r) => r, Err(e) => { - let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { *slot = Some(anyhow::anyhow!("IPC reader: {}", e)); } + let mut slot = writer_ref + .writer_state + .error + .lock() + .unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(anyhow::anyhow!("IPC reader: {}", e)); + } return -1; } }; @@ -621,18 +625,41 @@ pub extern "C" fn iceberg_writer_write_sync( let batch = match batch_result { Ok(b) => b, Err(e) => { - let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { *slot = Some(anyhow::anyhow!("IPC batch: {}", e)); } + let mut slot = writer_ref + .writer_state + .error + .lock() + .unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(anyhow::anyhow!("IPC batch: {}", e)); + } return -1; } }; - writer_ref.writer_state.pending.fetch_add(1, Ordering::AcqRel); - let task = EncodeTask { batch, state: writer_ref.writer_state.clone() }; + writer_ref + .writer_state + .pending + .fetch_add(1, Ordering::AcqRel); + let task = EncodeTask { + batch, + state: writer_ref.writer_state.clone(), + }; if pool.task_tx.blocking_send(task).is_err() { - let prev = writer_ref.writer_state.pending.fetch_sub(1, Ordering::AcqRel); - if prev == 1 { writer_ref.writer_state.done_notify.notify_one(); } - let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { *slot = Some(anyhow::anyhow!("Encode pool closed")); } + let prev = writer_ref + .writer_state + .pending + .fetch_sub(1, Ordering::AcqRel); + if prev == 1 { + writer_ref.writer_state.done_notify.notify_one(); + } + let mut slot = writer_ref + .writer_state + .error + .lock() + .unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(anyhow::anyhow!("Encode pool closed")); + } return -1; } } From 3e46312ebce9a3be74fe12dea2f009e555282d62 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 17:23:30 +0200 Subject: [PATCH 09/25] Rename fn --- src/RustyIceberg.jl | 2 +- src/writer.jl | 6 +++--- test/writer_tests.jl | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/RustyIceberg.jl b/src/RustyIceberg.jl index a9a3385..41551f6 100644 --- a/src/RustyIceberg.jl +++ b/src/RustyIceberg.jl @@ -40,7 +40,7 @@ export COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FL export COLUMN_TYPE_STRING, COLUMN_TYPE_DATE, COLUMN_TYPE_TIMESTAMP, COLUMN_TYPE_TIMESTAMPTZ, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_UUID export COLUMN_TYPE_DECIMAL_INT32, COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_DECIMAL_INT128 export julia_type_to_column_type -export GatheredColumn, GatheredBatch, add_slice!, add_string_slice!, write_scattered_columns_sync +export GatheredColumn, GatheredBatch, add_slice!, add_string_slice! # Always use the JLL library - override via Preferences if needed for local development # To use a local build, set the preference: diff --git a/src/writer.jl b/src/writer.jl index c9d50a5..522f902 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -1050,7 +1050,7 @@ function Base.push!( end """ - write_scattered_columns_sync(writer::DataFileWriter, batch::GatheredBatch[, extra_preserve]) + write_columns(writer::DataFileWriter, batch::GatheredBatch[, extra_preserve]) Gather column data from Julia memory synchronously, then encode asynchronously. @@ -1064,7 +1064,7 @@ The source data pointed to by the `GatheredBatch` slices and `extra_preserve` mu valid for the duration of this call. After the call returns, all Julia pointers have been consumed and the source data may be safely released. """ -function write_scattered_columns_sync( +function write_columns( writer::DataFileWriter, batch::GatheredBatch, extra_preserve = nothing, @@ -1098,6 +1098,6 @@ function write_scattered_columns_sync( length(descriptors)::Csize_t, )::Int32 end - ret == 0 || throw(IcebergException("write_scattered_columns_sync: gather failed (see writer close for details)")) + ret == 0 || throw(IcebergException("write_columns (gathered): gather failed (see writer close for details)")) return nothing end diff --git a/test/writer_tests.jl b/test/writer_tests.jl index 8d263d1..80c9380 100644 --- a/test/writer_tests.jl +++ b/test/writer_tests.jl @@ -1149,8 +1149,8 @@ end println("\n✅ write_columns decimal nullable tests completed!") end -@testset "Writer write_scattered_columns_sync API" begin - println("Testing write_scattered_columns_sync (gathered-column) API...") +@testset "Writer write_columns (GatheredBatch) API" begin + println("Testing write_columns with GatheredBatch (gathered-column) API...") catalog_uri = get_catalog_uri() props = get_catalog_properties() @@ -1239,8 +1239,8 @@ end push!(batch, tag_col) println("✅ tag column built (string via add_string_slice!)") - RustyIceberg.write_scattered_columns_sync(writer, batch) - println("✅ Batch written via write_scattered_columns_sync") + RustyIceberg.write_columns(writer, batch) + println("✅ Batch written via write_columns (GatheredBatch)") end @test data_files !== nothing && data_files.ptr != C_NULL println("✅ Writer closed") From b2309f2f689d20faf2a300a28e2a163fce64018c Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 17:23:34 +0200 Subject: [PATCH 10/25] . --- test/writer_tests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/writer_tests.jl b/test/writer_tests.jl index 80c9380..77f8501 100644 --- a/test/writer_tests.jl +++ b/test/writer_tests.jl @@ -1301,5 +1301,5 @@ end end end - println("\n✅ write_scattered_columns_sync API tests completed!") + println("\n✅ write_columns (GatheredBatch) API tests completed!") end From 8dbd1e6c25021fbfe4bef6cf1bfd7d1abb365fd0 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 17:24:46 +0200 Subject: [PATCH 11/25] . --- src/writer.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index 522f902..2f85603 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -1003,7 +1003,7 @@ Collects a `GatheredColumn` per output column, then writes all of them in one ca batch = GatheredBatch() push!(batch, col_int64) push!(batch, col_float64) -write_scattered_columns_sync(writer, batch) +write_columns(writer, batch) ``` You can also push a single-slice column inline without building a `GatheredColumn` @@ -1013,7 +1013,7 @@ explicitly: batch = GatheredBatch() push!(batch, src_ints, COLUMN_TYPE_INT64) push!(batch, src_floats, COLUMN_TYPE_FLOAT64; sel=indices, validity=valid_bv) -write_scattered_columns_sync(writer, batch) +write_columns(writer, batch) ``` """ mutable struct GatheredBatch From 3d43275a5e32531a2bb60fe3c1b500e016187247 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 17:27:20 +0200 Subject: [PATCH 12/25] Fix naming --- iceberg_rust_ffi/src/writer.rs | 18 +++++++----------- src/writer.jl | 4 ++-- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 8dff45f..9ccf5c0 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -254,21 +254,17 @@ pub type IcebergDataFileWriterResponse = IcebergBoxedResponse; -/// Synchronous scatter-gather write: gathers all column data from Julia memory into -/// Arrow arrays in the calling thread, submits the RecordBatch to the global encode -/// pool, then returns immediately (encode is still async). +/// Gather column data from Julia memory into Arrow arrays in the calling thread, then +/// submit the RecordBatch to the global encode pool asynchronously. /// -/// Unlike `iceberg_writer_write_scattered_columns` (which requires a Tokio async context -/// and uses a callback), this function is designed for plain C `ccall` from Julia: -/// - Julia keeps source arrays alive via `GC.@preserve` for the duration of this call. -/// - After this function returns, all Julia pointers have been consumed; Julia may safely -/// release the source data. -/// - Encode is still asynchronous in the global pool; call `iceberg_writer_close` to -/// wait for all pending encodes. +/// Julia keeps source arrays alive via `GC.@preserve` for the duration of this call. +/// After this function returns, all Julia pointers have been consumed and Julia may safely +/// release the source data. Encode is still asynchronous in the global pool; call +/// `iceberg_writer_close` to wait for all pending encodes. /// /// Returns 0 on success, -1 on error (error stored in writer state, propagated on close). #[no_mangle] -pub extern "C" fn iceberg_writer_write_scattered_columns_sync( +pub extern "C" fn iceberg_writer_write_gathered_columns( writer: *mut IcebergDataFileWriter, columns: *const GatheredColumnDescriptor, num_columns: usize, diff --git a/src/writer.jl b/src/writer.jl index 2f85603..b41d5bc 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -512,7 +512,7 @@ end GatheredColumnDescriptor FFI descriptor for a column to be gathered from multiple SliceRefs. -Pass an array of these to `write_scattered_columns`. +Pass an array of these to `write_columns`. - `slices_ptr`: pointer to array of SliceRef structs - `num_slices`: number of SliceRef entries @@ -1092,7 +1092,7 @@ function write_columns( extra_preserve !== nothing && append!(preserve, extra_preserve) ret = GC.@preserve preserve all_slice_arrays descriptors begin - @ccall rust_lib.iceberg_writer_write_scattered_columns_sync( + @ccall rust_lib.iceberg_writer_write_gathered_columns( writer.ptr::Ptr{Cvoid}, pointer(descriptors)::Ptr{GatheredColumnDescriptor}, length(descriptors)::Csize_t, From b2ae1972bd7ea26e247107353f13fca2a4dd72ab Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 18:01:02 +0200 Subject: [PATCH 13/25] Add test --- test/writer_tests.jl | 120 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/test/writer_tests.jl b/test/writer_tests.jl index 77f8501..1d0aa30 100644 --- a/test/writer_tests.jl +++ b/test/writer_tests.jl @@ -1303,3 +1303,123 @@ end println("\n✅ write_columns (GatheredBatch) API tests completed!") end + +@testset "Writer WriterConfig parquet properties" begin + println("Testing WriterConfig parquet writer properties...") + + catalog_uri = get_catalog_uri() + props = get_catalog_properties() + + catalog = nothing + try + catalog = RustyIceberg.catalog_create_rest(catalog_uri; properties=props) + @test catalog !== nothing + println("✅ Catalog created successfully") + + call_count = Ref(0) + + # Write n_rows with the given config, commit, read back, return the NamedTuple. + # Creates and cleans up its own namespace/table. + function write_read_config(config::RustyIceberg.WriterConfig, n_rows::Int=5) + call_count[] += 1 + uid = "$(round(Int, time() * 1000))_$(call_count[])" + ns = ["test_wrcfg_$uid"] + tn = "wrcfg_$uid" + schema = Schema([ + Field(Int32(1), "id", IcebergLong(); required=true), + Field(Int32(2), "value", IcebergDouble(); required=false), + ]) + table = C_NULL + updated_table = C_NULL + data_files = nothing + try + table = RustyIceberg.create_table(catalog, ns, tn, schema) + data_files = RustyIceberg.with_data_file_writer(table, config) do writer + write(writer, ( + id = Int64.(1:n_rows), + value = Float64.(1:n_rows) .* 1.1, + )) + end + updated_table = RustyIceberg.with_transaction(table, catalog) do tx + RustyIceberg.with_fast_append(tx) do action + RustyIceberg.add_data_files(action, data_files) + end + end + return read_table_data(updated_table) + finally + if updated_table != C_NULL + RustyIceberg.free_table(updated_table) + end + if data_files !== nothing && data_files.ptr != C_NULL + RustyIceberg.free_data_files!(data_files) + end + if table != C_NULL + RustyIceberg.free_table(table) + RustyIceberg.drop_table(catalog, ns, tn) + RustyIceberg.drop_namespace(catalog, ns) + end + end + end + + # --- Compression codecs --- + for codec in [RustyIceberg.SNAPPY, RustyIceberg.GZIP, RustyIceberg.LZ4, RustyIceberg.ZSTD] + tbl = write_read_config(RustyIceberg.WriterConfig(compression=codec)) + @test !isnothing(tbl) + @test length(tbl.id) == 5 + perm = sortperm(tbl.id) + @test tbl.id[perm] == Int64[1, 2, 3, 4, 5] + @test tbl.value[perm] ≈ Float64[1, 2, 3, 4, 5] .* 1.1 + println("✅ Compression $codec: data correct") + end + + # --- plain_encoding=true (bypasses DELTA_BINARY_PACKED for INT64/INT32) --- + tbl = write_read_config(RustyIceberg.WriterConfig(plain_encoding=true)) + @test !isnothing(tbl) && length(tbl.id) == 5 + @test tbl.id[sortperm(tbl.id)] == Int64[1, 2, 3, 4, 5] + println("✅ plain_encoding=true: data correct") + + # --- dictionary_enabled=false --- + tbl = write_read_config(RustyIceberg.WriterConfig(dictionary_enabled=false)) + @test !isnothing(tbl) && length(tbl.id) == 5 + @test tbl.id[sortperm(tbl.id)] == Int64[1, 2, 3, 4, 5] + println("✅ dictionary_enabled=false: data correct") + + # --- statistics_enabled=false --- + tbl = write_read_config(RustyIceberg.WriterConfig(statistics_enabled=false)) + @test !isnothing(tbl) && length(tbl.id) == 5 + @test tbl.id[sortperm(tbl.id)] == Int64[1, 2, 3, 4, 5] + println("✅ statistics_enabled=false: data correct") + + # --- max_row_group_size=3 with 10 rows → forces ≥4 row groups --- + tbl = write_read_config(RustyIceberg.WriterConfig(max_row_group_size=3), 10) + @test !isnothing(tbl) && length(tbl.id) == 10 + @test tbl.id[sortperm(tbl.id)] == Int64.(1:10) + println("✅ max_row_group_size=3 (10 rows, multiple row groups): all rows intact") + + # --- write_batch_size=2 (rows encoded per column chunk within a row group) --- + tbl = write_read_config(RustyIceberg.WriterConfig(write_batch_size=2)) + @test !isnothing(tbl) && length(tbl.id) == 5 + @test tbl.id[sortperm(tbl.id)] == Int64[1, 2, 3, 4, 5] + println("✅ write_batch_size=2: data correct") + + # --- Combined: ZSTD + plain_encoding + no dict + large write_batch_size --- + config = RustyIceberg.WriterConfig( + compression = RustyIceberg.ZSTD, + plain_encoding = true, + dictionary_enabled = false, + write_batch_size = 65536, + ) + tbl = write_read_config(config, 8) + @test !isnothing(tbl) && length(tbl.id) == 8 + @test tbl.id[sortperm(tbl.id)] == Int64.(1:8) + println("✅ Combined (ZSTD + plain + no dict + large batch_size): data correct") + + finally + if catalog !== nothing + RustyIceberg.free_catalog!(catalog) + println("✅ Catalog cleaned up") + end + end + + println("\n✅ WriterConfig parquet properties tests completed!") +end From 66880e815edc2feecbc82f99e6d2c4d8fb395bca Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 18:05:50 +0200 Subject: [PATCH 14/25] Rename fn --- iceberg_rust_ffi/src/writer.rs | 6 +++--- iceberg_rust_ffi/src/writer_columns.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 9ccf5c0..d00700c 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -95,7 +95,7 @@ use crate::table::IcebergTable; use crate::transaction::IcebergDataFiles; use crate::util::parse_c_string; use crate::writer_columns::{ - build_arrow_array_scattered, ColumnDescriptor, GatheredColumnDescriptor, SliceRef, + build_arrow_array_gathered, ColumnDescriptor, GatheredColumnDescriptor, SliceRef, }; use object_store_ffi::{ export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT, @@ -307,7 +307,7 @@ pub extern "C" fn iceberg_writer_write_gathered_columns( let batch = match (|| -> Result { let mut arrays = Vec::with_capacity(num_columns); for (i, desc) in col_descs.iter().enumerate() { - arrays.push(unsafe { build_arrow_array_scattered(desc, arrow_schema.field(i))? }); + arrays.push(unsafe { build_arrow_array_gathered(desc, arrow_schema.field(i))? }); } RecordBatch::try_new(arrow_schema, arrays) .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e)) @@ -426,7 +426,7 @@ pub extern "C" fn iceberg_writer_write_columns_sync( let batch = match (|| -> Result { let mut arrays = Vec::with_capacity(num_columns); for (i, desc) in gathered.iter().enumerate() { - arrays.push(unsafe { build_arrow_array_scattered(desc, arrow_schema.field(i))? }); + arrays.push(unsafe { build_arrow_array_gathered(desc, arrow_schema.field(i))? }); } RecordBatch::try_new(arrow_schema, arrays) .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e)) diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs index b099e3b..6f3f3d5 100644 --- a/iceberg_rust_ffi/src/writer_columns.rs +++ b/iceberg_rust_ffi/src/writer_columns.rs @@ -141,7 +141,7 @@ unsafe fn build_null_buffer_scattered( } /// Gather all slices for a column into an Arrow array. -pub(crate) unsafe fn build_arrow_array_scattered( +pub(crate) unsafe fn build_arrow_array_gathered( desc: &GatheredColumnDescriptor, schema_field: &arrow_schema::Field, ) -> Result { From d3fe00ef80f38ec39cb1c8ee58e1ca19c82faa28 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 18:28:28 +0200 Subject: [PATCH 15/25] . --- test/writer_tests.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/writer_tests.jl b/test/writer_tests.jl index 1d0aa30..78bb05e 100644 --- a/test/writer_tests.jl +++ b/test/writer_tests.jl @@ -1333,6 +1333,7 @@ end updated_table = C_NULL data_files = nothing try + RustyIceberg.create_namespace(catalog, ns) table = RustyIceberg.create_table(catalog, ns, tn, schema) data_files = RustyIceberg.with_data_file_writer(table, config) do writer write(writer, ( From 8ce862c91b478dac5e26ae29c49f726b417dcf4e Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 19:12:12 +0200 Subject: [PATCH 16/25] Disallow changing pool size --- iceberg_rust_ffi/src/writer.rs | 9 +++++++-- src/writer.jl | 11 +++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index d00700c..a45cb02 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -145,12 +145,17 @@ static GLOBAL_ENCODE_POOL: OnceLock = OnceLock::new(); static ENCODE_WORKERS: AtomicUsize = AtomicUsize::new(0); /// Set the number of encode worker threads in the global pool. -/// Must be called before creating any writers; has no effect afterwards. +/// Must be called before any writer is created. Returns 0 on success, 1 if the pool is +/// already initialized (call ignored). #[no_mangle] -pub extern "C" fn iceberg_set_encode_workers(n: i32) { +pub extern "C" fn iceberg_set_encode_workers(n: i32) -> i32 { + if GLOBAL_ENCODE_POOL.get().is_some() { + return 1; + } if n > 0 { ENCODE_WORKERS.store(n as usize, Ordering::Relaxed); } + 0 } /// Initialize the global encode pool on first call. diff --git a/src/writer.jl b/src/writer.jl index b41d5bc..0675314 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -151,13 +151,16 @@ end Set the number of threads in the global Parquet encode worker pool. -Must be called before creating any `DataFileWriter`; has no effect once the pool -has been initialized (i.e. after the first writer is created). Defaults to -`Sys.CPU_THREADS` if not set. +Must be called before the first `DataFileWriter` is created (i.e. before the pool is +initialized). Throws if the pool is already running. Defaults to `Sys.CPU_THREADS` +if not set. """ function set_encode_workers!(n::Int) n > 0 || throw(ArgumentError("n must be positive, got $n")) - @ccall rust_lib.iceberg_set_encode_workers(n::Cint)::Cvoid + ret = @ccall rust_lib.iceberg_set_encode_workers(n::Cint)::Int32 + ret == 0 || throw(IcebergException( + "set_encode_workers! must be called before creating any DataFileWriter" + )) return nothing end From bec8fa564c2937f7f66599df7cdce146c690d11b Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 19:18:52 +0200 Subject: [PATCH 17/25] Update comment --- iceberg_rust_ffi/src/writer_columns.rs | 7 +++++-- src/writer.jl | 5 ++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs index 6f3f3d5..f223bbb 100644 --- a/iceberg_rust_ffi/src/writer_columns.rs +++ b/iceberg_rust_ffi/src/writer_columns.rs @@ -248,8 +248,11 @@ pub(crate) unsafe fn build_arrow_array_gathered( )) } COLUMN_TYPE_STRING => { - // Strings are always pre-staged in Julia; data_ptr = *const *const u8, - // lengths_ptr = *const i64, sel_ptr is always null (identity). + // String columns do not support selection vectors. Julia strings are + // heap-allocated with non-contiguous addresses, so the caller must build + // str_ptrs/str_lens arrays up-front — any row selection is already applied + // before add_string_slice! is called. sel_ptr is therefore always null here. + // data_ptr = *const *const u8, lengths_ptr = *const i64. let mut all_strings: Vec> = Vec::with_capacity(total); for slice in slices { if slice.lengths_ptr.is_null() { diff --git a/src/writer.jl b/src/writer.jl index 0675314..4ecf46e 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -903,7 +903,10 @@ add_slice!(col, src_array2; sel=sel_indices) # scattered: rows at sel_indic add_slice!(col, src_array3; validity=valid_bv) # nullable slice ``` -String columns are not supported on the scattered path; use `ColumnBatch` instead. +For string columns use `add_string_slice!` instead of `add_slice!`. Selection vectors +are not supported for strings: Julia strings are non-contiguous, so the caller must +build `str_ptrs`/`str_lens` arrays up-front — any row selection is applied on the Julia +side before calling `add_string_slice!`. """ mutable struct GatheredColumn slices::Vector{SliceRef} From f535c117ecbf6c01814d28390d287247de3036c2 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Tue, 28 Apr 2026 19:24:43 +0200 Subject: [PATCH 18/25] Add high-level string overload for add_string_slice! Accepts Vector{String} with optional validity BitVector, handling pointer extraction and GC preservation internally. The low-level ptr/len overload is retained for performance-sensitive callers with pre-allocated buffers. Co-Authored-By: Claude Sonnet 4.6 --- src/writer.jl | 48 +++++++++++++++++++++++++++++++++++++++++--- test/writer_tests.jl | 24 ++++++---------------- 2 files changed, 51 insertions(+), 21 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index 4ecf46e..6df49db 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -901,6 +901,9 @@ col = GatheredColumn(COLUMN_TYPE_INT64) add_slice!(col, src_array) # sequential: all rows add_slice!(col, src_array2; sel=sel_indices) # scattered: rows at sel_indices add_slice!(col, src_array3; validity=valid_bv) # nullable slice + +str_col = GatheredColumn(COLUMN_TYPE_STRING; nullable=true) +add_string_slice!(str_col, ["a", "", "c"]; validity=BitVector([true, false, true])) ``` For string columns use `add_string_slice!` instead of `add_slice!`. Selection vectors @@ -965,12 +968,51 @@ function add_slice!( return col end +""" + add_string_slice!(col::GatheredColumn, strings::Vector{String}; validity=nothing) + +Append a string slice to `col` from a plain `Vector{String}`. + +- `validity`: optional `BitVector` (`true` = valid, `false` = null). Marking a row null + does not require a placeholder in `strings`, but the vector must still be the same length. + +```julia +col = GatheredColumn(COLUMN_TYPE_STRING; nullable=true) +add_string_slice!(col, ["hello", "", "world"]; validity=BitVector([true, false, true])) +``` + +For performance-critical paths where pointer/length arrays are pre-allocated, use the +lower-level `add_string_slice!(col, str_ptrs, str_lens; validity)` overload directly. +""" +function add_string_slice!( + col::GatheredColumn, + strings::Vector{String}; + validity::Union{Nothing, BitVector} = nothing, +) + n = length(strings) + is_nullable = validity !== nothing + str_ptrs = Vector{Ptr{UInt8}}(undef, n) + str_lens = Vector{Int64}(undef, n) + for i in 1:n + if is_nullable && !validity[i] + str_ptrs[i] = Ptr{UInt8}(C_NULL) + str_lens[i] = 0 + else + str_ptrs[i] = pointer(strings[i]) + str_lens[i] = ncodeunits(strings[i]) + end + end + push!(col.preserve, strings) # keep String objects alive so pointers remain valid + return add_string_slice!(col, str_ptrs, str_lens; validity) +end + """ add_string_slice!(col::GatheredColumn, str_ptrs, str_lens; validity=nothing) -Append a string slice to `col`. `str_ptrs` is a `Vector{Ptr{UInt8}}` of pointers to -UTF-8 string data and `str_lens` is a `Vector{Int64}` of corresponding byte lengths. -The caller is responsible for keeping the pointed-to string bytes alive. +Low-level overload: append a string slice from pre-built pointer/length arrays. +`str_ptrs` is a `Vector{Ptr{UInt8}}` of pointers to UTF-8 string data and `str_lens` +is a `Vector{Int64}` of corresponding byte lengths. The caller is responsible for keeping +the pointed-to string bytes alive until `write_columns` returns. """ function add_string_slice!( col::GatheredColumn, diff --git a/test/writer_tests.jl b/test/writer_tests.jl index 78bb05e..1c15dd7 100644 --- a/test/writer_tests.jl +++ b/test/writer_tests.jl @@ -1216,26 +1216,14 @@ end push!(batch, score_col) println("✅ score column built (scattered + nullable)") - # tag: string column via add_string_slice! + # tag: string column via the high-level add_string_slice! overload # Row 0: "alpha", row 1: null, row 2: "gamma", row 3: null - tag_strings = ["alpha", "", "gamma", ""] # empty at null positions - tag_ptrs = Vector{Ptr{UInt8}}(undef, 4) - tag_lens = Vector{Int64}(undef, 4) - tag_valid = BitVector([true, false, true, false]) - for i in 1:4 - if tag_valid[i] - tag_ptrs[i] = pointer(tag_strings[i]) - tag_lens[i] = ncodeunits(tag_strings[i]) - else - tag_ptrs[i] = Ptr{UInt8}(C_NULL) - tag_lens[i] = 0 - end - end - tag_col = RustyIceberg.GatheredColumn(RustyIceberg.COLUMN_TYPE_STRING; nullable=true) - RustyIceberg.add_string_slice!(tag_col, tag_ptrs, tag_lens; validity=tag_valid) - # Preserve source strings so pointers stay valid until write completes - push!(tag_col.preserve, tag_strings) + RustyIceberg.add_string_slice!( + tag_col, + ["alpha", "", "gamma", ""]; + validity=BitVector([true, false, true, false]) + ) push!(batch, tag_col) println("✅ tag column built (string via add_string_slice!)") From e242f2d5138000e8e6d967fb48622424377194d3 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 29 Apr 2026 17:27:59 +0200 Subject: [PATCH 19/25] Optimise string column building: skip UTF-8 validation, avoid intermediate Vec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build StringArray directly with OffsetBuffer + values Buffer in a single pass, replacing Vec> + StringArray::from. Use new_unchecked to skip Arrow's UTF-8 re-validation — Julia strings are guaranteed valid UTF-8. For 20M x 32-byte strings this eliminates ~320 MB of intermediate Vec> storage and ~640 MB of redundant UTF-8 validation reads per column. Co-Authored-By: Claude Sonnet 4.6 --- iceberg_rust_ffi/src/writer_columns.rs | 44 +++++++++++++++++--------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs index f223bbb..fbdae91 100644 --- a/iceberg_rust_ffi/src/writer_columns.rs +++ b/iceberg_rust_ffi/src/writer_columns.rs @@ -13,7 +13,7 @@ use arrow_array::{ }, ArrayRef, BooleanArray, PrimitiveArray, StringArray, }; -use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, ScalarBuffer}; +use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; /// Column type codes (must match Julia's ColumnType enum) pub const COLUMN_TYPE_INT32: i32 = 0; @@ -253,7 +253,19 @@ pub(crate) unsafe fn build_arrow_array_gathered( // str_ptrs/str_lens arrays up-front — any row selection is already applied // before add_string_slice! is called. sel_ptr is therefore always null here. // data_ptr = *const *const u8, lengths_ptr = *const i64. - let mut all_strings: Vec> = Vec::with_capacity(total); + // + // Build the Arrow StringArray directly: one pass copies string bytes into a + // contiguous values buffer and tracks cumulative offsets. This avoids the + // intermediate Vec> and skips UTF-8 validation — Julia strings + // are guaranteed valid UTF-8. + let null_buf = if desc.is_nullable { + build_null_buffer_scattered(slices, total) + } else { + None + }; + let mut offsets = Vec::::with_capacity(total + 1); + offsets.push(0i32); + let mut values = Vec::::new(); for slice in slices { if slice.lengths_ptr.is_null() { return Err(anyhow::anyhow!("String column requires lengths_ptr")); @@ -262,24 +274,26 @@ pub(crate) unsafe fn build_arrow_array_gathered( std::slice::from_raw_parts(slice.data_ptr as *const *const u8, slice.len); let lens = std::slice::from_raw_parts(slice.lengths_ptr, slice.len); for i in 0..slice.len { - let is_null = if !slice.validity_ptr.is_null() { - (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1 == 0 - } else { - false - }; - if is_null { - all_strings.push(None); - } else { - let s = std::str::from_utf8(std::slice::from_raw_parts( + let is_null = !slice.validity_ptr.is_null() + && ((*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1) == 0; + if !is_null { + values.extend_from_slice(std::slice::from_raw_parts( ptrs[i], lens[i] as usize, - )) - .map_err(|e| anyhow::anyhow!("Invalid UTF-8: {}", e))?; - all_strings.push(Some(s)); + )); } + offsets.push(values.len() as i32); } } - Arc::new(StringArray::from(all_strings)) + // SAFETY: offsets are monotonically non-decreasing by construction; values + // bytes come from Julia String objects (valid UTF-8) kept alive in col.preserve. + Arc::new(unsafe { + StringArray::new_unchecked( + OffsetBuffer::new(ScalarBuffer::from(offsets)), + Buffer::from_vec(values), + null_buf, + ) + }) } COLUMN_TYPE_UUID => { let mut data: Vec = Vec::with_capacity(total * 16); From 3e8bcc3e6165b9d58581e5b1785b77739bb4f227 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Thu, 30 Apr 2026 13:51:43 +0200 Subject: [PATCH 20/25] Address PR review comments - Extract encode worker loop body into encode_worker_loop() - Retain panic message: downcast Box to &str / String before formatting - Add iceberg_take_gather_error() FFI + thread-local to surface gather errors immediately in Julia exceptions rather than deferring to writer close - Clarify lengths_ptr doc: array of byte lengths per string - Merge identical sequential/scattered validity branches in build_null_buffer_scattered - Add explanatory comments: bitvector merging, re-alignment, all-valid bit-set Co-Authored-By: Claude Sonnet 4.6 --- iceberg_rust_ffi/src/writer.rs | 149 ++++++++++++++++--------- iceberg_rust_ffi/src/writer_columns.rs | 27 ++--- src/writer.jl | 12 +- 3 files changed, 121 insertions(+), 67 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index a45cb02..ec59b1b 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -5,7 +5,9 @@ /// `Arc>` inside WriterState: only one pool thread encodes /// a given writer at a time, and the FIFO global queue ensures batches are submitted /// in order. -use std::ffi::{c_char, c_void}; +use std::any::Any; +use std::cell::RefCell; +use std::ffi::{c_char, c_void, CString}; use std::io::Cursor; use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -140,6 +142,95 @@ struct GlobalWorkerPool { static GLOBAL_ENCODE_POOL: OnceLock = OnceLock::new(); +// Thread-local storage for the most recent synchronous gather error. +// Set by iceberg_writer_write_gathered_columns on failure; consumed by iceberg_take_gather_error. +thread_local! { + static LAST_GATHER_ERROR: RefCell> = const { RefCell::new(None) }; +} + +fn store_gather_error(e: &anyhow::Error) { + let msg = format!("{:#}", e); + LAST_GATHER_ERROR.with(|cell| { + *cell.borrow_mut() = CString::new(msg).ok(); + }); +} + +/// Returns a heap-allocated C string with the most recent gather error on this thread, +/// or NULL if none. Must be called on the same thread as the failed write call, immediately +/// after it returns. The caller must free the returned string with `iceberg_destroy_cstring`. +#[no_mangle] +pub extern "C" fn iceberg_take_gather_error() -> *mut c_char { + LAST_GATHER_ERROR.with(|cell| { + cell.borrow_mut() + .take() + .map(|s| s.into_raw()) + .unwrap_or(std::ptr::null_mut()) + }) +} + +/// Formats a Rust panic payload into an anyhow error, preserving the message where possible. +fn format_panic_error(panic: Box) -> anyhow::Error { + let msg = if let Some(s) = panic.downcast_ref::<&str>() { + format!("encode worker panicked: {}", s) + } else if let Some(s) = panic.downcast_ref::() { + format!("encode worker panicked: {}", s) + } else { + "encode worker panicked (no string payload)".to_string() + }; + anyhow::anyhow!(msg) +} + +/// Body of each encode worker thread: receives tasks from the shared channel and encodes them. +fn encode_worker_loop( + task_rx: Arc>>, + handle: tokio::runtime::Handle, +) { + loop { + // Acquire the shared receiver lock, then wait for a task. + // The lock is released as soon as recv() returns, so workers are not serialized + // during encoding — only during task pickup. + let task = { + let mut rx = handle.block_on(task_rx.lock()); + match handle.block_on(rx.recv()) { + Some(t) => t, + None => break, // sender dropped → pool shutting down + } + }; + + // Clone state before moving task into the closure so we can always decrement + // pending even if the closure panics. + let state = task.state.clone(); + let handle_enc = handle.clone(); + let encode_result = catch_unwind(AssertUnwindSafe(move || { + let mut guard = task.state.writer.lock().unwrap_or_else(|e| e.into_inner()); + match guard.as_mut() { + Some(w) => handle_enc + .block_on(w.write(task.batch)) + .map_err(|e| anyhow::anyhow!("write batch: {}", e)), + None => Err(anyhow::anyhow!("writer already closed")), + } + })); + + let err = match encode_result { + Ok(Ok(())) => None, + Ok(Err(e)) => Some(e), + Err(panic) => Some(format_panic_error(panic)), + }; + if let Some(e) = err { + let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(e); + } + } + + // Always decrement pending; notify close() if this was the last task. + let prev = state.pending.fetch_sub(1, Ordering::AcqRel); + if prev == 1 { + state.done_notify.notify_one(); + } + } +} + /// Desired encode worker count. 0 means "use available_parallelism". /// Must be set before the first iceberg_writer_new call. static ENCODE_WORKERS: AtomicUsize = AtomicUsize::new(0); @@ -180,57 +271,7 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool { let handle = handle.clone(); thread::Builder::new() .name(format!("iceberg-encode-{}", i)) - .spawn(move || { - loop { - // Acquire the shared receiver lock, then wait for a task. - // The lock is released as soon as recv() returns, so workers - // are not serialized during encoding — only during task pickup. - let task = { - let mut rx = handle.block_on(task_rx.lock()); - match handle.block_on(rx.recv()) { - Some(t) => t, - None => break, // sender dropped → pool shutting down - } - }; - - // Clone state before moving task into the closure so we can - // always decrement pending even if the closure panics. - let state = task.state.clone(); - - let handle_enc = handle.clone(); - let encode_result = catch_unwind(AssertUnwindSafe(move || { - let result: anyhow::Result<()> = { - let mut guard = - task.state.writer.lock().unwrap_or_else(|e| e.into_inner()); - match guard.as_mut() { - Some(w) => handle_enc - .block_on(w.write(task.batch)) - .map_err(|e| anyhow::anyhow!("write batch: {}", e)), - None => Err(anyhow::anyhow!("writer already closed")), - } - }; - result - })); - - let err = match encode_result { - Ok(Ok(())) => None, - Ok(Err(e)) => Some(e), - Err(_panic) => Some(anyhow::anyhow!("encode worker panicked")), - }; - if let Some(e) = err { - let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(e); - } - } - - // Always decrement pending; notify close() if this was the last task. - let prev = state.pending.fetch_sub(1, Ordering::AcqRel); - if prev == 1 { - state.done_notify.notify_one(); - } - } - }) + .spawn(move || encode_worker_loop(task_rx, handle)) .expect("failed to spawn iceberg encode worker"); } @@ -319,6 +360,7 @@ pub extern "C" fn iceberg_writer_write_gathered_columns( })() { Ok(b) => b, Err(e) => { + store_gather_error(&e); let mut slot = writer_ref .writer_state .error @@ -352,7 +394,8 @@ pub extern "C" fn iceberg_writer_write_gathered_columns( if prev == 1 { writer_ref.writer_state.done_notify.notify_one(); } - eprintln!("[iceberg:sync] pool channel closed"); + let e = anyhow::anyhow!("encode pool channel closed unexpectedly"); + store_gather_error(&e); -1 } } diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs index fbdae91..ca9369f 100644 --- a/iceberg_rust_ffi/src/writer_columns.rs +++ b/iceberg_rust_ffi/src/writer_columns.rs @@ -68,7 +68,7 @@ unsafe impl Sync for ColumnDescriptor {} /// `sel_ptr = null` → sequential (identity) access: read data[0..len]. /// `sel_ptr != null` → scattered access: read data[sel[i]-1] for i in 0..len (1-based Julia indices). /// `validity_ptr = null` → all rows valid (non-nullable or known all-valid slice). -/// `lengths_ptr != null` → string column: data_ptr is Ptr{UInt8}[], lengths_ptr is Int64[]. +/// `lengths_ptr != null` → string column: data_ptr is Ptr{UInt8}[], lengths_ptr is Int64[] of byte lengths per string. /// Fields are all 8 bytes — no padding, total 40 bytes. #[repr(C)] #[derive(Clone, Copy)] @@ -99,8 +99,16 @@ pub struct GatheredColumnDescriptor { unsafe impl Send for GatheredColumnDescriptor {} unsafe impl Sync for GatheredColumnDescriptor {} -/// Build the Arrow null buffer by scanning validity bits across all slices. -/// Returns `None` if every slice has a null `validity_ptr` (all rows valid). +/// Merges the per-slice validity bitmaps from all slices into a single output bitmap. +/// +/// Each slice contributes `slice.len` output rows. Slices with a null `validity_ptr` are +/// all-valid. Slices with a bitmap may be misaligned relative to the output (each slice +/// starts at a different `out` offset), so bits are copied one at a time with a shift. +/// The selection vector (`sel_ptr`) governs which *source data* elements to read; the +/// validity bitmap is always indexed by output row position, so sequential and scattered +/// slices are treated identically here. +/// +/// Returns `None` if every slice is all-valid (no null buffer needed). unsafe fn build_null_buffer_scattered( slices: &[SliceRef], total_rows: usize, @@ -112,20 +120,13 @@ unsafe fn build_null_buffer_scattered( let mut out = 0usize; for slice in slices { if slice.validity_ptr.is_null() { - // All valid — set bits 1 + // All rows in this slice are valid — set one bit per output row. for i in 0..slice.len { bits[(out + i) / 8] |= 1u8 << ((out + i) % 8); } - } else if slice.sel_ptr.is_null() { - // Sequential: copy bits with possible alignment shift - for i in 0..slice.len { - let b = (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1; - bits[(out + i) / 8] |= b << ((out + i) % 8); - } } else { - // Scattered: validity is per output row (0..len-1), same as sequential. - // The selection governs which *data* elements are picked, not which - // validity bits — the validity bitmap has one bit per output row. + // Copy validity bits from the slice's bitmap into the output bitmap, + // re-aligning from source bit position i to output bit position (out + i). for i in 0..slice.len { let b = (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1; bits[(out + i) / 8] |= b << ((out + i) % 8); diff --git a/src/writer.jl b/src/writer.jl index 6df49db..b0f0b1a 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -1146,6 +1146,16 @@ function write_columns( length(descriptors)::Csize_t, )::Int32 end - ret == 0 || throw(IcebergException("write_columns (gathered): gather failed (see writer close for details)")) + if ret != 0 + err_ptr = @ccall rust_lib.iceberg_take_gather_error()::Ptr{Cchar} + msg = if err_ptr != C_NULL + s = unsafe_string(err_ptr) + @ccall rust_lib.iceberg_destroy_cstring(err_ptr::Ptr{Cchar})::Cint + s + else + "gather failed (see writer close for details)" + end + throw(IcebergException("write_columns (gathered): $(msg)")) + end return nothing end From 7a8947a9a864019c32ba5a7660294aa52e0c939c Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Thu, 30 Apr 2026 13:54:31 +0200 Subject: [PATCH 21/25] =?UTF-8?q?Rename=20build=5Fnull=5Fbuffer=5Fscattere?= =?UTF-8?q?d=20=E2=86=92=20build=5Fnull=5Fbuffer=5Fgathered?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- iceberg_rust_ffi/src/writer_columns.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs index ca9369f..7e1665a 100644 --- a/iceberg_rust_ffi/src/writer_columns.rs +++ b/iceberg_rust_ffi/src/writer_columns.rs @@ -109,10 +109,7 @@ unsafe impl Sync for GatheredColumnDescriptor {} /// slices are treated identically here. /// /// Returns `None` if every slice is all-valid (no null buffer needed). -unsafe fn build_null_buffer_scattered( - slices: &[SliceRef], - total_rows: usize, -) -> Option { +unsafe fn build_null_buffer_gathered(slices: &[SliceRef], total_rows: usize) -> Option { if !slices.iter().any(|s| !s.validity_ptr.is_null()) { return None; } @@ -149,7 +146,7 @@ pub(crate) unsafe fn build_arrow_array_gathered( let slices = std::slice::from_raw_parts(desc.slices, desc.num_slices); let total = desc.total_rows; let null_buf = if desc.is_nullable { - build_null_buffer_scattered(slices, total) + build_null_buffer_gathered(slices, total) } else { None }; @@ -260,7 +257,7 @@ pub(crate) unsafe fn build_arrow_array_gathered( // intermediate Vec> and skips UTF-8 validation — Julia strings // are guaranteed valid UTF-8. let null_buf = if desc.is_nullable { - build_null_buffer_scattered(slices, total) + build_null_buffer_gathered(slices, total) } else { None }; From 7aa1836624d00daa575c309c96fda421e14c5f94 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Thu, 30 Apr 2026 14:48:42 +0200 Subject: [PATCH 22/25] Remove code dup --- iceberg_rust_ffi/src/writer.rs | 239 +++++++++++++-------------------- 1 file changed, 96 insertions(+), 143 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index ec59b1b..d6e67a7 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -300,6 +300,70 @@ pub type IcebergDataFileWriterResponse = IcebergBoxedResponse; +/// Store an error in the writer state (first error wins). +fn store_writer_error(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) { + let mut slot = writer_ref + .writer_state + .error + .lock() + .unwrap_or_else(|e| e.into_inner()); + if slot.is_none() { + *slot = Some(e); + } +} + +/// Build a `RecordBatch` from a slice of `GatheredColumnDescriptor`s. +/// +/// # Safety +/// All pointers inside each `GatheredColumnDescriptor` must be valid for the duration of +/// this call (callers hold `GC.@preserve` or equivalent). +unsafe fn build_record_batch( + arrow_schema: ArrowSchemaRef, + col_descs: I, +) -> Result +where + I: IntoIterator, + I::IntoIter: ExactSizeIterator, +{ + let iter = col_descs.into_iter(); + let mut arrays = Vec::with_capacity(iter.len()); + for (i, desc) in iter.enumerate() { + arrays.push(unsafe { build_arrow_array_gathered(&desc, arrow_schema.field(i))? }); + } + RecordBatch::try_new(arrow_schema, arrays).map_err(|e| anyhow::anyhow!("RecordBatch: {}", e)) +} + +/// Submit a `RecordBatch` to the global encode pool. +/// +/// Increments the writer's pending count before sending and rolls it back on channel failure. +fn submit_batch( + writer_ref: &IcebergDataFileWriter, + pool: &GlobalWorkerPool, + batch: RecordBatch, +) -> Result<(), anyhow::Error> { + writer_ref + .writer_state + .pending + .fetch_add(1, Ordering::AcqRel); + let task = EncodeTask { + batch, + state: writer_ref.writer_state.clone(), + }; + match pool.task_tx.blocking_send(task) { + Ok(()) => Ok(()), + Err(_) => { + let prev = writer_ref + .writer_state + .pending + .fetch_sub(1, Ordering::AcqRel); + if prev == 1 { + writer_ref.writer_state.done_notify.notify_one(); + } + Err(anyhow::anyhow!("encode pool channel closed unexpectedly")) + } + } +} + /// Gather column data from Julia memory into Arrow arrays in the calling thread, then /// submit the RecordBatch to the global encode pool asynchronously. /// @@ -324,7 +388,7 @@ pub extern "C" fn iceberg_writer_write_gathered_columns( let pool = match GLOBAL_ENCODE_POOL.get() { Some(p) => p, None => { - eprintln!("[iceberg:sync] encode pool not initialized; call iceberg_writer_new first"); + eprintln!("[iceberg] encode pool not initialized; call iceberg_writer_new first"); return -1; } }; @@ -333,72 +397,34 @@ pub extern "C" fn iceberg_writer_write_gathered_columns( let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) }; if col_descs.len() != arrow_schema.fields().len() { - let mut slot = writer_ref - .writer_state - .error - .lock() - .unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(anyhow::anyhow!( + store_writer_error( + writer_ref, + anyhow::anyhow!( "Column count mismatch: got {} but schema has {}", col_descs.len(), arrow_schema.fields().len() - )); - } + ), + ); return -1; } // Gather: read from Julia memory into Rust-owned Arrow arrays (blocking, in calling thread). - // After this block, all Julia pointers have been consumed — safe to release. - let batch = match (|| -> Result { - let mut arrays = Vec::with_capacity(num_columns); - for (i, desc) in col_descs.iter().enumerate() { - arrays.push(unsafe { build_arrow_array_gathered(desc, arrow_schema.field(i))? }); - } - RecordBatch::try_new(arrow_schema, arrays) - .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e)) - })() { + // After this point all Julia pointers have been consumed — safe to release. + let batch = match unsafe { build_record_batch(arrow_schema, col_descs.iter().copied()) } { Ok(b) => b, Err(e) => { store_gather_error(&e); - let mut slot = writer_ref - .writer_state - .error - .lock() - .unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(e); - } + store_writer_error(writer_ref, e); return -1; } }; - // Increment pending before submit so iceberg_writer_close always accounts for this batch. - writer_ref - .writer_state - .pending - .fetch_add(1, Ordering::AcqRel); - let task = EncodeTask { - batch, - state: writer_ref.writer_state.clone(), - }; - - match pool.task_tx.blocking_send(task) { - Ok(()) => 0, - Err(_) => { - // Pool channel closed — undo the pending increment. - let prev = writer_ref - .writer_state - .pending - .fetch_sub(1, Ordering::AcqRel); - if prev == 1 { - writer_ref.writer_state.done_notify.notify_one(); - } - let e = anyhow::anyhow!("encode pool channel closed unexpectedly"); - store_gather_error(&e); - -1 - } + if let Err(e) = submit_batch(writer_ref, pool, batch) { + store_gather_error(&e); + store_writer_error(writer_ref, e); + return -1; } + 0 } /// Synchronous write of flat column data: copies each column from Julia memory into @@ -422,7 +448,7 @@ pub extern "C" fn iceberg_writer_write_columns_sync( let pool = match GLOBAL_ENCODE_POOL.get() { Some(p) => p, None => { - eprintln!("[iceberg:sync] encode pool not initialized; call iceberg_writer_new first"); + eprintln!("[iceberg] encode pool not initialized; call iceberg_writer_new first"); return -1; } }; @@ -431,18 +457,14 @@ pub extern "C" fn iceberg_writer_write_columns_sync( let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) }; if col_descs.len() != arrow_schema.fields().len() { - let mut slot = writer_ref - .writer_state - .error - .lock() - .unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(anyhow::anyhow!( + store_writer_error( + writer_ref, + anyhow::anyhow!( "Column count mismatch: got {} but schema has {}", col_descs.len(), arrow_schema.fields().len() - )); - } + ), + ); return -1; } @@ -459,7 +481,7 @@ pub extern "C" fn iceberg_writer_write_columns_sync( }) .collect(); - let gathered: Vec = col_descs + let gathered = col_descs .iter() .zip(slices.iter()) .map(|(d, s)| GatheredColumnDescriptor { @@ -468,54 +490,21 @@ pub extern "C" fn iceberg_writer_write_columns_sync( total_rows: d.num_rows, column_type: d.column_type, is_nullable: d.is_nullable, - }) - .collect(); + }); - let batch = match (|| -> Result { - let mut arrays = Vec::with_capacity(num_columns); - for (i, desc) in gathered.iter().enumerate() { - arrays.push(unsafe { build_arrow_array_gathered(desc, arrow_schema.field(i))? }); - } - RecordBatch::try_new(arrow_schema, arrays) - .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e)) - })() { + let batch = match unsafe { build_record_batch(arrow_schema, gathered) } { Ok(b) => b, Err(e) => { - let mut slot = writer_ref - .writer_state - .error - .lock() - .unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(e); - } + store_writer_error(writer_ref, e); return -1; } }; - writer_ref - .writer_state - .pending - .fetch_add(1, Ordering::AcqRel); - let task = EncodeTask { - batch, - state: writer_ref.writer_state.clone(), - }; - - match pool.task_tx.blocking_send(task) { - Ok(()) => 0, - Err(_) => { - let prev = writer_ref - .writer_state - .pending - .fetch_sub(1, Ordering::AcqRel); - if prev == 1 { - writer_ref.writer_state.done_notify.notify_one(); - } - eprintln!("[iceberg:sync] pool channel closed"); - -1 - } + if let Err(e) = submit_batch(writer_ref, pool, batch) { + store_writer_error(writer_ref, e); + return -1; } + 0 } /// Free a writer. Poisons the writer state so any in-flight pool tasks fail gracefully. @@ -653,14 +642,7 @@ pub extern "C" fn iceberg_writer_write_sync( let reader = match StreamReader::try_new(cursor, None) { Ok(r) => r, Err(e) => { - let mut slot = writer_ref - .writer_state - .error - .lock() - .unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(anyhow::anyhow!("IPC reader: {}", e)); - } + store_writer_error(writer_ref, anyhow::anyhow!("IPC reader: {}", e)); return -1; } }; @@ -669,41 +651,12 @@ pub extern "C" fn iceberg_writer_write_sync( let batch = match batch_result { Ok(b) => b, Err(e) => { - let mut slot = writer_ref - .writer_state - .error - .lock() - .unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(anyhow::anyhow!("IPC batch: {}", e)); - } + store_writer_error(writer_ref, anyhow::anyhow!("IPC batch: {}", e)); return -1; } }; - writer_ref - .writer_state - .pending - .fetch_add(1, Ordering::AcqRel); - let task = EncodeTask { - batch, - state: writer_ref.writer_state.clone(), - }; - if pool.task_tx.blocking_send(task).is_err() { - let prev = writer_ref - .writer_state - .pending - .fetch_sub(1, Ordering::AcqRel); - if prev == 1 { - writer_ref.writer_state.done_notify.notify_one(); - } - let mut slot = writer_ref - .writer_state - .error - .lock() - .unwrap_or_else(|e| e.into_inner()); - if slot.is_none() { - *slot = Some(anyhow::anyhow!("Encode pool closed")); - } + if let Err(e) = submit_batch(writer_ref, pool, batch) { + store_writer_error(writer_ref, e); return -1; } } From b820563946a6b5a60d221c936e183d9742796709 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Thu, 30 Apr 2026 14:50:15 +0200 Subject: [PATCH 23/25] Rename fn --- iceberg_rust_ffi/src/writer.rs | 4 ++-- src/writer.jl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index d6e67a7..0763a1f 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -434,7 +434,7 @@ pub extern "C" fn iceberg_writer_write_gathered_columns( /// Each `ColumnDescriptor` is treated as a single sequential slice (no scatter/gather). /// Returns 0 on success, -1 on error (error stored in writer state, propagated on close). #[no_mangle] -pub extern "C" fn iceberg_writer_write_columns_sync( +pub extern "C" fn iceberg_writer_write_columns( writer: *mut IcebergDataFileWriter, columns: *const ColumnDescriptor, num_columns: usize, @@ -617,7 +617,7 @@ export_runtime_op!( /// /// Returns 0 on success, -1 on error (error stored in writer state, propagated on close). #[no_mangle] -pub extern "C" fn iceberg_writer_write_sync( +pub extern "C" fn iceberg_writer_write( writer: *mut IcebergDataFileWriter, arrow_ipc_data: *const u8, arrow_ipc_len: usize, diff --git a/src/writer.jl b/src/writer.jl index b0f0b1a..901144a 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -419,7 +419,7 @@ end # Internal helper to write raw IPC bytes to the Rust writer function _write_ipc_bytes(writer::DataFileWriter, ipc_bytes::Vector{UInt8}) ret = GC.@preserve ipc_bytes begin - @ccall rust_lib.iceberg_writer_write_sync( + @ccall rust_lib.iceberg_writer_write( writer.ptr::Ptr{Cvoid}, pointer(ipc_bytes)::Ptr{UInt8}, length(ipc_bytes)::Csize_t, @@ -842,7 +842,7 @@ function write_columns(writer::DataFileWriter, columns::Vector{ColumnDescriptor} isempty(columns) && throw(IcebergException("No columns provided")) ret = GC.@preserve columns arrays_to_preserve begin - @ccall rust_lib.iceberg_writer_write_columns_sync( + @ccall rust_lib.iceberg_writer_write_columns( writer.ptr::Ptr{Cvoid}, pointer(columns)::Ptr{ColumnDescriptor}, length(columns)::Csize_t, From c1dc6921d1aa005f76dddd4db321104be952c688 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Thu, 30 Apr 2026 14:57:34 +0200 Subject: [PATCH 24/25] . --- iceberg_rust_ffi/src/writer.rs | 91 ++++++++++++++-------------------- 1 file changed, 37 insertions(+), 54 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index 0763a1f..ce3351f 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -364,6 +364,30 @@ fn submit_batch( } } +/// Shared core for write functions: validates column count, builds a `RecordBatch`, and +/// submits it to the encode pool. +unsafe fn write_gathered_inner( + writer_ref: &IcebergDataFileWriter, + pool: &GlobalWorkerPool, + arrow_schema: ArrowSchemaRef, + num_columns: usize, + col_descs: I, +) -> Result<(), anyhow::Error> +where + I: IntoIterator, + I::IntoIter: ExactSizeIterator, +{ + if num_columns != arrow_schema.fields().len() { + return Err(anyhow::anyhow!( + "Column count mismatch: got {} but schema has {}", + num_columns, + arrow_schema.fields().len() + )); + } + let batch = unsafe { build_record_batch(arrow_schema, col_descs) }?; + submit_batch(writer_ref, pool, batch) +} + /// Gather column data from Julia memory into Arrow arrays in the calling thread, then /// submit the RecordBatch to the global encode pool asynchronously. /// @@ -382,9 +406,7 @@ pub extern "C" fn iceberg_writer_write_gathered_columns( if writer.is_null() || columns.is_null() || num_columns == 0 { return -1; } - let writer_ref = unsafe { &*writer }; - let pool = match GLOBAL_ENCODE_POOL.get() { Some(p) => p, None => { @@ -392,34 +414,17 @@ pub extern "C" fn iceberg_writer_write_gathered_columns( return -1; } }; - let arrow_schema = writer_ref.arrow_schema.clone(); let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) }; - - if col_descs.len() != arrow_schema.fields().len() { - store_writer_error( + if let Err(e) = unsafe { + write_gathered_inner( writer_ref, - anyhow::anyhow!( - "Column count mismatch: got {} but schema has {}", - col_descs.len(), - arrow_schema.fields().len() - ), - ); - return -1; - } - - // Gather: read from Julia memory into Rust-owned Arrow arrays (blocking, in calling thread). - // After this point all Julia pointers have been consumed — safe to release. - let batch = match unsafe { build_record_batch(arrow_schema, col_descs.iter().copied()) } { - Ok(b) => b, - Err(e) => { - store_gather_error(&e); - store_writer_error(writer_ref, e); - return -1; - } - }; - - if let Err(e) = submit_batch(writer_ref, pool, batch) { + pool, + arrow_schema, + num_columns, + col_descs.iter().copied(), + ) + } { store_gather_error(&e); store_writer_error(writer_ref, e); return -1; @@ -442,9 +447,7 @@ pub extern "C" fn iceberg_writer_write_columns( if writer.is_null() || columns.is_null() || num_columns == 0 { return -1; } - let writer_ref = unsafe { &*writer }; - let pool = match GLOBAL_ENCODE_POOL.get() { Some(p) => p, None => { @@ -452,24 +455,12 @@ pub extern "C" fn iceberg_writer_write_columns( return -1; } }; - let arrow_schema = writer_ref.arrow_schema.clone(); let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) }; - - if col_descs.len() != arrow_schema.fields().len() { - store_writer_error( - writer_ref, - anyhow::anyhow!( - "Column count mismatch: got {} but schema has {}", - col_descs.len(), - arrow_schema.fields().len() - ), - ); - return -1; - } - // Wrap each ColumnDescriptor as a single-slice GatheredColumnDescriptor. // Sequential access (sel_ptr = null) so data[0..num_rows] is read in order. + // `slices` must outlive `gathered` since GatheredColumnDescriptor.slices is a raw + // pointer into this Vec's allocation. let slices: Vec = col_descs .iter() .map(|d| SliceRef { @@ -480,7 +471,6 @@ pub extern "C" fn iceberg_writer_write_columns( len: d.num_rows, }) .collect(); - let gathered = col_descs .iter() .zip(slices.iter()) @@ -491,16 +481,9 @@ pub extern "C" fn iceberg_writer_write_columns( column_type: d.column_type, is_nullable: d.is_nullable, }); - - let batch = match unsafe { build_record_batch(arrow_schema, gathered) } { - Ok(b) => b, - Err(e) => { - store_writer_error(writer_ref, e); - return -1; - } - }; - - if let Err(e) = submit_batch(writer_ref, pool, batch) { + if let Err(e) = + unsafe { write_gathered_inner(writer_ref, pool, arrow_schema, num_columns, gathered) } + { store_writer_error(writer_ref, e); return -1; } From 5d525a20fd9a5fd1b7955ebf5f73b309e378de99 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Thu, 30 Apr 2026 15:00:22 +0200 Subject: [PATCH 25/25] . --- iceberg_rust_ffi/src/writer.rs | 75 +++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 29 deletions(-) diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs index ce3351f..19b7e51 100644 --- a/iceberg_rust_ffi/src/writer.rs +++ b/iceberg_rust_ffi/src/writer.rs @@ -364,8 +364,8 @@ fn submit_batch( } } -/// Shared core for write functions: validates column count, builds a `RecordBatch`, and -/// submits it to the encode pool. +/// Validates column count, builds a `RecordBatch` from pre-built gathered descriptors, +/// and submits it to the encode pool. unsafe fn write_gathered_inner( writer_ref: &IcebergDataFileWriter, pool: &GlobalWorkerPool, @@ -388,6 +388,49 @@ where submit_batch(writer_ref, pool, batch) } +/// Validates column count, builds a `RecordBatch` from flat `ColumnDescriptor`s (each +/// treated as a single sequential slice), and submits it to the encode pool. +/// +/// Each `SliceRef` is constructed on the stack and used within the same loop iteration, +/// so no heap allocation is needed for the descriptor conversion. +unsafe fn write_columns_inner( + writer_ref: &IcebergDataFileWriter, + pool: &GlobalWorkerPool, + arrow_schema: ArrowSchemaRef, + col_descs: &[ColumnDescriptor], +) -> Result<(), anyhow::Error> { + if col_descs.len() != arrow_schema.fields().len() { + return Err(anyhow::anyhow!( + "Column count mismatch: got {} but schema has {}", + col_descs.len(), + arrow_schema.fields().len() + )); + } + let mut arrays = Vec::with_capacity(col_descs.len()); + for (i, d) in col_descs.iter().enumerate() { + // SliceRef lives on the stack for exactly this iteration; the raw pointer + // is consumed by build_arrow_array_gathered before the next iteration begins. + let slice = SliceRef { + data_ptr: d.data_ptr, + lengths_ptr: d.lengths_ptr, + validity_ptr: d.validity_ptr, + sel_ptr: std::ptr::null(), + len: d.num_rows, + }; + let desc = GatheredColumnDescriptor { + slices: &slice as *const SliceRef, + num_slices: 1, + total_rows: d.num_rows, + column_type: d.column_type, + is_nullable: d.is_nullable, + }; + arrays.push(unsafe { build_arrow_array_gathered(&desc, arrow_schema.field(i))? }); + } + let batch = RecordBatch::try_new(arrow_schema, arrays) + .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))?; + submit_batch(writer_ref, pool, batch) +} + /// Gather column data from Julia memory into Arrow arrays in the calling thread, then /// submit the RecordBatch to the global encode pool asynchronously. /// @@ -457,33 +500,7 @@ pub extern "C" fn iceberg_writer_write_columns( }; let arrow_schema = writer_ref.arrow_schema.clone(); let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) }; - // Wrap each ColumnDescriptor as a single-slice GatheredColumnDescriptor. - // Sequential access (sel_ptr = null) so data[0..num_rows] is read in order. - // `slices` must outlive `gathered` since GatheredColumnDescriptor.slices is a raw - // pointer into this Vec's allocation. - let slices: Vec = col_descs - .iter() - .map(|d| SliceRef { - data_ptr: d.data_ptr, - lengths_ptr: d.lengths_ptr, - validity_ptr: d.validity_ptr, - sel_ptr: std::ptr::null(), - len: d.num_rows, - }) - .collect(); - let gathered = col_descs - .iter() - .zip(slices.iter()) - .map(|(d, s)| GatheredColumnDescriptor { - slices: s as *const SliceRef, - num_slices: 1, - total_rows: d.num_rows, - column_type: d.column_type, - is_nullable: d.is_nullable, - }); - if let Err(e) = - unsafe { write_gathered_inner(writer_ref, pool, arrow_schema, num_columns, gathered) } - { + if let Err(e) = unsafe { write_columns_inner(writer_ref, pool, arrow_schema, col_descs) } { store_writer_error(writer_ref, e); return -1; }