From e4e690dc08b27783b4ff44c8cdfa217af3741568 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 21 Apr 2026 19:45:13 +0200
Subject: [PATCH 01/25] Bump iceberg-rust to 418213731e91, version 0.7.17

Update iceberg and iceberg-catalog-rest to rev
418213731e91544f5eb31a3efa459e88f599030e, which includes the fix for
incremental scans with from=None silently dropping EXISTING manifest
entries from expired snapshots.

Also pulls in arrow/parquet v58.1.0 via the updated lock file.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 iceberg_rust_ffi/Cargo.lock | 380 ++++++++++++++++++++++++++++++------
 iceberg_rust_ffi/Cargo.toml |   6 +-
 2 files changed, 321 insertions(+), 65 deletions(-)

diff --git a/iceberg_rust_ffi/Cargo.lock b/iceberg_rust_ffi/Cargo.lock
index 8f6d067..c36ad78 100644
--- a/iceberg_rust_ffi/Cargo.lock
+++ b/iceberg_rust_ffi/Cargo.lock
@@ -8,6 +8,41 @@ version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
 
+[[package]]
+name = "aead"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0"
+dependencies = [
+ "crypto-common",
+ "generic-array",
+]
+
+[[package]]
+name = "aes"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
+dependencies = [
+ "cfg-if",
+ "cipher",
+ "cpufeatures",
+]
+
+[[package]]
+name = "aes-gcm"
+version = "0.10.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1"
+dependencies = [
+ "aead",
+ "aes",
+ "cipher",
+ "ctr",
+ "ghash",
+ "subtle",
+]
+
 [[package]]
 name = "ahash"
 version = "0.8.12"
@@ -94,14 +129,14 @@ checksum = "3d62b7694a562cdf5a74227903507c56ab2cc8bdd1f781ed5cb4cf9c9f810bfc"
 
 [[package]]
 name = "arrow-arith"
-version = "57.3.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b"
+checksum = "ced5406f8b720cc0bc3aa9cf5758f93e8593cda5490677aa194e4b4b383f9a59"
 dependencies = [
- "arrow-array",
- "arrow-buffer",
- "arrow-data",
- "arrow-schema",
+ "arrow-array 58.1.0",
+ "arrow-buffer 58.1.0",
+ "arrow-data 58.1.0",
+ "arrow-schema 58.1.0",
  "chrono",
  "num-traits",
 ]
@@ -113,9 +148,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef"
 dependencies = [
  "ahash",
- "arrow-buffer",
- "arrow-data",
- "arrow-schema",
+ "arrow-buffer 57.3.0",
+ "arrow-data 57.3.0",
+ "arrow-schema 57.3.0",
+ "chrono",
+ "half",
+ "hashbrown 0.16.1",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "arrow-array"
+version = "58.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "772bd34cacdda8baec9418d80d23d0fb4d50ef0735685bd45158b83dfeb6e62d"
+dependencies = [
+ "ahash",
+ "arrow-buffer 58.1.0",
+ "arrow-data 58.1.0",
+ "arrow-schema 58.1.0",
  "chrono",
  "half",
  "hashbrown 0.16.1",
@@ -136,18 +189,51 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "arrow-buffer"
+version = "58.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "898f4cf1e9598fdb77f356fdf2134feedfd0ee8d5a4e0a5f573e7d0aec16baa4"
+dependencies = [
+ "bytes",
+ "half",
+ "num-bigint",
+ "num-traits",
+]
+
 [[package]]
 name = "arrow-cast"
 version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5"
 dependencies = [
- "arrow-array",
- "arrow-buffer",
- "arrow-data",
- "arrow-ord",
- "arrow-schema",
- "arrow-select",
+ "arrow-array 57.3.0",
+ "arrow-buffer 57.3.0",
+ "arrow-data 57.3.0",
+ "arrow-ord 57.3.0",
+ "arrow-schema 57.3.0",
+ "arrow-select 57.3.0",
+ "atoi",
+ "base64",
+ "chrono",
+ "half",
+ "lexical-core",
+ "num-traits",
+ "ryu",
+]
+
+[[package]]
+name = "arrow-cast"
+version = "58.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0127816c96533d20fc938729f48c52d3e48f99717e7a0b5ade77d742510736d"
+dependencies = [
+ "arrow-array 58.1.0",
+ "arrow-buffer 58.1.0",
+ "arrow-data 58.1.0",
+ "arrow-ord 58.1.0",
+ "arrow-schema 58.1.0",
+ "arrow-select 58.1.0",
  "atoi",
  "base64",
  "chrono",
@@ -163,8 +249,21 @@ version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304"
 dependencies = [
- "arrow-buffer",
- "arrow-schema",
+ "arrow-buffer 57.3.0",
+ "arrow-schema 57.3.0",
+ "half",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "arrow-data"
+version = "58.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42d10beeab2b1c3bb0b53a00f7c944a178b622173a5c7bcabc3cb45d90238df4"
+dependencies = [
+ "arrow-buffer 58.1.0",
+ "arrow-schema 58.1.0",
  "half",
  "num-integer",
  "num-traits",
@@ -176,11 +275,25 @@ version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1"
 dependencies = [
- "arrow-array",
- "arrow-buffer",
- "arrow-data",
- "arrow-schema",
- "arrow-select",
+ "arrow-array 57.3.0",
+ "arrow-buffer 57.3.0",
+ "arrow-data 57.3.0",
+ "arrow-schema 57.3.0",
+ "arrow-select 57.3.0",
+ "flatbuffers",
+]
+
+[[package]]
+name = "arrow-ipc"
+version = "58.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "609a441080e338147a84e8e6904b6da482cefb957c5cdc0f3398872f69a315d0"
+dependencies = [
+ "arrow-array 58.1.0",
+ "arrow-buffer 58.1.0",
+ "arrow-data 58.1.0",
+ "arrow-schema 58.1.0",
+ "arrow-select 58.1.0",
  "flatbuffers",
 ]
 
@@ -190,11 +303,24 @@ version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b"
 dependencies = [
- "arrow-array",
- "arrow-buffer",
- "arrow-data",
- "arrow-schema",
- "arrow-select",
+ "arrow-array 57.3.0",
+ "arrow-buffer 57.3.0",
+ "arrow-data 57.3.0",
+ "arrow-schema 57.3.0",
+ "arrow-select 57.3.0",
+]
+
+[[package]]
+name = "arrow-ord"
+version = "58.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "763a7ba279b20b52dad300e68cfc37c17efa65e68623169076855b3a9e941ca5"
+dependencies = [
+ "arrow-array 58.1.0",
+ "arrow-buffer 58.1.0",
+ "arrow-data 58.1.0",
+ "arrow-schema 58.1.0",
+ "arrow-select 58.1.0",
 ]
 
 [[package]]
@@ -203,6 +329,12 @@ version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68"
 
+[[package]]
+name = "arrow-schema"
+version = "58.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743"
+
 [[package]]
 name = "arrow-select"
 version = "57.3.0"
@@ -210,24 +342,38 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b"
 dependencies = [
  "ahash",
- "arrow-array",
- "arrow-buffer",
- "arrow-data",
- "arrow-schema",
+ "arrow-array 57.3.0",
+ "arrow-buffer 57.3.0",
+ "arrow-data 57.3.0",
+ "arrow-schema 57.3.0",
+ "num-traits",
+]
+
+[[package]]
+name = "arrow-select"
+version = "58.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78694888660a9e8ac949853db393af2a8b8fc82c19ce333132dfa2e72cc1a7fe"
+dependencies = [
+ "ahash",
+ "arrow-array 58.1.0",
+ "arrow-buffer 58.1.0",
+ "arrow-data 58.1.0",
+ "arrow-schema 58.1.0",
  "num-traits",
 ]
 
 [[package]]
 name = "arrow-string"
-version = "57.3.0"
+version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8"
+checksum = "61e04a01f8bb73ce54437514c5fd3ee2aa3e8abe4c777ee5cc55853b1652f79e"
 dependencies = [
- "arrow-array",
- "arrow-buffer",
- "arrow-data",
- "arrow-schema",
- "arrow-select",
+ "arrow-array 58.1.0",
+ "arrow-buffer 58.1.0",
+ "arrow-data 58.1.0",
+ "arrow-schema 58.1.0",
+ "arrow-select 58.1.0",
  "memchr",
  "num-traits",
  "regex",
@@ -487,6 +633,16 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "cipher"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
+dependencies = [
+ "crypto-common",
+ "inout",
+]
+
 [[package]]
 name = "cmake"
 version = "0.1.57"
@@ -645,9 +801,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
 dependencies = [
  "generic-array",
+ "rand_core 0.6.4",
  "typenum",
 ]
 
+[[package]]
+name = "ctr"
+version = "0.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835"
+dependencies = [
+ "cipher",
+]
+
 [[package]]
 name = "darling"
 version = "0.20.11"
@@ -1163,6 +1329,16 @@ dependencies = [
  "wasip3",
 ]
 
+[[package]]
+name = "ghash"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1"
+dependencies = [
+ "opaque-debug",
+ "polyval",
+]
+
 [[package]]
 name = "gloo-timers"
 version = "0.3.0"
@@ -1498,18 +1674,19 @@ dependencies = [
 [[package]]
 name = "iceberg"
 version = "0.9.0"
-source = "git+https://github.com/RelationalAI/iceberg-rust.git?rev=ff28348dff9e72ca7188a330508c65b637a57c53#ff28348dff9e72ca7188a330508c65b637a57c53"
+source = "git+https://github.com/RelationalAI/iceberg-rust.git?rev=418213731e91544f5eb31a3efa459e88f599030e#418213731e91544f5eb31a3efa459e88f599030e"
 dependencies = [
+ "aes-gcm",
  "anyhow",
  "apache-avro",
  "array-init",
  "arrow-arith",
- "arrow-array",
- "arrow-buffer",
- "arrow-cast",
- "arrow-ord",
- "arrow-schema",
- "arrow-select",
+ "arrow-array 58.1.0",
+ "arrow-buffer 58.1.0",
+ "arrow-cast 58.1.0",
+ "arrow-ord 58.1.0",
+ "arrow-schema 58.1.0",
+ "arrow-select 58.1.0",
  "arrow-string",
  "as-any",
  "async-trait",
@@ -1530,7 +1707,7 @@ dependencies = [
  "once_cell",
  "opendal",
  "ordered-float 4.6.0",
- "parquet",
+ "parquet 58.1.0",
  "rand 0.8.5",
  "reqsign",
  "reqwest",
@@ -1547,13 +1724,14 @@ dependencies = [
  "typetag",
  "url",
  "uuid",
+ "zeroize",
  "zstd",
 ]
 
 [[package]]
 name = "iceberg-catalog-rest"
 version = "0.9.0"
-source = "git+https://github.com/RelationalAI/iceberg-rust.git?rev=ff28348dff9e72ca7188a330508c65b637a57c53#ff28348dff9e72ca7188a330508c65b637a57c53"
+source = "git+https://github.com/RelationalAI/iceberg-rust.git?rev=418213731e91544f5eb31a3efa459e88f599030e#418213731e91544f5eb31a3efa459e88f599030e"
 dependencies = [
  "async-trait",
  "chrono",
@@ -1565,20 +1743,19 @@ dependencies = [
  "serde_derive",
  "serde_json",
  "tokio",
- "tracing",
  "typed-builder",
  "uuid",
 ]
 
 [[package]]
 name = "iceberg_rust_ffi"
-version = "0.7.16"
+version = "0.7.17"
 dependencies = [
  "anyhow",
- "arrow-array",
- "arrow-buffer",
- "arrow-ipc",
- "arrow-schema",
+ "arrow-array 57.3.0",
+ "arrow-buffer 57.3.0",
+ "arrow-ipc 57.3.0",
+ "arrow-schema 57.3.0",
  "async-trait",
  "futures",
  "home",
@@ -1587,7 +1764,7 @@ dependencies = [
  "libc",
  "object_store_ffi",
  "once_cell",
- "parquet",
+ "parquet 57.3.0",
  "serde_json",
  "tempfile",
  "tokio",
@@ -1732,6 +1909,15 @@ dependencies = [
  "serde_core",
 ]
 
+[[package]]
+name = "inout"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
+dependencies = [
+ "generic-array",
+]
+
 [[package]]
 name = "instant"
 version = "0.1.13"
@@ -2008,6 +2194,15 @@ dependencies = [
  "twox-hash",
 ]
 
+[[package]]
+name = "lz4_flex"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a"
+dependencies = [
+ "twox-hash",
+]
+
 [[package]]
 name = "matchers"
 version = "0.2.0"
@@ -2275,6 +2470,12 @@ dependencies = [
  "portable-atomic",
 ]
 
+[[package]]
+name = "opaque-debug"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"
+
 [[package]]
 name = "opendal"
 version = "0.55.0"
@@ -2422,13 +2623,46 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb"
 dependencies = [
  "ahash",
- "arrow-array",
- "arrow-buffer",
- "arrow-cast",
- "arrow-data",
- "arrow-ipc",
- "arrow-schema",
- "arrow-select",
+ "arrow-array 57.3.0",
+ "arrow-buffer 57.3.0",
+ "arrow-cast 57.3.0",
+ "arrow-data 57.3.0",
+ "arrow-ipc 57.3.0",
+ "arrow-schema 57.3.0",
+ "arrow-select 57.3.0",
+ "base64",
+ "brotli",
+ "bytes",
+ "chrono",
+ "flate2",
+ "half",
+ "hashbrown 0.16.1",
+ "lz4_flex 0.12.0",
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+ "paste",
+ "seq-macro",
+ "simdutf8",
+ "snap",
+ "thrift",
+ "twox-hash",
+ "zstd",
+]
+
+[[package]]
+name = "parquet"
+version = "58.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d3f9f2205199603564127932b89695f52b62322f541d0fc7179d57c2e1c9877"
+dependencies = [
+ "ahash",
+ "arrow-array 58.1.0",
+ "arrow-buffer 58.1.0",
+ "arrow-data 58.1.0",
+ "arrow-ipc 58.1.0",
+ "arrow-schema 58.1.0",
+ "arrow-select 58.1.0",
  "base64",
  "brotli",
  "bytes",
@@ -2437,7 +2671,7 @@ dependencies = [
  "futures",
  "half",
  "hashbrown 0.16.1",
- "lz4_flex",
+ "lz4_flex 0.13.0",
  "num-bigint",
  "num-integer",
  "num-traits",
@@ -2501,6 +2735,18 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
 
+[[package]]
+name = "polyval"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "opaque-debug",
+ "universal-hash",
+]
+
 [[package]]
 name = "portable-atomic"
 version = "1.13.1"
@@ -3807,6 +4053,16 @@ version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
 
+[[package]]
+name = "universal-hash"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea"
+dependencies = [
+ "crypto-common",
+ "subtle",
+]
+
 [[package]]
 name = "untrusted"
 version = "0.9.0"
diff --git a/iceberg_rust_ffi/Cargo.toml b/iceberg_rust_ffi/Cargo.toml
index c238db6..3d3dfb6 100644
--- a/iceberg_rust_ffi/Cargo.toml
+++ b/iceberg_rust_ffi/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "iceberg_rust_ffi"
-version = "0.7.16"
+version = "0.7.17"
 edition = "2021"
 
 [lib]
@@ -12,8 +12,8 @@ default = ["julia"]
 julia = []
 
 [dependencies]
-iceberg = { git = "https://github.com/RelationalAI/iceberg-rust.git", rev = "ff28348dff9e72ca7188a330508c65b637a57c53", features = ["storage-azdls"] }
-iceberg-catalog-rest = { git = "https://github.com/RelationalAI/iceberg-rust.git", rev = "ff28348dff9e72ca7188a330508c65b637a57c53" }
+iceberg = { git = "https://github.com/RelationalAI/iceberg-rust.git", rev = "418213731e91544f5eb31a3efa459e88f599030e", features = ["storage-azdls"] }
+iceberg-catalog-rest = { git = "https://github.com/RelationalAI/iceberg-rust.git", rev = "418213731e91544f5eb31a3efa459e88f599030e" }
 object_store_ffi = { git = "https://github.com/RelationalAI/object_store_ffi", rev = "79b08071c7a1642532b5891253280861eca9e44e", default-features = false }
 tokio = { version = "1.0", features = ["full"] }
 futures = "0.3"

From 8c730b29cb3c375d26b587a3a37b5bbeaef64377 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 21 Apr 2026 19:47:16 +0200
Subject: [PATCH 02/25] Bump version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 5620c51..c0ec1e9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "RustyIceberg"
 uuid = "390bdf5b-b624-43dc-a846-0ef7a3405804"
-version = "0.6.9"
+version = "0.7.0"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"

From 845fd0c05bde6bfbc93f0826e743d137ab3f1cc3 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 21 Apr 2026 20:06:37 +0200
Subject: [PATCH 03/25] Bump arrow/parquet deps to 58.1 to match iceberg-rust

iceberg-rust rev 418213731e91 depends on arrow/parquet v58.1.0;
keeping the FFI crate on 57.x caused two incompatible versions of
RecordBatch to be linked, failing to compile. Align all arrow-*
and parquet pins to "58.1".

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 iceberg_rust_ffi/Cargo.lock | 260 ++++++++----------------------------
 iceberg_rust_ffi/Cargo.toml |  10 +-
 2 files changed, 58 insertions(+), 212 deletions(-)

diff --git a/iceberg_rust_ffi/Cargo.lock b/iceberg_rust_ffi/Cargo.lock
index c36ad78..8070ddd 100644
--- a/iceberg_rust_ffi/Cargo.lock
+++ b/iceberg_rust_ffi/Cargo.lock
@@ -133,32 +133,14 @@ version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ced5406f8b720cc0bc3aa9cf5758f93e8593cda5490677aa194e4b4b383f9a59"
 dependencies = [
- "arrow-array 58.1.0",
- "arrow-buffer 58.1.0",
- "arrow-data 58.1.0",
- "arrow-schema 58.1.0",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
  "chrono",
  "num-traits",
 ]
 
-[[package]]
-name = "arrow-array"
-version = "57.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef"
-dependencies = [
- "ahash",
- "arrow-buffer 57.3.0",
- "arrow-data 57.3.0",
- "arrow-schema 57.3.0",
- "chrono",
- "half",
- "hashbrown 0.16.1",
- "num-complex",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "arrow-array"
 version = "58.1.0"
@@ -166,9 +148,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "772bd34cacdda8baec9418d80d23d0fb4d50ef0735685bd45158b83dfeb6e62d"
 dependencies = [
  "ahash",
- "arrow-buffer 58.1.0",
- "arrow-data 58.1.0",
- "arrow-schema 58.1.0",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
  "chrono",
  "half",
  "hashbrown 0.16.1",
@@ -177,18 +159,6 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "arrow-buffer"
-version = "57.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2"
-dependencies = [
- "bytes",
- "half",
- "num-bigint",
- "num-traits",
-]
-
 [[package]]
 name = "arrow-buffer"
 version = "58.1.0"
@@ -201,39 +171,18 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "arrow-cast"
-version = "57.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5"
-dependencies = [
- "arrow-array 57.3.0",
- "arrow-buffer 57.3.0",
- "arrow-data 57.3.0",
- "arrow-ord 57.3.0",
- "arrow-schema 57.3.0",
- "arrow-select 57.3.0",
- "atoi",
- "base64",
- "chrono",
- "half",
- "lexical-core",
- "num-traits",
- "ryu",
-]
-
 [[package]]
 name = "arrow-cast"
 version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b0127816c96533d20fc938729f48c52d3e48f99717e7a0b5ade77d742510736d"
 dependencies = [
- "arrow-array 58.1.0",
- "arrow-buffer 58.1.0",
- "arrow-data 58.1.0",
- "arrow-ord 58.1.0",
- "arrow-schema 58.1.0",
- "arrow-select 58.1.0",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-ord",
+ "arrow-schema",
+ "arrow-select",
  "atoi",
  "base64",
  "chrono",
@@ -243,112 +192,52 @@ dependencies = [
  "ryu",
 ]
 
-[[package]]
-name = "arrow-data"
-version = "57.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304"
-dependencies = [
- "arrow-buffer 57.3.0",
- "arrow-schema 57.3.0",
- "half",
- "num-integer",
- "num-traits",
-]
-
 [[package]]
 name = "arrow-data"
 version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "42d10beeab2b1c3bb0b53a00f7c944a178b622173a5c7bcabc3cb45d90238df4"
 dependencies = [
- "arrow-buffer 58.1.0",
- "arrow-schema 58.1.0",
+ "arrow-buffer",
+ "arrow-schema",
  "half",
  "num-integer",
  "num-traits",
 ]
 
-[[package]]
-name = "arrow-ipc"
-version = "57.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1"
-dependencies = [
- "arrow-array 57.3.0",
- "arrow-buffer 57.3.0",
- "arrow-data 57.3.0",
- "arrow-schema 57.3.0",
- "arrow-select 57.3.0",
- "flatbuffers",
-]
-
 [[package]]
 name = "arrow-ipc"
 version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "609a441080e338147a84e8e6904b6da482cefb957c5cdc0f3398872f69a315d0"
 dependencies = [
- "arrow-array 58.1.0",
- "arrow-buffer 58.1.0",
- "arrow-data 58.1.0",
- "arrow-schema 58.1.0",
- "arrow-select 58.1.0",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
  "flatbuffers",
 ]
 
-[[package]]
-name = "arrow-ord"
-version = "57.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b"
-dependencies = [
- "arrow-array 57.3.0",
- "arrow-buffer 57.3.0",
- "arrow-data 57.3.0",
- "arrow-schema 57.3.0",
- "arrow-select 57.3.0",
-]
-
 [[package]]
 name = "arrow-ord"
 version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "763a7ba279b20b52dad300e68cfc37c17efa65e68623169076855b3a9e941ca5"
 dependencies = [
- "arrow-array 58.1.0",
- "arrow-buffer 58.1.0",
- "arrow-data 58.1.0",
- "arrow-schema 58.1.0",
- "arrow-select 58.1.0",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
 ]
 
-[[package]]
-name = "arrow-schema"
-version = "57.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68"
-
 [[package]]
 name = "arrow-schema"
 version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743"
 
-[[package]]
-name = "arrow-select"
-version = "57.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b"
-dependencies = [
- "ahash",
- "arrow-array 57.3.0",
- "arrow-buffer 57.3.0",
- "arrow-data 57.3.0",
- "arrow-schema 57.3.0",
- "num-traits",
-]
-
 [[package]]
 name = "arrow-select"
 version = "58.1.0"
@@ -356,10 +245,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78694888660a9e8ac949853db393af2a8b8fc82c19ce333132dfa2e72cc1a7fe"
 dependencies = [
  "ahash",
- "arrow-array 58.1.0",
- "arrow-buffer 58.1.0",
- "arrow-data 58.1.0",
- "arrow-schema 58.1.0",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
  "num-traits",
 ]
 
@@ -369,11 +258,11 @@ version = "58.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "61e04a01f8bb73ce54437514c5fd3ee2aa3e8abe4c777ee5cc55853b1652f79e"
 dependencies = [
- "arrow-array 58.1.0",
- "arrow-buffer 58.1.0",
- "arrow-data 58.1.0",
- "arrow-schema 58.1.0",
- "arrow-select 58.1.0",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
  "memchr",
  "num-traits",
  "regex",
@@ -1681,12 +1570,12 @@ dependencies = [
  "apache-avro",
  "array-init",
  "arrow-arith",
- "arrow-array 58.1.0",
- "arrow-buffer 58.1.0",
- "arrow-cast 58.1.0",
- "arrow-ord 58.1.0",
- "arrow-schema 58.1.0",
- "arrow-select 58.1.0",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-ord",
+ "arrow-schema",
+ "arrow-select",
  "arrow-string",
  "as-any",
  "async-trait",
@@ -1707,7 +1596,7 @@ dependencies = [
  "once_cell",
  "opendal",
  "ordered-float 4.6.0",
- "parquet 58.1.0",
+ "parquet",
  "rand 0.8.5",
  "reqsign",
  "reqwest",
@@ -1752,10 +1641,10 @@ name = "iceberg_rust_ffi"
 version = "0.7.17"
 dependencies = [
  "anyhow",
- "arrow-array 57.3.0",
- "arrow-buffer 57.3.0",
- "arrow-ipc 57.3.0",
- "arrow-schema 57.3.0",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-ipc",
+ "arrow-schema",
  "async-trait",
  "futures",
  "home",
@@ -1764,7 +1653,7 @@ dependencies = [
  "libc",
  "object_store_ffi",
  "once_cell",
- "parquet 57.3.0",
+ "parquet",
  "serde_json",
  "tempfile",
  "tokio",
@@ -2185,15 +2074,6 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
 
-[[package]]
-name = "lz4_flex"
-version = "0.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e"
-dependencies = [
- "twox-hash",
-]
-
 [[package]]
 name = "lz4_flex"
 version = "0.13.0"
@@ -2616,40 +2496,6 @@ dependencies = [
  "windows-link",
 ]
 
-[[package]]
-name = "parquet"
-version = "57.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb"
-dependencies = [
- "ahash",
- "arrow-array 57.3.0",
- "arrow-buffer 57.3.0",
- "arrow-cast 57.3.0",
- "arrow-data 57.3.0",
- "arrow-ipc 57.3.0",
- "arrow-schema 57.3.0",
- "arrow-select 57.3.0",
- "base64",
- "brotli",
- "bytes",
- "chrono",
- "flate2",
- "half",
- "hashbrown 0.16.1",
- "lz4_flex 0.12.0",
- "num-bigint",
- "num-integer",
- "num-traits",
- "paste",
- "seq-macro",
- "simdutf8",
- "snap",
- "thrift",
- "twox-hash",
- "zstd",
-]
-
 [[package]]
 name = "parquet"
 version = "58.1.0"
@@ -2657,12 +2503,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7d3f9f2205199603564127932b89695f52b62322f541d0fc7179d57c2e1c9877"
 dependencies = [
  "ahash",
- "arrow-array 58.1.0",
- "arrow-buffer 58.1.0",
- "arrow-data 58.1.0",
- "arrow-ipc 58.1.0",
- "arrow-schema 58.1.0",
- "arrow-select 58.1.0",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-ipc",
+ "arrow-schema",
+ "arrow-select",
  "base64",
  "brotli",
  "bytes",
@@ -2671,7 +2517,7 @@ dependencies = [
  "futures",
  "half",
  "hashbrown 0.16.1",
- "lz4_flex 0.13.0",
+ "lz4_flex",
  "num-bigint",
  "num-integer",
  "num-traits",
diff --git a/iceberg_rust_ffi/Cargo.toml b/iceberg_rust_ffi/Cargo.toml
index 3d3dfb6..7b2cd92 100644
--- a/iceberg_rust_ffi/Cargo.toml
+++ b/iceberg_rust_ffi/Cargo.toml
@@ -19,11 +19,11 @@ tokio = { version = "1.0", features = ["full"] }
 futures = "0.3"
 libc = "0.2"
 anyhow = "1.0"
-arrow-array = "57.1"
-arrow-buffer = "57.1"
-arrow-schema = "57.1"
-arrow-ipc = "57.1"
-parquet = "57.1"
+arrow-array = "58.1"
+arrow-buffer = "58.1"
+arrow-schema = "58.1"
+arrow-ipc = "58.1"
+parquet = "58.1"
 tracing-subscriber = "0.3"
 tracing = "0.1"
 once_cell = "1.19"

From 029fe570ae1cec4fb12e5d4302f803b6a7060cd0 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 16:42:46 +0200
Subject: [PATCH 04/25] Add optimizations

---
 iceberg_rust_ffi/src/writer.rs         | 485 +++++++++++++++++++++----
 iceberg_rust_ffi/src/writer_columns.rs | 368 ++++++++++++++++---
 src/RustyIceberg.jl                    |   5 +-
 src/writer.jl                          | 370 ++++++++++++++++++-
 4 files changed, 1107 insertions(+), 121 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs
index efc194a..bfe95b0 100644
--- a/iceberg_rust_ffi/src/writer.rs
+++ b/iceberg_rust_ffi/src/writer.rs
@@ -1,12 +1,18 @@
 /// Writer support for iceberg_rust_ffi
 ///
-/// This module provides FFI bindings for the iceberg-rust Writer API,
-/// enabling Julia to write Arrow data to Parquet files and produce DataFiles
-/// for use with the Transaction API.
+/// Encoding is handled by a global pool of N=available_parallelism OS threads shared
+/// across all writers. Per-writer ordering is guaranteed by the per-writer
+/// `Arc<Mutex<ConcreteDataFileWriter>>` inside WriterState: only one pool thread encodes
+/// a given writer at a time, and the FIFO global queue ensures batches are submitted
+/// in order. Drain tasks are lightweight async forwarders (no CPU work).
 use std::ffi::{c_char, c_void};
 use std::io::Cursor;
-use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex, OnceLock};
+use std::panic::{catch_unwind, AssertUnwindSafe};
+use std::thread;
 
+use arrow_array::RecordBatch;
 use arrow_ipc::reader::StreamReader;
 use arrow_schema::SchemaRef as ArrowSchemaRef;
 use iceberg::arrow::schema_to_arrow_schema;
@@ -18,8 +24,10 @@ use iceberg::writer::file_writer::location_generator::{
 use iceberg::writer::file_writer::rolling_writer::RollingFileWriterBuilder;
 use iceberg::writer::file_writer::ParquetWriterBuilder;
 use iceberg::writer::{IcebergWriter, IcebergWriterBuilder};
-use parquet::basic::Compression;
-use parquet::file::properties::WriterProperties;
+use parquet::basic::{Compression, Encoding};
+use parquet::file::properties::{EnabledStatistics, WriterProperties};
+use tokio::sync::{mpsc, oneshot};
+use tokio::task::JoinHandle;
 
 /// Compression codec values (must match Julia's CompressionCodec enum)
 const COMPRESSION_UNCOMPRESSED: i32 = 0;
@@ -28,7 +36,6 @@ const COMPRESSION_GZIP: i32 = 2;
 const COMPRESSION_LZ4: i32 = 3;
 const COMPRESSION_ZSTD: i32 = 4;
 
-/// Convert FFI compression code to parquet Compression
 fn compression_from_code(code: i32) -> Compression {
     match code {
         COMPRESSION_UNCOMPRESSED => Compression::UNCOMPRESSED,
@@ -36,7 +43,52 @@ fn compression_from_code(code: i32) -> Compression {
         COMPRESSION_GZIP => Compression::GZIP(Default::default()),
         COMPRESSION_LZ4 => Compression::LZ4,
         COMPRESSION_ZSTD => Compression::ZSTD(Default::default()),
-        _ => Compression::SNAPPY, // Default to SNAPPY for unknown values
+        _ => Compression::SNAPPY,
+    }
+}
+
+/// Parquet writer properties passed from Julia (must match Julia's ParquetWriterProperties layout).
+/// Fields are ordered largest-to-smallest to avoid gaps in the repr(C) layout.
+/// Size fields use 0 to mean "use parquet default".
+#[repr(C)]
+pub struct ParquetWriterPropertiesFFI {
+    /// Maximum number of rows per row group (0 = parquet default: 1 048 576)
+    pub max_row_group_size: i64,
+    /// Target uncompressed data page size in bytes (0 = parquet default: 1 048 576)
+    pub data_page_size: i64,
+    /// Number of rows encoded per column chunk within a row group (0 = parquet default: 1024)
+    pub write_batch_size: i64,
+    /// Compression codec (see COMPRESSION_* constants)
+    pub compression_codec: i32,
+    /// Whether to enable dictionary encoding globally
+    pub dictionary_enabled: bool,
+    /// Force PLAIN encoding for all columns, bypassing DELTA_BINARY_PACKED default for INT64/INT32
+    pub use_plain_encoding: bool,
+    /// Collect per-page and per-row-group min/max statistics (default true; set false to skip)
+    pub statistics_enabled: bool,
+}
+
+impl ParquetWriterPropertiesFFI {
+    fn to_writer_properties(&self) -> WriterProperties {
+        let mut builder = WriterProperties::builder()
+            .set_compression(compression_from_code(self.compression_codec))
+            .set_dictionary_enabled(self.dictionary_enabled);
+        if self.max_row_group_size > 0 {
+            builder = builder.set_max_row_group_row_count(Some(self.max_row_group_size as usize));
+        }
+        if self.data_page_size > 0 {
+            builder = builder.set_data_page_size_limit(self.data_page_size as usize);
+        }
+        if self.write_batch_size > 0 {
+            builder = builder.set_write_batch_size(self.write_batch_size as usize);
+        }
+        if self.use_plain_encoding {
+            builder = builder.set_encoding(Encoding::PLAIN);
+        }
+        if !self.statistics_enabled {
+            builder = builder.set_statistics_enabled(EnabledStatistics::None);
+        }
+        builder.build()
     }
 }
 
@@ -44,6 +96,7 @@ use crate::response::IcebergBoxedResponse;
 use crate::table::IcebergTable;
 use crate::transaction::IcebergDataFiles;
 use crate::util::parse_c_string;
+use crate::writer_columns::{build_arrow_array_scattered, GatheredColumnDescriptor};
 use object_store_ffi::{
     export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT,
 };
@@ -52,12 +105,169 @@ use object_store_ffi::{
 type ConcreteDataFileWriter =
     DataFileWriter<ParquetWriterBuilder, DefaultLocationGenerator, DefaultFileNameGenerator>;
 
-/// Opaque writer handle for FFI
-/// Holds the DataFileWriter which can write RecordBatches and produce DataFiles
+/// Batch item sent through the per-writer drain channel.
+/// `ack` is `Some` only for the zero-copy path (`write_columns`), where Arrow arrays
+/// point directly into Julia memory — the pool worker sends on it after encoding so
+/// Julia knows those buffers are safe to reuse.
+pub(crate) type BatchItem = (RecordBatch, Option<oneshot::Sender<()>>);
+
+/// Encode task submitted to the global worker pool.
+struct EncodeTask {
+    batch: RecordBatch,
+    ack_opt: Option<oneshot::Sender<()>>,
+    state: Arc<WriterState>,
+}
+
+// Safety: RecordBatch is Send; WriterState fields are Send.
+unsafe impl Send for EncodeTask {}
+
+/// Shared mutable state for one IcebergDataFileWriter.
+/// Owned by the IcebergDataFileWriter and shared with pool workers via Arc.
+pub(crate) struct WriterState {
+    /// The underlying Parquet writer. Protected by a Mutex so pool workers can access it
+    /// concurrently (though at most one worker encodes a given writer at a time due to the
+    /// bounded per-writer channel). Set to None when the writer is closed or freed.
+    writer: Mutex<Option<ConcreteDataFileWriter>>,
+    /// Number of encode tasks submitted to the pool but not yet completed.
+    pending: AtomicUsize,
+    /// Notified when `pending` drops to zero, so iceberg_writer_close can wait efficiently.
+    done_notify: tokio::sync::Notify,
+    /// First encode error encountered by a pool worker, if any.
+    error: Mutex<Option<anyhow::Error>>,
+}
+
+// Safety: ConcreteDataFileWriter is Send (verified by its use in spawn_blocking previously).
+unsafe impl Send for WriterState {}
+unsafe impl Sync for WriterState {}
+
+/// Global pool of N=available_parallelism encode worker threads shared across all writers.
+struct GlobalWorkerPool {
+    task_tx: tokio::sync::mpsc::Sender<EncodeTask>,
+}
+
+static GLOBAL_ENCODE_POOL: OnceLock<GlobalWorkerPool> = OnceLock::new();
+
+/// Desired encode worker count. 0 means "use available_parallelism".
+/// Must be set before the first iceberg_writer_new call.
+static ENCODE_WORKERS: AtomicUsize = AtomicUsize::new(0);
+
+/// Set the number of encode worker threads in the global pool.
+/// Must be called before creating any writers; has no effect afterwards.
+#[no_mangle]
+pub extern "C" fn iceberg_set_encode_workers(n: i32) {
+    if n > 0 {
+        ENCODE_WORKERS.store(n as usize, Ordering::Relaxed);
+    }
+}
+
+/// Initialize the global encode pool on first call.
+/// Must be called from within a Tokio runtime (iceberg_writer_new satisfies this).
+fn get_or_init_encode_pool() -> &'static GlobalWorkerPool {
+    GLOBAL_ENCODE_POOL.get_or_init(|| {
+        let configured = ENCODE_WORKERS.load(Ordering::Relaxed);
+        let n = if configured > 0 {
+            configured
+        } else {
+            // available_parallelism() only fails on unusual platforms (embedded, some sandboxes).
+            // On Linux/macOS/Windows it always succeeds, so the unwrap never fires in practice.
+            thread::available_parallelism().unwrap().get()
+        };
+        let handle = tokio::runtime::Handle::current();
+        // Buffer 2× workers — drain tasks are rarely blocked on submit.
+        let (task_tx, task_rx) = tokio::sync::mpsc::channel::<EncodeTask>(n * 2);
+        let task_rx = Arc::new(tokio::sync::Mutex::new(task_rx));
+
+        for i in 0..n {
+            let task_rx = task_rx.clone();
+            let handle = handle.clone();
+            thread::Builder::new()
+                .name(format!("iceberg-encode-{}", i))
+                .spawn(move || {
+                    loop {
+                        // Acquire the shared receiver lock, then wait for a task.
+                        // The lock is released as soon as recv() returns, so workers
+                        // are not serialized during encoding — only during task pickup.
+                        let task = {
+                            let mut rx = handle.block_on(task_rx.lock());
+                            match handle.block_on(rx.recv()) {
+                                Some(t) => t,
+                                None => break, // sender dropped → pool shutting down
+                            }
+                        };
+
+                        // Clone state before moving task into the closure so we can
+                        // always decrement pending even if the closure panics.
+                        let state = task.state.clone();
+
+                        let handle_enc = handle.clone();
+                        let encode_result = catch_unwind(AssertUnwindSafe(move || {
+                            let t0 = std::time::Instant::now();
+                            // Use unwrap_or_else to recover from a poisoned mutex
+                            // (which happens when a previous encode panicked mid-write).
+                            let result: anyhow::Result<()> = {
+                                let mut guard = task.state.writer
+                                    .lock()
+                                    .unwrap_or_else(|e| e.into_inner());
+                                match guard.as_mut() {
+                                    Some(w) => handle_enc
+                                        .block_on(w.write(task.batch))
+                                        .map_err(|e| anyhow::anyhow!("write batch: {}", e)),
+                                    None => Err(anyhow::anyhow!("writer already closed")),
+                                }
+                            };
+                            // ACK inside the closure: if we panic before reaching this,
+                            // the Sender is dropped and ack_rx returns RecvError, which
+                            // write_columns propagates as an error to Julia — correct behavior.
+                            if let Some(ack) = task.ack_opt {
+                                let _ = ack.send(());
+                            }
+                            result
+                        }));
+
+                        let err = match encode_result {
+                            Ok(Ok(())) => None,
+                            Ok(Err(e)) => Some(e),
+                            Err(_panic) => Some(anyhow::anyhow!("encode worker panicked")),
+                        };
+                        if let Some(e) = err {
+                            let mut slot = state.error
+                                .lock()
+                                .unwrap_or_else(|e| e.into_inner());
+                            if slot.is_none() {
+                                *slot = Some(e);
+                            }
+                        }
+
+                        // Always decrement pending; notify close() if this was the last task.
+                        let prev = state.pending.fetch_sub(1, Ordering::AcqRel);
+                        if prev == 1 {
+                            state.done_notify.notify_one();
+                        }
+                    }
+                })
+                .expect("failed to spawn iceberg encode worker");
+        }
+
+        GlobalWorkerPool { task_tx }
+    })
+}
+
+/// Opaque writer handle for FFI.
+///
+/// Writing is pipelined: Julia gathers a RecordBatch, sends it to the bounded per-writer
+/// channel (capacity 1), and returns immediately. A lightweight async drain task forwards
+/// batches to the global encode pool. Pool workers (N = available_parallelism) encode
+/// Parquet concurrently across all active writers.
 pub struct IcebergDataFileWriter {
-    pub(crate) writer: Option<ConcreteDataFileWriter>,
-    /// The Arrow schema for this table, used by write_columns to create RecordBatches
+    /// Arrow schema for this table, used by write_columns to create RecordBatches.
     pub(crate) arrow_schema: ArrowSchemaRef,
+    /// Sender side of the per-writer batch queue (bounded, capacity 1).
+    /// Dropped in iceberg_writer_close to signal EOF to the drain task.
+    pub(crate) batch_tx: Option<mpsc::Sender<BatchItem>>,
+    /// Lightweight async drain task: forwards batches from batch_tx to the global pool.
+    pub(crate) drain_handle: Option<JoinHandle<()>>,
+    /// Shared state: owns the ConcreteDataFileWriter, tracks pending count and errors.
+    pub(crate) writer_state: Arc<WriterState>,
 }
 
 unsafe impl Send for IcebergDataFileWriter {}
@@ -69,25 +279,112 @@ pub type IcebergDataFileWriterResponse = IcebergBoxedResponse<IcebergDataFileWri
 /// Type alias for data files response (returns IcebergDataFiles handle)
 pub type IcebergWriterCloseResponse = IcebergBoxedResponse<IcebergDataFiles>;
 
-/// Free a writer
+/// Synchronous scatter-gather write: gathers all column data from Julia memory into
+/// Arrow arrays in the calling thread, submits the RecordBatch to the global encode
+/// pool, then returns immediately (encode is still async).
+///
+/// Unlike `iceberg_writer_write_scattered_columns` (which requires a Tokio async context
+/// and uses a callback), this function is designed for plain C `ccall` from Julia:
+/// - Julia keeps source arrays alive via `GC.@preserve` for the duration of this call.
+/// - After this function returns, all Julia pointers have been consumed; Julia may safely
+///   release the source data.
+/// - Encode is still asynchronous in the global pool; call `iceberg_writer_close` to
+///   wait for all pending encodes.
+///
+/// Returns 0 on success, -1 on error (error stored in writer state, propagated on close).
+#[no_mangle]
+pub extern "C" fn iceberg_writer_write_scattered_columns_sync(
+    writer: *mut IcebergDataFileWriter,
+    columns: *const GatheredColumnDescriptor,
+    num_columns: usize,
+) -> i32 {
+    if writer.is_null() || columns.is_null() || num_columns == 0 {
+        return -1;
+    }
+
+    let writer_ref = unsafe { &*writer };
+
+    let pool = match GLOBAL_ENCODE_POOL.get() {
+        Some(p) => p,
+        None => {
+            eprintln!("[iceberg:sync] encode pool not initialized; call iceberg_writer_new first");
+            return -1;
+        }
+    };
+
+    let arrow_schema = writer_ref.arrow_schema.clone();
+    let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) };
+
+    if col_descs.len() != arrow_schema.fields().len() {
+        let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner());
+        if slot.is_none() {
+            *slot = Some(anyhow::anyhow!(
+                "Column count mismatch: got {} but schema has {}",
+                col_descs.len(),
+                arrow_schema.fields().len()
+            ));
+        }
+        return -1;
+    }
+
+    // Gather: read from Julia memory into Rust-owned Arrow arrays (blocking, in calling thread).
+    // After this block, all Julia pointers have been consumed — safe to release.
+    let t0 = std::time::Instant::now();
+    let batch = match (|| -> Result<RecordBatch, anyhow::Error> {
+        let mut arrays = Vec::with_capacity(num_columns);
+        for (i, desc) in col_descs.iter().enumerate() {
+            arrays.push(unsafe { build_arrow_array_scattered(desc, arrow_schema.field(i))? });
+        }
+        RecordBatch::try_new(arrow_schema, arrays)
+            .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))
+    })() {
+        Ok(b) => b,
+        Err(e) => {
+            let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner());
+            if slot.is_none() { *slot = Some(e); }
+            return -1;
+        }
+    };
+
+    // Increment pending before submit so iceberg_writer_close always accounts for this batch.
+    writer_ref.writer_state.pending.fetch_add(1, Ordering::AcqRel);
+    let task = EncodeTask { batch, ack_opt: None, state: writer_ref.writer_state.clone() };
+
+    match pool.task_tx.blocking_send(task) {
+        Ok(()) => 0,
+        Err(_) => {
+            // Pool channel closed — undo the pending increment.
+            let prev = writer_ref.writer_state.pending.fetch_sub(1, Ordering::AcqRel);
+            if prev == 1 {
+                writer_ref.writer_state.done_notify.notify_one();
+            }
+            eprintln!("[iceberg:sync] pool channel closed");
+            -1
+        }
+    }
+}
+
+/// Free a writer. Aborts the drain task and poisons the writer state so any in-flight
+/// pool tasks fail gracefully (error/cancel path).
 #[no_mangle]
 pub extern "C" fn iceberg_writer_free(writer: *mut IcebergDataFileWriter) {
     if !writer.is_null() {
         unsafe {
-            let _ = Box::from_raw(writer);
+            let boxed = Box::from_raw(writer);
+            if let Some(handle) = boxed.drain_handle {
+                handle.abort();
+            }
+            // Poison the ConcreteDataFileWriter so any in-flight pool tasks return an error
+            // rather than writing to a partially-freed writer.
+            let _ = boxed.writer_state.writer.lock().unwrap().take();
         }
     }
 }
 
-// Create a new DataFileWriter from a table with configuration options
+// Create a new DataFileWriter from a table with configuration options.
 //
-// This creates the full writer chain:
-// ParquetWriterBuilder -> RollingFileWriterBuilder -> DataFileWriterBuilder -> build()
-//
-// Configuration options:
-// - `prefix`: File name prefix (e.g., "data" produces files like "data-xxx.parquet")
-// - `target_file_size_bytes`: Target size for rolling to a new file (0 = use default 512 MB)
-// - `compression_codec`: Compression codec (0=UNCOMPRESSED, 1=SNAPPY, 2=GZIP, 3=LZ4, 4=ZSTD)
+// Spawns a lightweight async drain task per writer. The global encode pool
+// (N = available_parallelism threads) is initialized on the first call.
 export_runtime_op!(
     iceberg_writer_new,
     IcebergDataFileWriterResponse,
@@ -95,14 +392,18 @@ export_runtime_op!(
         if table.is_null() {
             return Err(anyhow::anyhow!("Null table pointer provided"));
         }
+        if parquet_props.is_null() {
+            return Err(anyhow::anyhow!("Null parquet_props pointer provided"));
+        }
 
         let prefix_str = parse_c_string(prefix, "prefix")?;
         let table_ref = unsafe { &*table };
-        Ok((table_ref, prefix_str, target_file_size_bytes, compression_codec))
+        let props = unsafe { &*parquet_props };
+        Ok((table_ref, prefix_str, target_file_size_bytes, props.to_writer_properties()))
     },
     result_tuple,
     async {
-        let (table_ref, prefix_str, target_file_size_bytes, compression_codec) = result_tuple;
+        let (table_ref, prefix_str, target_file_size_bytes, writer_props) = result_tuple;
         let table = &table_ref.table;
 
         // Create LocationGenerator from table metadata
@@ -116,12 +417,6 @@ export_runtime_op!(
             DataFileFormat::Parquet,
         );
 
-        // Create WriterProperties with compression
-        let compression = compression_from_code(compression_codec);
-        let writer_props = WriterProperties::builder()
-            .set_compression(compression)
-            .build();
-
         // Create ParquetWriterBuilder with table schema and configured properties
         let parquet_writer_builder = ParquetWriterBuilder::new(
             writer_props,
@@ -146,11 +441,8 @@ export_runtime_op!(
             )
         };
 
-        // Create DataFileWriterBuilder
-        let data_file_writer_builder = DataFileWriterBuilder::new(rolling_file_writer_builder);
-
-        // Build the writer (no partition key for basic writes)
-        let writer = data_file_writer_builder
+        // Build the concrete DataFileWriter
+        let concrete_writer = DataFileWriterBuilder::new(rolling_file_writer_builder)
             .build(None)
             .await
             .map_err(|e| anyhow::anyhow!("Failed to build data file writer: {}", e))?;
@@ -161,21 +453,54 @@ export_runtime_op!(
                 .map_err(|e| anyhow::anyhow!("Failed to convert schema to Arrow: {}", e))?
         );
 
+        // Initialize global pool (no-op if already running).
+        let pool = get_or_init_encode_pool();
+
+        let writer_state = Arc::new(WriterState {
+            writer: Mutex::new(Some(concrete_writer)),
+            pending: AtomicUsize::new(0),
+            done_notify: tokio::sync::Notify::new(),
+            error: Mutex::new(None),
+        });
+
+        // Bounded per-writer channel (capacity 1): Julia can be at most one batch ahead
+        // of the pool. This provides backpressure: send blocks if the drain task hasn't
+        // forwarded the previous batch yet.
+        let (batch_tx, mut batch_rx) = mpsc::channel::<BatchItem>(1);
+
+        // Lightweight drain task: forwards batches to the global pool asynchronously.
+        let drain_handle: JoinHandle<()> = {
+            let state = writer_state.clone();
+            let pool_tx = pool.task_tx.clone();
+            tokio::task::spawn(async move {
+                while let Some((batch, ack_opt)) = batch_rx.recv().await {
+                    state.pending.fetch_add(1, Ordering::AcqRel);
+                    let task = EncodeTask { batch, ack_opt, state: state.clone() };
+                    if pool_tx.send(task).await.is_err() {
+                        eprintln!("[iceberg:drain] pool channel closed");
+                        break;
+                    }
+                }
+            })
+        };
+
         Ok::<IcebergDataFileWriter, anyhow::Error>(IcebergDataFileWriter {
-            writer: Some(writer),
             arrow_schema,
+            batch_tx: Some(batch_tx),
+            drain_handle: Some(drain_handle),
+            writer_state,
         })
     },
     table: *mut IcebergTable,
     prefix: *const c_char,
     target_file_size_bytes: i64,
-    compression_codec: i32
+    parquet_props: *const ParquetWriterPropertiesFFI
 );
 
-// Write Arrow IPC data to the writer
+// Write Arrow IPC data to the writer.
 //
-// The data must be in Arrow IPC stream format (as produced by Arrow.tobuffer in Julia).
-// This deserializes the IPC data to RecordBatch and writes it to the Parquet file.
+// Deserializes the IPC stream, then sends each RecordBatch to the drain task's channel.
+// Returns immediately after queuing — encoding happens asynchronously in the global pool.
 export_runtime_op!(
     iceberg_writer_write,
     crate::IcebergResponse,
@@ -202,25 +527,22 @@ export_runtime_op!(
     async {
         let (writer_ref, ipc_bytes) = result_tuple;
 
-        // Deserialize Arrow IPC to RecordBatch
+        let tx = writer_ref
+            .batch_tx
+            .as_ref()
+            .ok_or_else(|| anyhow::anyhow!("Writer has been closed"))?;
+
+        // Deserialize Arrow IPC to RecordBatches and enqueue each one
         let cursor = Cursor::new(ipc_bytes);
         let mut reader = StreamReader::try_new(cursor, None)
             .map_err(|e| anyhow::anyhow!("Failed to create Arrow IPC reader: {}", e))?;
 
-        // Get the writer
-        let writer = writer_ref
-            .writer
-            .as_mut()
-            .ok_or_else(|| anyhow::anyhow!("Writer has been closed"))?;
-
-        // Read and write all batches from the IPC stream
         while let Some(batch_result) = reader.next() {
             let batch = batch_result
                 .map_err(|e| anyhow::anyhow!("Failed to read Arrow IPC batch: {}", e))?;
-            writer
-                .write(batch)
-                .await
-                .map_err(|e| anyhow::anyhow!("Failed to write batch: {}", e))?;
+            // IPC data already copied into ipc_bytes above — no ACK needed.
+            tx.send((batch, None)).await
+                .map_err(|_| anyhow::anyhow!("Writer channel closed (drain task may have failed)"))?;
         }
 
         Ok::<(), anyhow::Error>(())
@@ -230,12 +552,11 @@ export_runtime_op!(
     arrow_ipc_len: usize
 );
 
-// Close the writer and return the produced DataFiles
-//
-// This flushes any remaining data, closes the Parquet file(s), and returns
-// the DataFiles metadata that can be used with fast_append! in a Transaction.
+// Close the writer and return the produced DataFiles.
 //
-// After calling this, the writer cannot be used again.
+// Drops the batch sender (signals EOF to the drain task), waits for the drain task to
+// finish forwarding all batches, waits for all pool encodes to complete, then finalizes
+// the Parquet file and returns the DataFiles metadata.
 export_runtime_op!(
     iceberg_writer_close,
     IcebergWriterCloseResponse,
@@ -248,14 +569,54 @@ export_runtime_op!(
     },
     writer_ref,
     async {
-        // Take the writer out
-        let mut writer = writer_ref
+        // Drop the sender — signals the drain task that no more batches are coming
+        drop(writer_ref.batch_tx.take());
+
+        // Wait for the drain task to finish forwarding all batches to the pool
+        if let Some(handle) = writer_ref.drain_handle.take() {
+            handle.await
+                .map_err(|e| anyhow::anyhow!("Drain task panicked: {}", e))?;
+        }
+
+        // Wait for all pending pool encodes to complete.
+        // Uses a timeout to guard against a dead worker thread (e.g. panic outside
+        // catch_unwind) that would otherwise leave pending > 0 forever.
+        // Tokio's Notify preserves the notification if notify_one() fired before
+        // notified() is polled, so the check-then-wait sequence is race-free.
+        let state = &writer_ref.writer_state;
+        let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(120);
+        loop {
+            if state.pending.load(Ordering::Acquire) == 0 {
+                break;
+            }
+            let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
+            if remaining.is_zero() {
+                return Err(anyhow::anyhow!(
+                    "Timed out waiting for {} encode task(s) to complete; \
+                     an encode worker may have crashed",
+                    state.pending.load(Ordering::Acquire)
+                ));
+            }
+            tokio::select! {
+                _ = state.done_notify.notified() => {}
+                _ = tokio::time::sleep(remaining) => {}
+            }
+        }
+
+        // Propagate any encode error
+        if let Some(e) = state.error.lock().unwrap().take() {
+            return Err(e);
+        }
+
+        // Take the concrete writer and finalize the Parquet file
+        let mut concrete = state
             .writer
+            .lock()
+            .unwrap()
             .take()
-            .ok_or_else(|| anyhow::anyhow!("Writer has already been closed"))?;
+            .ok_or_else(|| anyhow::anyhow!("Writer already closed"))?;
 
-        // Close the writer to get the DataFiles
-        let data_files = writer
+        let data_files = concrete
             .close()
             .await
             .map_err(|e| anyhow::anyhow!("Failed to close writer: {}", e))?;
diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs
index f892d0c..0f61849 100644
--- a/iceberg_rust_ffi/src/writer_columns.rs
+++ b/iceberg_rust_ffi/src/writer_columns.rs
@@ -4,6 +4,7 @@
 /// avoiding the overhead of Arrow IPC serialization. Julia passes raw column pointers
 /// and metadata, and Rust builds Arrow arrays directly from them.
 use std::ffi::c_void;
+use std::ptr::NonNull;
 use std::sync::Arc;
 
 use arrow_array::{
@@ -13,11 +14,26 @@ use arrow_array::{
     },
     ArrayRef, BooleanArray, PrimitiveArray, RecordBatch, StringArray,
 };
-use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, ScalarBuffer};
+use arrow_buffer::{ArrowNativeType, BooleanBuffer, Buffer, NullBuffer, ScalarBuffer};
+
+/// No-op allocator: Arrow buffers created from Julia memory must not be freed by Rust.
+/// Julia owns the memory until the per-batch oneshot ACK is received (write_columns path).
+struct NoopAllocation;
+
+/// Build a zero-copy `ScalarBuffer<T>` pointing directly into Julia-owned memory.
+/// Safety: `ptr` must be non-null, valid, and alive until the buffer is dropped.
+unsafe fn zero_copy_scalar_buffer<T: ArrowNativeType>(
+    ptr: *const T,
+    num_rows: usize,
+) -> ScalarBuffer<T> {
+    let byte_len = num_rows * std::mem::size_of::<T>();
+    let nn_ptr = NonNull::new_unchecked(ptr as *mut u8);
+    let buf = Buffer::from_custom_allocation(nn_ptr, byte_len, Arc::new(NoopAllocation));
+    ScalarBuffer::new(buf, 0, num_rows)
+}
 use arrow_schema::DataType;
 
 use crate::writer::IcebergDataFileWriter;
-use iceberg::writer::IcebergWriter;
 use object_store_ffi::{
     export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT,
 };
@@ -72,52 +88,42 @@ unsafe fn build_arrow_array(
     schema_field: &arrow_schema::Field,
 ) -> Result<ArrayRef, anyhow::Error> {
     let null_buffer = if desc.is_nullable && !desc.validity_ptr.is_null() {
-        // Julia's BitVector stores bits packed in UInt64 chunks, which we can use directly
-        // since Arrow also uses little-endian bit-packed format.
-        // We just need to copy the right number of bytes.
+        // Julia's BitVector uses the same little-endian bit-packed layout as Arrow.
+        // Point directly into Julia's buffer — no copy needed.
         let num_bytes = (desc.num_rows + 7) / 8;
-        let validity_slice = std::slice::from_raw_parts(desc.validity_ptr, num_bytes);
-        Some(NullBuffer::new(BooleanBuffer::new(
-            Buffer::from(validity_slice.to_vec()),
-            0,
-            desc.num_rows,
-        )))
+        let nn_ptr = NonNull::new_unchecked(desc.validity_ptr as *mut u8);
+        let buf = Buffer::from_custom_allocation(nn_ptr, num_bytes, Arc::new(NoopAllocation));
+        Some(NullBuffer::new(BooleanBuffer::new(buf, 0, desc.num_rows)))
     } else {
         None
     };
 
     let array: ArrayRef = match desc.column_type {
         COLUMN_TYPE_INT32 => {
-            let data = std::slice::from_raw_parts(desc.data_ptr as *const i32, desc.num_rows);
-            let buffer = ScalarBuffer::from(data.to_vec());
+            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i32, desc.num_rows);
             Arc::new(PrimitiveArray::<Int32Type>::new(buffer, null_buffer))
         }
         COLUMN_TYPE_INT64 => {
-            let data = std::slice::from_raw_parts(desc.data_ptr as *const i64, desc.num_rows);
-            let buffer = ScalarBuffer::from(data.to_vec());
+            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i64, desc.num_rows);
             Arc::new(PrimitiveArray::<Int64Type>::new(buffer, null_buffer))
         }
         COLUMN_TYPE_FLOAT32 => {
-            let data = std::slice::from_raw_parts(desc.data_ptr as *const f32, desc.num_rows);
-            let buffer = ScalarBuffer::from(data.to_vec());
+            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const f32, desc.num_rows);
             Arc::new(PrimitiveArray::<Float32Type>::new(buffer, null_buffer))
         }
         COLUMN_TYPE_FLOAT64 => {
-            let data = std::slice::from_raw_parts(desc.data_ptr as *const f64, desc.num_rows);
-            let buffer = ScalarBuffer::from(data.to_vec());
+            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const f64, desc.num_rows);
             Arc::new(PrimitiveArray::<Float64Type>::new(buffer, null_buffer))
         }
         COLUMN_TYPE_DATE => {
             // Date is stored as Int32 (days since epoch)
-            let data = std::slice::from_raw_parts(desc.data_ptr as *const i32, desc.num_rows);
-            let buffer = ScalarBuffer::from(data.to_vec());
+            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i32, desc.num_rows);
             Arc::new(PrimitiveArray::<Date32Type>::new(buffer, null_buffer))
         }
         COLUMN_TYPE_TIMESTAMP => {
             // Timestamp without timezone (Iceberg `timestamp`)
             // Stored as Int64 microseconds since epoch
-            let data = std::slice::from_raw_parts(desc.data_ptr as *const i64, desc.num_rows);
-            let buffer = ScalarBuffer::from(data.to_vec());
+            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i64, desc.num_rows);
             Arc::new(PrimitiveArray::<TimestampMicrosecondType>::new(
                 buffer,
                 null_buffer,
@@ -126,8 +132,7 @@ unsafe fn build_arrow_array(
         COLUMN_TYPE_TIMESTAMPTZ => {
             // Timestamp with UTC timezone (Iceberg `timestamptz`)
             // Stored as Int64 microseconds since epoch, with timezone metadata
-            let data = std::slice::from_raw_parts(desc.data_ptr as *const i64, desc.num_rows);
-            let buffer = ScalarBuffer::from(data.to_vec());
+            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i64, desc.num_rows);
             Arc::new(
                 PrimitiveArray::<TimestampMicrosecondType>::new(buffer, null_buffer)
                     .with_timezone("UTC"),
@@ -204,28 +209,26 @@ unsafe fn build_arrow_array(
                 }
             };
 
-            // Convert the backing integer representation to i128 values
-            let i128_values: Vec<i128> = match desc.column_type {
+            // All decimal variants produce an Arrow Decimal128 (i128 backing).
+            // INT32/INT64 variants need widening — materialize to avoid a cast loop
+            // in the zero-copy path; INT128 shares the same layout and is zero-copy.
+            let buffer: ScalarBuffer<i128> = match desc.column_type {
                 COLUMN_TYPE_DECIMAL_INT32 => {
                     let data =
                         std::slice::from_raw_parts(desc.data_ptr as *const i32, desc.num_rows);
-                    data.iter().map(|&v| v as i128).collect()
+                    ScalarBuffer::from(data.iter().map(|&v| v as i128).collect::<Vec<_>>())
                 }
                 COLUMN_TYPE_DECIMAL_INT64 => {
                     let data =
                         std::slice::from_raw_parts(desc.data_ptr as *const i64, desc.num_rows);
-                    data.iter().map(|&v| v as i128).collect()
+                    ScalarBuffer::from(data.iter().map(|&v| v as i128).collect::<Vec<_>>())
                 }
                 _ => {
                     // COLUMN_TYPE_DECIMAL_INT128: Julia Int128 and Rust i128 share the same
-                    // 16-byte little-endian layout on all supported platforms.
-                    let data =
-                        std::slice::from_raw_parts(desc.data_ptr as *const i128, desc.num_rows);
-                    data.to_vec()
+                    // 16-byte little-endian layout on all supported platforms — zero-copy.
+                    zero_copy_scalar_buffer(desc.data_ptr as *const i128, desc.num_rows)
                 }
             };
-
-            let buffer = ScalarBuffer::<i128>::from(i128_values);
             Arc::new(
                 PrimitiveArray::<Decimal128Type>::new(buffer, null_buffer)
                     .with_precision_and_scale(precision, scale)
@@ -240,6 +243,272 @@ unsafe fn build_arrow_array(
     Ok(array)
 }
 
+// =============================================================================
+// Scattered-gather writer: pass raw source pointers + selection indices to Rust,
+// which gathers the data directly into Arrow arrays — eliminating the Julia-side
+// staging copy for non-converting numeric column types.
+// =============================================================================
+
+/// A reference to one slice of source column data.
+/// `sel_ptr = null`  → sequential (identity) access: read data[0..len].
+/// `sel_ptr != null` → scattered access: read data[sel[i]-1] for i in 0..len (1-based Julia indices).
+/// `validity_ptr = null` → all rows valid (non-nullable or known all-valid slice).
+/// `lengths_ptr != null` → string column: data_ptr is Ptr{UInt8}[], lengths_ptr is Int64[].
+/// Fields are all 8 bytes — no padding, total 40 bytes.
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct SliceRef {
+    pub data_ptr: *const c_void,
+    pub lengths_ptr: *const i64,
+    pub validity_ptr: *const u8,
+    pub sel_ptr: *const i64,
+    pub len: usize,
+}
+
+unsafe impl Send for SliceRef {}
+unsafe impl Sync for SliceRef {}
+
+/// Gathered column descriptor: gather `num_slices` SliceRefs into one Arrow column.
+/// `total_rows` must equal the sum of all `slice.len` values.
+/// Fields ordered largest-to-smallest; 3 bytes trailing padding → 32 bytes total.
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct GatheredColumnDescriptor {
+    pub slices: *const SliceRef,
+    pub num_slices: usize,
+    pub total_rows: usize,
+    pub column_type: i32,
+    pub is_nullable: bool,
+}
+
+unsafe impl Send for GatheredColumnDescriptor {}
+unsafe impl Sync for GatheredColumnDescriptor {}
+
+/// Build the Arrow null buffer by scanning validity bits across all slices.
+/// Returns `None` if every slice has a null `validity_ptr` (all rows valid).
+unsafe fn build_null_buffer_scattered(slices: &[SliceRef], total_rows: usize) -> Option<NullBuffer> {
+    if !slices.iter().any(|s| !s.validity_ptr.is_null()) {
+        return None;
+    }
+    let mut bits = vec![0u8; (total_rows + 7) / 8];
+    let mut out = 0usize;
+    for slice in slices {
+        if slice.validity_ptr.is_null() {
+            // All valid — set bits 1
+            for i in 0..slice.len {
+                bits[(out + i) / 8] |= 1u8 << ((out + i) % 8);
+            }
+        } else if slice.sel_ptr.is_null() {
+            // Sequential: copy bits with possible alignment shift
+            for i in 0..slice.len {
+                let b = (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1;
+                bits[(out + i) / 8] |= b << ((out + i) % 8);
+            }
+        } else {
+            // Scattered: gather validity bits via selection indices (1-based)
+            let sel = std::slice::from_raw_parts(slice.sel_ptr, slice.len);
+            for (i, &idx) in sel.iter().enumerate() {
+                let src = (idx - 1) as usize;
+                let b = (*slice.validity_ptr.add(src / 8) >> (src % 8)) & 1;
+                bits[(out + i) / 8] |= b << ((out + i) % 8);
+            }
+        }
+        out += slice.len;
+    }
+    Some(NullBuffer::new(BooleanBuffer::new(Buffer::from(bits), 0, total_rows)))
+}
+
+/// Gather all slices for a column into an Arrow array.
+pub(crate) unsafe fn build_arrow_array_scattered(
+    desc: &GatheredColumnDescriptor,
+    schema_field: &arrow_schema::Field,
+) -> Result<ArrayRef, anyhow::Error> {
+    let slices = std::slice::from_raw_parts(desc.slices, desc.num_slices);
+    let total = desc.total_rows;
+    let null_buf = if desc.is_nullable {
+        build_null_buffer_scattered(slices, total)
+    } else {
+        None
+    };
+
+    // Macro gathers a primitive numeric type from all slices.
+    // sel_ptr=null → sequential copy; sel_ptr!=null → indirect gather (1-based indices).
+    macro_rules! gather_primitive {
+        ($T:ty, $ArrowType:ty) => {{
+            let mut values = Vec::<$T>::with_capacity(total);
+            for slice in slices {
+                let src = slice.data_ptr as *const $T;
+                if slice.sel_ptr.is_null() {
+                    values.extend_from_slice(std::slice::from_raw_parts(src, slice.len));
+                } else {
+                    for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) {
+                        values.push(*src.add((idx - 1) as usize));
+                    }
+                }
+            }
+            Arc::new(PrimitiveArray::<$ArrowType>::new(ScalarBuffer::from(values), null_buf))
+                as ArrayRef
+        }};
+    }
+
+    let array: ArrayRef = match desc.column_type {
+        COLUMN_TYPE_INT32 => gather_primitive!(i32, Int32Type),
+        COLUMN_TYPE_INT64 => gather_primitive!(i64, Int64Type),
+        COLUMN_TYPE_FLOAT32 => gather_primitive!(f32, Float32Type),
+        COLUMN_TYPE_FLOAT64 => gather_primitive!(f64, Float64Type),
+        COLUMN_TYPE_DATE => gather_primitive!(i32, Date32Type),
+        COLUMN_TYPE_TIMESTAMP => {
+            let mut values = Vec::<i64>::with_capacity(total);
+            for slice in slices {
+                let src = slice.data_ptr as *const i64;
+                if slice.sel_ptr.is_null() {
+                    values.extend_from_slice(std::slice::from_raw_parts(src, slice.len));
+                } else {
+                    for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) {
+                        values.push(*src.add((idx - 1) as usize));
+                    }
+                }
+            }
+            Arc::new(PrimitiveArray::<TimestampMicrosecondType>::new(
+                ScalarBuffer::from(values),
+                null_buf,
+            ))
+        }
+        COLUMN_TYPE_TIMESTAMPTZ => {
+            let mut values = Vec::<i64>::with_capacity(total);
+            for slice in slices {
+                let src = slice.data_ptr as *const i64;
+                if slice.sel_ptr.is_null() {
+                    values.extend_from_slice(std::slice::from_raw_parts(src, slice.len));
+                } else {
+                    for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) {
+                        values.push(*src.add((idx - 1) as usize));
+                    }
+                }
+            }
+            Arc::new(
+                PrimitiveArray::<TimestampMicrosecondType>::new(ScalarBuffer::from(values), null_buf)
+                    .with_timezone("UTC"),
+            )
+        }
+        COLUMN_TYPE_BOOLEAN => {
+            let mut bits = vec![0u8; (total + 7) / 8];
+            let mut out = 0usize;
+            for slice in slices {
+                let src = slice.data_ptr as *const u8;
+                if slice.sel_ptr.is_null() {
+                    let data = std::slice::from_raw_parts(src, slice.len);
+                    for (i, &v) in data.iter().enumerate() {
+                        if v != 0 { bits[(out + i) / 8] |= 1 << ((out + i) % 8); }
+                    }
+                } else {
+                    for (i, &idx) in std::slice::from_raw_parts(slice.sel_ptr, slice.len).iter().enumerate() {
+                        if *src.add((idx - 1) as usize) != 0 {
+                            bits[(out + i) / 8] |= 1 << ((out + i) % 8);
+                        }
+                    }
+                }
+                out += slice.len;
+            }
+            Arc::new(BooleanArray::new(BooleanBuffer::new(Buffer::from(bits), 0, total), null_buf))
+        }
+        COLUMN_TYPE_STRING => {
+            // Strings are always pre-staged in Julia; data_ptr = *const *const u8,
+            // lengths_ptr = *const i64, sel_ptr is always null (identity).
+            let mut all_strings: Vec<Option<&str>> = Vec::with_capacity(total);
+            for slice in slices {
+                if slice.lengths_ptr.is_null() {
+                    return Err(anyhow::anyhow!("String column requires lengths_ptr"));
+                }
+                let ptrs = std::slice::from_raw_parts(slice.data_ptr as *const *const u8, slice.len);
+                let lens = std::slice::from_raw_parts(slice.lengths_ptr, slice.len);
+                for i in 0..slice.len {
+                    let is_null = if !slice.validity_ptr.is_null() {
+                        (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1 == 0
+                    } else {
+                        false
+                    };
+                    if is_null {
+                        all_strings.push(None);
+                    } else {
+                        let s = std::str::from_utf8(std::slice::from_raw_parts(ptrs[i], lens[i] as usize))
+                            .map_err(|e| anyhow::anyhow!("Invalid UTF-8: {}", e))?;
+                        all_strings.push(Some(s));
+                    }
+                }
+            }
+            Arc::new(StringArray::from(all_strings))
+        }
+        COLUMN_TYPE_UUID => {
+            let mut data: Vec<u8> = Vec::with_capacity(total * 16);
+            for slice in slices {
+                let src = slice.data_ptr as *const u8;
+                if slice.sel_ptr.is_null() {
+                    data.extend_from_slice(std::slice::from_raw_parts(src, slice.len * 16));
+                } else {
+                    for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) {
+                        data.extend_from_slice(std::slice::from_raw_parts(src.add((idx - 1) as usize * 16), 16));
+                    }
+                }
+            }
+            let chunks: Vec<&[u8]> = data.chunks(16).collect();
+            Arc::new(
+                arrow_array::FixedSizeBinaryArray::try_from_iter(chunks.into_iter())
+                    .map_err(|e| anyhow::anyhow!("Failed to build UUID array: {}", e))?,
+            )
+        }
+        COLUMN_TYPE_DECIMAL_INT32 | COLUMN_TYPE_DECIMAL_INT64 | COLUMN_TYPE_DECIMAL_INT128 => {
+            let (precision, scale) = match schema_field.data_type() {
+                arrow_schema::DataType::Decimal128(p, s) => (*p, *s),
+                dt => return Err(anyhow::anyhow!("Expected Decimal128, got {:?}", dt)),
+            };
+            let mut values = Vec::<i128>::with_capacity(total);
+            for slice in slices {
+                match desc.column_type {
+                    COLUMN_TYPE_DECIMAL_INT32 => {
+                        let src = slice.data_ptr as *const i32;
+                        if slice.sel_ptr.is_null() {
+                            values.extend(std::slice::from_raw_parts(src, slice.len).iter().map(|&v| v as i128));
+                        } else {
+                            for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) {
+                                values.push(*src.add((idx - 1) as usize) as i128);
+                            }
+                        }
+                    }
+                    COLUMN_TYPE_DECIMAL_INT64 => {
+                        let src = slice.data_ptr as *const i64;
+                        if slice.sel_ptr.is_null() {
+                            values.extend(std::slice::from_raw_parts(src, slice.len).iter().map(|&v| v as i128));
+                        } else {
+                            for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) {
+                                values.push(*src.add((idx - 1) as usize) as i128);
+                            }
+                        }
+                    }
+                    _ => {
+                        // DECIMAL_INT128: i128 layout matches Julia Int128
+                        let src = slice.data_ptr as *const i128;
+                        if slice.sel_ptr.is_null() {
+                            values.extend_from_slice(std::slice::from_raw_parts(src, slice.len));
+                        } else {
+                            for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) {
+                                values.push(*src.add((idx - 1) as usize));
+                            }
+                        }
+                    }
+                }
+            }
+            Arc::new(
+                PrimitiveArray::<Decimal128Type>::new(ScalarBuffer::from(values), null_buf)
+                    .with_precision_and_scale(precision, scale)
+                    .map_err(|e| anyhow::anyhow!("Decimal precision/scale: {}", e))?,
+            )
+        }
+        _ => return Err(anyhow::anyhow!("Unknown column type: {}", desc.column_type)),
+    };
+    Ok(array)
+}
+
 // Write columns directly to the Parquet writer.
 // Accepts an array of ColumnDescriptors and builds a RecordBatch from them,
 // then writes to the underlying Parquet writer.
@@ -267,7 +536,6 @@ export_runtime_op!(
     async {
         let (writer_ref, cols) = result_tuple;
 
-        // Get the writer's schema (stored when writer was created)
         let arrow_schema = writer_ref.arrow_schema.clone();
 
         // Validate column count matches schema
@@ -279,30 +547,32 @@ export_runtime_op!(
             ));
         }
 
-        // Get the writer
-        let iceberg_writer = writer_ref
-            .writer
-            .as_mut()
-            .ok_or_else(|| anyhow::anyhow!("Writer has been closed"))?;
-
-        // Build Arrow arrays from column descriptors
+        // Build Arrow arrays from column descriptors.
+        // These arrays are zero-copy: they hold raw pointers into Julia-owned buffers.
         let mut arrays: Vec<ArrayRef> = Vec::with_capacity(cols.len());
-
         for (i, desc) in cols.iter().enumerate() {
             let schema_field = arrow_schema.field(i);
             let array = unsafe { build_arrow_array(desc, schema_field)? };
             arrays.push(array);
         }
 
-        // Create record batch using the table's Arrow schema (with proper field IDs)
         let batch = RecordBatch::try_new(arrow_schema, arrays)
             .map_err(|e| anyhow::anyhow!("Failed to create RecordBatch: {}", e))?;
 
-        // Write the batch
-        iceberg_writer
-            .write(batch)
+        // Arrow arrays are zero-copy (pointing into Julia buffers) — we must not return
+        // until Rust has released them. Use a per-batch oneshot: the drain task signals
+        // after encoding, at which point the Arrow arrays have been dropped.
+        let (ack_tx, ack_rx) = tokio::sync::oneshot::channel::<()>();
+        writer_ref
+            .batch_tx
+            .as_ref()
+            .ok_or_else(|| anyhow::anyhow!("Writer has been closed"))?
+            .send((batch, Some(ack_tx)))
+            .await
+            .map_err(|_| anyhow::anyhow!("Writer channel closed (drain task may have failed)"))?;
+        ack_rx
             .await
-            .map_err(|e| anyhow::anyhow!("Failed to write batch: {}", e))?;
+            .map_err(|_| anyhow::anyhow!("Ack channel closed (drain task may have failed)"))?;
 
         Ok::<(), anyhow::Error>(())
     },
diff --git a/src/RustyIceberg.jl b/src/RustyIceberg.jl
index 8e29037..85b1ee4 100644
--- a/src/RustyIceberg.jl
+++ b/src/RustyIceberg.jl
@@ -33,13 +33,14 @@ export IcebergTimestampNs, IcebergTimestamptzNs
 export IcebergString, IcebergUuid, IcebergBinary, IcebergDecimal
 export Transaction, DataFiles, free_transaction!, free_data_files!, commit, transaction
 export FastAppendAction, free_fast_append_action!, add_data_files, apply, with_fast_append
-export DataFileWriter, free_writer!, close_writer, write_columns
+export DataFileWriter, free_writer!, close_writer, write_columns, set_encode_workers!
 export WriterConfig, CompressionCodec, UNCOMPRESSED, SNAPPY, GZIP, LZ4, ZSTD
-export ColumnDescriptor, ColumnType
+export ColumnDescriptor, ColumnBatch, ColumnType
 export COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FLOAT64
 export COLUMN_TYPE_STRING, COLUMN_TYPE_DATE, COLUMN_TYPE_TIMESTAMP, COLUMN_TYPE_TIMESTAMPTZ, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_UUID
 export COLUMN_TYPE_DECIMAL_INT32, COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_DECIMAL_INT128
 export julia_type_to_column_type
+export GatheredColumn, GatheredBatch, add_slice!, add_string_slice!, write_scattered_columns_sync
 
 # Always use the JLL library - override via Preferences if needed for local development
 # To use a local build, set the preference:
diff --git a/src/writer.jl b/src/writer.jl
index 563d01a..6abab6e 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -36,16 +36,24 @@ Configuration options for the DataFileWriter.
 
 # Fields
 - `prefix::String`: Prefix for output file names (default: "data")
-- `target_file_size_bytes::Int`: Target size for rolling to a new file (default: 512 MB)
-- `compression::CompressionCodec`: Compression codec for Parquet files (default: UNCOMPRESSED)
+- `target_file_size_bytes::Int`: Target file size before rolling to a new file (default: 512 MB)
+- `compression::CompressionCodec`: Parquet compression codec (default: UNCOMPRESSED)
+- `dictionary_enabled::Bool`: Enable Parquet dictionary encoding globally (default: true)
+- `max_row_group_size::Int`: Maximum rows per Parquet row group, 0 = parquet default (1 048 576)
+- `data_page_size::Int`: Target uncompressed data page size in bytes, 0 = parquet default (1 048 576)
+- `write_batch_size::Int`: Rows encoded per column chunk within a row group, 0 = parquet default (1024).
+  Increasing this (e.g. 65536) reduces encoding overhead for large row groups.
+- `plain_encoding::Bool`: Force PLAIN encoding for all columns, bypassing the DELTA_BINARY_PACKED
+  default that parquet-rs uses for INT64/INT32 under PARQUET_2_0 (default: false).
 
 # Example
 ```julia
-# Override defaults: use smaller file size and ZSTD compression
 config = WriterConfig(
     prefix = "my_data",
-    target_file_size_bytes = 128 * 1024 * 1024,  # 128 MB (default is 512 MB)
-    compression = ZSTD
+    compression = ZSTD,
+    dictionary_enabled = false,  # disable dict encoding for benchmarking
+    plain_encoding = true,       # use PLAIN to avoid DELTA_BINARY_PACKED overhead
+    write_batch_size = 65536,    # larger batches for better throughput
 )
 writer = DataFileWriter(table, config)
 ```
@@ -54,6 +62,24 @@ writer = DataFileWriter(table, config)
     prefix::String = "data"
     target_file_size_bytes::Int = DEFAULT_TARGET_FILE_SIZE_BYTES
     compression::CompressionCodec = UNCOMPRESSED
+    dictionary_enabled::Bool = true
+    max_row_group_size::Int = 0
+    data_page_size::Int = 0
+    write_batch_size::Int = 0
+    plain_encoding::Bool = false
+    statistics_enabled::Bool = true
+end
+
+# C-layout struct matching ParquetWriterPropertiesFFI in Rust.
+# Fields are ordered largest-to-smallest to match repr(C) padding rules.
+struct ParquetWriterPropertiesFFI
+    max_row_group_size::Int64
+    data_page_size::Int64
+    write_batch_size::Int64
+    compression_codec::Int32
+    dictionary_enabled::Bool
+    use_plain_encoding::Bool
+    statistics_enabled::Bool
 end
 
 """
@@ -120,6 +146,21 @@ function get_column_metadata(table::Table)::Dict{Symbol, Vector{Pair{String, Str
     return colmeta
 end
 
+"""
+    set_encode_workers!(n::Int)
+
+Set the number of threads in the global Parquet encode worker pool.
+
+Must be called before creating any `DataFileWriter`; has no effect once the pool
+has been initialized (i.e. after the first writer is created). Defaults to
+`Sys.CPU_THREADS` if not set.
+"""
+function set_encode_workers!(n::Int)
+    n > 0 || throw(ArgumentError("n must be positive, got $n"))
+    @ccall rust_lib.iceberg_set_encode_workers(n::Cint)::Cvoid
+    return nothing
+end
+
 """
     DataFileWriter(table::Table, config::WriterConfig) -> DataFileWriter
     DataFileWriter(table::Table; prefix="data", target_file_size_bytes=512MB, compression=UNCOMPRESSED) -> DataFileWriter
@@ -157,13 +198,22 @@ free_writer!(writer)
 """
 function DataFileWriter(table::Table, config::WriterConfig)
     response = DataFileWriterResponse()
-
-    async_ccall(response, config.prefix) do handle
+    parquet_props = Ref(ParquetWriterPropertiesFFI(
+        Int64(config.max_row_group_size),
+        Int64(config.data_page_size),
+        Int64(config.write_batch_size),
+        Int32(config.compression),
+        config.dictionary_enabled,
+        config.plain_encoding,
+        config.statistics_enabled,
+    ))
+
+    async_ccall(response, config.prefix, parquet_props) do handle
         @ccall rust_lib.iceberg_writer_new(
             table::Table,
             config.prefix::Cstring,
             config.target_file_size_bytes::Int64,
-            Int32(config.compression)::Int32,
+            parquet_props::Ref{ParquetWriterPropertiesFFI},
             response::Ref{DataFileWriterResponse},
             handle::Ptr{Cvoid}
         )::Cint
@@ -446,6 +496,47 @@ end
 # Column-based writing (zero-copy from Julia)
 # ==========================================================================================
 
+"""
+    SliceRef
+
+FFI reference to a single slice of source column data for the scattered-gather writer.
+
+- `data_ptr`: pointer to source data array (T[]) or string pointers (Ptr{UInt8}[])
+- `lengths_ptr`: for string columns, pointer to lengths array; null for other types
+- `validity_ptr`: pointer to validity bitmap (BitVector.chunks); null if all rows valid
+- `sel_ptr`: pointer to selection index array (1-based Julia indices); null for sequential access
+- `len`: number of rows in this slice
+
+All fields are 8 bytes — total struct size is 40 bytes with no padding.
+"""
+struct SliceRef
+    data_ptr::Ptr{Cvoid}
+    lengths_ptr::Ptr{Int64}
+    validity_ptr::Ptr{UInt8}
+    sel_ptr::Ptr{Int64}
+    len::Csize_t
+end
+
+"""
+    GatheredColumnDescriptor
+
+FFI descriptor for a column to be gathered from multiple SliceRefs.
+Pass an array of these to `write_scattered_columns`.
+
+- `slices_ptr`: pointer to array of SliceRef structs
+- `num_slices`: number of SliceRef entries
+- `total_rows`: sum of all slice lengths
+- `column_type`: ColumnType enum value
+- `is_nullable`: whether the column may contain null values
+"""
+struct GatheredColumnDescriptor
+    slices_ptr::Ptr{SliceRef}
+    num_slices::Csize_t
+    total_rows::Csize_t
+    column_type::Int32
+    is_nullable::Bool
+end
+
 """
     ColumnType
 
@@ -632,6 +723,47 @@ function Base.push!(
     return batch
 end
 
+"""
+    push!(batch::ColumnBatch, data::Vector{String}, str_ptrs::Vector{Ptr{UInt8}}, str_lens::Vector{Int64}; validity=nothing, length=nothing, column_type=nothing)
+
+Add a string column to the batch using pre-allocated pointer/length buffers.
+The caller is responsible for filling `str_ptrs` and `str_lens` before calling this.
+Avoids allocating new pointer/length arrays on every write.
+"""
+function Base.push!(
+    batch::ColumnBatch,
+    data::Vector{String},
+    str_ptrs::Vector{Ptr{UInt8}},
+    str_lens::Vector{Int64};
+    validity::Union{Nothing, BitVector}=nothing,
+    length::Union{Nothing, Int}=nothing,
+    column_type::Union{Nothing, ColumnType}=nothing,
+)
+    num_rows = length === nothing ? Base.length(str_ptrs) : length
+    is_nullable = validity !== nothing
+    col_type = column_type === nothing ? COLUMN_TYPE_STRING : column_type
+
+    push!(batch.arrays_to_preserve, data, str_ptrs, str_lens)
+
+    validity_ptr = if is_nullable
+        push!(batch.arrays_to_preserve, validity)
+        Ptr{UInt8}(pointer(validity.chunks))
+    else
+        Ptr{UInt8}(C_NULL)
+    end
+
+    desc = ColumnDescriptor(
+        Ptr{Cvoid}(pointer(str_ptrs)),
+        pointer(str_lens),
+        validity_ptr,
+        Csize_t(num_rows),
+        Int32(col_type),
+        is_nullable
+    )
+    push!(batch.descriptors, desc)
+    return batch
+end
+
 """
     push!(batch::ColumnBatch, data::Vector{T}; validity=nothing, length=nothing, column_type=nothing) where T
 
@@ -767,3 +899,225 @@ write_columns(writer, batch)
 function write_columns(writer::DataFileWriter, batch::ColumnBatch)
     write_columns(writer, batch.descriptors, batch.arrays_to_preserve)
 end
+
+# ==========================================================================================
+# High-level gathered-column API
+# ==========================================================================================
+
+"""
+    GatheredColumn
+
+Accumulates one or more source slices for a single column. Rust gathers the data
+directly from source buffers when the batch is written, avoiding a Julia-side staging
+copy for numeric columns.
+
+Typical usage:
+
+```julia
+col = GatheredColumn(COLUMN_TYPE_INT64)
+add_slice!(col, src_array)                        # sequential: all rows
+add_slice!(col, src_array2; sel=sel_indices)      # scattered: rows at sel_indices
+add_slice!(col, src_array3; validity=valid_bv)    # nullable slice
+```
+
+String columns are not supported on the scattered path; use `ColumnBatch` instead.
+"""
+mutable struct GatheredColumn
+    slices::Vector{SliceRef}
+    total_rows::Int
+    column_type::ColumnType
+    is_nullable::Bool
+    preserve::Vector{Any}   # source arrays kept alive until write
+end
+
+GatheredColumn(column_type::ColumnType; nullable::Bool=false) =
+    GatheredColumn(SliceRef[], 0, column_type, nullable, Any[])
+
+"""
+    add_slice!(col::GatheredColumn, data::AbstractVector{T};
+               sel=nothing, validity=nothing)
+
+Append a slice of `data` to `col`.
+
+- `sel`: optional `Vector{Int64}` of 1-based row indices into `data` to select.
+  If omitted, all rows of `data` are used sequentially.
+- `validity`: optional `BitVector` (length = number of selected rows, `true` = valid).
+  Providing this marks the column as nullable.
+"""
+function add_slice!(
+    col::GatheredColumn,
+    data::AbstractVector{T};
+    sel::Union{Nothing, Vector{Int64}} = nothing,
+    validity::Union{Nothing, BitVector} = nothing,
+) where T
+    len = sel === nothing ? length(data) : length(sel)
+
+    sel_ptr = if sel !== nothing
+        push!(col.preserve, sel)
+        pointer(sel)
+    else
+        Ptr{Int64}(C_NULL)
+    end
+
+    validity_ptr = if validity !== nothing
+        col.is_nullable = true
+        push!(col.preserve, validity)
+        Ptr{UInt8}(pointer(validity.chunks))
+    else
+        Ptr{UInt8}(C_NULL)
+    end
+
+    push!(col.preserve, data)
+    push!(col.slices, SliceRef(
+        Ptr{Cvoid}(pointer(data)),
+        Ptr{Int64}(C_NULL),   # lengths_ptr unused for non-string types
+        validity_ptr,
+        sel_ptr,
+        Csize_t(len),
+    ))
+    col.total_rows += len
+    return col
+end
+
+"""
+    add_string_slice!(col::GatheredColumn, str_ptrs, str_lens; validity=nothing)
+
+Append a string slice to `col`. `str_ptrs` is a `Vector{Ptr{UInt8}}` of pointers to
+UTF-8 string data and `str_lens` is a `Vector{Int64}` of corresponding byte lengths.
+The caller is responsible for keeping the pointed-to string bytes alive.
+"""
+function add_string_slice!(
+    col::GatheredColumn,
+    str_ptrs::Vector{Ptr{UInt8}},
+    str_lens::Vector{Int64};
+    validity::Union{Nothing, BitVector} = nothing,
+)
+    len = length(str_ptrs)
+
+    validity_ptr = if validity !== nothing
+        col.is_nullable = true
+        push!(col.preserve, validity)
+        Ptr{UInt8}(pointer(validity.chunks))
+    else
+        Ptr{UInt8}(C_NULL)
+    end
+
+    push!(col.preserve, str_ptrs, str_lens)
+    push!(col.slices, SliceRef(
+        Ptr{Cvoid}(pointer(str_ptrs)),
+        pointer(str_lens),
+        validity_ptr,
+        Ptr{Int64}(C_NULL),
+        Csize_t(len),
+    ))
+    col.total_rows += len
+    return col
+end
+
+"""
+    GatheredBatch
+
+Collects a `GatheredColumn` per output column, then writes all of them in one call.
+
+```julia
+batch = GatheredBatch()
+push!(batch, col_int64)
+push!(batch, col_float64)
+write_scattered_columns_sync(writer, batch)
+```
+
+You can also push a single-slice column inline without building a `GatheredColumn`
+explicitly:
+
+```julia
+batch = GatheredBatch()
+push!(batch, src_ints,   COLUMN_TYPE_INT64)
+push!(batch, src_floats, COLUMN_TYPE_FLOAT64; sel=indices, validity=valid_bv)
+write_scattered_columns_sync(writer, batch)
+```
+"""
+mutable struct GatheredBatch
+    columns::Vector{GatheredColumn}
+end
+
+GatheredBatch() = GatheredBatch(GatheredColumn[])
+
+"""
+    push!(batch::GatheredBatch, col::GatheredColumn)
+
+Append an already-built `GatheredColumn` to the batch.
+"""
+Base.push!(batch::GatheredBatch, col::GatheredColumn) = (push!(batch.columns, col); batch)
+
+"""
+    push!(batch::GatheredBatch, data::AbstractVector, column_type::ColumnType;
+          sel=nothing, validity=nothing, nullable=false)
+
+Convenience: create a single-slice `GatheredColumn` from `data` and append it.
+"""
+function Base.push!(
+    batch::GatheredBatch,
+    data::AbstractVector,
+    column_type::ColumnType;
+    sel::Union{Nothing, Vector{Int64}} = nothing,
+    validity::Union{Nothing, BitVector} = nothing,
+    nullable::Bool = validity !== nothing,
+)
+    col = GatheredColumn(column_type; nullable)
+    add_slice!(col, data; sel, validity)
+    push!(batch.columns, col)
+    return batch
+end
+
+"""
+    write_scattered_columns_sync(writer::DataFileWriter, batch::GatheredBatch[, extra_preserve])
+
+Gather column data from Julia memory synchronously, then encode asynchronously.
+
+Gathers all column data from Julia memory in the calling thread using a plain blocking
+`ccall`. Encode runs asynchronously in the global worker pool.
+
+`extra_preserve` (optional) is an additional collection of objects whose memory must
+stay alive during the gather (e.g. source string arrays for zero-copy string columns).
+
+The source data pointed to by the `GatheredBatch` slices and `extra_preserve` must be
+valid for the duration of this call. After the call returns, all Julia pointers have
+been consumed and the source data may be safely released.
+"""
+function write_scattered_columns_sync(
+    writer::DataFileWriter,
+    batch::GatheredBatch,
+    extra_preserve = nothing,
+)
+    isempty(batch.columns) && throw(IcebergException("GatheredBatch has no columns"))
+    writer.ptr == C_NULL && throw(IcebergException("Writer has been freed"))
+
+    all_slice_arrays = Vector{Vector{SliceRef}}(undef, length(batch.columns))
+    descriptors = Vector{GatheredColumnDescriptor}(undef, length(batch.columns))
+    preserve = Any[]
+
+    for (i, col) in enumerate(batch.columns)
+        slices = col.slices
+        all_slice_arrays[i] = slices
+        append!(preserve, col.preserve)
+        push!(preserve, slices)
+        descriptors[i] = GatheredColumnDescriptor(
+            pointer(slices),
+            Csize_t(length(slices)),
+            Csize_t(col.total_rows),
+            Int32(col.column_type),
+            col.is_nullable,
+        )
+    end
+    extra_preserve !== nothing && append!(preserve, extra_preserve)
+
+    ret = GC.@preserve preserve all_slice_arrays descriptors begin
+        @ccall rust_lib.iceberg_writer_write_scattered_columns_sync(
+            writer.ptr::Ptr{Cvoid},
+            pointer(descriptors)::Ptr{GatheredColumnDescriptor},
+            length(descriptors)::Csize_t,
+        )::Int32
+    end
+    ret == 0 || throw(IcebergException("write_scattered_columns_sync: gather failed (see writer close for details)"))
+    return nothing
+end

From 26176147ed034993d540d80d415491b2eb02d871 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 16:57:46 +0200
Subject: [PATCH 05/25] Add tests

---
 iceberg_rust_ffi/src/writer.rs         | 188 +++++++++++---
 iceberg_rust_ffi/src/writer_columns.rs | 326 ++++---------------------
 src/writer.jl                          |  23 +-
 test/writer_tests.jl                   | 155 ++++++++++++
 4 files changed, 367 insertions(+), 325 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs
index bfe95b0..38aae30 100644
--- a/iceberg_rust_ffi/src/writer.rs
+++ b/iceberg_rust_ffi/src/writer.rs
@@ -7,9 +7,9 @@
 /// in order. Drain tasks are lightweight async forwarders (no CPU work).
 use std::ffi::{c_char, c_void};
 use std::io::Cursor;
+use std::panic::{catch_unwind, AssertUnwindSafe};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, OnceLock};
-use std::panic::{catch_unwind, AssertUnwindSafe};
 use std::thread;
 
 use arrow_array::RecordBatch;
@@ -26,7 +26,7 @@ use iceberg::writer::file_writer::ParquetWriterBuilder;
 use iceberg::writer::{IcebergWriter, IcebergWriterBuilder};
 use parquet::basic::{Compression, Encoding};
 use parquet::file::properties::{EnabledStatistics, WriterProperties};
-use tokio::sync::{mpsc, oneshot};
+use tokio::sync::mpsc;
 use tokio::task::JoinHandle;
 
 /// Compression codec values (must match Julia's CompressionCodec enum)
@@ -96,7 +96,9 @@ use crate::response::IcebergBoxedResponse;
 use crate::table::IcebergTable;
 use crate::transaction::IcebergDataFiles;
 use crate::util::parse_c_string;
-use crate::writer_columns::{build_arrow_array_scattered, GatheredColumnDescriptor};
+use crate::writer_columns::{
+    build_arrow_array_scattered, ColumnDescriptor, GatheredColumnDescriptor, SliceRef,
+};
 use object_store_ffi::{
     export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT,
 };
@@ -106,15 +108,11 @@ type ConcreteDataFileWriter =
     DataFileWriter<ParquetWriterBuilder, DefaultLocationGenerator, DefaultFileNameGenerator>;
 
 /// Batch item sent through the per-writer drain channel.
-/// `ack` is `Some` only for the zero-copy path (`write_columns`), where Arrow arrays
-/// point directly into Julia memory — the pool worker sends on it after encoding so
-/// Julia knows those buffers are safe to reuse.
-pub(crate) type BatchItem = (RecordBatch, Option<oneshot::Sender<()>>);
+pub(crate) type BatchItem = RecordBatch;
 
 /// Encode task submitted to the global worker pool.
 struct EncodeTask {
     batch: RecordBatch,
-    ack_opt: Option<oneshot::Sender<()>>,
     state: Arc<WriterState>,
 }
 
@@ -201,13 +199,9 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool {
 
                         let handle_enc = handle.clone();
                         let encode_result = catch_unwind(AssertUnwindSafe(move || {
-                            let t0 = std::time::Instant::now();
-                            // Use unwrap_or_else to recover from a poisoned mutex
-                            // (which happens when a previous encode panicked mid-write).
                             let result: anyhow::Result<()> = {
-                                let mut guard = task.state.writer
-                                    .lock()
-                                    .unwrap_or_else(|e| e.into_inner());
+                                let mut guard =
+                                    task.state.writer.lock().unwrap_or_else(|e| e.into_inner());
                                 match guard.as_mut() {
                                     Some(w) => handle_enc
                                         .block_on(w.write(task.batch))
@@ -215,12 +209,6 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool {
                                     None => Err(anyhow::anyhow!("writer already closed")),
                                 }
                             };
-                            // ACK inside the closure: if we panic before reaching this,
-                            // the Sender is dropped and ack_rx returns RecvError, which
-                            // write_columns propagates as an error to Julia — correct behavior.
-                            if let Some(ack) = task.ack_opt {
-                                let _ = ack.send(());
-                            }
                             result
                         }));
 
@@ -230,9 +218,7 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool {
                             Err(_panic) => Some(anyhow::anyhow!("encode worker panicked")),
                         };
                         if let Some(e) = err {
-                            let mut slot = state.error
-                                .lock()
-                                .unwrap_or_else(|e| e.into_inner());
+                            let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner());
                             if slot.is_none() {
                                 *slot = Some(e);
                             }
@@ -316,7 +302,11 @@ pub extern "C" fn iceberg_writer_write_scattered_columns_sync(
     let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) };
 
     if col_descs.len() != arrow_schema.fields().len() {
-        let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner());
+        let mut slot = writer_ref
+            .writer_state
+            .error
+            .lock()
+            .unwrap_or_else(|e| e.into_inner());
         if slot.is_none() {
             *slot = Some(anyhow::anyhow!(
                 "Column count mismatch: got {} but schema has {}",
@@ -329,7 +319,6 @@ pub extern "C" fn iceberg_writer_write_scattered_columns_sync(
 
     // Gather: read from Julia memory into Rust-owned Arrow arrays (blocking, in calling thread).
     // After this block, all Julia pointers have been consumed — safe to release.
-    let t0 = std::time::Instant::now();
     let batch = match (|| -> Result<RecordBatch, anyhow::Error> {
         let mut arrays = Vec::with_capacity(num_columns);
         for (i, desc) in col_descs.iter().enumerate() {
@@ -340,21 +329,153 @@ pub extern "C" fn iceberg_writer_write_scattered_columns_sync(
     })() {
         Ok(b) => b,
         Err(e) => {
-            let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner());
-            if slot.is_none() { *slot = Some(e); }
+            let mut slot = writer_ref
+                .writer_state
+                .error
+                .lock()
+                .unwrap_or_else(|e| e.into_inner());
+            if slot.is_none() {
+                *slot = Some(e);
+            }
             return -1;
         }
     };
 
     // Increment pending before submit so iceberg_writer_close always accounts for this batch.
-    writer_ref.writer_state.pending.fetch_add(1, Ordering::AcqRel);
-    let task = EncodeTask { batch, ack_opt: None, state: writer_ref.writer_state.clone() };
+    writer_ref
+        .writer_state
+        .pending
+        .fetch_add(1, Ordering::AcqRel);
+    let task = EncodeTask {
+        batch,
+        state: writer_ref.writer_state.clone(),
+    };
 
     match pool.task_tx.blocking_send(task) {
         Ok(()) => 0,
         Err(_) => {
             // Pool channel closed — undo the pending increment.
-            let prev = writer_ref.writer_state.pending.fetch_sub(1, Ordering::AcqRel);
+            let prev = writer_ref
+                .writer_state
+                .pending
+                .fetch_sub(1, Ordering::AcqRel);
+            if prev == 1 {
+                writer_ref.writer_state.done_notify.notify_one();
+            }
+            eprintln!("[iceberg:sync] pool channel closed");
+            -1
+        }
+    }
+}
+
+/// Synchronous write of flat column data: copies each column from Julia memory into
+/// Rust-owned Arrow arrays in the calling thread, then submits to the global encode
+/// pool asynchronously.
+///
+/// Each `ColumnDescriptor` is treated as a single sequential slice (no scatter/gather).
+/// Returns 0 on success, -1 on error (error stored in writer state, propagated on close).
+#[no_mangle]
+pub extern "C" fn iceberg_writer_write_columns_sync(
+    writer: *mut IcebergDataFileWriter,
+    columns: *const ColumnDescriptor,
+    num_columns: usize,
+) -> i32 {
+    if writer.is_null() || columns.is_null() || num_columns == 0 {
+        return -1;
+    }
+
+    let writer_ref = unsafe { &*writer };
+
+    let pool = match GLOBAL_ENCODE_POOL.get() {
+        Some(p) => p,
+        None => {
+            eprintln!("[iceberg:sync] encode pool not initialized; call iceberg_writer_new first");
+            return -1;
+        }
+    };
+
+    let arrow_schema = writer_ref.arrow_schema.clone();
+    let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) };
+
+    if col_descs.len() != arrow_schema.fields().len() {
+        let mut slot = writer_ref
+            .writer_state
+            .error
+            .lock()
+            .unwrap_or_else(|e| e.into_inner());
+        if slot.is_none() {
+            *slot = Some(anyhow::anyhow!(
+                "Column count mismatch: got {} but schema has {}",
+                col_descs.len(),
+                arrow_schema.fields().len()
+            ));
+        }
+        return -1;
+    }
+
+    // Wrap each ColumnDescriptor as a single-slice GatheredColumnDescriptor.
+    // Sequential access (sel_ptr = null) so data[0..num_rows] is read in order.
+    let slices: Vec<SliceRef> = col_descs
+        .iter()
+        .map(|d| SliceRef {
+            data_ptr: d.data_ptr,
+            lengths_ptr: d.lengths_ptr,
+            validity_ptr: d.validity_ptr,
+            sel_ptr: std::ptr::null(),
+            len: d.num_rows,
+        })
+        .collect();
+
+    let gathered: Vec<GatheredColumnDescriptor> = col_descs
+        .iter()
+        .zip(slices.iter())
+        .map(|(d, s)| GatheredColumnDescriptor {
+            slices: s as *const SliceRef,
+            num_slices: 1,
+            total_rows: d.num_rows,
+            column_type: d.column_type,
+            is_nullable: d.is_nullable,
+        })
+        .collect();
+
+    let batch = match (|| -> Result<RecordBatch, anyhow::Error> {
+        let mut arrays = Vec::with_capacity(num_columns);
+        for (i, desc) in gathered.iter().enumerate() {
+            arrays.push(unsafe { build_arrow_array_scattered(desc, arrow_schema.field(i))? });
+        }
+        RecordBatch::try_new(arrow_schema, arrays)
+            .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))
+    })() {
+        Ok(b) => b,
+        Err(e) => {
+            let mut slot = writer_ref
+                .writer_state
+                .error
+                .lock()
+                .unwrap_or_else(|e| e.into_inner());
+            if slot.is_none() {
+                *slot = Some(e);
+            }
+            return -1;
+        }
+    };
+
+    writer_ref
+        .writer_state
+        .pending
+        .fetch_add(1, Ordering::AcqRel);
+    let task = EncodeTask {
+        batch,
+        state: writer_ref.writer_state.clone(),
+    };
+
+    match pool.task_tx.blocking_send(task) {
+        Ok(()) => 0,
+        Err(_) => {
+            let prev = writer_ref
+                .writer_state
+                .pending
+                .fetch_sub(1, Ordering::AcqRel);
             if prev == 1 {
                 writer_ref.writer_state.done_notify.notify_one();
             }
@@ -473,9 +594,9 @@ export_runtime_op!(
             let state = writer_state.clone();
             let pool_tx = pool.task_tx.clone();
             tokio::task::spawn(async move {
-                while let Some((batch, ack_opt)) = batch_rx.recv().await {
+                while let Some(batch) = batch_rx.recv().await {
                     state.pending.fetch_add(1, Ordering::AcqRel);
-                    let task = EncodeTask { batch, ack_opt, state: state.clone() };
+                    let task = EncodeTask { batch, state: state.clone() };
                     if pool_tx.send(task).await.is_err() {
                         eprintln!("[iceberg:drain] pool channel closed");
                         break;
@@ -540,8 +661,7 @@ export_runtime_op!(
         while let Some(batch_result) = reader.next() {
             let batch = batch_result
                 .map_err(|e| anyhow::anyhow!("Failed to read Arrow IPC batch: {}", e))?;
-            // IPC data already copied into ipc_bytes above — no ACK needed.
-            tx.send((batch, None)).await
+            tx.send(batch).await
                 .map_err(|_| anyhow::anyhow!("Writer channel closed (drain task may have failed)"))?;
         }
 
diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs
index 0f61849..6bf4c67 100644
--- a/iceberg_rust_ffi/src/writer_columns.rs
+++ b/iceberg_rust_ffi/src/writer_columns.rs
@@ -4,7 +4,6 @@
 /// avoiding the overhead of Arrow IPC serialization. Julia passes raw column pointers
 /// and metadata, and Rust builds Arrow arrays directly from them.
 use std::ffi::c_void;
-use std::ptr::NonNull;
 use std::sync::Arc;
 
 use arrow_array::{
@@ -12,31 +11,9 @@ use arrow_array::{
         Date32Type, Decimal128Type, Float32Type, Float64Type, Int32Type, Int64Type,
         TimestampMicrosecondType,
     },
-    ArrayRef, BooleanArray, PrimitiveArray, RecordBatch, StringArray,
-};
-use arrow_buffer::{ArrowNativeType, BooleanBuffer, Buffer, NullBuffer, ScalarBuffer};
-
-/// No-op allocator: Arrow buffers created from Julia memory must not be freed by Rust.
-/// Julia owns the memory until the per-batch oneshot ACK is received (write_columns path).
-struct NoopAllocation;
-
-/// Build a zero-copy `ScalarBuffer<T>` pointing directly into Julia-owned memory.
-/// Safety: `ptr` must be non-null, valid, and alive until the buffer is dropped.
-unsafe fn zero_copy_scalar_buffer<T: ArrowNativeType>(
-    ptr: *const T,
-    num_rows: usize,
-) -> ScalarBuffer<T> {
-    let byte_len = num_rows * std::mem::size_of::<T>();
-    let nn_ptr = NonNull::new_unchecked(ptr as *mut u8);
-    let buf = Buffer::from_custom_allocation(nn_ptr, byte_len, Arc::new(NoopAllocation));
-    ScalarBuffer::new(buf, 0, num_rows)
-}
-use arrow_schema::DataType;
-
-use crate::writer::IcebergDataFileWriter;
-use object_store_ffi::{
-    export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT,
+    ArrayRef, BooleanArray, PrimitiveArray, StringArray,
 };
+use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, ScalarBuffer};
 
 /// Column type codes (must match Julia's ColumnType enum)
 pub const COLUMN_TYPE_INT32: i32 = 0;
@@ -81,168 +58,6 @@ pub struct ColumnDescriptor {
 unsafe impl Send for ColumnDescriptor {}
 unsafe impl Sync for ColumnDescriptor {}
 
-/// Build an Arrow array from a ColumnDescriptor and its corresponding schema field.
-/// The schema field is used to extract precision/scale for decimal columns.
-unsafe fn build_arrow_array(
-    desc: &ColumnDescriptor,
-    schema_field: &arrow_schema::Field,
-) -> Result<ArrayRef, anyhow::Error> {
-    let null_buffer = if desc.is_nullable && !desc.validity_ptr.is_null() {
-        // Julia's BitVector uses the same little-endian bit-packed layout as Arrow.
-        // Point directly into Julia's buffer — no copy needed.
-        let num_bytes = (desc.num_rows + 7) / 8;
-        let nn_ptr = NonNull::new_unchecked(desc.validity_ptr as *mut u8);
-        let buf = Buffer::from_custom_allocation(nn_ptr, num_bytes, Arc::new(NoopAllocation));
-        Some(NullBuffer::new(BooleanBuffer::new(buf, 0, desc.num_rows)))
-    } else {
-        None
-    };
-
-    let array: ArrayRef = match desc.column_type {
-        COLUMN_TYPE_INT32 => {
-            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i32, desc.num_rows);
-            Arc::new(PrimitiveArray::<Int32Type>::new(buffer, null_buffer))
-        }
-        COLUMN_TYPE_INT64 => {
-            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i64, desc.num_rows);
-            Arc::new(PrimitiveArray::<Int64Type>::new(buffer, null_buffer))
-        }
-        COLUMN_TYPE_FLOAT32 => {
-            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const f32, desc.num_rows);
-            Arc::new(PrimitiveArray::<Float32Type>::new(buffer, null_buffer))
-        }
-        COLUMN_TYPE_FLOAT64 => {
-            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const f64, desc.num_rows);
-            Arc::new(PrimitiveArray::<Float64Type>::new(buffer, null_buffer))
-        }
-        COLUMN_TYPE_DATE => {
-            // Date is stored as Int32 (days since epoch)
-            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i32, desc.num_rows);
-            Arc::new(PrimitiveArray::<Date32Type>::new(buffer, null_buffer))
-        }
-        COLUMN_TYPE_TIMESTAMP => {
-            // Timestamp without timezone (Iceberg `timestamp`)
-            // Stored as Int64 microseconds since epoch
-            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i64, desc.num_rows);
-            Arc::new(PrimitiveArray::<TimestampMicrosecondType>::new(
-                buffer,
-                null_buffer,
-            ))
-        }
-        COLUMN_TYPE_TIMESTAMPTZ => {
-            // Timestamp with UTC timezone (Iceberg `timestamptz`)
-            // Stored as Int64 microseconds since epoch, with timezone metadata
-            let buffer = zero_copy_scalar_buffer(desc.data_ptr as *const i64, desc.num_rows);
-            Arc::new(
-                PrimitiveArray::<TimestampMicrosecondType>::new(buffer, null_buffer)
-                    .with_timezone("UTC"),
-            )
-        }
-        COLUMN_TYPE_BOOLEAN => {
-            let data = std::slice::from_raw_parts(desc.data_ptr as *const u8, desc.num_rows);
-            // Convert bytes to boolean buffer
-            let mut bits = vec![0u8; (desc.num_rows + 7) / 8];
-            for (i, &val) in data.iter().enumerate() {
-                if val != 0 {
-                    bits[i / 8] |= 1 << (i % 8);
-                }
-            }
-            let values = BooleanBuffer::new(Buffer::from(bits), 0, desc.num_rows);
-            Arc::new(BooleanArray::new(values, null_buffer))
-        }
-        COLUMN_TYPE_STRING => {
-            // String data passed from Julia:
-            // - data_ptr: pointer to array of string pointers (each pointing to UTF-8 bytes)
-            // - lengths_ptr: pointer to array of string lengths (Int64)
-            // Note: While we avoid copying on the Julia side (just passing pointers),
-            // Arrow's StringArray copies the data into its own contiguous buffer below.
-            if desc.lengths_ptr.is_null() {
-                return Err(anyhow::anyhow!("String column requires lengths"));
-            }
-            let str_ptrs =
-                std::slice::from_raw_parts(desc.data_ptr as *const *const u8, desc.num_rows);
-            let str_lens = std::slice::from_raw_parts(desc.lengths_ptr, desc.num_rows);
-
-            // Build string references, then Arrow copies them into its internal buffer
-            let mut strings: Vec<Option<&str>> = Vec::with_capacity(desc.num_rows);
-            for i in 0..desc.num_rows {
-                let is_null: bool = if let Some(ref nb) = null_buffer {
-                    nb.is_null(i)
-                } else {
-                    false
-                };
-                if is_null {
-                    strings.push(None);
-                } else {
-                    let ptr = str_ptrs[i];
-                    let len = str_lens[i] as usize;
-                    let bytes = std::slice::from_raw_parts(ptr, len);
-                    let s = std::str::from_utf8(bytes)
-                        .map_err(|e| anyhow::anyhow!("Invalid UTF-8 in string column: {}", e))?;
-                    strings.push(Some(s));
-                }
-            }
-            Arc::new(StringArray::from(strings))
-        }
-        COLUMN_TYPE_UUID => {
-            // UUID is stored as 16 bytes (UInt128 in Julia)
-            // Store as fixed-size binary (16 bytes per value)
-            let data = std::slice::from_raw_parts(desc.data_ptr as *const u8, desc.num_rows * 16);
-
-            // Build the array using the builder or from_iter_values
-            let values: Vec<&[u8]> = data.chunks(16).collect();
-            Arc::new(
-                arrow_array::FixedSizeBinaryArray::try_from_iter(values.into_iter())
-                    .map_err(|e| anyhow::anyhow!("Failed to create UUID array: {}", e))?,
-            )
-        }
-        COLUMN_TYPE_DECIMAL_INT32 | COLUMN_TYPE_DECIMAL_INT64 | COLUMN_TYPE_DECIMAL_INT128 => {
-            // All decimal variants map to Arrow Decimal128. Precision and scale come from
-            // the schema field (set when the Iceberg table was created).
-            let (precision, scale) = match schema_field.data_type() {
-                DataType::Decimal128(p, s) => (*p, *s),
-                dt => {
-                    return Err(anyhow::anyhow!(
-                        "Expected Decimal128 schema field for decimal column, got {:?}",
-                        dt
-                    ))
-                }
-            };
-
-            // All decimal variants produce an Arrow Decimal128 (i128 backing).
-            // INT32/INT64 variants need widening — materialize to avoid a cast loop
-            // in the zero-copy path; INT128 shares the same layout and is zero-copy.
-            let buffer: ScalarBuffer<i128> = match desc.column_type {
-                COLUMN_TYPE_DECIMAL_INT32 => {
-                    let data =
-                        std::slice::from_raw_parts(desc.data_ptr as *const i32, desc.num_rows);
-                    ScalarBuffer::from(data.iter().map(|&v| v as i128).collect::<Vec<_>>())
-                }
-                COLUMN_TYPE_DECIMAL_INT64 => {
-                    let data =
-                        std::slice::from_raw_parts(desc.data_ptr as *const i64, desc.num_rows);
-                    ScalarBuffer::from(data.iter().map(|&v| v as i128).collect::<Vec<_>>())
-                }
-                _ => {
-                    // COLUMN_TYPE_DECIMAL_INT128: Julia Int128 and Rust i128 share the same
-                    // 16-byte little-endian layout on all supported platforms — zero-copy.
-                    zero_copy_scalar_buffer(desc.data_ptr as *const i128, desc.num_rows)
-                }
-            };
-            Arc::new(
-                PrimitiveArray::<Decimal128Type>::new(buffer, null_buffer)
-                    .with_precision_and_scale(precision, scale)
-                    .map_err(|e| anyhow::anyhow!("Failed to set decimal precision/scale: {}", e))?,
-            )
-        }
-        _ => {
-            return Err(anyhow::anyhow!("Unknown column type: {}", desc.column_type));
-        }
-    };
-
-    Ok(array)
-}
-
 // =============================================================================
 // Scattered-gather writer: pass raw source pointers + selection indices to Rust,
 // which gathers the data directly into Arrow arrays — eliminating the Julia-side
@@ -286,7 +101,10 @@ unsafe impl Sync for GatheredColumnDescriptor {}
 
 /// Build the Arrow null buffer by scanning validity bits across all slices.
 /// Returns `None` if every slice has a null `validity_ptr` (all rows valid).
-unsafe fn build_null_buffer_scattered(slices: &[SliceRef], total_rows: usize) -> Option<NullBuffer> {
+unsafe fn build_null_buffer_scattered(
+    slices: &[SliceRef],
+    total_rows: usize,
+) -> Option<NullBuffer> {
     if !slices.iter().any(|s| !s.validity_ptr.is_null()) {
         return None;
     }
@@ -315,7 +133,11 @@ unsafe fn build_null_buffer_scattered(slices: &[SliceRef], total_rows: usize) ->
         }
         out += slice.len;
     }
-    Some(NullBuffer::new(BooleanBuffer::new(Buffer::from(bits), 0, total_rows)))
+    Some(NullBuffer::new(BooleanBuffer::new(
+        Buffer::from(bits),
+        0,
+        total_rows,
+    )))
 }
 
 /// Gather all slices for a column into an Arrow array.
@@ -346,8 +168,10 @@ pub(crate) unsafe fn build_arrow_array_scattered(
                     }
                 }
             }
-            Arc::new(PrimitiveArray::<$ArrowType>::new(ScalarBuffer::from(values), null_buf))
-                as ArrayRef
+            Arc::new(PrimitiveArray::<$ArrowType>::new(
+                ScalarBuffer::from(values),
+                null_buf,
+            )) as ArrayRef
         }};
     }
 
@@ -387,8 +211,11 @@ pub(crate) unsafe fn build_arrow_array_scattered(
                 }
             }
             Arc::new(
-                PrimitiveArray::<TimestampMicrosecondType>::new(ScalarBuffer::from(values), null_buf)
-                    .with_timezone("UTC"),
+                PrimitiveArray::<TimestampMicrosecondType>::new(
+                    ScalarBuffer::from(values),
+                    null_buf,
+                )
+                .with_timezone("UTC"),
             )
         }
         COLUMN_TYPE_BOOLEAN => {
@@ -399,10 +226,15 @@ pub(crate) unsafe fn build_arrow_array_scattered(
                 if slice.sel_ptr.is_null() {
                     let data = std::slice::from_raw_parts(src, slice.len);
                     for (i, &v) in data.iter().enumerate() {
-                        if v != 0 { bits[(out + i) / 8] |= 1 << ((out + i) % 8); }
+                        if v != 0 {
+                            bits[(out + i) / 8] |= 1 << ((out + i) % 8);
+                        }
                     }
                 } else {
-                    for (i, &idx) in std::slice::from_raw_parts(slice.sel_ptr, slice.len).iter().enumerate() {
+                    for (i, &idx) in std::slice::from_raw_parts(slice.sel_ptr, slice.len)
+                        .iter()
+                        .enumerate()
+                    {
                         if *src.add((idx - 1) as usize) != 0 {
                             bits[(out + i) / 8] |= 1 << ((out + i) % 8);
                         }
@@ -410,7 +242,10 @@ pub(crate) unsafe fn build_arrow_array_scattered(
                 }
                 out += slice.len;
             }
-            Arc::new(BooleanArray::new(BooleanBuffer::new(Buffer::from(bits), 0, total), null_buf))
+            Arc::new(BooleanArray::new(
+                BooleanBuffer::new(Buffer::from(bits), 0, total),
+                null_buf,
+            ))
         }
         COLUMN_TYPE_STRING => {
             // Strings are always pre-staged in Julia; data_ptr = *const *const u8,
@@ -420,7 +255,8 @@ pub(crate) unsafe fn build_arrow_array_scattered(
                 if slice.lengths_ptr.is_null() {
                     return Err(anyhow::anyhow!("String column requires lengths_ptr"));
                 }
-                let ptrs = std::slice::from_raw_parts(slice.data_ptr as *const *const u8, slice.len);
+                let ptrs =
+                    std::slice::from_raw_parts(slice.data_ptr as *const *const u8, slice.len);
                 let lens = std::slice::from_raw_parts(slice.lengths_ptr, slice.len);
                 for i in 0..slice.len {
                     let is_null = if !slice.validity_ptr.is_null() {
@@ -431,8 +267,11 @@ pub(crate) unsafe fn build_arrow_array_scattered(
                     if is_null {
                         all_strings.push(None);
                     } else {
-                        let s = std::str::from_utf8(std::slice::from_raw_parts(ptrs[i], lens[i] as usize))
-                            .map_err(|e| anyhow::anyhow!("Invalid UTF-8: {}", e))?;
+                        let s = std::str::from_utf8(std::slice::from_raw_parts(
+                            ptrs[i],
+                            lens[i] as usize,
+                        ))
+                        .map_err(|e| anyhow::anyhow!("Invalid UTF-8: {}", e))?;
                         all_strings.push(Some(s));
                     }
                 }
@@ -447,7 +286,10 @@ pub(crate) unsafe fn build_arrow_array_scattered(
                     data.extend_from_slice(std::slice::from_raw_parts(src, slice.len * 16));
                 } else {
                     for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) {
-                        data.extend_from_slice(std::slice::from_raw_parts(src.add((idx - 1) as usize * 16), 16));
+                        data.extend_from_slice(std::slice::from_raw_parts(
+                            src.add((idx - 1) as usize * 16),
+                            16,
+                        ));
                     }
                 }
             }
@@ -468,7 +310,11 @@ pub(crate) unsafe fn build_arrow_array_scattered(
                     COLUMN_TYPE_DECIMAL_INT32 => {
                         let src = slice.data_ptr as *const i32;
                         if slice.sel_ptr.is_null() {
-                            values.extend(std::slice::from_raw_parts(src, slice.len).iter().map(|&v| v as i128));
+                            values.extend(
+                                std::slice::from_raw_parts(src, slice.len)
+                                    .iter()
+                                    .map(|&v| v as i128),
+                            );
                         } else {
                             for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) {
                                 values.push(*src.add((idx - 1) as usize) as i128);
@@ -478,7 +324,11 @@ pub(crate) unsafe fn build_arrow_array_scattered(
                     COLUMN_TYPE_DECIMAL_INT64 => {
                         let src = slice.data_ptr as *const i64;
                         if slice.sel_ptr.is_null() {
-                            values.extend(std::slice::from_raw_parts(src, slice.len).iter().map(|&v| v as i128));
+                            values.extend(
+                                std::slice::from_raw_parts(src, slice.len)
+                                    .iter()
+                                    .map(|&v| v as i128),
+                            );
                         } else {
                             for &idx in std::slice::from_raw_parts(slice.sel_ptr, slice.len) {
                                 values.push(*src.add((idx - 1) as usize) as i128);
@@ -508,75 +358,3 @@ pub(crate) unsafe fn build_arrow_array_scattered(
     };
     Ok(array)
 }
-
-// Write columns directly to the Parquet writer.
-// Accepts an array of ColumnDescriptors and builds a RecordBatch from them,
-// then writes to the underlying Parquet writer.
-// The caller must ensure all pointers are valid and point to appropriately sized data.
-export_runtime_op!(
-    iceberg_writer_write_columns,
-    crate::IcebergResponse,
-    || {
-        if writer.is_null() {
-            return Err(anyhow::anyhow!("Null writer pointer provided"));
-        }
-        if columns.is_null() || num_columns == 0 {
-            return Err(anyhow::anyhow!("No columns provided"));
-        }
-
-        // Copy column descriptors for safe use across await
-        let cols: Vec<ColumnDescriptor> = unsafe {
-            std::slice::from_raw_parts(columns, num_columns).to_vec()
-        };
-
-        let writer_ref = unsafe { &mut *writer };
-        Ok((writer_ref, cols))
-    },
-    result_tuple,
-    async {
-        let (writer_ref, cols) = result_tuple;
-
-        let arrow_schema = writer_ref.arrow_schema.clone();
-
-        // Validate column count matches schema
-        if cols.len() != arrow_schema.fields().len() {
-            return Err(anyhow::anyhow!(
-                "Column count mismatch: got {} columns but schema has {} fields",
-                cols.len(),
-                arrow_schema.fields().len()
-            ));
-        }
-
-        // Build Arrow arrays from column descriptors.
-        // These arrays are zero-copy: they hold raw pointers into Julia-owned buffers.
-        let mut arrays: Vec<ArrayRef> = Vec::with_capacity(cols.len());
-        for (i, desc) in cols.iter().enumerate() {
-            let schema_field = arrow_schema.field(i);
-            let array = unsafe { build_arrow_array(desc, schema_field)? };
-            arrays.push(array);
-        }
-
-        let batch = RecordBatch::try_new(arrow_schema, arrays)
-            .map_err(|e| anyhow::anyhow!("Failed to create RecordBatch: {}", e))?;
-
-        // Arrow arrays are zero-copy (pointing into Julia buffers) — we must not return
-        // until Rust has released them. Use a per-batch oneshot: the drain task signals
-        // after encoding, at which point the Arrow arrays have been dropped.
-        let (ack_tx, ack_rx) = tokio::sync::oneshot::channel::<()>();
-        writer_ref
-            .batch_tx
-            .as_ref()
-            .ok_or_else(|| anyhow::anyhow!("Writer has been closed"))?
-            .send((batch, Some(ack_tx)))
-            .await
-            .map_err(|_| anyhow::anyhow!("Writer channel closed (drain task may have failed)"))?;
-        ack_rx
-            .await
-            .map_err(|_| anyhow::anyhow!("Ack channel closed (drain task may have failed)"))?;
-
-        Ok::<(), anyhow::Error>(())
-    },
-    writer: *mut IcebergDataFileWriter,
-    columns: *const ColumnDescriptor,
-    num_columns: usize
-);
diff --git a/src/writer.jl b/src/writer.jl
index 6abab6e..4e41776 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -844,29 +844,18 @@ write_columns(writer, [desc], (data, validity))  # Arrays preserved during call
 ```
 """
 function write_columns(writer::DataFileWriter, columns::Vector{ColumnDescriptor}, arrays_to_preserve)
-    if writer.ptr == C_NULL
-        throw(IcebergException("Writer has been freed"))
-    end
-
-    if isempty(columns)
-        throw(IcebergException("No columns provided"))
-    end
-
-    response = Response{Cvoid}(-1, nothing, C_NULL, C_NULL)
+    writer.ptr == C_NULL && throw(IcebergException("Writer has been freed"))
+    isempty(columns) && throw(IcebergException("No columns provided"))
 
-    # Pass arrays_to_preserve to async_ccall so GC.@preserve keeps them alive
-    async_ccall(response, columns, arrays_to_preserve) do handle
-        @ccall rust_lib.iceberg_writer_write_columns(
+    ret = GC.@preserve columns arrays_to_preserve begin
+        @ccall rust_lib.iceberg_writer_write_columns_sync(
             writer.ptr::Ptr{Cvoid},
             pointer(columns)::Ptr{ColumnDescriptor},
             length(columns)::Csize_t,
-            response::Ref{Response{Cvoid}},
-            handle::Ptr{Cvoid}
-        )::Cint
+        )::Int32
     end
 
-    @throw_on_error(response, "write_columns", IcebergException)
-
+    ret == 0 || throw(IcebergException("write_columns failed (see writer close for details)"))
     return nothing
 end
 
diff --git a/test/writer_tests.jl b/test/writer_tests.jl
index f8b1bef..b8ad8a3 100644
--- a/test/writer_tests.jl
+++ b/test/writer_tests.jl
@@ -1126,3 +1126,158 @@ end
 
     println("\n✅ write_columns decimal nullable tests completed!")
 end
+
+@testset "Writer write_scattered_columns_sync API" begin
+    println("Testing write_scattered_columns_sync (gathered-column) API...")
+
+    catalog_uri = get_catalog_uri()
+    props = get_catalog_properties()
+
+    catalog = nothing
+    table = C_NULL
+    data_files = nothing
+    test_namespace = nothing
+    table_name = nothing
+
+    try
+        catalog = RustyIceberg.catalog_create_rest(catalog_uri; properties=props)
+        @test catalog !== nothing
+
+        test_namespace = ["test_gathered_$(round(Int, time() * 1000))"]
+        RustyIceberg.create_namespace(catalog, test_namespace)
+
+        # Schema: id (non-nullable long), score (nullable double), tag (nullable string)
+        schema = Schema([
+            Field(Int32(1), "id",    IcebergLong();   required=true),
+            Field(Int32(2), "score", IcebergDouble(); required=false),
+            Field(Int32(3), "tag",   IcebergString(); required=false),
+        ])
+
+        table_name = "gathered_test_$(round(Int, time() * 1000))"
+        table = RustyIceberg.create_table(catalog, test_namespace, table_name, schema)
+        @test table != C_NULL
+        println("✅ Table created")
+
+        # --- Data layout (4 rows) ---
+        # id:    [1, 2, 3, 4]  — non-nullable, single sequential slice
+        # score: [1.1, null, 3.3, null]  — nullable; two sequential slices, each with validity
+        # tag:   ["alpha", null, "gamma", null]  — nullable string via add_string_slice!
+
+        data_files = RustyIceberg.with_data_file_writer(table) do writer
+            batch = RustyIceberg.GatheredBatch()
+
+            # id: single sequential slice, no nulls
+            id_data = Int64[1, 2, 3, 4]
+            id_col = RustyIceberg.GatheredColumn(RustyIceberg.COLUMN_TYPE_INT64)
+            RustyIceberg.add_slice!(id_col, id_data)
+            push!(batch, id_col)
+            println("✅ id column built (sequential, non-nullable)")
+
+            # score: two sequential slices, each with validity masks
+            # Slice 1 — src = [1.1, 9.9], validity = [true, false] → rows 0 (1.1) and 1 (null)
+            # Slice 2 — src = [3.3, 8.8] via selection [1] + identity [8.8] (just sequential here)
+            # Use scattered access for slice 2: src=[99.9, 3.3, 88.8], sel=[2] → picks 3.3
+            score_src1 = Float64[1.1, 9.9]
+            score_valid1 = BitVector([true, false])
+
+            score_src2 = Float64[99.9, 3.3, 88.8]
+            score_sel2 = Int64[2]       # picks index 2 → 3.3 (1-based)
+            score_valid2 = BitVector([true])
+
+            score_src3 = Float64[7.7, 8.8]
+            score_valid3 = BitVector([false])  # null for row 3
+
+            score_col = RustyIceberg.GatheredColumn(RustyIceberg.COLUMN_TYPE_FLOAT64; nullable=true)
+            RustyIceberg.add_slice!(score_col, score_src1; validity=score_valid1)
+            RustyIceberg.add_slice!(score_col, score_src2; sel=score_sel2, validity=score_valid2)
+            RustyIceberg.add_slice!(score_col, score_src3; sel=Int64[1], validity=score_valid3)
+            push!(batch, score_col)
+            println("✅ score column built (scattered + nullable)")
+
+            # tag: string column via add_string_slice!
+            # Row 0: "alpha", row 1: null, row 2: "gamma", row 3: null
+            tag_strings = ["alpha", "", "gamma", ""]  # empty at null positions
+            tag_ptrs = Vector{Ptr{UInt8}}(undef, 4)
+            tag_lens = Vector{Int64}(undef, 4)
+            tag_valid = BitVector([true, false, true, false])
+            for i in 1:4
+                if tag_valid[i]
+                    tag_ptrs[i] = pointer(tag_strings[i])
+                    tag_lens[i] = ncodeunits(tag_strings[i])
+                else
+                    tag_ptrs[i] = Ptr{UInt8}(C_NULL)
+                    tag_lens[i] = 0
+                end
+            end
+
+            tag_col = RustyIceberg.GatheredColumn(RustyIceberg.COLUMN_TYPE_STRING; nullable=true)
+            RustyIceberg.add_string_slice!(tag_col, tag_ptrs, tag_lens; validity=tag_valid)
+            # Preserve source strings so pointers stay valid until write completes
+            push!(tag_col.preserve, tag_strings)
+            push!(batch, tag_col)
+            println("✅ tag column built (string via add_string_slice!)")
+
+            RustyIceberg.write_scattered_columns_sync(writer, batch)
+            println("✅ Batch written via write_scattered_columns_sync")
+        end
+        @test data_files !== nothing && data_files.ptr != C_NULL
+        println("✅ Writer closed")
+
+        updated_table = RustyIceberg.with_transaction(table, catalog) do tx
+            RustyIceberg.with_fast_append(tx) do action
+                RustyIceberg.add_data_files(action, data_files)
+            end
+        end
+        @test updated_table != C_NULL
+        println("✅ Transaction committed")
+
+        tbl = read_table_data(updated_table)
+        @test tbl !== nothing
+        @test length(tbl.id) == 4
+        println("✅ Read $(length(tbl.id)) rows")
+
+        perm = sortperm(tbl.id)
+        sorted_ids   = tbl.id[perm]
+        sorted_scores = tbl.score[perm]
+        sorted_tags  = tbl.tag[perm]
+
+        # Verify id column (non-nullable, sequential)
+        @test sorted_ids == Int64[1, 2, 3, 4]
+        println("✅ id values correct")
+
+        # Verify score column (nullable, scattered slices)
+        @test !ismissing(sorted_scores[1]) && sorted_scores[1] ≈ 1.1
+        @test ismissing(sorted_scores[2])
+        @test !ismissing(sorted_scores[3]) && sorted_scores[3] ≈ 3.3
+        @test ismissing(sorted_scores[4])
+        println("✅ score values correct (including nulls)")
+
+        # Verify tag column (nullable string)
+        @test !ismissing(sorted_tags[1]) && sorted_tags[1] == "alpha"
+        @test ismissing(sorted_tags[2])
+        @test !ismissing(sorted_tags[3]) && sorted_tags[3] == "gamma"
+        @test ismissing(sorted_tags[4])
+        println("✅ tag values correct (including nulls)")
+
+        RustyIceberg.free_table(updated_table)
+
+    finally
+        if data_files !== nothing && data_files.ptr != C_NULL
+            RustyIceberg.free_data_files!(data_files)
+        end
+        if table != C_NULL
+            RustyIceberg.free_table(table)
+        end
+        if table_name !== nothing && test_namespace !== nothing && catalog !== nothing
+            RustyIceberg.drop_table(catalog, test_namespace, table_name)
+        end
+        if test_namespace !== nothing && catalog !== nothing
+            RustyIceberg.drop_namespace(catalog, test_namespace)
+        end
+        if catalog !== nothing
+            RustyIceberg.free_catalog!(catalog)
+        end
+    end
+
+    println("\n✅ write_scattered_columns_sync API tests completed!")
+end

From a2c016964ab11f36ad4fd92f8327b6a6415c2ff9 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 16:59:09 +0200
Subject: [PATCH 06/25] Bump version

---
 iceberg_rust_ffi/Cargo.lock | 2 +-
 iceberg_rust_ffi/Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/iceberg_rust_ffi/Cargo.lock b/iceberg_rust_ffi/Cargo.lock
index 8070ddd..c2cd772 100644
--- a/iceberg_rust_ffi/Cargo.lock
+++ b/iceberg_rust_ffi/Cargo.lock
@@ -1638,7 +1638,7 @@ dependencies = [
 
 [[package]]
 name = "iceberg_rust_ffi"
-version = "0.7.17"
+version = "0.7.19"
 dependencies = [
  "anyhow",
  "arrow-array",
diff --git a/iceberg_rust_ffi/Cargo.toml b/iceberg_rust_ffi/Cargo.toml
index 7b2cd92..f76f726 100644
--- a/iceberg_rust_ffi/Cargo.toml
+++ b/iceberg_rust_ffi/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "iceberg_rust_ffi"
-version = "0.7.17"
+version = "0.7.19"
 edition = "2021"
 
 [lib]

From 218f426acd225b75803faef63950f19b6b39a5c6 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 17:18:09 +0200
Subject: [PATCH 07/25] Fix test

---
 iceberg_rust_ffi/src/writer.rs         | 168 ++++++++++---------------
 iceberg_rust_ffi/src/writer_columns.rs |  10 +-
 src/writer.jl                          |  21 +---
 3 files changed, 74 insertions(+), 125 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs
index 38aae30..fb04b84 100644
--- a/iceberg_rust_ffi/src/writer.rs
+++ b/iceberg_rust_ffi/src/writer.rs
@@ -4,7 +4,7 @@
 /// across all writers. Per-writer ordering is guaranteed by the per-writer
 /// `Arc<Mutex<ConcreteDataFileWriter>>` inside WriterState: only one pool thread encodes
 /// a given writer at a time, and the FIFO global queue ensures batches are submitted
-/// in order. Drain tasks are lightweight async forwarders (no CPU work).
+/// in order.
 use std::ffi::{c_char, c_void};
 use std::io::Cursor;
 use std::panic::{catch_unwind, AssertUnwindSafe};
@@ -26,8 +26,6 @@ use iceberg::writer::file_writer::ParquetWriterBuilder;
 use iceberg::writer::{IcebergWriter, IcebergWriterBuilder};
 use parquet::basic::{Compression, Encoding};
 use parquet::file::properties::{EnabledStatistics, WriterProperties};
-use tokio::sync::mpsc;
-use tokio::task::JoinHandle;
 
 /// Compression codec values (must match Julia's CompressionCodec enum)
 const COMPRESSION_UNCOMPRESSED: i32 = 0;
@@ -107,9 +105,6 @@ use object_store_ffi::{
 type ConcreteDataFileWriter =
     DataFileWriter<ParquetWriterBuilder, DefaultLocationGenerator, DefaultFileNameGenerator>;
 
-/// Batch item sent through the per-writer drain channel.
-pub(crate) type BatchItem = RecordBatch;
-
 /// Encode task submitted to the global worker pool.
 struct EncodeTask {
     batch: RecordBatch,
@@ -240,18 +235,12 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool {
 
 /// Opaque writer handle for FFI.
 ///
-/// Writing is pipelined: Julia gathers a RecordBatch, sends it to the bounded per-writer
-/// channel (capacity 1), and returns immediately. A lightweight async drain task forwards
-/// batches to the global encode pool. Pool workers (N = available_parallelism) encode
-/// Parquet concurrently across all active writers.
+/// Writing is pipelined: Julia gathers a RecordBatch and submits it directly to the
+/// global encode pool, then returns immediately. Pool workers (N = available_parallelism)
+/// encode Parquet concurrently across all active writers.
 pub struct IcebergDataFileWriter {
     /// Arrow schema for this table, used by write_columns to create RecordBatches.
     pub(crate) arrow_schema: ArrowSchemaRef,
-    /// Sender side of the per-writer batch queue (bounded, capacity 1).
-    /// Dropped in iceberg_writer_close to signal EOF to the drain task.
-    pub(crate) batch_tx: Option<mpsc::Sender<BatchItem>>,
-    /// Lightweight async drain task: forwards batches from batch_tx to the global pool.
-    pub(crate) drain_handle: Option<JoinHandle<()>>,
     /// Shared state: owns the ConcreteDataFileWriter, tracks pending count and errors.
     pub(crate) writer_state: Arc<WriterState>,
 }
@@ -485,16 +474,12 @@ pub extern "C" fn iceberg_writer_write_columns_sync(
     }
 }
 
-/// Free a writer. Aborts the drain task and poisons the writer state so any in-flight
-/// pool tasks fail gracefully (error/cancel path).
+/// Free a writer. Poisons the writer state so any in-flight pool tasks fail gracefully.
 #[no_mangle]
 pub extern "C" fn iceberg_writer_free(writer: *mut IcebergDataFileWriter) {
     if !writer.is_null() {
         unsafe {
             let boxed = Box::from_raw(writer);
-            if let Some(handle) = boxed.drain_handle {
-                handle.abort();
-            }
             // Poison the ConcreteDataFileWriter so any in-flight pool tasks return an error
             // rather than writing to a partially-freed writer.
             let _ = boxed.writer_state.writer.lock().unwrap().take();
@@ -504,8 +489,7 @@ pub extern "C" fn iceberg_writer_free(writer: *mut IcebergDataFileWriter) {
 
 // Create a new DataFileWriter from a table with configuration options.
 //
-// Spawns a lightweight async drain task per writer. The global encode pool
-// (N = available_parallelism threads) is initialized on the first call.
+// The global encode pool (N = available_parallelism threads) is initialized on the first call.
 export_runtime_op!(
     iceberg_writer_new,
     IcebergDataFileWriterResponse,
@@ -575,7 +559,7 @@ export_runtime_op!(
         );
 
         // Initialize global pool (no-op if already running).
-        let pool = get_or_init_encode_pool();
+        get_or_init_encode_pool();
 
         let writer_state = Arc::new(WriterState {
             writer: Mutex::new(Some(concrete_writer)),
@@ -584,31 +568,8 @@ export_runtime_op!(
             error: Mutex::new(None),
         });
 
-        // Bounded per-writer channel (capacity 1): Julia can be at most one batch ahead
-        // of the pool. This provides backpressure: send blocks if the drain task hasn't
-        // forwarded the previous batch yet.
-        let (batch_tx, mut batch_rx) = mpsc::channel::<BatchItem>(1);
-
-        // Lightweight drain task: forwards batches to the global pool asynchronously.
-        let drain_handle: JoinHandle<()> = {
-            let state = writer_state.clone();
-            let pool_tx = pool.task_tx.clone();
-            tokio::task::spawn(async move {
-                while let Some(batch) = batch_rx.recv().await {
-                    state.pending.fetch_add(1, Ordering::AcqRel);
-                    let task = EncodeTask { batch, state: state.clone() };
-                    if pool_tx.send(task).await.is_err() {
-                        eprintln!("[iceberg:drain] pool channel closed");
-                        break;
-                    }
-                }
-            })
-        };
-
         Ok::<IcebergDataFileWriter, anyhow::Error>(IcebergDataFileWriter {
             arrow_schema,
-            batch_tx: Some(batch_tx),
-            drain_handle: Some(drain_handle),
             writer_state,
         })
     },
@@ -618,65 +579,71 @@ export_runtime_op!(
     parquet_props: *const ParquetWriterPropertiesFFI
 );
 
-// Write Arrow IPC data to the writer.
-//
-// Deserializes the IPC stream, then sends each RecordBatch to the drain task's channel.
-// Returns immediately after queuing — encoding happens asynchronously in the global pool.
-export_runtime_op!(
-    iceberg_writer_write,
-    crate::IcebergResponse,
-    || {
-        if writer.is_null() {
-            return Err(anyhow::anyhow!("Null writer pointer provided"));
-        }
-        if arrow_ipc_data.is_null() {
-            return Err(anyhow::anyhow!("Null arrow_ipc_data pointer provided"));
+/// Write Arrow IPC data synchronously: copy IPC bytes from Julia, deserialize the stream,
+/// and submit each RecordBatch to the global encode pool.
+///
+/// Returns 0 on success, -1 on error (error stored in writer state, propagated on close).
+#[no_mangle]
+pub extern "C" fn iceberg_writer_write_sync(
+    writer: *mut IcebergDataFileWriter,
+    arrow_ipc_data: *const u8,
+    arrow_ipc_len: usize,
+) -> i32 {
+    if writer.is_null() || arrow_ipc_data.is_null() || arrow_ipc_len == 0 {
+        return -1;
+    }
+
+    let writer_ref = unsafe { &*writer };
+
+    let pool = match GLOBAL_ENCODE_POOL.get() {
+        Some(p) => p,
+        None => {
+            eprintln!("[iceberg:sync] encode pool not initialized; call iceberg_writer_new first");
+            return -1;
         }
-        if arrow_ipc_len == 0 {
-            return Err(anyhow::anyhow!("Arrow IPC data length is zero"));
+    };
+
+    let ipc_bytes = unsafe {
+        std::slice::from_raw_parts(arrow_ipc_data, arrow_ipc_len).to_vec()
+    };
+
+    let cursor = Cursor::new(ipc_bytes);
+    let reader = match StreamReader::try_new(cursor, None) {
+        Ok(r) => r,
+        Err(e) => {
+            let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner());
+            if slot.is_none() { *slot = Some(anyhow::anyhow!("IPC reader: {}", e)); }
+            return -1;
         }
+    };
 
-        // Copy the IPC data into a Vec for safe use across await points
-        let ipc_bytes = unsafe {
-            std::slice::from_raw_parts(arrow_ipc_data, arrow_ipc_len).to_vec()
+    for batch_result in reader {
+        let batch = match batch_result {
+            Ok(b) => b,
+            Err(e) => {
+                let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner());
+                if slot.is_none() { *slot = Some(anyhow::anyhow!("IPC batch: {}", e)); }
+                return -1;
+            }
         };
-
-        let writer_ref = unsafe { &mut *writer };
-        Ok((writer_ref, ipc_bytes))
-    },
-    result_tuple,
-    async {
-        let (writer_ref, ipc_bytes) = result_tuple;
-
-        let tx = writer_ref
-            .batch_tx
-            .as_ref()
-            .ok_or_else(|| anyhow::anyhow!("Writer has been closed"))?;
-
-        // Deserialize Arrow IPC to RecordBatches and enqueue each one
-        let cursor = Cursor::new(ipc_bytes);
-        let mut reader = StreamReader::try_new(cursor, None)
-            .map_err(|e| anyhow::anyhow!("Failed to create Arrow IPC reader: {}", e))?;
-
-        while let Some(batch_result) = reader.next() {
-            let batch = batch_result
-                .map_err(|e| anyhow::anyhow!("Failed to read Arrow IPC batch: {}", e))?;
-            tx.send(batch).await
-                .map_err(|_| anyhow::anyhow!("Writer channel closed (drain task may have failed)"))?;
+        writer_ref.writer_state.pending.fetch_add(1, Ordering::AcqRel);
+        let task = EncodeTask { batch, state: writer_ref.writer_state.clone() };
+        if pool.task_tx.blocking_send(task).is_err() {
+            let prev = writer_ref.writer_state.pending.fetch_sub(1, Ordering::AcqRel);
+            if prev == 1 { writer_ref.writer_state.done_notify.notify_one(); }
+            let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner());
+            if slot.is_none() { *slot = Some(anyhow::anyhow!("Encode pool closed")); }
+            return -1;
         }
+    }
 
-        Ok::<(), anyhow::Error>(())
-    },
-    writer: *mut IcebergDataFileWriter,
-    arrow_ipc_data: *const u8,
-    arrow_ipc_len: usize
-);
+    0
+}
 
 // Close the writer and return the produced DataFiles.
 //
-// Drops the batch sender (signals EOF to the drain task), waits for the drain task to
-// finish forwarding all batches, waits for all pool encodes to complete, then finalizes
-// the Parquet file and returns the DataFiles metadata.
+// Waits for all pending pool encodes to complete, then finalizes the Parquet file
+// and returns the DataFiles metadata.
 export_runtime_op!(
     iceberg_writer_close,
     IcebergWriterCloseResponse,
@@ -689,15 +656,6 @@ export_runtime_op!(
     },
     writer_ref,
     async {
-        // Drop the sender — signals the drain task that no more batches are coming
-        drop(writer_ref.batch_tx.take());
-
-        // Wait for the drain task to finish forwarding all batches to the pool
-        if let Some(handle) = writer_ref.drain_handle.take() {
-            handle.await
-                .map_err(|e| anyhow::anyhow!("Drain task panicked: {}", e))?;
-        }
-
         // Wait for all pending pool encodes to complete.
         // Uses a timeout to guard against a dead worker thread (e.g. panic outside
         // catch_unwind) that would otherwise leave pending > 0 forever.
diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs
index 6bf4c67..b099e3b 100644
--- a/iceberg_rust_ffi/src/writer_columns.rs
+++ b/iceberg_rust_ffi/src/writer_columns.rs
@@ -123,11 +123,11 @@ unsafe fn build_null_buffer_scattered(
                 bits[(out + i) / 8] |= b << ((out + i) % 8);
             }
         } else {
-            // Scattered: gather validity bits via selection indices (1-based)
-            let sel = std::slice::from_raw_parts(slice.sel_ptr, slice.len);
-            for (i, &idx) in sel.iter().enumerate() {
-                let src = (idx - 1) as usize;
-                let b = (*slice.validity_ptr.add(src / 8) >> (src % 8)) & 1;
+            // Scattered: validity is per output row (0..len-1), same as sequential.
+            // The selection governs which *data* elements are picked, not which
+            // validity bits — the validity bitmap has one bit per output row.
+            for i in 0..slice.len {
+                let b = (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1;
                 bits[(out + i) / 8] |= b << ((out + i) % 8);
             }
         }
diff --git a/src/writer.jl b/src/writer.jl
index 4e41776..c9d50a5 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -415,23 +415,14 @@ end
 
 # Internal helper to write raw IPC bytes to the Rust writer
 function _write_ipc_bytes(writer::DataFileWriter, ipc_bytes::Vector{UInt8})
-    ipc_data = pointer(ipc_bytes)
-    ipc_len = length(ipc_bytes)
-
-    response = Response{Cvoid}(-1, nothing, C_NULL, C_NULL)
-
-    async_ccall(response, ipc_bytes) do handle
-        @ccall rust_lib.iceberg_writer_write(
+    ret = GC.@preserve ipc_bytes begin
+        @ccall rust_lib.iceberg_writer_write_sync(
             writer.ptr::Ptr{Cvoid},
-            ipc_data::Ptr{UInt8},
-            ipc_len::Csize_t,
-            response::Ref{Response{Cvoid}},
-            handle::Ptr{Cvoid}
-        )::Cint
+            pointer(ipc_bytes)::Ptr{UInt8},
+            length(ipc_bytes)::Csize_t,
+        )::Int32
     end
-
-    @throw_on_error(response, "write", IcebergException)
-
+    ret == 0 || throw(IcebergException("write failed (see writer close for details)"))
     return nothing
 end
 

From a1a78babb2ed3788380778baa58409af3ad8c54f Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 17:20:31 +0200
Subject: [PATCH 08/25] format

---
 iceberg_rust_ffi/src/writer.rs | 53 +++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs
index fb04b84..8dff45f 100644
--- a/iceberg_rust_ffi/src/writer.rs
+++ b/iceberg_rust_ffi/src/writer.rs
@@ -603,16 +603,20 @@ pub extern "C" fn iceberg_writer_write_sync(
         }
     };
 
-    let ipc_bytes = unsafe {
-        std::slice::from_raw_parts(arrow_ipc_data, arrow_ipc_len).to_vec()
-    };
+    let ipc_bytes = unsafe { std::slice::from_raw_parts(arrow_ipc_data, arrow_ipc_len).to_vec() };
 
     let cursor = Cursor::new(ipc_bytes);
     let reader = match StreamReader::try_new(cursor, None) {
         Ok(r) => r,
         Err(e) => {
-            let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner());
-            if slot.is_none() { *slot = Some(anyhow::anyhow!("IPC reader: {}", e)); }
+            let mut slot = writer_ref
+                .writer_state
+                .error
+                .lock()
+                .unwrap_or_else(|e| e.into_inner());
+            if slot.is_none() {
+                *slot = Some(anyhow::anyhow!("IPC reader: {}", e));
+            }
             return -1;
         }
     };
@@ -621,18 +625,41 @@ pub extern "C" fn iceberg_writer_write_sync(
         let batch = match batch_result {
             Ok(b) => b,
             Err(e) => {
-                let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner());
-                if slot.is_none() { *slot = Some(anyhow::anyhow!("IPC batch: {}", e)); }
+                let mut slot = writer_ref
+                    .writer_state
+                    .error
+                    .lock()
+                    .unwrap_or_else(|e| e.into_inner());
+                if slot.is_none() {
+                    *slot = Some(anyhow::anyhow!("IPC batch: {}", e));
+                }
                 return -1;
             }
         };
-        writer_ref.writer_state.pending.fetch_add(1, Ordering::AcqRel);
-        let task = EncodeTask { batch, state: writer_ref.writer_state.clone() };
+        writer_ref
+            .writer_state
+            .pending
+            .fetch_add(1, Ordering::AcqRel);
+        let task = EncodeTask {
+            batch,
+            state: writer_ref.writer_state.clone(),
+        };
         if pool.task_tx.blocking_send(task).is_err() {
-            let prev = writer_ref.writer_state.pending.fetch_sub(1, Ordering::AcqRel);
-            if prev == 1 { writer_ref.writer_state.done_notify.notify_one(); }
-            let mut slot = writer_ref.writer_state.error.lock().unwrap_or_else(|e| e.into_inner());
-            if slot.is_none() { *slot = Some(anyhow::anyhow!("Encode pool closed")); }
+            let prev = writer_ref
+                .writer_state
+                .pending
+                .fetch_sub(1, Ordering::AcqRel);
+            if prev == 1 {
+                writer_ref.writer_state.done_notify.notify_one();
+            }
+            let mut slot = writer_ref
+                .writer_state
+                .error
+                .lock()
+                .unwrap_or_else(|e| e.into_inner());
+            if slot.is_none() {
+                *slot = Some(anyhow::anyhow!("Encode pool closed"));
+            }
             return -1;
         }
     }

From 3e46312ebce9a3be74fe12dea2f009e555282d62 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 17:23:30 +0200
Subject: [PATCH 09/25] Rename fn

---
 src/RustyIceberg.jl  | 2 +-
 src/writer.jl        | 6 +++---
 test/writer_tests.jl | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/RustyIceberg.jl b/src/RustyIceberg.jl
index a9a3385..41551f6 100644
--- a/src/RustyIceberg.jl
+++ b/src/RustyIceberg.jl
@@ -40,7 +40,7 @@ export COLUMN_TYPE_INT32, COLUMN_TYPE_INT64, COLUMN_TYPE_FLOAT32, COLUMN_TYPE_FL
 export COLUMN_TYPE_STRING, COLUMN_TYPE_DATE, COLUMN_TYPE_TIMESTAMP, COLUMN_TYPE_TIMESTAMPTZ, COLUMN_TYPE_BOOLEAN, COLUMN_TYPE_UUID
 export COLUMN_TYPE_DECIMAL_INT32, COLUMN_TYPE_DECIMAL_INT64, COLUMN_TYPE_DECIMAL_INT128
 export julia_type_to_column_type
-export GatheredColumn, GatheredBatch, add_slice!, add_string_slice!, write_scattered_columns_sync
+export GatheredColumn, GatheredBatch, add_slice!, add_string_slice!
 
 # Always use the JLL library - override via Preferences if needed for local development
 # To use a local build, set the preference:
diff --git a/src/writer.jl b/src/writer.jl
index c9d50a5..522f902 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -1050,7 +1050,7 @@ function Base.push!(
 end
 
 """
-    write_scattered_columns_sync(writer::DataFileWriter, batch::GatheredBatch[, extra_preserve])
+    write_columns(writer::DataFileWriter, batch::GatheredBatch[, extra_preserve])
 
 Gather column data from Julia memory synchronously, then encode asynchronously.
 
@@ -1064,7 +1064,7 @@ The source data pointed to by the `GatheredBatch` slices and `extra_preserve` mu
 valid for the duration of this call. After the call returns, all Julia pointers have
 been consumed and the source data may be safely released.
 """
-function write_scattered_columns_sync(
+function write_columns(
     writer::DataFileWriter,
     batch::GatheredBatch,
     extra_preserve = nothing,
@@ -1098,6 +1098,6 @@ function write_scattered_columns_sync(
             length(descriptors)::Csize_t,
         )::Int32
     end
-    ret == 0 || throw(IcebergException("write_scattered_columns_sync: gather failed (see writer close for details)"))
+    ret == 0 || throw(IcebergException("write_columns (gathered): gather failed (see writer close for details)"))
     return nothing
 end
diff --git a/test/writer_tests.jl b/test/writer_tests.jl
index 8d263d1..80c9380 100644
--- a/test/writer_tests.jl
+++ b/test/writer_tests.jl
@@ -1149,8 +1149,8 @@ end
     println("\n✅ write_columns decimal nullable tests completed!")
 end
 
-@testset "Writer write_scattered_columns_sync API" begin
-    println("Testing write_scattered_columns_sync (gathered-column) API...")
+@testset "Writer write_columns (GatheredBatch) API" begin
+    println("Testing write_columns with GatheredBatch (gathered-column) API...")
 
     catalog_uri = get_catalog_uri()
     props = get_catalog_properties()
@@ -1239,8 +1239,8 @@ end
             push!(batch, tag_col)
             println("✅ tag column built (string via add_string_slice!)")
 
-            RustyIceberg.write_scattered_columns_sync(writer, batch)
-            println("✅ Batch written via write_scattered_columns_sync")
+            RustyIceberg.write_columns(writer, batch)
+            println("✅ Batch written via write_columns (GatheredBatch)")
         end
         @test data_files !== nothing && data_files.ptr != C_NULL
         println("✅ Writer closed")

From b2309f2f689d20faf2a300a28e2a163fce64018c Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 17:23:34 +0200
Subject: [PATCH 10/25] .

---
 test/writer_tests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/writer_tests.jl b/test/writer_tests.jl
index 80c9380..77f8501 100644
--- a/test/writer_tests.jl
+++ b/test/writer_tests.jl
@@ -1301,5 +1301,5 @@ end
         end
     end
 
-    println("\n✅ write_scattered_columns_sync API tests completed!")
+    println("\n✅ write_columns (GatheredBatch) API tests completed!")
 end

From 8dbd1e6c25021fbfe4bef6cf1bfd7d1abb365fd0 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 17:24:46 +0200
Subject: [PATCH 11/25] .

---
 src/writer.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/writer.jl b/src/writer.jl
index 522f902..2f85603 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -1003,7 +1003,7 @@ Collects a `GatheredColumn` per output column, then writes all of them in one ca
 batch = GatheredBatch()
 push!(batch, col_int64)
 push!(batch, col_float64)
-write_scattered_columns_sync(writer, batch)
+write_columns(writer, batch)
 ```
 
 You can also push a single-slice column inline without building a `GatheredColumn`
@@ -1013,7 +1013,7 @@ explicitly:
 batch = GatheredBatch()
 push!(batch, src_ints,   COLUMN_TYPE_INT64)
 push!(batch, src_floats, COLUMN_TYPE_FLOAT64; sel=indices, validity=valid_bv)
-write_scattered_columns_sync(writer, batch)
+write_columns(writer, batch)
 ```
 """
 mutable struct GatheredBatch

From 3d43275a5e32531a2bb60fe3c1b500e016187247 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 17:27:20 +0200
Subject: [PATCH 12/25] Fix naming

---
 iceberg_rust_ffi/src/writer.rs | 18 +++++++-----------
 src/writer.jl                  |  4 ++--
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs
index 8dff45f..9ccf5c0 100644
--- a/iceberg_rust_ffi/src/writer.rs
+++ b/iceberg_rust_ffi/src/writer.rs
@@ -254,21 +254,17 @@ pub type IcebergDataFileWriterResponse = IcebergBoxedResponse<IcebergDataFileWri
 /// Type alias for data files response (returns IcebergDataFiles handle)
 pub type IcebergWriterCloseResponse = IcebergBoxedResponse<IcebergDataFiles>;
 
-/// Synchronous scatter-gather write: gathers all column data from Julia memory into
-/// Arrow arrays in the calling thread, submits the RecordBatch to the global encode
-/// pool, then returns immediately (encode is still async).
+/// Gather column data from Julia memory into Arrow arrays in the calling thread, then
+/// submit the RecordBatch to the global encode pool asynchronously.
 ///
-/// Unlike `iceberg_writer_write_scattered_columns` (which requires a Tokio async context
-/// and uses a callback), this function is designed for plain C `ccall` from Julia:
-/// - Julia keeps source arrays alive via `GC.@preserve` for the duration of this call.
-/// - After this function returns, all Julia pointers have been consumed; Julia may safely
-///   release the source data.
-/// - Encode is still asynchronous in the global pool; call `iceberg_writer_close` to
-///   wait for all pending encodes.
+/// Julia keeps source arrays alive via `GC.@preserve` for the duration of this call.
+/// After this function returns, all Julia pointers have been consumed and Julia may safely
+/// release the source data. Encode is still asynchronous in the global pool; call
+/// `iceberg_writer_close` to wait for all pending encodes.
 ///
 /// Returns 0 on success, -1 on error (error stored in writer state, propagated on close).
 #[no_mangle]
-pub extern "C" fn iceberg_writer_write_scattered_columns_sync(
+pub extern "C" fn iceberg_writer_write_gathered_columns(
     writer: *mut IcebergDataFileWriter,
     columns: *const GatheredColumnDescriptor,
     num_columns: usize,
diff --git a/src/writer.jl b/src/writer.jl
index 2f85603..b41d5bc 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -512,7 +512,7 @@ end
     GatheredColumnDescriptor
 
 FFI descriptor for a column to be gathered from multiple SliceRefs.
-Pass an array of these to `write_scattered_columns`.
+Pass an array of these to `write_columns`.
 
 - `slices_ptr`: pointer to array of SliceRef structs
 - `num_slices`: number of SliceRef entries
@@ -1092,7 +1092,7 @@ function write_columns(
     extra_preserve !== nothing && append!(preserve, extra_preserve)
 
     ret = GC.@preserve preserve all_slice_arrays descriptors begin
-        @ccall rust_lib.iceberg_writer_write_scattered_columns_sync(
+        @ccall rust_lib.iceberg_writer_write_gathered_columns(
             writer.ptr::Ptr{Cvoid},
             pointer(descriptors)::Ptr{GatheredColumnDescriptor},
             length(descriptors)::Csize_t,

From b2ae1972bd7ea26e247107353f13fca2a4dd72ab Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 18:01:02 +0200
Subject: [PATCH 13/25] Add test

---
 test/writer_tests.jl | 120 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/test/writer_tests.jl b/test/writer_tests.jl
index 77f8501..1d0aa30 100644
--- a/test/writer_tests.jl
+++ b/test/writer_tests.jl
@@ -1303,3 +1303,123 @@ end
 
     println("\n✅ write_columns (GatheredBatch) API tests completed!")
 end
+
+@testset "Writer WriterConfig parquet properties" begin
+    println("Testing WriterConfig parquet writer properties...")
+
+    catalog_uri = get_catalog_uri()
+    props = get_catalog_properties()
+
+    catalog = nothing
+    try
+        catalog = RustyIceberg.catalog_create_rest(catalog_uri; properties=props)
+        @test catalog !== nothing
+        println("✅ Catalog created successfully")
+
+        call_count = Ref(0)
+
+        # Write n_rows with the given config, commit, read back, return the NamedTuple.
+        # Creates and cleans up its own namespace/table.
+        function write_read_config(config::RustyIceberg.WriterConfig, n_rows::Int=5)
+            call_count[] += 1
+            uid = "$(round(Int, time() * 1000))_$(call_count[])"
+            ns = ["test_wrcfg_$uid"]
+            tn = "wrcfg_$uid"
+            schema = Schema([
+                Field(Int32(1), "id",    IcebergLong();   required=true),
+                Field(Int32(2), "value", IcebergDouble(); required=false),
+            ])
+            table = C_NULL
+            updated_table = C_NULL
+            data_files = nothing
+            try
+                table = RustyIceberg.create_table(catalog, ns, tn, schema)
+                data_files = RustyIceberg.with_data_file_writer(table, config) do writer
+                    write(writer, (
+                        id    = Int64.(1:n_rows),
+                        value = Float64.(1:n_rows) .* 1.1,
+                    ))
+                end
+                updated_table = RustyIceberg.with_transaction(table, catalog) do tx
+                    RustyIceberg.with_fast_append(tx) do action
+                        RustyIceberg.add_data_files(action, data_files)
+                    end
+                end
+                return read_table_data(updated_table)
+            finally
+                if updated_table != C_NULL
+                    RustyIceberg.free_table(updated_table)
+                end
+                if data_files !== nothing && data_files.ptr != C_NULL
+                    RustyIceberg.free_data_files!(data_files)
+                end
+                if table != C_NULL
+                    RustyIceberg.free_table(table)
+                    RustyIceberg.drop_table(catalog, ns, tn)
+                    RustyIceberg.drop_namespace(catalog, ns)
+                end
+            end
+        end
+
+        # --- Compression codecs ---
+        for codec in [RustyIceberg.SNAPPY, RustyIceberg.GZIP, RustyIceberg.LZ4, RustyIceberg.ZSTD]
+            tbl = write_read_config(RustyIceberg.WriterConfig(compression=codec))
+            @test !isnothing(tbl)
+            @test length(tbl.id) == 5
+            perm = sortperm(tbl.id)
+            @test tbl.id[perm] == Int64[1, 2, 3, 4, 5]
+            @test tbl.value[perm] ≈ Float64[1, 2, 3, 4, 5] .* 1.1
+            println("✅ Compression $codec: data correct")
+        end
+
+        # --- plain_encoding=true (bypasses DELTA_BINARY_PACKED for INT64/INT32) ---
+        tbl = write_read_config(RustyIceberg.WriterConfig(plain_encoding=true))
+        @test !isnothing(tbl) && length(tbl.id) == 5
+        @test tbl.id[sortperm(tbl.id)] == Int64[1, 2, 3, 4, 5]
+        println("✅ plain_encoding=true: data correct")
+
+        # --- dictionary_enabled=false ---
+        tbl = write_read_config(RustyIceberg.WriterConfig(dictionary_enabled=false))
+        @test !isnothing(tbl) && length(tbl.id) == 5
+        @test tbl.id[sortperm(tbl.id)] == Int64[1, 2, 3, 4, 5]
+        println("✅ dictionary_enabled=false: data correct")
+
+        # --- statistics_enabled=false ---
+        tbl = write_read_config(RustyIceberg.WriterConfig(statistics_enabled=false))
+        @test !isnothing(tbl) && length(tbl.id) == 5
+        @test tbl.id[sortperm(tbl.id)] == Int64[1, 2, 3, 4, 5]
+        println("✅ statistics_enabled=false: data correct")
+
+        # --- max_row_group_size=3 with 10 rows → forces ≥4 row groups ---
+        tbl = write_read_config(RustyIceberg.WriterConfig(max_row_group_size=3), 10)
+        @test !isnothing(tbl) && length(tbl.id) == 10
+        @test tbl.id[sortperm(tbl.id)] == Int64.(1:10)
+        println("✅ max_row_group_size=3 (10 rows, multiple row groups): all rows intact")
+
+        # --- write_batch_size=2 (rows encoded per column chunk within a row group) ---
+        tbl = write_read_config(RustyIceberg.WriterConfig(write_batch_size=2))
+        @test !isnothing(tbl) && length(tbl.id) == 5
+        @test tbl.id[sortperm(tbl.id)] == Int64[1, 2, 3, 4, 5]
+        println("✅ write_batch_size=2: data correct")
+
+        # --- Combined: ZSTD + plain_encoding + no dict + large write_batch_size ---
+        config = RustyIceberg.WriterConfig(
+            compression        = RustyIceberg.ZSTD,
+            plain_encoding     = true,
+            dictionary_enabled = false,
+            write_batch_size   = 65536,
+        )
+        tbl = write_read_config(config, 8)
+        @test !isnothing(tbl) && length(tbl.id) == 8
+        @test tbl.id[sortperm(tbl.id)] == Int64.(1:8)
+        println("✅ Combined (ZSTD + plain + no dict + large batch_size): data correct")
+
+    finally
+        if catalog !== nothing
+            RustyIceberg.free_catalog!(catalog)
+            println("✅ Catalog cleaned up")
+        end
+    end
+
+    println("\n✅ WriterConfig parquet properties tests completed!")
+end

From 66880e815edc2feecbc82f99e6d2c4d8fb395bca Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 18:05:50 +0200
Subject: [PATCH 14/25] Rename fn

---
 iceberg_rust_ffi/src/writer.rs         | 6 +++---
 iceberg_rust_ffi/src/writer_columns.rs | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs
index 9ccf5c0..d00700c 100644
--- a/iceberg_rust_ffi/src/writer.rs
+++ b/iceberg_rust_ffi/src/writer.rs
@@ -95,7 +95,7 @@ use crate::table::IcebergTable;
 use crate::transaction::IcebergDataFiles;
 use crate::util::parse_c_string;
 use crate::writer_columns::{
-    build_arrow_array_scattered, ColumnDescriptor, GatheredColumnDescriptor, SliceRef,
+    build_arrow_array_gathered, ColumnDescriptor, GatheredColumnDescriptor, SliceRef,
 };
 use object_store_ffi::{
     export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT,
@@ -307,7 +307,7 @@ pub extern "C" fn iceberg_writer_write_gathered_columns(
     let batch = match (|| -> Result<RecordBatch, anyhow::Error> {
         let mut arrays = Vec::with_capacity(num_columns);
         for (i, desc) in col_descs.iter().enumerate() {
-            arrays.push(unsafe { build_arrow_array_scattered(desc, arrow_schema.field(i))? });
+            arrays.push(unsafe { build_arrow_array_gathered(desc, arrow_schema.field(i))? });
         }
         RecordBatch::try_new(arrow_schema, arrays)
             .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))
@@ -426,7 +426,7 @@ pub extern "C" fn iceberg_writer_write_columns_sync(
     let batch = match (|| -> Result<RecordBatch, anyhow::Error> {
         let mut arrays = Vec::with_capacity(num_columns);
         for (i, desc) in gathered.iter().enumerate() {
-            arrays.push(unsafe { build_arrow_array_scattered(desc, arrow_schema.field(i))? });
+            arrays.push(unsafe { build_arrow_array_gathered(desc, arrow_schema.field(i))? });
         }
         RecordBatch::try_new(arrow_schema, arrays)
             .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))
diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs
index b099e3b..6f3f3d5 100644
--- a/iceberg_rust_ffi/src/writer_columns.rs
+++ b/iceberg_rust_ffi/src/writer_columns.rs
@@ -141,7 +141,7 @@ unsafe fn build_null_buffer_scattered(
 }
 
 /// Gather all slices for a column into an Arrow array.
-pub(crate) unsafe fn build_arrow_array_scattered(
+pub(crate) unsafe fn build_arrow_array_gathered(
     desc: &GatheredColumnDescriptor,
     schema_field: &arrow_schema::Field,
 ) -> Result<ArrayRef, anyhow::Error> {

From d3fe00ef80f38ec39cb1c8ee58e1ca19c82faa28 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 18:28:28 +0200
Subject: [PATCH 15/25] .

---
 test/writer_tests.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/writer_tests.jl b/test/writer_tests.jl
index 1d0aa30..78bb05e 100644
--- a/test/writer_tests.jl
+++ b/test/writer_tests.jl
@@ -1333,6 +1333,7 @@ end
             updated_table = C_NULL
             data_files = nothing
             try
+                RustyIceberg.create_namespace(catalog, ns)
                 table = RustyIceberg.create_table(catalog, ns, tn, schema)
                 data_files = RustyIceberg.with_data_file_writer(table, config) do writer
                     write(writer, (

From 8ce862c91b478dac5e26ae29c49f726b417dcf4e Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 19:12:12 +0200
Subject: [PATCH 16/25] Disallow changing pool size

---
 iceberg_rust_ffi/src/writer.rs |  9 +++++++--
 src/writer.jl                  | 11 +++++++----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs
index d00700c..a45cb02 100644
--- a/iceberg_rust_ffi/src/writer.rs
+++ b/iceberg_rust_ffi/src/writer.rs
@@ -145,12 +145,17 @@ static GLOBAL_ENCODE_POOL: OnceLock<GlobalWorkerPool> = OnceLock::new();
 static ENCODE_WORKERS: AtomicUsize = AtomicUsize::new(0);
 
 /// Set the number of encode worker threads in the global pool.
-/// Must be called before creating any writers; has no effect afterwards.
+/// Must be called before any writer is created. Returns 0 on success, 1 if the pool is
+/// already initialized (call ignored).
 #[no_mangle]
-pub extern "C" fn iceberg_set_encode_workers(n: i32) {
+pub extern "C" fn iceberg_set_encode_workers(n: i32) -> i32 {
+    if GLOBAL_ENCODE_POOL.get().is_some() {
+        return 1;
+    }
     if n > 0 {
         ENCODE_WORKERS.store(n as usize, Ordering::Relaxed);
     }
+    0
 }
 
 /// Initialize the global encode pool on first call.
diff --git a/src/writer.jl b/src/writer.jl
index b41d5bc..0675314 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -151,13 +151,16 @@ end
 
 Set the number of threads in the global Parquet encode worker pool.
 
-Must be called before creating any `DataFileWriter`; has no effect once the pool
-has been initialized (i.e. after the first writer is created). Defaults to
-`Sys.CPU_THREADS` if not set.
+Must be called before the first `DataFileWriter` is created (i.e. before the pool is
+initialized). Throws if the pool is already running. Defaults to `Sys.CPU_THREADS`
+if not set.
 """
 function set_encode_workers!(n::Int)
     n > 0 || throw(ArgumentError("n must be positive, got $n"))
-    @ccall rust_lib.iceberg_set_encode_workers(n::Cint)::Cvoid
+    ret = @ccall rust_lib.iceberg_set_encode_workers(n::Cint)::Int32
+    ret == 0 || throw(IcebergException(
+        "set_encode_workers! must be called before creating any DataFileWriter"
+    ))
     return nothing
 end
 

From bec8fa564c2937f7f66599df7cdce146c690d11b Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 19:18:52 +0200
Subject: [PATCH 17/25] Update comment

---
 iceberg_rust_ffi/src/writer_columns.rs | 7 +++++--
 src/writer.jl                          | 5 ++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs
index 6f3f3d5..f223bbb 100644
--- a/iceberg_rust_ffi/src/writer_columns.rs
+++ b/iceberg_rust_ffi/src/writer_columns.rs
@@ -248,8 +248,11 @@ pub(crate) unsafe fn build_arrow_array_gathered(
             ))
         }
         COLUMN_TYPE_STRING => {
-            // Strings are always pre-staged in Julia; data_ptr = *const *const u8,
-            // lengths_ptr = *const i64, sel_ptr is always null (identity).
+            // String columns do not support selection vectors. Julia strings are
+            // heap-allocated with non-contiguous addresses, so the caller must build
+            // str_ptrs/str_lens arrays up-front — any row selection is already applied
+            // before add_string_slice! is called. sel_ptr is therefore always null here.
+            // data_ptr = *const *const u8, lengths_ptr = *const i64.
             let mut all_strings: Vec<Option<&str>> = Vec::with_capacity(total);
             for slice in slices {
                 if slice.lengths_ptr.is_null() {
diff --git a/src/writer.jl b/src/writer.jl
index 0675314..4ecf46e 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -903,7 +903,10 @@ add_slice!(col, src_array2; sel=sel_indices)      # scattered: rows at sel_indic
 add_slice!(col, src_array3; validity=valid_bv)    # nullable slice
 ```
 
-String columns are not supported on the scattered path; use `ColumnBatch` instead.
+For string columns use `add_string_slice!` instead of `add_slice!`. Selection vectors
+are not supported for strings: Julia strings are non-contiguous, so the caller must
+build `str_ptrs`/`str_lens` arrays up-front — any row selection is applied on the Julia
+side before calling `add_string_slice!`.
 """
 mutable struct GatheredColumn
     slices::Vector{SliceRef}

From f535c117ecbf6c01814d28390d287247de3036c2 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Tue, 28 Apr 2026 19:24:43 +0200
Subject: [PATCH 18/25] Add high-level string overload for add_string_slice!

Accepts Vector{String} with optional validity BitVector, handling pointer
extraction and GC preservation internally. The low-level ptr/len overload
is retained for performance-sensitive callers with pre-allocated buffers.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/writer.jl        | 48 +++++++++++++++++++++++++++++++++++++++++---
 test/writer_tests.jl | 24 ++++++----------------
 2 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/src/writer.jl b/src/writer.jl
index 4ecf46e..6df49db 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -901,6 +901,9 @@ col = GatheredColumn(COLUMN_TYPE_INT64)
 add_slice!(col, src_array)                        # sequential: all rows
 add_slice!(col, src_array2; sel=sel_indices)      # scattered: rows at sel_indices
 add_slice!(col, src_array3; validity=valid_bv)    # nullable slice
+
+str_col = GatheredColumn(COLUMN_TYPE_STRING; nullable=true)
+add_string_slice!(str_col, ["a", "", "c"]; validity=BitVector([true, false, true]))
 ```
 
 For string columns use `add_string_slice!` instead of `add_slice!`. Selection vectors
@@ -965,12 +968,51 @@ function add_slice!(
     return col
 end
 
+"""
+    add_string_slice!(col::GatheredColumn, strings::Vector{String}; validity=nothing)
+
+Append a string slice to `col` from a plain `Vector{String}`.
+
+- `validity`: optional `BitVector` (`true` = valid, `false` = null). Marking a row null
+  does not require a placeholder in `strings`, but the vector must still be the same length.
+
+```julia
+col = GatheredColumn(COLUMN_TYPE_STRING; nullable=true)
+add_string_slice!(col, ["hello", "", "world"]; validity=BitVector([true, false, true]))
+```
+
+For performance-critical paths where pointer/length arrays are pre-allocated, use the
+lower-level `add_string_slice!(col, str_ptrs, str_lens; validity)` overload directly.
+"""
+function add_string_slice!(
+    col::GatheredColumn,
+    strings::Vector{String};
+    validity::Union{Nothing, BitVector} = nothing,
+)
+    n = length(strings)
+    is_nullable = validity !== nothing
+    str_ptrs = Vector{Ptr{UInt8}}(undef, n)
+    str_lens = Vector{Int64}(undef, n)
+    for i in 1:n
+        if is_nullable && !validity[i]
+            str_ptrs[i] = Ptr{UInt8}(C_NULL)
+            str_lens[i] = 0
+        else
+            str_ptrs[i] = pointer(strings[i])
+            str_lens[i] = ncodeunits(strings[i])
+        end
+    end
+    push!(col.preserve, strings)  # keep String objects alive so pointers remain valid
+    return add_string_slice!(col, str_ptrs, str_lens; validity)
+end
+
 """
     add_string_slice!(col::GatheredColumn, str_ptrs, str_lens; validity=nothing)
 
-Append a string slice to `col`. `str_ptrs` is a `Vector{Ptr{UInt8}}` of pointers to
-UTF-8 string data and `str_lens` is a `Vector{Int64}` of corresponding byte lengths.
-The caller is responsible for keeping the pointed-to string bytes alive.
+Low-level overload: append a string slice from pre-built pointer/length arrays.
+`str_ptrs` is a `Vector{Ptr{UInt8}}` of pointers to UTF-8 string data and `str_lens`
+is a `Vector{Int64}` of corresponding byte lengths. The caller is responsible for keeping
+the pointed-to string bytes alive until `write_columns` returns.
 """
 function add_string_slice!(
     col::GatheredColumn,
diff --git a/test/writer_tests.jl b/test/writer_tests.jl
index 78bb05e..1c15dd7 100644
--- a/test/writer_tests.jl
+++ b/test/writer_tests.jl
@@ -1216,26 +1216,14 @@ end
             push!(batch, score_col)
             println("✅ score column built (scattered + nullable)")
 
-            # tag: string column via add_string_slice!
+            # tag: string column via the high-level add_string_slice! overload
             # Row 0: "alpha", row 1: null, row 2: "gamma", row 3: null
-            tag_strings = ["alpha", "", "gamma", ""]  # empty at null positions
-            tag_ptrs = Vector{Ptr{UInt8}}(undef, 4)
-            tag_lens = Vector{Int64}(undef, 4)
-            tag_valid = BitVector([true, false, true, false])
-            for i in 1:4
-                if tag_valid[i]
-                    tag_ptrs[i] = pointer(tag_strings[i])
-                    tag_lens[i] = ncodeunits(tag_strings[i])
-                else
-                    tag_ptrs[i] = Ptr{UInt8}(C_NULL)
-                    tag_lens[i] = 0
-                end
-            end
-
             tag_col = RustyIceberg.GatheredColumn(RustyIceberg.COLUMN_TYPE_STRING; nullable=true)
-            RustyIceberg.add_string_slice!(tag_col, tag_ptrs, tag_lens; validity=tag_valid)
-            # Preserve source strings so pointers stay valid until write completes
-            push!(tag_col.preserve, tag_strings)
+            RustyIceberg.add_string_slice!(
+                tag_col,
+                ["alpha", "", "gamma", ""];
+                validity=BitVector([true, false, true, false])
+            )
             push!(batch, tag_col)
             println("✅ tag column built (string via add_string_slice!)")
 

From e242f2d5138000e8e6d967fb48622424377194d3 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Wed, 29 Apr 2026 17:27:59 +0200
Subject: [PATCH 19/25] Optimise string column building: skip UTF-8 validation,
 avoid intermediate Vec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Build StringArray directly with OffsetBuffer + values Buffer in a single pass,
replacing Vec<Option<&str>> + StringArray::from. Use new_unchecked to skip
Arrow's UTF-8 re-validation — Julia strings are guaranteed valid UTF-8.

For 20M x 32-byte strings this eliminates ~320 MB of intermediate Vec<Option<&str>>
storage and ~640 MB of redundant UTF-8 validation reads per column.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 iceberg_rust_ffi/src/writer_columns.rs | 44 +++++++++++++++++---------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs
index f223bbb..fbdae91 100644
--- a/iceberg_rust_ffi/src/writer_columns.rs
+++ b/iceberg_rust_ffi/src/writer_columns.rs
@@ -13,7 +13,7 @@ use arrow_array::{
     },
     ArrayRef, BooleanArray, PrimitiveArray, StringArray,
 };
-use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, ScalarBuffer};
+use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, OffsetBuffer, ScalarBuffer};
 
 /// Column type codes (must match Julia's ColumnType enum)
 pub const COLUMN_TYPE_INT32: i32 = 0;
@@ -253,7 +253,19 @@ pub(crate) unsafe fn build_arrow_array_gathered(
             // str_ptrs/str_lens arrays up-front — any row selection is already applied
             // before add_string_slice! is called. sel_ptr is therefore always null here.
             // data_ptr = *const *const u8, lengths_ptr = *const i64.
-            let mut all_strings: Vec<Option<&str>> = Vec::with_capacity(total);
+            //
+            // Build the Arrow StringArray directly: one pass copies string bytes into a
+            // contiguous values buffer and tracks cumulative offsets. This avoids the
+            // intermediate Vec<Option<&str>> and skips UTF-8 validation — Julia strings
+            // are guaranteed valid UTF-8.
+            let null_buf = if desc.is_nullable {
+                build_null_buffer_scattered(slices, total)
+            } else {
+                None
+            };
+            let mut offsets = Vec::<i32>::with_capacity(total + 1);
+            offsets.push(0i32);
+            let mut values = Vec::<u8>::new();
             for slice in slices {
                 if slice.lengths_ptr.is_null() {
                     return Err(anyhow::anyhow!("String column requires lengths_ptr"));
@@ -262,24 +274,26 @@ pub(crate) unsafe fn build_arrow_array_gathered(
                     std::slice::from_raw_parts(slice.data_ptr as *const *const u8, slice.len);
                 let lens = std::slice::from_raw_parts(slice.lengths_ptr, slice.len);
                 for i in 0..slice.len {
-                    let is_null = if !slice.validity_ptr.is_null() {
-                        (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1 == 0
-                    } else {
-                        false
-                    };
-                    if is_null {
-                        all_strings.push(None);
-                    } else {
-                        let s = std::str::from_utf8(std::slice::from_raw_parts(
+                    let is_null = !slice.validity_ptr.is_null()
+                        && ((*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1) == 0;
+                    if !is_null {
+                        values.extend_from_slice(std::slice::from_raw_parts(
                             ptrs[i],
                             lens[i] as usize,
-                        ))
-                        .map_err(|e| anyhow::anyhow!("Invalid UTF-8: {}", e))?;
-                        all_strings.push(Some(s));
+                        ));
                     }
+                    offsets.push(values.len() as i32);
                 }
             }
-            Arc::new(StringArray::from(all_strings))
+            // SAFETY: offsets are monotonically non-decreasing by construction; values
+            // bytes come from Julia String objects (valid UTF-8) kept alive in col.preserve.
+            Arc::new(unsafe {
+                StringArray::new_unchecked(
+                    OffsetBuffer::new(ScalarBuffer::from(offsets)),
+                    Buffer::from_vec(values),
+                    null_buf,
+                )
+            })
         }
         COLUMN_TYPE_UUID => {
             let mut data: Vec<u8> = Vec::with_capacity(total * 16);

From 3e8bcc3e6165b9d58581e5b1785b77739bb4f227 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Thu, 30 Apr 2026 13:51:43 +0200
Subject: [PATCH 20/25] Address PR review comments

- Extract encode worker loop body into encode_worker_loop()
- Retain panic message: downcast Box<dyn Any> to &str / String before formatting
- Add iceberg_take_gather_error() FFI + thread-local to surface gather errors
  immediately in Julia exceptions rather than deferring to writer close
- Clarify lengths_ptr doc: array of byte lengths per string
- Merge identical sequential/scattered validity branches in build_null_buffer_scattered
- Add explanatory comments: bitvector merging, re-alignment, all-valid bit-set

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 iceberg_rust_ffi/src/writer.rs         | 149 ++++++++++++++++---------
 iceberg_rust_ffi/src/writer_columns.rs |  27 ++---
 src/writer.jl                          |  12 +-
 3 files changed, 121 insertions(+), 67 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs
index a45cb02..ec59b1b 100644
--- a/iceberg_rust_ffi/src/writer.rs
+++ b/iceberg_rust_ffi/src/writer.rs
@@ -5,7 +5,9 @@
 /// `Arc<Mutex<ConcreteDataFileWriter>>` inside WriterState: only one pool thread encodes
 /// a given writer at a time, and the FIFO global queue ensures batches are submitted
 /// in order.
-use std::ffi::{c_char, c_void};
+use std::any::Any;
+use std::cell::RefCell;
+use std::ffi::{c_char, c_void, CString};
 use std::io::Cursor;
 use std::panic::{catch_unwind, AssertUnwindSafe};
 use std::sync::atomic::{AtomicUsize, Ordering};
@@ -140,6 +142,95 @@ struct GlobalWorkerPool {
 
 static GLOBAL_ENCODE_POOL: OnceLock<GlobalWorkerPool> = OnceLock::new();
 
+// Thread-local storage for the most recent synchronous gather error.
+// Set by iceberg_writer_write_gathered_columns on failure; consumed by iceberg_take_gather_error.
+thread_local! {
+    static LAST_GATHER_ERROR: RefCell<Option<CString>> = const { RefCell::new(None) };
+}
+
+fn store_gather_error(e: &anyhow::Error) {
+    let msg = format!("{:#}", e);
+    LAST_GATHER_ERROR.with(|cell| {
+        *cell.borrow_mut() = CString::new(msg).ok();
+    });
+}
+
+/// Returns a heap-allocated C string with the most recent gather error on this thread,
+/// or NULL if none. Must be called on the same thread as the failed write call, immediately
+/// after it returns. The caller must free the returned string with `iceberg_destroy_cstring`.
+#[no_mangle]
+pub extern "C" fn iceberg_take_gather_error() -> *mut c_char {
+    LAST_GATHER_ERROR.with(|cell| {
+        cell.borrow_mut()
+            .take()
+            .map(|s| s.into_raw())
+            .unwrap_or(std::ptr::null_mut())
+    })
+}
+
+/// Formats a Rust panic payload into an anyhow error, preserving the message where possible.
+fn format_panic_error(panic: Box<dyn Any + Send>) -> anyhow::Error {
+    let msg = if let Some(s) = panic.downcast_ref::<&str>() {
+        format!("encode worker panicked: {}", s)
+    } else if let Some(s) = panic.downcast_ref::<String>() {
+        format!("encode worker panicked: {}", s)
+    } else {
+        "encode worker panicked (no string payload)".to_string()
+    };
+    anyhow::anyhow!(msg)
+}
+
+/// Body of each encode worker thread: receives tasks from the shared channel and encodes them.
+fn encode_worker_loop(
+    task_rx: Arc<tokio::sync::Mutex<tokio::sync::mpsc::Receiver<EncodeTask>>>,
+    handle: tokio::runtime::Handle,
+) {
+    loop {
+        // Acquire the shared receiver lock, then wait for a task.
+        // The lock is released as soon as recv() returns, so workers are not serialized
+        // during encoding — only during task pickup.
+        let task = {
+            let mut rx = handle.block_on(task_rx.lock());
+            match handle.block_on(rx.recv()) {
+                Some(t) => t,
+                None => break, // sender dropped → pool shutting down
+            }
+        };
+
+        // Clone state before moving task into the closure so we can always decrement
+        // pending even if the closure panics.
+        let state = task.state.clone();
+        let handle_enc = handle.clone();
+        let encode_result = catch_unwind(AssertUnwindSafe(move || {
+            let mut guard = task.state.writer.lock().unwrap_or_else(|e| e.into_inner());
+            match guard.as_mut() {
+                Some(w) => handle_enc
+                    .block_on(w.write(task.batch))
+                    .map_err(|e| anyhow::anyhow!("write batch: {}", e)),
+                None => Err(anyhow::anyhow!("writer already closed")),
+            }
+        }));
+
+        let err = match encode_result {
+            Ok(Ok(())) => None,
+            Ok(Err(e)) => Some(e),
+            Err(panic) => Some(format_panic_error(panic)),
+        };
+        if let Some(e) = err {
+            let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner());
+            if slot.is_none() {
+                *slot = Some(e);
+            }
+        }
+
+        // Always decrement pending; notify close() if this was the last task.
+        let prev = state.pending.fetch_sub(1, Ordering::AcqRel);
+        if prev == 1 {
+            state.done_notify.notify_one();
+        }
+    }
+}
+
 /// Desired encode worker count. 0 means "use available_parallelism".
 /// Must be set before the first iceberg_writer_new call.
 static ENCODE_WORKERS: AtomicUsize = AtomicUsize::new(0);
@@ -180,57 +271,7 @@ fn get_or_init_encode_pool() -> &'static GlobalWorkerPool {
             let handle = handle.clone();
             thread::Builder::new()
                 .name(format!("iceberg-encode-{}", i))
-                .spawn(move || {
-                    loop {
-                        // Acquire the shared receiver lock, then wait for a task.
-                        // The lock is released as soon as recv() returns, so workers
-                        // are not serialized during encoding — only during task pickup.
-                        let task = {
-                            let mut rx = handle.block_on(task_rx.lock());
-                            match handle.block_on(rx.recv()) {
-                                Some(t) => t,
-                                None => break, // sender dropped → pool shutting down
-                            }
-                        };
-
-                        // Clone state before moving task into the closure so we can
-                        // always decrement pending even if the closure panics.
-                        let state = task.state.clone();
-
-                        let handle_enc = handle.clone();
-                        let encode_result = catch_unwind(AssertUnwindSafe(move || {
-                            let result: anyhow::Result<()> = {
-                                let mut guard =
-                                    task.state.writer.lock().unwrap_or_else(|e| e.into_inner());
-                                match guard.as_mut() {
-                                    Some(w) => handle_enc
-                                        .block_on(w.write(task.batch))
-                                        .map_err(|e| anyhow::anyhow!("write batch: {}", e)),
-                                    None => Err(anyhow::anyhow!("writer already closed")),
-                                }
-                            };
-                            result
-                        }));
-
-                        let err = match encode_result {
-                            Ok(Ok(())) => None,
-                            Ok(Err(e)) => Some(e),
-                            Err(_panic) => Some(anyhow::anyhow!("encode worker panicked")),
-                        };
-                        if let Some(e) = err {
-                            let mut slot = state.error.lock().unwrap_or_else(|e| e.into_inner());
-                            if slot.is_none() {
-                                *slot = Some(e);
-                            }
-                        }
-
-                        // Always decrement pending; notify close() if this was the last task.
-                        let prev = state.pending.fetch_sub(1, Ordering::AcqRel);
-                        if prev == 1 {
-                            state.done_notify.notify_one();
-                        }
-                    }
-                })
+                .spawn(move || encode_worker_loop(task_rx, handle))
                 .expect("failed to spawn iceberg encode worker");
         }
 
@@ -319,6 +360,7 @@ pub extern "C" fn iceberg_writer_write_gathered_columns(
     })() {
         Ok(b) => b,
         Err(e) => {
+            store_gather_error(&e);
             let mut slot = writer_ref
                 .writer_state
                 .error
@@ -352,7 +394,8 @@ pub extern "C" fn iceberg_writer_write_gathered_columns(
             if prev == 1 {
                 writer_ref.writer_state.done_notify.notify_one();
             }
-            eprintln!("[iceberg:sync] pool channel closed");
+            let e = anyhow::anyhow!("encode pool channel closed unexpectedly");
+            store_gather_error(&e);
             -1
         }
     }
diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs
index fbdae91..ca9369f 100644
--- a/iceberg_rust_ffi/src/writer_columns.rs
+++ b/iceberg_rust_ffi/src/writer_columns.rs
@@ -68,7 +68,7 @@ unsafe impl Sync for ColumnDescriptor {}
 /// `sel_ptr = null`  → sequential (identity) access: read data[0..len].
 /// `sel_ptr != null` → scattered access: read data[sel[i]-1] for i in 0..len (1-based Julia indices).
 /// `validity_ptr = null` → all rows valid (non-nullable or known all-valid slice).
-/// `lengths_ptr != null` → string column: data_ptr is Ptr{UInt8}[], lengths_ptr is Int64[].
+/// `lengths_ptr != null` → string column: data_ptr is Ptr{UInt8}[], lengths_ptr is Int64[] of byte lengths per string.
 /// Fields are all 8 bytes — no padding, total 40 bytes.
 #[repr(C)]
 #[derive(Clone, Copy)]
@@ -99,8 +99,16 @@ pub struct GatheredColumnDescriptor {
 unsafe impl Send for GatheredColumnDescriptor {}
 unsafe impl Sync for GatheredColumnDescriptor {}
 
-/// Build the Arrow null buffer by scanning validity bits across all slices.
-/// Returns `None` if every slice has a null `validity_ptr` (all rows valid).
+/// Merges the per-slice validity bitmaps from all slices into a single output bitmap.
+///
+/// Each slice contributes `slice.len` output rows. Slices with a null `validity_ptr` are
+/// all-valid. Slices with a bitmap may be misaligned relative to the output (each slice
+/// starts at a different `out` offset), so bits are copied one at a time with a shift.
+/// The selection vector (`sel_ptr`) governs which *source data* elements to read; the
+/// validity bitmap is always indexed by output row position, so sequential and scattered
+/// slices are treated identically here.
+///
+/// Returns `None` if every slice is all-valid (no null buffer needed).
 unsafe fn build_null_buffer_scattered(
     slices: &[SliceRef],
     total_rows: usize,
@@ -112,20 +120,13 @@ unsafe fn build_null_buffer_scattered(
     let mut out = 0usize;
     for slice in slices {
         if slice.validity_ptr.is_null() {
-            // All valid — set bits 1
+            // All rows in this slice are valid — set one bit per output row.
             for i in 0..slice.len {
                 bits[(out + i) / 8] |= 1u8 << ((out + i) % 8);
             }
-        } else if slice.sel_ptr.is_null() {
-            // Sequential: copy bits with possible alignment shift
-            for i in 0..slice.len {
-                let b = (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1;
-                bits[(out + i) / 8] |= b << ((out + i) % 8);
-            }
         } else {
-            // Scattered: validity is per output row (0..len-1), same as sequential.
-            // The selection governs which *data* elements are picked, not which
-            // validity bits — the validity bitmap has one bit per output row.
+            // Copy validity bits from the slice's bitmap into the output bitmap,
+            // re-aligning from source bit position i to output bit position (out + i).
             for i in 0..slice.len {
                 let b = (*slice.validity_ptr.add(i / 8) >> (i % 8)) & 1;
                 bits[(out + i) / 8] |= b << ((out + i) % 8);
diff --git a/src/writer.jl b/src/writer.jl
index 6df49db..b0f0b1a 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -1146,6 +1146,16 @@ function write_columns(
             length(descriptors)::Csize_t,
         )::Int32
     end
-    ret == 0 || throw(IcebergException("write_columns (gathered): gather failed (see writer close for details)"))
+    if ret != 0
+        err_ptr = @ccall rust_lib.iceberg_take_gather_error()::Ptr{Cchar}
+        msg = if err_ptr != C_NULL
+            s = unsafe_string(err_ptr)
+            @ccall rust_lib.iceberg_destroy_cstring(err_ptr::Ptr{Cchar})::Cint
+            s
+        else
+            "gather failed (see writer close for details)"
+        end
+        throw(IcebergException("write_columns (gathered): $(msg)"))
+    end
     return nothing
 end

From 7a8947a9a864019c32ba5a7660294aa52e0c939c Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Thu, 30 Apr 2026 13:54:31 +0200
Subject: [PATCH 21/25] =?UTF-8?q?Rename=20build=5Fnull=5Fbuffer=5Fscattere?=
 =?UTF-8?q?d=20=E2=86=92=20build=5Fnull=5Fbuffer=5Fgathered?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 iceberg_rust_ffi/src/writer_columns.rs | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer_columns.rs b/iceberg_rust_ffi/src/writer_columns.rs
index ca9369f..7e1665a 100644
--- a/iceberg_rust_ffi/src/writer_columns.rs
+++ b/iceberg_rust_ffi/src/writer_columns.rs
@@ -109,10 +109,7 @@ unsafe impl Sync for GatheredColumnDescriptor {}
 /// slices are treated identically here.
 ///
 /// Returns `None` if every slice is all-valid (no null buffer needed).
-unsafe fn build_null_buffer_scattered(
-    slices: &[SliceRef],
-    total_rows: usize,
-) -> Option<NullBuffer> {
+unsafe fn build_null_buffer_gathered(slices: &[SliceRef], total_rows: usize) -> Option<NullBuffer> {
     if !slices.iter().any(|s| !s.validity_ptr.is_null()) {
         return None;
     }
@@ -149,7 +146,7 @@ pub(crate) unsafe fn build_arrow_array_gathered(
     let slices = std::slice::from_raw_parts(desc.slices, desc.num_slices);
     let total = desc.total_rows;
     let null_buf = if desc.is_nullable {
-        build_null_buffer_scattered(slices, total)
+        build_null_buffer_gathered(slices, total)
     } else {
         None
     };
@@ -260,7 +257,7 @@ pub(crate) unsafe fn build_arrow_array_gathered(
             // intermediate Vec<Option<&str>> and skips UTF-8 validation — Julia strings
             // are guaranteed valid UTF-8.
             let null_buf = if desc.is_nullable {
-                build_null_buffer_scattered(slices, total)
+                build_null_buffer_gathered(slices, total)
             } else {
                 None
             };

From 7aa1836624d00daa575c309c96fda421e14c5f94 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Thu, 30 Apr 2026 14:48:42 +0200
Subject: [PATCH 22/25] Remove code dup

---
 iceberg_rust_ffi/src/writer.rs | 239 +++++++++++++--------------------
 1 file changed, 96 insertions(+), 143 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs
index ec59b1b..d6e67a7 100644
--- a/iceberg_rust_ffi/src/writer.rs
+++ b/iceberg_rust_ffi/src/writer.rs
@@ -300,6 +300,70 @@ pub type IcebergDataFileWriterResponse = IcebergBoxedResponse<IcebergDataFileWri
 /// Type alias for data files response (returns IcebergDataFiles handle)
 pub type IcebergWriterCloseResponse = IcebergBoxedResponse<IcebergDataFiles>;
 
+/// Store an error in the writer state (first error wins).
+fn store_writer_error(writer_ref: &IcebergDataFileWriter, e: anyhow::Error) {
+    let mut slot = writer_ref
+        .writer_state
+        .error
+        .lock()
+        .unwrap_or_else(|e| e.into_inner());
+    if slot.is_none() {
+        *slot = Some(e);
+    }
+}
+
+/// Build a `RecordBatch` from a slice of `GatheredColumnDescriptor`s.
+///
+/// # Safety
+/// All pointers inside each `GatheredColumnDescriptor` must be valid for the duration of
+/// this call (callers hold `GC.@preserve` or equivalent).
+unsafe fn build_record_batch<I>(
+    arrow_schema: ArrowSchemaRef,
+    col_descs: I,
+) -> Result<RecordBatch, anyhow::Error>
+where
+    I: IntoIterator<Item = GatheredColumnDescriptor>,
+    I::IntoIter: ExactSizeIterator,
+{
+    let iter = col_descs.into_iter();
+    let mut arrays = Vec::with_capacity(iter.len());
+    for (i, desc) in iter.enumerate() {
+        arrays.push(unsafe { build_arrow_array_gathered(&desc, arrow_schema.field(i))? });
+    }
+    RecordBatch::try_new(arrow_schema, arrays).map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))
+}
+
+/// Submit a `RecordBatch` to the global encode pool.
+///
+/// Increments the writer's pending count before sending and rolls it back on channel failure.
+fn submit_batch(
+    writer_ref: &IcebergDataFileWriter,
+    pool: &GlobalWorkerPool,
+    batch: RecordBatch,
+) -> Result<(), anyhow::Error> {
+    writer_ref
+        .writer_state
+        .pending
+        .fetch_add(1, Ordering::AcqRel);
+    let task = EncodeTask {
+        batch,
+        state: writer_ref.writer_state.clone(),
+    };
+    match pool.task_tx.blocking_send(task) {
+        Ok(()) => Ok(()),
+        Err(_) => {
+            let prev = writer_ref
+                .writer_state
+                .pending
+                .fetch_sub(1, Ordering::AcqRel);
+            if prev == 1 {
+                writer_ref.writer_state.done_notify.notify_one();
+            }
+            Err(anyhow::anyhow!("encode pool channel closed unexpectedly"))
+        }
+    }
+}
+
 /// Gather column data from Julia memory into Arrow arrays in the calling thread, then
 /// submit the RecordBatch to the global encode pool asynchronously.
 ///
@@ -324,7 +388,7 @@ pub extern "C" fn iceberg_writer_write_gathered_columns(
     let pool = match GLOBAL_ENCODE_POOL.get() {
         Some(p) => p,
         None => {
-            eprintln!("[iceberg:sync] encode pool not initialized; call iceberg_writer_new first");
+            eprintln!("[iceberg] encode pool not initialized; call iceberg_writer_new first");
             return -1;
         }
     };
@@ -333,72 +397,34 @@ pub extern "C" fn iceberg_writer_write_gathered_columns(
     let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) };
 
     if col_descs.len() != arrow_schema.fields().len() {
-        let mut slot = writer_ref
-            .writer_state
-            .error
-            .lock()
-            .unwrap_or_else(|e| e.into_inner());
-        if slot.is_none() {
-            *slot = Some(anyhow::anyhow!(
+        store_writer_error(
+            writer_ref,
+            anyhow::anyhow!(
                 "Column count mismatch: got {} but schema has {}",
                 col_descs.len(),
                 arrow_schema.fields().len()
-            ));
-        }
+            ),
+        );
         return -1;
     }
 
     // Gather: read from Julia memory into Rust-owned Arrow arrays (blocking, in calling thread).
-    // After this block, all Julia pointers have been consumed — safe to release.
-    let batch = match (|| -> Result<RecordBatch, anyhow::Error> {
-        let mut arrays = Vec::with_capacity(num_columns);
-        for (i, desc) in col_descs.iter().enumerate() {
-            arrays.push(unsafe { build_arrow_array_gathered(desc, arrow_schema.field(i))? });
-        }
-        RecordBatch::try_new(arrow_schema, arrays)
-            .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))
-    })() {
+    // After this point all Julia pointers have been consumed — safe to release.
+    let batch = match unsafe { build_record_batch(arrow_schema, col_descs.iter().copied()) } {
         Ok(b) => b,
         Err(e) => {
             store_gather_error(&e);
-            let mut slot = writer_ref
-                .writer_state
-                .error
-                .lock()
-                .unwrap_or_else(|e| e.into_inner());
-            if slot.is_none() {
-                *slot = Some(e);
-            }
+            store_writer_error(writer_ref, e);
             return -1;
         }
     };
 
-    // Increment pending before submit so iceberg_writer_close always accounts for this batch.
-    writer_ref
-        .writer_state
-        .pending
-        .fetch_add(1, Ordering::AcqRel);
-    let task = EncodeTask {
-        batch,
-        state: writer_ref.writer_state.clone(),
-    };
-
-    match pool.task_tx.blocking_send(task) {
-        Ok(()) => 0,
-        Err(_) => {
-            // Pool channel closed — undo the pending increment.
-            let prev = writer_ref
-                .writer_state
-                .pending
-                .fetch_sub(1, Ordering::AcqRel);
-            if prev == 1 {
-                writer_ref.writer_state.done_notify.notify_one();
-            }
-            let e = anyhow::anyhow!("encode pool channel closed unexpectedly");
-            store_gather_error(&e);
-            -1
-        }
+    if let Err(e) = submit_batch(writer_ref, pool, batch) {
+        store_gather_error(&e);
+        store_writer_error(writer_ref, e);
+        return -1;
     }
+    0
 }
 
 /// Synchronous write of flat column data: copies each column from Julia memory into
@@ -422,7 +448,7 @@ pub extern "C" fn iceberg_writer_write_columns_sync(
     let pool = match GLOBAL_ENCODE_POOL.get() {
         Some(p) => p,
         None => {
-            eprintln!("[iceberg:sync] encode pool not initialized; call iceberg_writer_new first");
+            eprintln!("[iceberg] encode pool not initialized; call iceberg_writer_new first");
             return -1;
         }
     };
@@ -431,18 +457,14 @@ pub extern "C" fn iceberg_writer_write_columns_sync(
     let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) };
 
     if col_descs.len() != arrow_schema.fields().len() {
-        let mut slot = writer_ref
-            .writer_state
-            .error
-            .lock()
-            .unwrap_or_else(|e| e.into_inner());
-        if slot.is_none() {
-            *slot = Some(anyhow::anyhow!(
+        store_writer_error(
+            writer_ref,
+            anyhow::anyhow!(
                 "Column count mismatch: got {} but schema has {}",
                 col_descs.len(),
                 arrow_schema.fields().len()
-            ));
-        }
+            ),
+        );
         return -1;
     }
 
@@ -459,7 +481,7 @@ pub extern "C" fn iceberg_writer_write_columns_sync(
         })
         .collect();
 
-    let gathered: Vec<GatheredColumnDescriptor> = col_descs
+    let gathered = col_descs
         .iter()
         .zip(slices.iter())
         .map(|(d, s)| GatheredColumnDescriptor {
@@ -468,54 +490,21 @@ pub extern "C" fn iceberg_writer_write_columns_sync(
             total_rows: d.num_rows,
             column_type: d.column_type,
             is_nullable: d.is_nullable,
-        })
-        .collect();
+        });
 
-    let batch = match (|| -> Result<RecordBatch, anyhow::Error> {
-        let mut arrays = Vec::with_capacity(num_columns);
-        for (i, desc) in gathered.iter().enumerate() {
-            arrays.push(unsafe { build_arrow_array_gathered(desc, arrow_schema.field(i))? });
-        }
-        RecordBatch::try_new(arrow_schema, arrays)
-            .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))
-    })() {
+    let batch = match unsafe { build_record_batch(arrow_schema, gathered) } {
         Ok(b) => b,
         Err(e) => {
-            let mut slot = writer_ref
-                .writer_state
-                .error
-                .lock()
-                .unwrap_or_else(|e| e.into_inner());
-            if slot.is_none() {
-                *slot = Some(e);
-            }
+            store_writer_error(writer_ref, e);
             return -1;
         }
     };
 
-    writer_ref
-        .writer_state
-        .pending
-        .fetch_add(1, Ordering::AcqRel);
-    let task = EncodeTask {
-        batch,
-        state: writer_ref.writer_state.clone(),
-    };
-
-    match pool.task_tx.blocking_send(task) {
-        Ok(()) => 0,
-        Err(_) => {
-            let prev = writer_ref
-                .writer_state
-                .pending
-                .fetch_sub(1, Ordering::AcqRel);
-            if prev == 1 {
-                writer_ref.writer_state.done_notify.notify_one();
-            }
-            eprintln!("[iceberg:sync] pool channel closed");
-            -1
-        }
+    if let Err(e) = submit_batch(writer_ref, pool, batch) {
+        store_writer_error(writer_ref, e);
+        return -1;
     }
+    0
 }
 
 /// Free a writer. Poisons the writer state so any in-flight pool tasks fail gracefully.
@@ -653,14 +642,7 @@ pub extern "C" fn iceberg_writer_write_sync(
     let reader = match StreamReader::try_new(cursor, None) {
         Ok(r) => r,
         Err(e) => {
-            let mut slot = writer_ref
-                .writer_state
-                .error
-                .lock()
-                .unwrap_or_else(|e| e.into_inner());
-            if slot.is_none() {
-                *slot = Some(anyhow::anyhow!("IPC reader: {}", e));
-            }
+            store_writer_error(writer_ref, anyhow::anyhow!("IPC reader: {}", e));
             return -1;
         }
     };
@@ -669,41 +651,12 @@ pub extern "C" fn iceberg_writer_write_sync(
         let batch = match batch_result {
             Ok(b) => b,
             Err(e) => {
-                let mut slot = writer_ref
-                    .writer_state
-                    .error
-                    .lock()
-                    .unwrap_or_else(|e| e.into_inner());
-                if slot.is_none() {
-                    *slot = Some(anyhow::anyhow!("IPC batch: {}", e));
-                }
+                store_writer_error(writer_ref, anyhow::anyhow!("IPC batch: {}", e));
                 return -1;
             }
         };
-        writer_ref
-            .writer_state
-            .pending
-            .fetch_add(1, Ordering::AcqRel);
-        let task = EncodeTask {
-            batch,
-            state: writer_ref.writer_state.clone(),
-        };
-        if pool.task_tx.blocking_send(task).is_err() {
-            let prev = writer_ref
-                .writer_state
-                .pending
-                .fetch_sub(1, Ordering::AcqRel);
-            if prev == 1 {
-                writer_ref.writer_state.done_notify.notify_one();
-            }
-            let mut slot = writer_ref
-                .writer_state
-                .error
-                .lock()
-                .unwrap_or_else(|e| e.into_inner());
-            if slot.is_none() {
-                *slot = Some(anyhow::anyhow!("Encode pool closed"));
-            }
+        if let Err(e) = submit_batch(writer_ref, pool, batch) {
+            store_writer_error(writer_ref, e);
             return -1;
         }
     }

From b820563946a6b5a60d221c936e183d9742796709 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Thu, 30 Apr 2026 14:50:15 +0200
Subject: [PATCH 23/25] Rename fn

---
 iceberg_rust_ffi/src/writer.rs | 4 ++--
 src/writer.jl                  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs
index d6e67a7..0763a1f 100644
--- a/iceberg_rust_ffi/src/writer.rs
+++ b/iceberg_rust_ffi/src/writer.rs
@@ -434,7 +434,7 @@ pub extern "C" fn iceberg_writer_write_gathered_columns(
 /// Each `ColumnDescriptor` is treated as a single sequential slice (no scatter/gather).
 /// Returns 0 on success, -1 on error (error stored in writer state, propagated on close).
 #[no_mangle]
-pub extern "C" fn iceberg_writer_write_columns_sync(
+pub extern "C" fn iceberg_writer_write_columns(
     writer: *mut IcebergDataFileWriter,
     columns: *const ColumnDescriptor,
     num_columns: usize,
@@ -617,7 +617,7 @@ export_runtime_op!(
 ///
 /// Returns 0 on success, -1 on error (error stored in writer state, propagated on close).
 #[no_mangle]
-pub extern "C" fn iceberg_writer_write_sync(
+pub extern "C" fn iceberg_writer_write(
     writer: *mut IcebergDataFileWriter,
     arrow_ipc_data: *const u8,
     arrow_ipc_len: usize,
diff --git a/src/writer.jl b/src/writer.jl
index b0f0b1a..901144a 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -419,7 +419,7 @@ end
 # Internal helper to write raw IPC bytes to the Rust writer
 function _write_ipc_bytes(writer::DataFileWriter, ipc_bytes::Vector{UInt8})
     ret = GC.@preserve ipc_bytes begin
-        @ccall rust_lib.iceberg_writer_write_sync(
+        @ccall rust_lib.iceberg_writer_write(
             writer.ptr::Ptr{Cvoid},
             pointer(ipc_bytes)::Ptr{UInt8},
             length(ipc_bytes)::Csize_t,
@@ -842,7 +842,7 @@ function write_columns(writer::DataFileWriter, columns::Vector{ColumnDescriptor}
     isempty(columns) && throw(IcebergException("No columns provided"))
 
     ret = GC.@preserve columns arrays_to_preserve begin
-        @ccall rust_lib.iceberg_writer_write_columns_sync(
+        @ccall rust_lib.iceberg_writer_write_columns(
             writer.ptr::Ptr{Cvoid},
             pointer(columns)::Ptr{ColumnDescriptor},
             length(columns)::Csize_t,

From c1dc6921d1aa005f76dddd4db321104be952c688 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Thu, 30 Apr 2026 14:57:34 +0200
Subject: [PATCH 24/25] .

---
 iceberg_rust_ffi/src/writer.rs | 91 ++++++++++++++--------------------
 1 file changed, 37 insertions(+), 54 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs
index 0763a1f..ce3351f 100644
--- a/iceberg_rust_ffi/src/writer.rs
+++ b/iceberg_rust_ffi/src/writer.rs
@@ -364,6 +364,30 @@ fn submit_batch(
     }
 }
 
+/// Shared core for write functions: validates column count, builds a `RecordBatch`, and
+/// submits it to the encode pool.
+unsafe fn write_gathered_inner<I>(
+    writer_ref: &IcebergDataFileWriter,
+    pool: &GlobalWorkerPool,
+    arrow_schema: ArrowSchemaRef,
+    num_columns: usize,
+    col_descs: I,
+) -> Result<(), anyhow::Error>
+where
+    I: IntoIterator<Item = GatheredColumnDescriptor>,
+    I::IntoIter: ExactSizeIterator,
+{
+    if num_columns != arrow_schema.fields().len() {
+        return Err(anyhow::anyhow!(
+            "Column count mismatch: got {} but schema has {}",
+            num_columns,
+            arrow_schema.fields().len()
+        ));
+    }
+    let batch = unsafe { build_record_batch(arrow_schema, col_descs) }?;
+    submit_batch(writer_ref, pool, batch)
+}
+
 /// Gather column data from Julia memory into Arrow arrays in the calling thread, then
 /// submit the RecordBatch to the global encode pool asynchronously.
 ///
@@ -382,9 +406,7 @@ pub extern "C" fn iceberg_writer_write_gathered_columns(
     if writer.is_null() || columns.is_null() || num_columns == 0 {
         return -1;
     }
-
     let writer_ref = unsafe { &*writer };
-
     let pool = match GLOBAL_ENCODE_POOL.get() {
         Some(p) => p,
         None => {
@@ -392,34 +414,17 @@ pub extern "C" fn iceberg_writer_write_gathered_columns(
             return -1;
         }
     };
-
     let arrow_schema = writer_ref.arrow_schema.clone();
     let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) };
-
-    if col_descs.len() != arrow_schema.fields().len() {
-        store_writer_error(
+    if let Err(e) = unsafe {
+        write_gathered_inner(
             writer_ref,
-            anyhow::anyhow!(
-                "Column count mismatch: got {} but schema has {}",
-                col_descs.len(),
-                arrow_schema.fields().len()
-            ),
-        );
-        return -1;
-    }
-
-    // Gather: read from Julia memory into Rust-owned Arrow arrays (blocking, in calling thread).
-    // After this point all Julia pointers have been consumed — safe to release.
-    let batch = match unsafe { build_record_batch(arrow_schema, col_descs.iter().copied()) } {
-        Ok(b) => b,
-        Err(e) => {
-            store_gather_error(&e);
-            store_writer_error(writer_ref, e);
-            return -1;
-        }
-    };
-
-    if let Err(e) = submit_batch(writer_ref, pool, batch) {
+            pool,
+            arrow_schema,
+            num_columns,
+            col_descs.iter().copied(),
+        )
+    } {
         store_gather_error(&e);
         store_writer_error(writer_ref, e);
         return -1;
@@ -442,9 +447,7 @@ pub extern "C" fn iceberg_writer_write_columns(
     if writer.is_null() || columns.is_null() || num_columns == 0 {
         return -1;
     }
-
     let writer_ref = unsafe { &*writer };
-
     let pool = match GLOBAL_ENCODE_POOL.get() {
         Some(p) => p,
         None => {
@@ -452,24 +455,12 @@ pub extern "C" fn iceberg_writer_write_columns(
             return -1;
         }
     };
-
     let arrow_schema = writer_ref.arrow_schema.clone();
     let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) };
-
-    if col_descs.len() != arrow_schema.fields().len() {
-        store_writer_error(
-            writer_ref,
-            anyhow::anyhow!(
-                "Column count mismatch: got {} but schema has {}",
-                col_descs.len(),
-                arrow_schema.fields().len()
-            ),
-        );
-        return -1;
-    }
-
     // Wrap each ColumnDescriptor as a single-slice GatheredColumnDescriptor.
     // Sequential access (sel_ptr = null) so data[0..num_rows] is read in order.
+    // `slices` must outlive `gathered` since GatheredColumnDescriptor.slices is a raw
+    // pointer into this Vec's allocation.
     let slices: Vec<SliceRef> = col_descs
         .iter()
         .map(|d| SliceRef {
@@ -480,7 +471,6 @@ pub extern "C" fn iceberg_writer_write_columns(
             len: d.num_rows,
         })
         .collect();
-
     let gathered = col_descs
         .iter()
         .zip(slices.iter())
@@ -491,16 +481,9 @@ pub extern "C" fn iceberg_writer_write_columns(
             column_type: d.column_type,
             is_nullable: d.is_nullable,
         });
-
-    let batch = match unsafe { build_record_batch(arrow_schema, gathered) } {
-        Ok(b) => b,
-        Err(e) => {
-            store_writer_error(writer_ref, e);
-            return -1;
-        }
-    };
-
-    if let Err(e) = submit_batch(writer_ref, pool, batch) {
+    if let Err(e) =
+        unsafe { write_gathered_inner(writer_ref, pool, arrow_schema, num_columns, gathered) }
+    {
         store_writer_error(writer_ref, e);
         return -1;
     }

From 5d525a20fd9a5fd1b7955ebf5f73b309e378de99 Mon Sep 17 00:00:00 2001
From: Gerald Berger <gberger.work@gmail.com>
Date: Thu, 30 Apr 2026 15:00:22 +0200
Subject: [PATCH 25/25] .

---
 iceberg_rust_ffi/src/writer.rs | 75 +++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/iceberg_rust_ffi/src/writer.rs b/iceberg_rust_ffi/src/writer.rs
index ce3351f..19b7e51 100644
--- a/iceberg_rust_ffi/src/writer.rs
+++ b/iceberg_rust_ffi/src/writer.rs
@@ -364,8 +364,8 @@ fn submit_batch(
     }
 }
 
-/// Shared core for write functions: validates column count, builds a `RecordBatch`, and
-/// submits it to the encode pool.
+/// Validates column count, builds a `RecordBatch` from pre-built gathered descriptors,
+/// and submits it to the encode pool.
 unsafe fn write_gathered_inner<I>(
     writer_ref: &IcebergDataFileWriter,
     pool: &GlobalWorkerPool,
@@ -388,6 +388,49 @@ where
     submit_batch(writer_ref, pool, batch)
 }
 
+/// Validates column count, builds a `RecordBatch` from flat `ColumnDescriptor`s (each
+/// treated as a single sequential slice), and submits it to the encode pool.
+///
+/// Each `SliceRef` is constructed on the stack and used within the same loop iteration,
+/// so no heap allocation is needed for the descriptor conversion.
+unsafe fn write_columns_inner(
+    writer_ref: &IcebergDataFileWriter,
+    pool: &GlobalWorkerPool,
+    arrow_schema: ArrowSchemaRef,
+    col_descs: &[ColumnDescriptor],
+) -> Result<(), anyhow::Error> {
+    if col_descs.len() != arrow_schema.fields().len() {
+        return Err(anyhow::anyhow!(
+            "Column count mismatch: got {} but schema has {}",
+            col_descs.len(),
+            arrow_schema.fields().len()
+        ));
+    }
+    let mut arrays = Vec::with_capacity(col_descs.len());
+    for (i, d) in col_descs.iter().enumerate() {
+        // SliceRef lives on the stack for exactly this iteration; the raw pointer
+        // is consumed by build_arrow_array_gathered before the next iteration begins.
+        let slice = SliceRef {
+            data_ptr: d.data_ptr,
+            lengths_ptr: d.lengths_ptr,
+            validity_ptr: d.validity_ptr,
+            sel_ptr: std::ptr::null(),
+            len: d.num_rows,
+        };
+        let desc = GatheredColumnDescriptor {
+            slices: &slice as *const SliceRef,
+            num_slices: 1,
+            total_rows: d.num_rows,
+            column_type: d.column_type,
+            is_nullable: d.is_nullable,
+        };
+        arrays.push(unsafe { build_arrow_array_gathered(&desc, arrow_schema.field(i))? });
+    }
+    let batch = RecordBatch::try_new(arrow_schema, arrays)
+        .map_err(|e| anyhow::anyhow!("RecordBatch: {}", e))?;
+    submit_batch(writer_ref, pool, batch)
+}
+
 /// Gather column data from Julia memory into Arrow arrays in the calling thread, then
 /// submit the RecordBatch to the global encode pool asynchronously.
 ///
@@ -457,33 +500,7 @@ pub extern "C" fn iceberg_writer_write_columns(
     };
     let arrow_schema = writer_ref.arrow_schema.clone();
     let col_descs = unsafe { std::slice::from_raw_parts(columns, num_columns) };
-    // Wrap each ColumnDescriptor as a single-slice GatheredColumnDescriptor.
-    // Sequential access (sel_ptr = null) so data[0..num_rows] is read in order.
-    // `slices` must outlive `gathered` since GatheredColumnDescriptor.slices is a raw
-    // pointer into this Vec's allocation.
-    let slices: Vec<SliceRef> = col_descs
-        .iter()
-        .map(|d| SliceRef {
-            data_ptr: d.data_ptr,
-            lengths_ptr: d.lengths_ptr,
-            validity_ptr: d.validity_ptr,
-            sel_ptr: std::ptr::null(),
-            len: d.num_rows,
-        })
-        .collect();
-    let gathered = col_descs
-        .iter()
-        .zip(slices.iter())
-        .map(|(d, s)| GatheredColumnDescriptor {
-            slices: s as *const SliceRef,
-            num_slices: 1,
-            total_rows: d.num_rows,
-            column_type: d.column_type,
-            is_nullable: d.is_nullable,
-        });
-    if let Err(e) =
-        unsafe { write_gathered_inner(writer_ref, pool, arrow_schema, num_columns, gathered) }
-    {
+    if let Err(e) = unsafe { write_columns_inner(writer_ref, pool, arrow_schema, col_descs) } {
         store_writer_error(writer_ref, e);
         return -1;
     }