diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 10ec656c6d6..db5655d15af 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -174,13 +174,13 @@ jobs: - uses: Swatinem/rust-cache@v2 - name: Build tests run: | - cargo test --profile ci --locked --features fp16kernels,cli,tensorflow,dynamodb,substrait --no-run + cargo test --profile ci --locked --features fp16kernels,cli,dynamodb,substrait --no-run - name: Run tests run: | - cargo test --profile ci --features fp16kernels,cli,tensorflow,dynamodb,substrait + cargo test --profile ci --features fp16kernels,cli,dynamodb,substrait - name: Check benchmarks run: | - cargo check --profile ci --benches --features fp16kernels,cli,tensorflow,dynamodb,substrait + cargo check --profile ci --benches --features fp16kernels,cli,dynamodb,substrait windows-build: runs-on: windows-latest defaults: diff --git a/Cargo.lock b/Cargo.lock index 08d1ca77007..f22f9d62115 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -418,17 +418,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "async-channel" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" -dependencies = [ - "concurrent-queue", - "event-listener 2.5.3", - "futures-core", -] - [[package]] name = "async-channel" version = "2.5.0" @@ -454,60 +443,13 @@ dependencies = [ "tokio", ] -[[package]] -name = "async-executor" -version = "1.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "497c00e0fd83a72a79a39fcbd8e3e2f055d6f6c7e025f3b3d91f4f8e76527fb8" -dependencies = [ - "async-task", - "concurrent-queue", - "fastrand", - "futures-lite", - "pin-project-lite", - "slab", -] - -[[package]] -name = "async-global-executor" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05b1b633a2115cd122d73b955eadd9916c18c8f510ec9cd1686404c60ad1c29c" -dependencies = [ - "async-channel 2.5.0", - "async-executor", - "async-io", - "async-lock", - "blocking", - "futures-lite", - "once_cell", -] - -[[package]] -name = "async-io" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc" -dependencies = [ - "autocfg", - "cfg-if", - "concurrent-queue", - "futures-io", - "futures-lite", - "parking", - "polling", - "rustix 1.1.2", - "slab", - "windows-sys 0.61.2", -] - [[package]] name = "async-lock" version = "3.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5fd03604047cee9b6ce9de9f70c6cd540a0520c813cbd49bae61f33ab80ed1dc" dependencies = [ - "event-listener 5.4.1", + "event-listener", "event-listener-strategy", "pin-project-lite", ] @@ -523,38 +465,6 @@ dependencies = [ "syn 2.0.110", ] -[[package]] -name = "async-std" -version = "1.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c8e079a4ab67ae52b7403632e4618815d6db36d2a010cfe41b02c1b1578f93b" -dependencies = [ - "async-channel 1.9.0", - "async-global-executor", - "async-io", - "async-lock", - "crossbeam-utils", - "futures-channel", - "futures-core", - "futures-io", - "futures-lite", - "gloo-timers", - "kv-log-macro", - "log", - "memchr", - "once_cell", - "pin-project-lite", - "pin-utils", - "slab", - "wasm-bindgen-futures", -] - -[[package]] -name = "async-task" -version = "4.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" - [[package]] name = "async-trait" version = "0.1.89" @@ -1311,19 +1221,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "blocking" -version = "1.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e83f8d02be6967315521be875afa792a316e28d57b5a2d401897e2a7921b7f21" -dependencies = [ - "async-channel 2.5.0", - "async-task", - "futures-io", - "futures-lite", - "piper", -] - [[package]] name = "bon" version = "3.8.1" @@ -2690,7 +2587,7 @@ dependencies = [ "itertools 0.14.0", "object_store", "pbjson-types", - "prost 0.13.5", + "prost", "substrait 0.58.0", "tokio", "url", @@ -3141,12 +3038,6 @@ version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b" -[[package]] -name = "event-listener" -version = "2.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" - [[package]] name = "event-listener" version = "5.4.1" @@ -3164,7 +3055,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" dependencies = [ - "event-listener 5.4.1", + "event-listener", "pin-project-lite", ] @@ -3226,12 +3117,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "fixedbitset" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" - [[package]] name = "fixedbitset" version = "0.5.7" @@ -3398,19 +3283,6 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" -[[package]] -name = "futures-lite" -version = "2.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad" -dependencies = [ - "fastrand", - "futures-core", - "futures-io", - "parking", - "pin-project-lite", -] - [[package]] name = "futures-macro" version = "0.3.31" @@ -3686,17 +3558,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "hostname" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867" -dependencies = [ - "libc", - "match_cfg", - "winapi", -] - [[package]] name = "htmlescape" version = "0.3.1" @@ -4135,12 +3996,6 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" -[[package]] -name = "integer-encoding" -version = "4.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c00403deb17c3221a1fe4fb571b9ed0370b3dcd116553c77fa294a3d918699" - [[package]] name = "ipnet" version = "2.11.0" @@ -4353,15 +4208,6 @@ dependencies = [ "bitflags 1.3.2", ] -[[package]] -name = "kv-log-macro" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" -dependencies = [ - "log", -] - [[package]] name = "lance" version = "1.0.0-beta.3" @@ -4426,9 +4272,8 @@ dependencies = [ "pin-project", "pprof", "pretty_assertions", - "prost 0.12.6", - "prost 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-types", "rand 0.9.2", "roaring", "rstest", @@ -4439,7 +4284,6 @@ dependencies = [ "tantivy", "tempfile", "test-log", - "tfrecord", "tokio", "tokio-stream", "tracing", @@ -4502,7 +4346,7 @@ dependencies = [ "object_store", "pin-project", "proptest", - "prost 0.13.5", + "prost", "rand 0.9.2", "roaring", "serde_json", @@ -4539,7 +4383,7 @@ dependencies = [ "lance-datagen", "log", "pin-project", - "prost 0.13.5", + "prost", "snafu", "substrait-expr", "tokio", @@ -4596,9 +4440,9 @@ dependencies = [ "num-traits", "pprof", "proptest", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "protobuf-src", "rand 0.9.2", "rand_xoshiro", @@ -4668,9 +4512,9 @@ dependencies = [ "pprof", "pretty_assertions", "proptest", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "protobuf-src", "rand 0.9.2", "rstest", @@ -4691,7 +4535,7 @@ dependencies = [ "arrow-ord", "arrow-schema", "arrow-select", - "async-channel 2.5.0", + "async-channel", "async-recursion", "async-trait", "bitpacking", @@ -4732,9 +4576,9 @@ dependencies = [ "num-traits", "object_store", "pprof", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "protobuf-src", "rand 0.9.2", "rand_distr 0.5.1", @@ -4787,7 +4631,7 @@ dependencies = [ "path_abs", "pin-project", "pprof", - "prost 0.13.5", + "prost", "rand 0.9.2", "rstest", "serde", @@ -4907,9 +4751,9 @@ dependencies = [ "pprof", "pretty_assertions", "proptest", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "protobuf-src", "rand 0.9.2", "rangemap", @@ -5284,9 +5128,6 @@ name = "log" version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" -dependencies = [ - "value-bag", -] [[package]] name = "loom" @@ -5371,12 +5212,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" -[[package]] -name = "match_cfg" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4" - [[package]] name = "matchers" version = "0.2.0" @@ -5531,7 +5366,7 @@ dependencies = [ "crossbeam-epoch", "crossbeam-utils", "equivalent", - "event-listener 5.4.1", + "event-listener", "futures-util", "parking_lot", "portable-atomic", @@ -5618,15 +5453,6 @@ dependencies = [ "libc", ] -[[package]] -name = "noisy_float" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978fe6e6ebc0bf53de533cd456ca2d9de13de13856eda1518a285d7705a213af" -dependencies = [ - "num-traits", -] - [[package]] name = "nom" version = "7.1.3" @@ -6120,8 +5946,8 @@ checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" dependencies = [ "heck", "itertools 0.13.0", - "prost 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-types", ] [[package]] @@ -6134,8 +5960,8 @@ dependencies = [ "chrono", "pbjson", "pbjson-build", - "prost 0.13.5", - "prost-build 0.13.5", + "prost", + "prost-build", "serde", ] @@ -6180,23 +6006,13 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df202b0b0f5b8e389955afd5f27b007b00fb948162953f1db9c70d2c7e3157d7" -[[package]] -name = "petgraph" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" -dependencies = [ - "fixedbitset 0.4.2", - "indexmap", -] - [[package]] name = "petgraph" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ - "fixedbitset 0.5.7", + "fixedbitset", "indexmap", ] @@ -6206,7 +6022,7 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ - "fixedbitset 0.5.7", + "fixedbitset", "hashbrown 0.15.5", "indexmap", "serde", @@ -6301,17 +6117,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "piper" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" -dependencies = [ - "atomic-waker", - "fastrand", - "futures-io", -] - [[package]] name = "pkcs1" version = "0.7.5" @@ -6394,20 +6199,6 @@ dependencies = [ "plotters-backend", ] -[[package]] -name = "polling" -version = "3.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" -dependencies = [ - "cfg-if", - "concurrent-queue", - "hermit-abi", - "pin-project-lite", - "rustix 1.1.2", - "windows-sys 0.61.2", -] - [[package]] name = "portable-atomic" version = "1.11.1" @@ -6577,16 +6368,6 @@ dependencies = [ "unarray", ] -[[package]] -name = "prost" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" -dependencies = [ - "bytes", - "prost-derive 0.12.6", -] - [[package]] name = "prost" version = "0.13.5" @@ -6594,28 +6375,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", - "prost-derive 0.13.5", -] - -[[package]] -name = "prost-build" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" -dependencies = [ - "bytes", - "heck", - "itertools 0.12.1", - "log", - "multimap", - "once_cell", - "petgraph 0.6.5", - "prettyplease", - "prost 0.12.6", - "prost-types 0.12.6", - "regex", - "syn 2.0.110", - "tempfile", + "prost-derive", ] [[package]] @@ -6631,26 +6391,13 @@ dependencies = [ "once_cell", "petgraph 0.7.1", "prettyplease", - "prost 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-types", "regex", "syn 2.0.110", "tempfile", ] -[[package]] -name = "prost-derive" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" -dependencies = [ - "anyhow", - "itertools 0.12.1", - "proc-macro2", - "quote", - "syn 2.0.110", -] - [[package]] name = "prost-derive" version = "0.13.5" @@ -6664,22 +6411,13 @@ dependencies = [ "syn 2.0.110", ] -[[package]] -name = "prost-types" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" -dependencies = [ - "prost 0.12.6", -] - [[package]] name = "prost-types" version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ - "prost 0.13.5", + "prost", ] [[package]] @@ -8015,9 +7753,9 @@ checksum = "b1772d041c37cc7e6477733c76b2acf4ee36bd52b2ae4d9ea0ec9c87d003db32" dependencies = [ "heck", "prettyplease", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "regress", "schemars", "semver", @@ -8040,9 +7778,9 @@ dependencies = [ "pbjson-build", "pbjson-types", "prettyplease", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "regress", "schemars", "semver", @@ -8061,7 +7799,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d091cf06bc7808bd81eb01f5f5b77b2b14288bb022501a2dcad78633c65262f" dependencies = [ "once_cell", - "prost 0.13.5", + "prost", "substrait 0.50.4", "substrait-expr-funcgen", "substrait-expr-macros", @@ -8397,35 +8135,6 @@ dependencies = [ "syn 2.0.110", ] -[[package]] -name = "tfrecord" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7036e822a1d906b8a49620e524a6fe21ab956583ac77f1427e908c61499a1f78" -dependencies = [ - "anyhow", - "async-std", - "bytemuck", - "crc", - "flate2", - "futures", - "glob", - "hex", - "hostname", - "integer-encoding 4.1.0", - "itertools 0.11.0", - "noisy_float", - "num", - "num-traits", - "once_cell", - "pin-project", - "prost 0.12.6", - "prost-build 0.12.6", - "tar", - "thiserror 1.0.69", - "ureq", -] - [[package]] name = "thiserror" version = "1.0.69" @@ -8491,7 +8200,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" dependencies = [ "byteorder", - "integer-encoding 3.0.4", + "integer-encoding", "ordered-float 2.10.1", ] @@ -9128,12 +8837,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" -[[package]] -name = "value-bag" -version = "1.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "943ce29a8a743eb10d6082545d861b24f9d1b160b7d741e0f2cdf726bec909c5" - [[package]] name = "vcpkg" version = "0.2.15" diff --git a/deny.toml b/deny.toml index 4c67b0767c9..e192267b97b 100644 --- a/deny.toml +++ b/deny.toml @@ -83,7 +83,6 @@ ignore = [ { id = "RUSTSEC-2021-0153", reason = "`encoding` is used by lindera" }, { id = "RUSTSEC-2024-0370", reason = "`proc-macro-error` is used by jieba-rs via include-flate" }, { id = "RUSTSEC-2024-0436", reason = "`paste` is used by datafusion" }, - { id = "RUSTSEC-2025-0052", reason = "`async-std` is used by tfrecord" }, { id = "RUSTSEC-2023-0071", reason = "`rsa` is used by opendal via reqsign" }, { id = "RUSTSEC-2025-0119", reason = "`number_prefix` used by hf-hub in examples" } ] diff --git a/python/Cargo.lock b/python/Cargo.lock index 27f3208cded..0e4f6e896d8 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -451,17 +451,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "async-channel" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" -dependencies = [ - "concurrent-queue", - "event-listener 2.5.3", - "futures-core", -] - [[package]] name = "async-channel" version = "2.5.0" @@ -491,20 +480,6 @@ dependencies = [ "zstd-safe", ] -[[package]] -name = "async-executor" -version = "1.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "497c00e0fd83a72a79a39fcbd8e3e2f055d6f6c7e025f3b3d91f4f8e76527fb8" -dependencies = [ - "async-task", - "concurrent-queue", - "fastrand", - "futures-lite", - "pin-project-lite", - "slab", -] - [[package]] name = "async-ffi" version = "0.5.0" @@ -514,46 +489,13 @@ dependencies = [ "abi_stable", ] -[[package]] -name = "async-global-executor" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05b1b633a2115cd122d73b955eadd9916c18c8f510ec9cd1686404c60ad1c29c" -dependencies = [ - "async-channel 2.5.0", - "async-executor", - "async-io", - "async-lock", - "blocking", - "futures-lite", - "once_cell", -] - -[[package]] -name = "async-io" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc" -dependencies = [ - "autocfg", - "cfg-if", - "concurrent-queue", - "futures-io", - "futures-lite", - "parking", - "polling", - "rustix 1.1.2", - "slab", - "windows-sys 0.61.2", -] - [[package]] name = "async-lock" version = "3.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5fd03604047cee9b6ce9de9f70c6cd540a0520c813cbd49bae61f33ab80ed1dc" dependencies = [ - "event-listener 5.4.1", + "event-listener", "event-listener-strategy", "pin-project-lite", ] @@ -569,38 +511,6 @@ dependencies = [ "syn 2.0.110", ] -[[package]] -name = "async-std" -version = "1.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c8e079a4ab67ae52b7403632e4618815d6db36d2a010cfe41b02c1b1578f93b" -dependencies = [ - "async-channel 1.9.0", - "async-global-executor", - "async-io", - "async-lock", - "crossbeam-utils", - "futures-channel", - "futures-core", - "futures-io", - "futures-lite", - "gloo-timers", - "kv-log-macro", - "log", - "memchr", - "once_cell", - "pin-project-lite", - "pin-utils", - "slab", - "wasm-bindgen-futures", -] - -[[package]] -name = "async-task" -version = "4.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" - [[package]] name = "async-trait" version = "0.1.89" @@ -1242,19 +1152,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "blocking" -version = "1.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e83f8d02be6967315521be875afa792a316e28d57b5a2d401897e2a7921b7f21" -dependencies = [ - "async-channel 2.5.0", - "async-task", - "futures-io", - "futures-lite", - "piper", -] - [[package]] name = "bon" version = "3.8.1" @@ -1600,21 +1497,6 @@ dependencies = [ "libc", ] -[[package]] -name = "crc" -version = "3.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" -dependencies = [ - "crc-catalog", -] - -[[package]] -name = "crc-catalog" -version = "2.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" - [[package]] name = "crc32c" version = "0.6.8" @@ -2141,7 +2023,7 @@ dependencies = [ "datafusion-proto-common", "futures", "log", - "prost 0.13.5", + "prost", "semver", "tokio", ] @@ -2422,7 +2304,7 @@ dependencies = [ "datafusion-expr", "datafusion-proto-common", "object_store", - "prost 0.13.5", + "prost", ] [[package]] @@ -2433,7 +2315,7 @@ checksum = "b4b14f288ca4ef77743d9672cafecf3adfffff0b9b04af9af79ecbeaaf736901" dependencies = [ "arrow", "datafusion-common", - "prost 0.13.5", + "prost", ] [[package]] @@ -2508,7 +2390,7 @@ dependencies = [ "itertools 0.14.0", "object_store", "pbjson-types", - "prost 0.13.5", + "prost", "substrait", "tokio", "url", @@ -2791,12 +2673,6 @@ version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b" -[[package]] -name = "event-listener" -version = "2.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" - [[package]] name = "event-listener" version = "5.4.1" @@ -2814,7 +2690,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" dependencies = [ - "event-listener 5.4.1", + "event-listener", "pin-project-lite", ] @@ -2854,12 +2730,6 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a3076410a55c90011c298b04d0cfa770b00fa04e1e3c97d3f6c9de105a03844" -[[package]] -name = "fixedbitset" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" - [[package]] name = "fixedbitset" version = "0.5.7" @@ -3001,19 +2871,6 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" -[[package]] -name = "futures-lite" -version = "2.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad" -dependencies = [ - "fastrand", - "futures-core", - "futures-io", - "parking", - "pin-project-lite", -] - [[package]] name = "futures-macro" version = "0.3.31" @@ -3251,17 +3108,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "hostname" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867" -dependencies = [ - "libc", - "match_cfg", - "winapi", -] - [[package]] name = "htmlescape" version = "0.3.1" @@ -3420,7 +3266,7 @@ dependencies = [ "tokio", "tokio-rustls 0.26.4", "tower-service", - "webpki-roots 1.0.4", + "webpki-roots", ] [[package]] @@ -3660,12 +3506,6 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" -[[package]] -name = "integer-encoding" -version = "4.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c00403deb17c3221a1fe4fb571b9ed0370b3dcd116553c77fa294a3d918699" - [[package]] name = "ipnet" version = "2.11.0" @@ -3688,24 +3528,6 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" -[[package]] -name = "itertools" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" -dependencies = [ - "either", -] - -[[package]] -name = "itertools" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.13.0" @@ -3858,15 +3680,6 @@ dependencies = [ "bitflags 1.3.2", ] -[[package]] -name = "kv-log-macro" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" -dependencies = [ - "log", -] - [[package]] name = "lance" version = "1.0.0-beta.3" @@ -3915,9 +3728,8 @@ dependencies = [ "object_store", "permutation", "pin-project", - "prost 0.12.6", - "prost 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-types", "rand 0.9.2", "roaring", "semver", @@ -3925,7 +3737,6 @@ dependencies = [ "serde_json", "snafu", "tantivy", - "tfrecord", "tokio", "tokio-stream", "tracing", @@ -3983,7 +3794,7 @@ dependencies = [ "num_cpus", "object_store", "pin-project", - "prost 0.13.5", + "prost", "rand 0.9.2", "roaring", "serde_json", @@ -4020,7 +3831,7 @@ dependencies = [ "lance-datagen", "log", "pin-project", - "prost 0.13.5", + "prost", "snafu", "tokio", "tracing", @@ -4068,9 +3879,9 @@ dependencies = [ "log", "lz4", "num-traits", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "rand 0.9.2", "snafu", "strum 0.26.3", @@ -4104,9 +3915,9 @@ dependencies = [ "log", "num-traits", "object_store", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "snafu", "tokio", "tracing", @@ -4122,7 +3933,7 @@ dependencies = [ "arrow-ord", "arrow-schema", "arrow-select", - "async-channel 2.5.0", + "async-channel", "async-recursion", "async-trait", "bitpacking", @@ -4158,9 +3969,9 @@ dependencies = [ "ndarray", "num-traits", "object_store", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "rand 0.9.2", "rand_distr 0.5.1", "rayon", @@ -4206,7 +4017,7 @@ dependencies = [ "opendal", "path_abs", "pin-project", - "prost 0.13.5", + "prost", "rand 0.9.2", "serde", "shellexpand", @@ -4309,9 +4120,9 @@ dependencies = [ "lance-io", "log", "object_store", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "rand 0.9.2", "rangemap", "roaring", @@ -4546,7 +4357,7 @@ dependencies = [ "reqwest", "serde", "tar", - "thiserror 2.0.17", + "thiserror", "tokio", "yada", ] @@ -4646,9 +4457,6 @@ name = "log" version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" -dependencies = [ - "value-bag", -] [[package]] name = "loom" @@ -4717,12 +4525,6 @@ dependencies = [ "pkg-config", ] -[[package]] -name = "match_cfg" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4" - [[package]] name = "matchers" version = "0.2.0" @@ -4860,7 +4662,7 @@ dependencies = [ "crossbeam-epoch", "crossbeam-utils", "equivalent", - "event-listener 5.4.1", + "event-listener", "futures-util", "parking_lot", "portable-atomic", @@ -4897,15 +4699,6 @@ dependencies = [ "rawpointer", ] -[[package]] -name = "noisy_float" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978fe6e6ebc0bf53de533cd456ca2d9de13de13856eda1518a285d7705a213af" -dependencies = [ - "num-traits", -] - [[package]] name = "nom" version = "7.1.3" @@ -5078,7 +4871,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "thiserror 2.0.17", + "thiserror", "tokio", "tracing", "url", @@ -5306,8 +5099,8 @@ checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" dependencies = [ "heck", "itertools 0.13.0", - "prost 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-types", ] [[package]] @@ -5320,8 +5113,8 @@ dependencies = [ "chrono", "pbjson", "pbjson-build", - "prost 0.13.5", - "prost-build 0.13.5", + "prost", + "prost-build", "serde", ] @@ -5366,23 +5159,13 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df202b0b0f5b8e389955afd5f27b007b00fb948162953f1db9c70d2c7e3157d7" -[[package]] -name = "petgraph" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" -dependencies = [ - "fixedbitset 0.4.2", - "indexmap", -] - [[package]] name = "petgraph" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ - "fixedbitset 0.5.7", + "fixedbitset", "indexmap", ] @@ -5392,7 +5175,7 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ - "fixedbitset 0.5.7", + "fixedbitset", "hashbrown 0.15.5", "indexmap", "serde", @@ -5487,17 +5270,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "piper" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" -dependencies = [ - "atomic-waker", - "fastrand", - "futures-io", -] - [[package]] name = "pkcs1" version = "0.7.5" @@ -5542,20 +5314,6 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" -[[package]] -name = "polling" -version = "3.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" -dependencies = [ - "cfg-if", - "concurrent-queue", - "hermit-abi", - "pin-project-lite", - "rustix 1.1.2", - "windows-sys 0.61.2", -] - [[package]] name = "portable-atomic" version = "1.11.1" @@ -5638,16 +5396,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "prost" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" -dependencies = [ - "bytes", - "prost-derive 0.12.6", -] - [[package]] name = "prost" version = "0.13.5" @@ -5655,28 +5403,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" dependencies = [ "bytes", - "prost-derive 0.13.5", -] - -[[package]] -name = "prost-build" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" -dependencies = [ - "bytes", - "heck", - "itertools 0.12.1", - "log", - "multimap", - "once_cell", - "petgraph 0.6.5", - "prettyplease", - "prost 0.12.6", - "prost-types 0.12.6", - "regex", - "syn 2.0.110", - "tempfile", + "prost-derive", ] [[package]] @@ -5692,26 +5419,13 @@ dependencies = [ "once_cell", "petgraph 0.7.1", "prettyplease", - "prost 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-types", "regex", "syn 2.0.110", "tempfile", ] -[[package]] -name = "prost-derive" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" -dependencies = [ - "anyhow", - "itertools 0.12.1", - "proc-macro2", - "quote", - "syn 2.0.110", -] - [[package]] name = "prost-derive" version = "0.13.5" @@ -5725,22 +5439,13 @@ dependencies = [ "syn 2.0.110", ] -[[package]] -name = "prost-types" -version = "0.12.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" -dependencies = [ - "prost 0.12.6", -] - [[package]] name = "prost-types" version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ - "prost 0.13.5", + "prost", ] [[package]] @@ -5786,8 +5491,8 @@ dependencies = [ "libc", "log", "object_store", - "prost 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-types", "pyo3", "pythonize", "regex", @@ -5911,7 +5616,7 @@ dependencies = [ "rustc-hash", "rustls 0.23.35", "socket2 0.6.1", - "thiserror 2.0.17", + "thiserror", "tokio", "tracing", "web-time", @@ -5932,7 +5637,7 @@ dependencies = [ "rustls 0.23.35", "rustls-pki-types", "slab", - "thiserror 2.0.17", + "thiserror", "tinyvec", "tracing", "web-time", @@ -6143,7 +5848,7 @@ checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.16", "libredox", - "thiserror 2.0.17", + "thiserror", ] [[package]] @@ -6276,7 +5981,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots 1.0.4", + "webpki-roots", ] [[package]] @@ -6410,7 +6115,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f" dependencies = [ "aws-lc-rs", - "log", "once_cell", "ring", "rustls-pki-types", @@ -6839,7 +6543,7 @@ checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" dependencies = [ "num-bigint", "num-traits", - "thiserror 2.0.17", + "thiserror", "time", ] @@ -7046,9 +6750,9 @@ dependencies = [ "pbjson-build", "pbjson-types", "prettyplease", - "prost 0.13.5", - "prost-build 0.13.5", - "prost-types 0.13.5", + "prost", + "prost-build", + "prost-types", "regress", "schemars", "semver", @@ -7160,7 +6864,7 @@ dependencies = [ "tantivy-stacker", "tantivy-tokenizer-api", "tempfile", - "thiserror 2.0.17", + "thiserror", "time", "uuid", "winapi", @@ -7296,62 +7000,13 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "tfrecord" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7036e822a1d906b8a49620e524a6fe21ab956583ac77f1427e908c61499a1f78" -dependencies = [ - "anyhow", - "async-std", - "bytemuck", - "crc", - "flate2", - "futures", - "glob", - "hex", - "hostname", - "integer-encoding 4.1.0", - "itertools 0.11.0", - "noisy_float", - "num", - "num-traits", - "once_cell", - "pin-project", - "prost 0.12.6", - "prost-build 0.12.6", - "tar", - "thiserror 1.0.69", - "ureq", -] - -[[package]] -name = "thiserror" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" -dependencies = [ - "thiserror-impl 1.0.69", -] - [[package]] name = "thiserror" version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ - "thiserror-impl 2.0.17", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.110", + "thiserror-impl", ] [[package]] @@ -7390,7 +7045,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" dependencies = [ "byteorder", - "integer-encoding 3.0.4", + "integer-encoding", "ordered-float 2.10.1", ] @@ -7741,7 +7396,7 @@ dependencies = [ "serde", "serde_json", "syn 2.0.110", - "thiserror 2.0.17", + "thiserror", "unicode-ident", ] @@ -7825,22 +7480,6 @@ version = "0.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" -[[package]] -name = "ureq" -version = "2.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" -dependencies = [ - "base64 0.22.1", - "flate2", - "log", - "once_cell", - "rustls 0.23.35", - "rustls-pki-types", - "url", - "webpki-roots 0.26.11", -] - [[package]] name = "url" version = "2.5.7" @@ -7895,12 +7534,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" -[[package]] -name = "value-bag" -version = "1.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "943ce29a8a743eb10d6082545d861b24f9d1b160b7d741e0f2cdf726bec909c5" - [[package]] name = "version_check" version = "0.9.5" @@ -8044,15 +7677,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki-roots" -version = "0.26.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" -dependencies = [ - "webpki-roots 1.0.4", -] - [[package]] name = "webpki-roots" version = "1.0.4" diff --git a/python/Cargo.toml b/python/Cargo.toml index 80ae1508d82..e21d5d8a2e4 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -29,7 +29,6 @@ half = { version = "2.5", default-features = false, features = [ "std", ] } lance = { path = "../rust/lance", features = [ - "tensorflow", "dynamodb", "substrait", ] } diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index 2389a4e2e43..3e3aef279c2 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -82,13 +82,6 @@ from .trace import capture_trace_events as capture_trace_events from .trace import shutdown_tracing as shutdown_tracing from .trace import trace_to_chrome as trace_to_chrome -def infer_tfrecord_schema( - uri: str, - tensor_features: Optional[List[str]] = None, - string_features: Optional[List[str]] = None, -) -> pa.Schema: ... -def read_tfrecord(uri: str, schema: pa.Schema) -> pa.RecordBatchReader: ... - class CleanupStats: bytes_removed: int old_versions: int diff --git a/python/python/lance/tf/tfrecord.py b/python/python/lance/tf/tfrecord.py deleted file mode 100644 index ef9d1235e4b..00000000000 --- a/python/python/lance/tf/tfrecord.py +++ /dev/null @@ -1,5 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright The Lance Authors - -from lance.lance import infer_tfrecord_schema as infer_tfrecord_schema -from lance.lance import read_tfrecord as read_tfrecord diff --git a/python/python/tests/test_tf.py b/python/python/tests/test_tf.py index 878a8b29425..87a01aa5e25 100644 --- a/python/python/tests/test_tf.py +++ b/python/python/tests/test_tf.py @@ -5,12 +5,11 @@ import warnings import lance -import ml_dtypes import numpy as np import pandas as pd import pyarrow as pa import pytest -from lance.arrow import BFloat16Type, ImageArray, bfloat16_array +from lance.arrow import ImageArray from lance.fragment import LanceFragment pytest.skip("Skip tensorflow tests", allow_module_level=True) @@ -32,7 +31,6 @@ lance_fragments, lance_take_batches, ) -from lance.tf.tfrecord import infer_tfrecord_schema, read_tfrecord # noqa: E402 @pytest.fixture @@ -313,224 +311,3 @@ def test_image_types(tmp_path): assert batch["tensor_images"].shape == (3, 1, 1, 4) assert batch["tensor_images"].dtype == tf.uint8 assert batch["tensor_images"].numpy().tolist() == tensors.to_numpy().tolist() - - -@pytest.fixture -def sample_tf_example(): - # Create a TFRecord with a string, float, int, and a tensor - tensor = tf.constant(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)) - tensor_bf16 = tf.constant( - np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=ml_dtypes.bfloat16) - ) - - feature = { - "1_int": tf.train.Feature(int64_list=tf.train.Int64List(value=[1])), - "2_int_list": tf.train.Feature(int64_list=tf.train.Int64List(value=[1, 2, 3])), - "3_float": tf.train.Feature(float_list=tf.train.FloatList(value=[1.0])), - "4_float_list": tf.train.Feature( - float_list=tf.train.FloatList(value=[1.0, 2.0, 3.0]) - ), - "5_bytes": tf.train.Feature( - bytes_list=tf.train.BytesList(value=[b"Hello, TensorFlow!"]) - ), - "6_bytes_list": tf.train.Feature( - bytes_list=tf.train.BytesList( - value=[b"Hello, TensorFlow!", b"Hello, Lance!"] - ) - ), - "7_string": tf.train.Feature( - bytes_list=tf.train.BytesList(value=[b"Hello, TensorFlow!"]) - ), - "8_tensor": tf.train.Feature( - bytes_list=tf.train.BytesList( - value=[tf.io.serialize_tensor(tensor).numpy()] - ) - ), - "9_tensor_bf16": tf.train.Feature( - bytes_list=tf.train.BytesList( - value=[tf.io.serialize_tensor(tensor_bf16).numpy()] - ) - ), - } - - return tf.train.Example(features=tf.train.Features(feature=feature)) - - -def test_tfrecord_parsing(tmp_path, sample_tf_example): - serialized = sample_tf_example.SerializeToString() - - path = tmp_path / "test.tfrecord" - with tf.io.TFRecordWriter(str(path)) as writer: - writer.write(serialized) - - inferred_schema = infer_tfrecord_schema(str(path)) - - assert inferred_schema == pa.schema( - { - "1_int": pa.int64(), - "2_int_list": pa.list_(pa.int64()), - "3_float": pa.float32(), - "4_float_list": pa.list_(pa.float32()), - "5_bytes": pa.binary(), - "6_bytes_list": pa.list_(pa.binary()), - # tensors and strings assumed binary - "7_string": pa.binary(), - "8_tensor": pa.binary(), - "9_tensor_bf16": pa.binary(), - } - ) - - inferred_schema = infer_tfrecord_schema( - str(path), - tensor_features=["8_tensor", "9_tensor_bf16"], - string_features=["7_string"], - ) - assert inferred_schema == pa.schema( - { - "1_int": pa.int64(), - "2_int_list": pa.list_(pa.int64()), - "3_float": pa.float32(), - "4_float_list": pa.list_(pa.float32()), - "5_bytes": pa.binary(), - "6_bytes_list": pa.list_(pa.binary()), - "7_string": pa.string(), - "8_tensor": pa.fixed_shape_tensor(pa.float32(), [2, 3]), - "9_tensor_bf16": pa.fixed_shape_tensor(BFloat16Type(), [2, 3]), - } - ) - - reader = read_tfrecord(str(path), inferred_schema) - assert reader.schema == inferred_schema - table = reader.read_all() - - assert table.schema == inferred_schema - - tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) - inner = pa.array([float(x) for x in range(1, 7)], pa.float32()) - storage = pa.FixedSizeListArray.from_arrays(inner, 6) - f32_array = pa.ExtensionArray.from_storage(tensor_type, storage) - - tensor_type = pa.fixed_shape_tensor(BFloat16Type(), [2, 3]) - bf16_array = bfloat16_array([float(x) for x in range(1, 7)]) - storage = pa.FixedSizeListArray.from_arrays(bf16_array, 6) - bf16_array = pa.ExtensionArray.from_storage(tensor_type, storage) - - expected_data = pa.table( - { - "1_int": pa.array([1]), - "2_int_list": pa.array([[1, 2, 3]]), - "3_float": pa.array([1.0], pa.float32()), - "4_float_list": pa.array([[1.0, 2.0, 3.0]], pa.list_(pa.float32())), - "5_bytes": pa.array([b"Hello, TensorFlow!"]), - "6_bytes_list": pa.array([[b"Hello, TensorFlow!", b"Hello, Lance!"]]), - "7_string": pa.array(["Hello, TensorFlow!"]), - "8_tensor": f32_array, - "9_tensor_bf16": bf16_array, - } - ) - - assert table == expected_data - - -def test_tfrecord_roundtrip(tmp_path, sample_tf_example): - serialized = sample_tf_example.SerializeToString() - - path = tmp_path / "test.tfrecord" - with tf.io.TFRecordWriter(str(path)) as writer: - writer.write(serialized) - - schema = infer_tfrecord_schema( - str(path), - tensor_features=["8_tensor", "9_tensor_bf16"], - string_features=["7_string"], - ) - - table = read_tfrecord(str(path), schema).read_all() - - # Can roundtrip to lance - dataset_uri = tmp_path / "dataset" - dataset = lance.write_dataset(table, dataset_uri) - assert dataset.schema == table.schema - assert dataset.to_table() == table - - # TODO: validate we can roundtrip with from_lance() - # tf_ds = from_lance(dataset, batch_size=1) - - -def test_tfrecord_parsing_nulls(tmp_path): - # Make sure we don't trip up on missing values - tensor = tf.constant(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)) - - features = [ - { - "a": tf.train.Feature(int64_list=tf.train.Int64List(value=[1])), - "b": tf.train.Feature(int64_list=tf.train.Int64List(value=[1])), - "c": tf.train.Feature(float_list=tf.train.FloatList(value=[1.0])), - "d": tf.train.Feature( - bytes_list=tf.train.BytesList( - value=[tf.io.serialize_tensor(tensor).numpy()] - ) - ), - }, - { - "a": tf.train.Feature(int64_list=tf.train.Int64List(value=[1])), - }, - { - "a": tf.train.Feature(int64_list=tf.train.Int64List(value=[1])), - "b": tf.train.Feature(int64_list=tf.train.Int64List(value=[1, 2, 3])), - "c": tf.train.Feature(float_list=tf.train.FloatList(value=[1.0])), - }, - ] - - path = tmp_path / "test.tfrecord" - with tf.io.TFRecordWriter(str(path)) as writer: - for feature in features: - example_proto = tf.train.Example( - features=tf.train.Features(feature=feature) - ) - serialized = example_proto.SerializeToString() - writer.write(serialized) - - inferred_schema = infer_tfrecord_schema(str(path), tensor_features=["d"]) - assert inferred_schema == pa.schema( - { - "a": pa.int64(), - "b": pa.list_(pa.int64()), - "c": pa.float32(), - "d": pa.fixed_shape_tensor(pa.float32(), [2, 3]), - } - ) - - tensor_type = pa.fixed_shape_tensor(pa.float32(), [2, 3]) - inner = pa.array([float(x) for x in range(1, 7)] + [None] * 12, pa.float32()) - storage = pa.FixedSizeListArray.from_arrays(inner, 6) - f32_array = pa.ExtensionArray.from_storage(tensor_type, storage) - - data = read_tfrecord(str(path), inferred_schema).read_all() - expected = pa.table( - { - "a": pa.array([1, 1, 1]), - "b": pa.array([[1], [], [1, 2, 3]]), - "c": pa.array([1.0, None, 1.0], pa.float32()), - "d": f32_array, - } - ) - - assert data == expected - - # can do projection - read_schema = pa.schema( - { - "a": pa.int64(), - "c": pa.float32(), - } - ) - expected = pa.table( - { - "a": pa.array([1, 1, 1]), - "c": pa.array([1.0, None, 1.0], pa.float32()), - } - ) - - data = read_tfrecord(str(path), read_schema).read_all() - assert data == expected diff --git a/python/src/lib.rs b/python/src/lib.rs index 0bb96f4d0ae..8961e9c5a24 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -30,15 +30,10 @@ use std::sync::Arc; use std::ffi::CString; -use ::arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream}; use ::arrow::pyarrow::PyArrowType; use ::arrow_schema::Schema as ArrowSchema; use ::lance::arrow::json::ArrowJsonExt; use ::lance::datafusion::LanceTableProvider; - -use arrow_array::{RecordBatch, RecordBatchIterator}; -use arrow_schema::ArrowError; -use datafusion::error::Result; use datafusion_ffi::table_provider::FFI_TableProvider; #[cfg(feature = "datagen")] use datagen::register_datagen; @@ -53,10 +48,9 @@ use file::{ stable_version, LanceBufferDescriptor, LanceColumnMetadata, LanceFileMetadata, LanceFileReader, LanceFileStatistics, LanceFileWriter, LancePageMetadata, }; -use futures::StreamExt; use lance_index::DatasetIndexExt; use log::Level; -use pyo3::exceptions::{PyIOError, PyValueError}; +use pyo3::exceptions::PyIOError; use pyo3::prelude::*; use pyo3::types::{PyAny, PyAnyMethods, PyCapsule}; use scanner::ScanStatistics; @@ -283,8 +277,6 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(write_fragments_transaction))?; m.add_wrapped(wrap_pyfunction!(schema_to_json))?; m.add_wrapped(wrap_pyfunction!(json_to_schema))?; - m.add_wrapped(wrap_pyfunction!(infer_tfrecord_schema))?; - m.add_wrapped(wrap_pyfunction!(read_tfrecord))?; m.add_wrapped(wrap_pyfunction!(trace_to_chrome))?; m.add_wrapped(wrap_pyfunction!(capture_trace_events))?; m.add_wrapped(wrap_pyfunction!(shutdown_tracing))?; @@ -348,127 +340,6 @@ pub fn language_model_home() -> PyResult { Ok(String::from(pstr)) } -/// Infer schema from tfrecord file -/// -/// Parameters -/// ---------- -/// uri: str -/// URI of the tfrecord file -/// tensor_features: Optional[List[str]] -/// Names of features that should be treated as tensors. Currently only -/// fixed-shape tensors are supported. -/// string_features: Optional[List[str]] -/// Names of features that should be treated as strings. Otherwise they -/// will be treated as binary. -/// batch_size: Optional[int], default None -/// Number of records to read to infer the schema. If None, will read the -/// entire file. -/// -/// Returns -/// ------- -/// pyarrow.Schema -/// An Arrow schema inferred from the tfrecord file. The schema is -/// alphabetically sorted by field names, since TFRecord doesn't have -/// a concept of field order. -#[pyfunction] -#[pyo3(signature = (uri, *, tensor_features = None, string_features = None, num_rows = None))] -fn infer_tfrecord_schema( - uri: &str, - tensor_features: Option>, - string_features: Option>, - num_rows: Option, -) -> PyResult> { - let tensor_features = tensor_features.unwrap_or_default(); - let tensor_features = tensor_features - .iter() - .map(|s| s.as_str()) - .collect::>(); - let string_features = string_features.unwrap_or_default(); - let string_features = string_features - .iter() - .map(|s| s.as_str()) - .collect::>(); - let schema = rt() - .runtime - .block_on(::lance::utils::tfrecord::infer_tfrecord_schema( - uri, - &tensor_features, - &string_features, - num_rows, - )) - .map_err(|err| PyIOError::new_err(err.to_string()))?; - Ok(PyArrowType(schema)) -} - -/// Read tfrecord file as an Arrow stream -/// -/// Parameters -/// ---------- -/// uri: str -/// URI of the tfrecord file -/// schema: pyarrow.Schema -/// Arrow schema of the tfrecord file. Use :py:func:`infer_tfrecord_schema` -/// to infer the schema. The schema is allowed to be a subset of fields; the -/// reader will only parse the fields that are present in the schema. -/// batch_size: int, default 10k -/// Number of records to read per batch. -/// -/// Returns -/// ------- -/// pyarrow.RecordBatchReader -/// An Arrow reader, which can be passed directly to -/// :py:func:`lance.write_dataset`. The output schema will match the schema -/// provided, including field order. -#[pyfunction] -#[pyo3(signature = (uri, schema, *, batch_size = 10_000))] -fn read_tfrecord( - uri: String, - schema: PyArrowType, - batch_size: usize, -) -> PyResult> { - let schema = Arc::new(schema.0); - - let (init_sender, init_receiver) = std::sync::mpsc::channel::>(); - let (batch_sender, batch_receiver) = - std::sync::mpsc::channel::>(); - - let schema_ref = schema.clone(); - rt().spawn_background(None, async move { - let mut stream = - match ::lance::utils::tfrecord::read_tfrecord(&uri, schema_ref, Some(batch_size)).await - { - Ok(stream) => { - init_sender.send(Ok(())).unwrap(); - stream - } - Err(err) => { - init_sender.send(Err(err)).unwrap(); - return; - } - }; - - while let Some(batch) = stream.next().await { - let batch = batch.map_err(|err| ArrowError::ExternalError(Box::new(err))); - batch_sender.send(batch).unwrap(); - } - }); - - // Verify initialization happened successfully - init_receiver.recv().unwrap().map_err(|err| { - PyIOError::new_err(format!("Failed to initialize tfrecord reader: {}", err)) - })?; - - let batch_reader = RecordBatchIterator::new(batch_receiver, schema); - - // TODO: this should be handled by upstream - let stream = FFI_ArrowArrayStream::new(Box::new(batch_reader)); - let stream_reader = ArrowArrayStreamReader::try_new(stream).map_err(|err| { - PyValueError::new_err(format!("Failed to export record batch reader: {}", err)) - })?; - - Ok(PyArrowType(stream_reader)) -} - #[pyfunction] #[pyo3(signature = (dataset,))] fn manifest_needs_migration(dataset: &Bound<'_, PyAny>) -> PyResult { diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml index 045d54f289c..2a254a47648 100644 --- a/rust/lance/Cargo.toml +++ b/rust/lance/Cargo.toml @@ -74,8 +74,6 @@ serde = { workspace = true } moka.workspace = true permutation = { version = "0.4.0" } tantivy.workspace = true -tfrecord = { version = "0.15.0", optional = true, features = ["async"] } -prost_old = { version = "0.12.6", package = "prost", optional = true } aws-sdk-dynamodb = { workspace = true, optional = true } tracing.workspace = true humantime = { workspace = true } @@ -116,7 +114,6 @@ default = ["aws", "azure", "gcp", "oss"] fp16kernels = ["lance-linalg/fp16kernels"] # Prevent dynamic linking of lzma, which comes from datafusion cli = ["dep:clap", "lzma-sys/static"] -tensorflow = ["dep:tfrecord", "dep:prost_old"] dynamodb = ["lance-table/dynamodb", "dep:aws-sdk-dynamodb"] dynamodb_tests = ["dynamodb"] substrait = ["lance-datafusion/substrait"] diff --git a/rust/lance/src/utils.rs b/rust/lance/src/utils.rs index 4ae847cfb9c..b2997d58e63 100644 --- a/rust/lance/src/utils.rs +++ b/rust/lance/src/utils.rs @@ -7,5 +7,3 @@ pub(crate) mod future; pub(crate) mod temporal; #[cfg(test)] pub(crate) mod test; -#[cfg(feature = "tensorflow")] -pub mod tfrecord; diff --git a/rust/lance/src/utils/tfrecord.rs b/rust/lance/src/utils/tfrecord.rs deleted file mode 100644 index a4b09e3866c..00000000000 --- a/rust/lance/src/utils/tfrecord.rs +++ /dev/null @@ -1,792 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -//! Reading TFRecord files into Arrow data -//! -//! Use [infer_tfrecord_schema] to infer the schema of a TFRecord file, then use -//! [read_tfrecord] to read the file into an Arrow record batch stream. - -use arrow::buffer::OffsetBuffer; -use arrow_array::builder::PrimitiveBuilder; -use arrow_array::{ArrayRef, FixedSizeListArray, ListArray}; -use arrow_buffer::ArrowNativeType; -use arrow_buffer::ScalarBuffer; -use datafusion::error::DataFusionError; -use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use datafusion::physical_plan::SendableRecordBatchStream; -use futures::{StreamExt, TryStreamExt}; -use half::{bf16, f16}; -use lance_arrow::bfloat16::BFLOAT16_EXT_NAME; -use lance_arrow::{ARROW_EXT_META_KEY, ARROW_EXT_NAME_KEY}; -use prost_old::Message; -use std::collections::HashMap; -use std::sync::Arc; - -use crate::io::ObjectStore; -use crate::{Error, Result}; -use arrow::record_batch::RecordBatch; -use arrow_schema::{ - DataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, -}; -use snafu::{location, Location}; -use tfrecord::protobuf::feature::Kind; -use tfrecord::protobuf::{DataType as TensorDataType, TensorProto}; -use tfrecord::record_reader::RecordStream; -use tfrecord::{Example, Feature}; - -trait OldProstResultExt { - fn map_prost_err(self, location: Location) -> Result; -} - -impl OldProstResultExt for std::result::Result { - fn map_prost_err(self, location: Location) -> Result { - self.map_err(|err| Error::IO { - source: Box::new(err), - location, - }) - } -} - -/// Infer the Arrow schema from a TFRecord file. -/// -/// The featured named by `tensor_features` will be assumed to be binary fields -/// containing serialized tensors (TensorProto messages). Currently only -/// fixed-shape tensors are supported. -/// -/// The features named by `string_features` will be assumed to be UTF-8 encoded -/// strings. -/// -/// `num_rows` determines the number of rows to read from the file to infer the -/// schema. If `None`, the entire file will be read. -pub async fn infer_tfrecord_schema( - uri: &str, - tensor_features: &[&str], - string_features: &[&str], - num_rows: Option, -) -> Result { - let mut columns: HashMap = HashMap::new(); - - let (store, path) = ObjectStore::from_uri(uri).await?; - // TODO: should we avoid reading the entire file into memory? - let data = store - .inner - .get(&path) - .await? - .into_stream() - .map_err(std::io::Error::other) - .into_async_read(); - let mut records = RecordStream::::from_reader(data, Default::default()); - let mut i = 0; - while let Some(record) = records.next().await { - let record = record.map_err(|err| Error::io(err.to_string(), location!()))?; - - if let Some(features) = record.features { - for (name, feature) in features.feature { - if let Some(entry) = columns.get_mut(&name) { - entry.try_update(&feature)?; - } else { - columns.insert( - name.clone(), - FeatureMeta::try_new( - &feature, - tensor_features.contains(&name.as_str()), - string_features.contains(&name.as_str()), - )?, - ); - } - } - } - - i += 1; - if let Some(num_rows) = num_rows { - if i >= num_rows { - break; - } - } - } - - let mut fields = columns - .iter() - .map(|(name, meta)| make_field(name, meta)) - .collect::>>()?; - - // To guarantee some sort of deterministic order, we sort the fields by name - fields.sort_by(|a, b| a.name().cmp(b.name())); - Ok(ArrowSchema::new(fields)) -} - -/// Read a TFRecord file into an Arrow record batch stream. -/// -/// Reads `batch_size` rows at a time. If `batch_size` is `None`, a default -/// batch size of 10,000 is used. -/// -/// The schema may be a partial schema, in which case only the fields present in -/// the schema will be read. -pub async fn read_tfrecord( - uri: &str, - schema: ArrowSchemaRef, - batch_size: Option, -) -> Result { - let batch_size = batch_size.unwrap_or(10_000); - - let (store, path) = ObjectStore::from_uri(uri).await?; - let data = store - .inner - .get(&path) - .await? - .into_stream() - .map_err(std::io::Error::other) - .into_async_read(); - let schema_ref = schema.clone(); - let batch_stream = RecordStream::::from_reader(data, Default::default()) - .try_chunks(batch_size) - .map(move |chunk| { - let chunk = chunk.map_err(|err| DataFusionError::External(Box::new(err.1)))?; - let batch = convert_batch(chunk, &schema_ref)?; - Ok(batch) - }); - - Ok(Box::pin(RecordBatchStreamAdapter::new( - schema, - batch_stream, - ))) -} - -/// Check if a feature has more than 1 value. -fn feature_is_repeated(feature: &tfrecord::Feature) -> bool { - match feature.kind.as_ref().unwrap() { - Kind::BytesList(bytes_list) => bytes_list.value.len() > 1, - Kind::FloatList(float_list) => float_list.value.len() > 1, - Kind::Int64List(int64_list) => int64_list.value.len() > 1, - } -} - -/// Simplified representation of a features data type. -#[derive(Clone, PartialEq, Debug)] -enum FeatureType { - Integer, - Float, - Binary, - String, - Tensor { - shape: Vec, - dtype: TensorDataType, - }, -} - -/// General type information about a single feature. -struct FeatureMeta { - /// Whether the feature contains multiple values per example. Ones that do - /// will be converted to Arrow lists. Otherwise they will be primitive arrays. - repeated: bool, - feature_type: FeatureType, -} - -impl FeatureMeta { - /// Create a new FeatureMeta from a single example. - pub fn try_new(feature: &Feature, is_tensor: bool, is_string: bool) -> Result { - let feature_type = match feature.kind.as_ref().unwrap() { - Kind::BytesList(data) => { - if is_tensor { - Self::extract_tensor(data.value[0].as_slice())? - } else if is_string { - FeatureType::String - } else { - FeatureType::Binary - } - } - Kind::FloatList(_) => FeatureType::Float, - Kind::Int64List(_) => FeatureType::Integer, - }; - Ok(Self { - repeated: feature_is_repeated(feature), - feature_type, - }) - } - - /// Update the FeatureMeta with a new example, or return an error if the - /// example is inconsistent with the existing FeatureMeta. - pub fn try_update(&mut self, feature: &Feature) -> Result<()> { - let feature_type = match feature.kind.as_ref().unwrap() { - Kind::BytesList(data) => match self.feature_type { - FeatureType::String => FeatureType::String, - FeatureType::Binary => FeatureType::Binary, - FeatureType::Tensor { .. } => Self::extract_tensor(data.value[0].as_slice())?, - _ => { - return Err(Error::io( - format!( - "Data type mismatch: expected {:?}, got {:?}", - self.feature_type, - feature.kind.as_ref().unwrap() - ), - location!(), - )) - } - }, - Kind::FloatList(_) => FeatureType::Float, - Kind::Int64List(_) => FeatureType::Integer, - }; - if self.feature_type != feature_type { - return Err(Error::io( - format!("inconsistent feature type for field {:?}", feature_type), - location!(), - )); - } - if feature_is_repeated(feature) { - self.repeated = true; - } - Ok(()) - } - - fn extract_tensor(data: &[u8]) -> Result { - let tensor_proto = TensorProto::decode(data).map_prost_err(location!())?; - Ok(FeatureType::Tensor { - shape: tensor_proto - .tensor_shape - .as_ref() - .unwrap() - .dim - .iter() - .map(|d| d.size) - .collect(), - dtype: tensor_proto.dtype(), - }) - } -} - -/// Metadata for a fixed-shape tensor. -#[derive(serde::Serialize)] -struct ArrowTensorMetadata { - shape: Vec, -} - -fn tensor_dtype_to_arrow(tensor_dtype: &TensorDataType) -> Result { - Ok(match tensor_dtype { - TensorDataType::DtBfloat16 => DataType::FixedSizeBinary(2), - TensorDataType::DtHalf => DataType::Float16, - TensorDataType::DtFloat => DataType::Float32, - TensorDataType::DtDouble => DataType::Float64, - _ => { - return Err(Error::io( - format!("unsupported tensor data type {:?}", tensor_dtype), - location!(), - )); - } - }) -} - -fn make_field(name: &str, feature_meta: &FeatureMeta) -> Result { - let data_type = match &feature_meta.feature_type { - FeatureType::Integer => DataType::Int64, - FeatureType::Float => DataType::Float32, - FeatureType::Binary => DataType::Binary, - FeatureType::String => DataType::Utf8, - FeatureType::Tensor { shape, dtype } => { - let list_size = shape.iter().map(|x| *x as i32).product(); - let inner_type = tensor_dtype_to_arrow(dtype)?; - - let inner_meta = match dtype { - TensorDataType::DtBfloat16 => Some( - [(ARROW_EXT_NAME_KEY, BFLOAT16_EXT_NAME)] - .into_iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect::>(), - ), - _ => None, - }; - let mut inner_field = ArrowField::new("item", inner_type, true); - if let Some(metadata) = inner_meta { - inner_field.set_metadata(metadata); - } - - DataType::FixedSizeList(Arc::new(inner_field), list_size) - } - }; - - // This metadata marks the field as a tensor column, which PyArrow can - // recognize. - let metadata = match &feature_meta.feature_type { - FeatureType::Tensor { shape, dtype: _ } => { - let mut metadata = HashMap::new(); - let tensor_metadata = ArrowTensorMetadata { - shape: shape.clone(), - }; - metadata.insert( - ARROW_EXT_NAME_KEY.to_string(), - "arrow.fixed_shape_tensor".to_string(), - ); - metadata.insert( - ARROW_EXT_META_KEY.to_string(), - serde_json::to_string(&tensor_metadata)?, - ); - Some(metadata) - } - _ => None, - }; - - let mut field = if feature_meta.repeated { - ArrowField::new("item", data_type, true) - } else { - ArrowField::new(name, data_type, true) - }; - if let Some(metadata) = metadata { - field.set_metadata(metadata); - } - - let field = if feature_meta.repeated { - ArrowField::new(name, DataType::List(Arc::new(field)), true) - } else { - field - }; - - Ok(field) -} - -/// Convert a vector of TFRecord examples into an Arrow record batch. -fn convert_batch(records: Vec, schema: &ArrowSchema) -> Result { - // TODO: do this in parallel? - let columns = schema - .fields - .iter() - .map(|field| convert_column(&records, field)) - .collect::>>()?; - - let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?; - Ok(batch) -} - -/// Convert a single column of TFRecord examples into an Arrow array. -fn convert_column(records: &[Example], field: &ArrowField) -> Result { - let type_info = parse_type(field.data_type()); - // Make leaf type - let (mut column, offsets) = convert_leaf(records, field.name(), &type_info)?; - - if let Some(fsl_size) = &type_info.fsl_size { - let mut field = ArrowField::new("item", type_info.leaf_type.clone(), true); - if matches!(&type_info.leaf_type, DataType::FixedSizeBinary(2)) { - field.set_metadata( - [ - ( - ARROW_EXT_NAME_KEY.to_string(), - BFLOAT16_EXT_NAME.to_string(), - ), - (ARROW_EXT_META_KEY.to_string(), "".to_string()), - ] - .into_iter() - .collect(), - ); - } - // Wrap in a FSL - column = Arc::new(FixedSizeListArray::try_new( - Arc::new(field), - *fsl_size, - column, - None, - )?); - } - - if type_info.in_list { - column = Arc::new(ListArray::try_new( - Arc::new(ArrowField::new("item", column.data_type().clone(), true)), - offsets.unwrap(), - column, - None, - )?); - } - - Ok(column) -} - -/// Representation of a field in the TFRecord file. It can be a leaf type, a -/// tensor, or a list of either. -struct TypeInfo { - leaf_type: DataType, - fsl_size: Option, - in_list: bool, -} - -fn parse_type(data_type: &DataType) -> TypeInfo { - match data_type { - DataType::FixedSizeList(inner_field, list_size) => { - let inner_type = parse_type(inner_field.data_type()); - TypeInfo { - leaf_type: inner_type.leaf_type, - fsl_size: Some(*list_size), - in_list: false, - } - } - DataType::List(inner_field) => { - let inner_type = parse_type(inner_field.data_type()); - TypeInfo { - leaf_type: inner_type.leaf_type, - fsl_size: inner_type.fsl_size, - in_list: true, - } - } - _ => TypeInfo { - leaf_type: data_type.clone(), - fsl_size: None, - in_list: false, - }, - } -} - -fn convert_leaf( - records: &[Example], - name: &str, - type_info: &TypeInfo, -) -> Result<(ArrayRef, Option>)> { - use arrow::array::*; - let features: Vec> = records - .iter() - .map(|record| { - let features = record.features.as_ref().unwrap(); - features.feature.get(name) - }) - .collect(); - let (values, offsets): (ArrayRef, Option>) = match type_info { - // First, the Non-tensor leaf types - TypeInfo { - leaf_type: DataType::Int64, - fsl_size: None, - in_list, - } => { - let mut values = Int64Builder::with_capacity(features.len()); - for feature in features.iter() { - if let Some(Feature { - kind: Some(Kind::Int64List(list)), - }) = feature - { - values.append_slice(&list.value); - } else if !type_info.in_list { - values.append_null(); - } - } - let offsets = if *in_list { - Some(compute_offsets(&features, type_info)) - } else { - None - }; - (Arc::new(values.finish()), offsets) - } - TypeInfo { - leaf_type: DataType::Float32, - fsl_size: None, - in_list, - } => { - let mut values = Float32Builder::with_capacity(features.len()); - for feature in features.iter() { - if let Some(Feature { - kind: Some(Kind::FloatList(list)), - }) = feature - { - values.append_slice(&list.value); - } else if !type_info.in_list { - values.append_null(); - } - } - let offsets = if *in_list { - Some(compute_offsets(&features, type_info)) - } else { - None - }; - (Arc::new(values.finish()), offsets) - } - TypeInfo { - leaf_type: DataType::Binary, - fsl_size: None, - in_list, - } => { - let mut values = BinaryBuilder::with_capacity(features.len(), 1024); - for feature in features.iter() { - if let Some(Feature { - kind: Some(Kind::BytesList(list)), - }) = feature - { - for value in &list.value { - values.append_value(value); - } - } else if !type_info.in_list { - values.append_null(); - } - } - let offsets = if *in_list { - Some(compute_offsets(&features, type_info)) - } else { - None - }; - (Arc::new(values.finish()), offsets) - } - TypeInfo { - leaf_type: DataType::Utf8, - fsl_size: None, - in_list, - } => { - let mut values = StringBuilder::with_capacity(features.len(), 1024); - for feature in features.iter() { - if let Some(Feature { - kind: Some(Kind::BytesList(list)), - }) = feature - { - for value in &list.value { - values.append_value(String::from_utf8_lossy(value)); - } - } else if !type_info.in_list { - values.append_null(); - } - } - let offsets = if *in_list { - Some(compute_offsets(&features, type_info)) - } else { - None - }; - (Arc::new(values.finish()), offsets) - } - // Now, handle tensors - TypeInfo { - fsl_size: Some(_), .. - } => convert_fixedshape_tensor(&features, type_info)?, - _ => Err(Error::io( - format!("unsupported type {:?}", type_info.leaf_type), - location!(), - ))?, - }; - - Ok((values, offsets)) -} - -fn compute_offsets(features: &[Option<&Feature>], type_info: &TypeInfo) -> OffsetBuffer { - let mut offsets: Vec = Vec::with_capacity(features.len() + 1); - offsets.push(0); - - let mut current = 0; - for feature in features.iter() { - if let Some(feature) = feature { - match ( - type_info.fsl_size.is_some(), - &type_info.leaf_type, - feature.kind.as_ref().unwrap(), - ) { - (true, _, Kind::BytesList(list)) => { - current += list.value.len() as i32; - } - (false, DataType::Binary, Kind::BytesList(list)) => { - current += list.value.len() as i32; - } - (false, DataType::Utf8, Kind::BytesList(list)) => { - current += list.value.len() as i32; - } - (false, DataType::Float32, Kind::FloatList(list)) => { - current += list.value.len() as i32; - } - (false, DataType::Int64, Kind::Int64List(list)) => { - current += list.value.len() as i32; - } - _ => {} // Ignore mismatched types - } - } - offsets.push(current); - } - - OffsetBuffer::new(ScalarBuffer::from(offsets)) -} - -// /// Convert TensorProto message into an element of a FixedShapeTensor array and -// /// append it to the builder. -// /// -// /// TensorProto definition: -// /// https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/tensor.proto -// /// -// /// FixedShapeTensor definition: -// /// https://arrow.apache.org/docs/format/CanonicalExtensions.html#fixed-shape-tensor -fn convert_fixedshape_tensor( - features: &[Option<&Feature>], - type_info: &TypeInfo, -) -> Result<(ArrayRef, Option>)> { - use arrow::array::*; - let tensor_iter = features.iter().map(|maybe_feature| { - if let Some(feature) = maybe_feature { - if let Some(Kind::BytesList(list)) = &feature.kind { - list.value - .iter() - .map(|val| TensorProto::decode(val.as_slice())) - .collect::, _>>() - .map(Some) - } else { - Ok(None) - } - } else { - Ok(None) - } - }); - - let offsets = if type_info.in_list { - Some(compute_offsets(features, type_info)) - } else { - None - }; - - let list_size = type_info.fsl_size.unwrap() as usize; - - let values: ArrayRef = match type_info.leaf_type { - DataType::Float16 => { - let mut values = Float16Builder::with_capacity(features.len()); - for tensors in tensor_iter { - if let Some(tensors) = tensors.map_prost_err(location!())? { - for tensor in tensors { - validate_tensor(&tensor, type_info)?; - if tensor.half_val.is_empty() { - append_primitive_from_slice( - &mut values, - tensor.tensor_content.as_slice(), - |bytes| f16::from_le_bytes(bytes.try_into().unwrap()), - ) - } else { - // The individual values have padding (they are stored as i32) - // because protobuf has no 2-byte type - for value in tensor.half_val { - values.append_value(f16::from_bits(value as u16)); - } - } - } - } else { - values.append_nulls(list_size); - } - } - Arc::new(values.finish()) - } - // BFloat16 - DataType::FixedSizeBinary(2) => { - let mut values = FixedSizeBinaryBuilder::with_capacity(features.len(), 2); - - for tensors in tensor_iter { - if let Some(tensors) = tensors.map_prost_err(location!())? { - for tensor in tensors { - validate_tensor(&tensor, type_info)?; - if tensor.half_val.is_empty() { - // Just directly move the bytes - for bytes in tensor.tensor_content.as_slice().chunks_exact(2) { - values.append_value(bytes)?; - } - } else { - // The individual values have padding (they are stored as i32) - // because protobuf has no 2-byte type - for value in tensor.half_val { - let bf16_value = bf16::from_bits(value as u16); - values.append_value(bf16_value.to_le_bytes())?; - } - } - } - } else { - for _ in 0..list_size { - values.append_null(); - } - } - } - Arc::new(values.finish()) - } - DataType::Float32 => { - let mut values = Float32Builder::with_capacity(features.len()); - for tensors in tensor_iter { - if let Some(tensors) = tensors.map_prost_err(location!())? { - for tensor in tensors { - validate_tensor(&tensor, type_info)?; - if tensor.float_val.is_empty() { - append_primitive_from_slice( - &mut values, - tensor.tensor_content.as_slice(), - |bytes| f32::from_le_bytes(bytes.try_into().unwrap()), - ) - } else { - values.append_slice(tensor.float_val.as_slice()); - } - } - } else { - values.append_nulls(list_size); - } - } - Arc::new(values.finish()) - } - DataType::Float64 => { - let mut values = Float64Builder::with_capacity(features.len()); - for tensors in tensor_iter { - if let Some(tensors) = tensors.map_prost_err(location!())? { - for tensor in tensors { - validate_tensor(&tensor, type_info)?; - if tensor.float_val.is_empty() { - append_primitive_from_slice( - &mut values, - tensor.tensor_content.as_slice(), - |bytes| f64::from_le_bytes(bytes.try_into().unwrap()), - ) - } else { - values.append_slice(tensor.double_val.as_slice()) - }; - } - } else { - values.append_nulls(list_size); - } - } - Arc::new(values.finish()) - } - _ => Err(Error::io( - format!("unsupported type {:?}", type_info.leaf_type), - location!(), - ))?, - }; - - Ok((values, offsets)) -} - -fn validate_tensor(tensor: &TensorProto, type_info: &TypeInfo) -> Result<()> { - let tensor_shape = tensor.tensor_shape.as_ref().unwrap(); - let length = tensor_shape.dim.iter().map(|d| d.size as i32).product(); - if type_info.fsl_size != Some(length) { - return Err(Error::io( - format!( - "tensor length mismatch: expected {}, got {}", - type_info.fsl_size.unwrap(), - length - ), - location!(), - )); - } - - let data_type = tensor_dtype_to_arrow(&tensor.dtype())?; - if data_type != type_info.leaf_type { - return Err(Error::io( - format!( - "tensor type mismatch: expected {:?}, got {:?}", - type_info.leaf_type, - tensor.dtype() - ), - location!(), - )); - } - - Ok(()) -} - -/// Given a potentially unaligned slice, append the slice to the builder. -fn append_primitive_from_slice( - builder: &mut PrimitiveBuilder, - slice: &[u8], - parse_val: impl Fn(&[u8]) -> T::Native, -) where - T: arrow::datatypes::ArrowPrimitiveType, -{ - // Safety: we are trusting that the data in the buffer are valid for the - // datatype T::Native, as claimed by the file. There isn't anywhere for - // TensorProto to tell us the original endianness, so it's possible there - // could be a mismatch here. - let (prefix, middle, suffix) = unsafe { slice.align_to::() }; - for val in prefix.chunks_exact(T::Native::get_byte_width()) { - builder.append_value(parse_val(val)); - } - - builder.append_slice(middle); - - for val in suffix.chunks_exact(T::Native::get_byte_width()) { - builder.append_value(parse_val(val)); - } -}