diff --git a/Cargo.lock b/Cargo.lock index db08ef5..0a34bfe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -378,19 +378,14 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.19" +version = "0.4.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" +checksum = "d0f9ee0f6e02ffd7ad5816e9464499fba7b3effd01123b515c41d1697c43dad1" dependencies = [ - "bzip2 0.5.2", - "flate2", - "futures-core", - "memchr", + "compression-codecs", + "compression-core", "pin-project-lite", "tokio", - "xz2", - "zstd", - "zstd-safe", ] [[package]] @@ -545,15 +540,6 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" -[[package]] -name = "bzip2" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" -dependencies = [ - "bzip2-sys", -] - [[package]] name = "bzip2" version = "0.6.1" @@ -563,16 +549,6 @@ dependencies = [ "libbz2-rs-sys", ] -[[package]] -name = "bzip2-sys" -version = "0.1.13+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" -dependencies = [ - "cc", - "pkg-config", -] - [[package]] name = "cc" version = "1.2.39" @@ -622,6 +598,27 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "compression-codecs" +version = "0.4.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb7b51a7d9c967fc26773061ba86150f19c50c0d65c887cb1fbe295fd16619b7" +dependencies = [ + "bzip2", + "compression-core", + "flate2", + "liblzma", + "memchr", + "zstd", + "zstd-safe", +] + +[[package]] +name = "compression-core" +version = "0.4.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" + [[package]] name = "const-random" version = "0.1.18" @@ -764,15 +761,15 @@ dependencies = [ [[package]] name = "datafusion" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ba7cb113e9c0bedf9e9765926031e132fa05a1b09ba6e93a6d1a4d7044457b8" +checksum = "503f1f4a9060ae6e650d3dff5dc7a21266fea1302d890768d45b4b28586e830f" dependencies = [ "arrow", "arrow-schema", "async-trait", "bytes", - "bzip2 0.6.1", + "bzip2", "chrono", "datafusion-catalog", "datafusion-catalog-listing", @@ -802,27 +799,26 @@ dependencies = [ "flate2", "futures", "itertools", + "liblzma", "log", "object_store", "parking_lot", "parquet", "rand", "regex", - "rstest", "sqlparser", "tempfile", "tokio", "url", "uuid", - "xz2", "zstd", ] [[package]] name = "datafusion-catalog" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66a3a799f914a59b1ea343906a0486f17061f39509af74e874a866428951130d" +checksum = "14417a3ee4ae3d092b56cd6c1d32e8ff3e2c9ec130ecb2276ec91c89fd599399" dependencies = [ "arrow", "async-trait", @@ -845,9 +841,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db1b113c80d7a0febcd901476a57aef378e717c54517a163ed51417d87621b0" +checksum = "9d0eba824adb45a4b3ac6f0251d40df3f6a9382371cad136f4f14ac9ebc6bc10" dependencies = [ "arrow", "async-trait", @@ -864,21 +860,20 @@ dependencies = [ "itertools", "log", "object_store", - "tokio", ] [[package]] name = "datafusion-common" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c10f7659e96127d25e8366be7c8be4109595d6a2c3eac70421f380a7006a1b0" +checksum = "0039deefbd00c56adf5168b7ca58568fb058e4ba4c5a03b09f8be371b4e434b6" dependencies = [ "ahash", "arrow", "arrow-ipc", "chrono", "half", - "hashbrown 0.14.5", + "hashbrown 0.16.1", "indexmap", "libc", "log", @@ -893,9 +888,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b92065bbc6532c6651e2f7dd30b55cba0c7a14f860c7e1d15f165c41a1868d95" +checksum = "1ec7e3e60b813048331f8fb9673583173e5d2dd8fef862834ee871fc98b57ca7" dependencies = [ "futures", "log", @@ -904,15 +899,15 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde13794244bc7581cd82f6fff217068ed79cdc344cafe4ab2c3a1c3510b38d6" +checksum = "802068957f620302ecf05f84ff4019601aeafd36f5f3f1334984af2e34265129" dependencies = [ "arrow", "async-compression", "async-trait", "bytes", - "bzip2 0.6.1", + "bzip2", "chrono", "datafusion-common", "datafusion-common-runtime", @@ -927,21 +922,21 @@ dependencies = [ "futures", "glob", "itertools", + "liblzma", "log", "object_store", "rand", "tokio", "tokio-util", "url", - "xz2", "zstd", ] [[package]] name = "datafusion-datasource-arrow" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804fa9b4ecf3157982021770617200ef7c1b2979d57bec9044748314775a9aea" +checksum = "90fc387d5067c62d494a6647d29c5ad4fcdd5a6e50ab4ea1d2568caa2d66f2cc" dependencies = [ "arrow", "arrow-ipc", @@ -963,9 +958,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61a1641a40b259bab38131c5e6f48fac0717bedb7dc93690e604142a849e0568" +checksum = "efd5e20579bb6c8bd4e6c620253972fb723822030c280dd6aa047f660d09eeba" dependencies = [ "arrow", "async-trait", @@ -986,9 +981,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adeacdb00c1d37271176f8fb6a1d8ce096baba16ea7a4b2671840c5c9c64fe85" +checksum = "c0788b0d48fcef31880a02013ea3cc18e5a4e0eacc3b0abdd2cd0597b99dc96e" dependencies = [ "arrow", "async-trait", @@ -1008,9 +1003,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d0b60ffd66f28bfb026565d62b0a6cbc416da09814766a3797bba7d85a3cd9" +checksum = "66639b70f1f363f5f0950733170100e588f1acfacac90c1894e231194aa35957" dependencies = [ "arrow", "async-trait", @@ -1038,18 +1033,19 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b99e13947667b36ad713549237362afb054b2d8f8cc447751e23ec61202db07" +checksum = "e44b41f3e8267c6cf3eec982d63f34db9f1dd5f30abfd2e1f124f0871708952e" [[package]] name = "datafusion-execution" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63695643190679037bc946ad46a263b62016931547bf119859c511f7ff2f5178" +checksum = "9e456f60e5d38db45335e84617006d90af14a8c8c5b8e959add708b2daaa0e2c" dependencies = [ "arrow", "async-trait", + "chrono", "dashmap", "datafusion-common", "datafusion-expr", @@ -1064,9 +1060,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9a4787cbf5feb1ab351f789063398f67654a6df75c4d37d7f637dc96f951a91" +checksum = "6507c719804265a58043134580c1c20767e7c23ba450724393f03ec982769ad9" dependencies = [ "arrow", "async-trait", @@ -1087,9 +1083,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce2fb1b8c15c9ac45b0863c30b268c69dc9ee7a1ee13ecf5d067738338173dc" +checksum = "a413caa9c5885072b539337aed68488f0291653e8edd7d676c92df2480f6cab0" dependencies = [ "arrow", "datafusion-common", @@ -1100,20 +1096,27 @@ dependencies = [ [[package]] name = "datafusion-ffi" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec510e7787641279b0336e8b79e4b7bd1385d5976875ff9b97f4269ce5231a67" +checksum = "3ca486e22de2bb1512dda751fb490c1cabafa9aec67b456cd4038e812be527f7" dependencies = [ "abi_stable", "arrow", "arrow-schema", "async-ffi", "async-trait", - "datafusion", + "datafusion-catalog", "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", "datafusion-proto", "datafusion-proto-common", + "datafusion-session", "futures", "log", "prost", @@ -1123,9 +1126,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "794a9db7f7b96b3346fc007ff25e994f09b8f0511b4cf7dff651fadfe3ebb28f" +checksum = "189256495dc9cbbb8e20dbcf161f60422e628d201a78df8207e44bd4baefadb6" dependencies = [ "arrow", "arrow-buffer", @@ -1133,6 +1136,7 @@ dependencies = [ "blake2", "blake3", "chrono", + "chrono-tz", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1153,9 +1157,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c25210520a9dcf9c2b2cbbce31ebd4131ef5af7fc60ee92b266dc7d159cb305" +checksum = "12e73dfee4cd67c4a507ffff4c5a711d39983adf544adbc09c09bf06f789f413" dependencies = [ "ahash", "arrow", @@ -1174,9 +1178,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f4a66f3b87300bb70f4124b55434d2ae3fe80455f3574701d0348da040b55d" +checksum = "87727bd9e65f4f9ac6d608c9810b7da9eaa3b18b26a4a4b76520592d49020acf" dependencies = [ "ahash", "arrow", @@ -1187,9 +1191,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae5c06eed03918dc7fe7a9f082a284050f0e9ecf95d72f57712d1496da03b8c4" +checksum = "2e5ef761359224b7c2b5a1bfad6296ac63225f8583d08ad18af9ba1a89ac3887" dependencies = [ "arrow", "arrow-ord", @@ -1210,9 +1214,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db4fed1d71738fbe22e2712d71396db04c25de4111f1ec252b8f4c6d3b25d7f5" +checksum = "3b17dac25dfda2d2a90ff0ad1c054a11fb1523766226bec6e9bd8c410daee2ae" dependencies = [ "arrow", "async-trait", @@ -1226,9 +1230,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d92206aa5ae21892f1552b4d61758a862a70956e6fd7a95cb85db1de74bc6d1" +checksum = "c594a29ddb22cbdbce500e4d99b5b2392c5cecb4c1086298b41d1ffec14dbb77" dependencies = [ "arrow", "datafusion-common", @@ -1244,9 +1248,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53ae9bcc39800820d53a22d758b3b8726ff84a5a3e24cecef04ef4e5fdf1c7cc" +checksum = "9aa1b15ed81c7543f62264a30dd49dec4b1b0b698053b968f53be32dfba4f729" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1254,9 +1258,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1063ad4c9e094b3f798acee16d9a47bd7372d9699be2de21b05c3bd3f34ab848" +checksum = "c00c31c4795597aa25b74cab5174ac07a53051f27ce1e011ecaffa9eaeecef81" dependencies = [ "datafusion-doc", "quote", @@ -1265,9 +1269,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f35f9ec5d08b87fd1893a30c2929f2559c2f9806ca072d8fefca5009dc0f06a" +checksum = "80ccf60767c09302b2e0fc3afebb3761a6d508d07316fab8c5e93312728a21bb" dependencies = [ "arrow", "chrono", @@ -1285,9 +1289,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c30cc8012e9eedcb48bbe112c6eff4ae5ed19cf3003cb0f505662e88b7014c5d" +checksum = "c64b7f277556944e4edd3558da01d9e9ff9f5416f1c0aa7fee088e57bd141a7e" dependencies = [ "ahash", "arrow", @@ -1297,19 +1301,21 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", - "hashbrown 0.14.5", + "hashbrown 0.16.1", "indexmap", "itertools", "parking_lot", "paste", "petgraph", + "recursive", + "tokio", ] [[package]] name = "datafusion-physical-expr-adapter" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9ff2dbd476221b1f67337699eff432781c4e6e1713d2aefdaa517dfbf79768" +checksum = "b7abaee372ea2d19c016ee9ef8629c4415257d291cdd152bc7f0b75f28af1b63" dependencies = [ "arrow", "datafusion-common", @@ -1322,23 +1328,26 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90da43e1ec550b172f34c87ec68161986ced70fd05c8d2a2add66eef9c276f03" +checksum = "42237efe621f92adc22d111b531fdbc2cc38ca9b5e02327535628fb103ae2157" dependencies = [ "ahash", "arrow", + "chrono", "datafusion-common", "datafusion-expr-common", - "hashbrown 0.14.5", + "hashbrown 0.16.1", + "indexmap", "itertools", + "parking_lot", ] [[package]] name = "datafusion-physical-optimizer" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce9804f799acd7daef3be7aaffe77c0033768ed8fdbf5fb82fc4c5f2e6bc14e6" +checksum = "fd093498bd1319c6e5c76e9dfa905e78486f01b34579ce97f2e3a49f84c37fac" dependencies = [ "arrow", "datafusion-common", @@ -1355,27 +1364,27 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0acf0ad6b6924c6b1aa7d213b181e012e2d3ec0a64ff5b10ee6282ab0f8532ac" +checksum = "7cbe61b12daf81a9f20ba03bd3541165d51f86e004ef37426b11881330eed261" dependencies = [ "ahash", "arrow", "arrow-ord", "arrow-schema", "async-trait", - "chrono", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-functions", "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr", "datafusion-physical-expr-common", "futures", "half", - "hashbrown 0.14.5", + "hashbrown 0.16.1", "indexmap", "itertools", "log", @@ -1386,9 +1395,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d368093a98a17d1449b1083ac22ed16b7128e4c67789991869480d8c4a40ecb9" +checksum = "33c055594ab7e3f5430aea1024055bc9bd29ba6479a9cae6fe29823a2f527470" dependencies = [ "arrow", "chrono", @@ -1413,9 +1422,9 @@ dependencies = [ [[package]] name = "datafusion-proto-common" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b6aef3d5e5c1d2bc3114c4876730cb76a9bdc5a8df31ef1b6db48f0c1671895" +checksum = "84b2523bb8e7269b943c9060a3ae91c5a61e6b1d800c014a9433547a9ce23e55" dependencies = [ "arrow", "datafusion-common", @@ -1424,9 +1433,9 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac2c2498a1f134a9e11a9f5ed202a2a7d7e9774bd9249295593053ea3be999db" +checksum = "0124331116db7f79df92ebfd2c3b11a8f90240f253555c9bb084f10b6fecf1dd" dependencies = [ "arrow", "datafusion-common", @@ -1441,9 +1450,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f96eebd17555386f459037c65ab73aae8df09f464524c709d6a3134ad4f4776" +checksum = "1673e3c58ba618a6ea0568672f00664087b8982c581e9afd5aa6c3c79c9b431f" dependencies = [ "async-trait", "datafusion-common", @@ -1455,9 +1464,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "51.0.0" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fc195fe60634b2c6ccfd131b487de46dc30eccae8a3c35a13f136e7f440414f" +checksum = "5272d256dab5347bb39d2040589f45d8c6b715b27edcb5fffe88cc8b9c3909cb" dependencies = [ "arrow", "bigdecimal", @@ -1566,6 +1575,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -1646,12 +1661,6 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" -[[package]] -name = "futures-timer" -version = "3.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" - [[package]] name = "futures-util" version = "0.3.31" @@ -1735,10 +1744,6 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" -dependencies = [ - "ahash", - "allocator-api2", -] [[package]] name = "hashbrown" @@ -1746,7 +1751,7 @@ version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "foldhash", + "foldhash 0.1.5", ] [[package]] @@ -1754,6 +1759,11 @@ name = "hashbrown" version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] [[package]] name = "heck" @@ -2051,6 +2061,26 @@ dependencies = [ "winapi", ] +[[package]] +name = "liblzma" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899" +dependencies = [ + "liblzma-sys", +] + +[[package]] +name = "liblzma-sys" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f2db66f3268487b5033077f266da6777d057949b8f93c8ad82e441df25e6186" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "libm" version = "0.2.15" @@ -2094,17 +2124,6 @@ dependencies = [ "twox-hash", ] -[[package]] -name = "lzma-sys" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "md-5" version = "0.10.6" @@ -2361,15 +2380,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "proc-macro-crate" -version = "3.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" -dependencies = [ - "toml_edit", -] - [[package]] name = "proc-macro2" version = "1.0.101" @@ -2574,12 +2584,6 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" -[[package]] -name = "relative-path" -version = "1.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" - [[package]] name = "repr_offset" version = "0.2.2" @@ -2589,35 +2593,6 @@ dependencies = [ "tstr", ] -[[package]] -name = "rstest" -version = "0.26.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5a3193c063baaa2a95a33f03035c8a72b83d97a54916055ba22d35ed3839d49" -dependencies = [ - "futures-timer", - "futures-util", - "rstest_macros", -] - -[[package]] -name = "rstest_macros" -version = "0.26.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c845311f0ff7951c5506121a9ad75aec44d083c31583b2ea5a30bcb0b0abba0" -dependencies = [ - "cfg-if", - "glob", - "proc-macro-crate", - "proc-macro2", - "quote", - "regex", - "relative-path", - "rustc_version", - "syn 2.0.114", - "unicode-ident", -] - [[package]] name = "rustc_version" version = "0.4.1" @@ -2959,36 +2934,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "toml_datetime" -version = "0.7.5+spec-1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" -dependencies = [ - "serde_core", -] - -[[package]] -name = "toml_edit" -version = "0.23.10+spec-1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" -dependencies = [ - "indexmap", - "toml_datetime", - "toml_parser", - "winnow", -] - -[[package]] -name = "toml_parser" -version = "1.0.6+spec-1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3198b4b0a8e11f09dd03e133c0280504d0801269e9afa46362ffde1cbeebf44" -dependencies = [ - "winnow", -] - [[package]] name = "tracing" version = "0.1.41" @@ -3416,15 +3361,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "winnow" -version = "0.7.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" -dependencies = [ - "memchr", -] - [[package]] name = "wit-bindgen" version = "0.46.0" @@ -3452,15 +3388,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "xz2" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" -dependencies = [ - "lzma-sys", -] - [[package]] name = "yoke" version = "0.8.0" diff --git a/Cargo.toml b/Cargo.toml index 33bf581..a8de124 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,8 +21,8 @@ exclude = [ arrow = { version = "57.2.0", features = ["pyarrow"] } async-stream = "0.3" async-trait = "0.1" -datafusion = { version = "51.0.0" } -datafusion-ffi = { version = "51.0.0" } +datafusion = { version = "52.0.0" } +datafusion-ffi = { version = "52.0.0" } futures = { version = "0.3" } pyo3 = { version = "0.26.0", features = ["extension-module"] } tokio = { version = "1.46.1", features = ["rt"] } diff --git a/pyproject.toml b/pyproject.toml index ae1b3ff..6bfb1d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ classifiers = [ ] dependencies = [ "dask>=2024.8.0", - "datafusion==51.0.0", # This needs to match the cargo datafusion version!! + "datafusion==52.0.0", # This needs to match the cargo datafusion version!! "xarray>=2024.7.0", ] diff --git a/src/lib.rs b/src/lib.rs index cecba67..c489609 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,9 +27,7 @@ //! ## Parallel Execution //! //! Each xarray chunk becomes a separate partition, enabling parallel execution across -//! multiple cores. Due to a bug in DataFusion v51.0.0's `collect()` method, aggregation -//! queries should use `to_arrow_table()` instead to ensure complete results. -//! TODO(#107): Upgrading to the latest datafusion-python (52+) should fix this. +//! multiple cores. //! //! ## Filter Pushdown (Partition Pruning) //! @@ -66,10 +64,10 @@ use datafusion::logical_expr::{ use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::streaming::PartitionStream; use datafusion::physical_plan::{ExecutionPlan, SendableRecordBatchStream}; +use datafusion_ffi::proto::logical_extension_codec::FFI_LogicalExtensionCodec; use datafusion_ffi::table_provider::FFI_TableProvider; use pyo3::prelude::*; use pyo3::types::{PyCapsule, PyList}; -use tokio::runtime::Handle; // ============================================================================ // Partition Metadata Types for Filter Pushdown @@ -788,6 +786,58 @@ impl PartitionStream for PyArrowStreamPartition { } } +// ============================================================================ +// FFI Helpers +// ============================================================================ + +/// Extract an `FFI_LogicalExtensionCodec` from a Python session object. +/// +/// DataFusion 52 passes the `SessionContext` to `__datafusion_table_provider__` +/// so that the provider can obtain the codec needed for physical-plan +/// serialisation across the FFI boundary. The session exposes this via +/// `__datafusion_logical_extension_codec__()`, which returns a PyCapsule +/// named `"datafusion_logical_extension_codec"`. +/// +/// Mirrors the helper in the official datafusion-python FFI example +/// (`examples/datafusion-ffi-example/src/utils.rs`). +fn ffi_logical_codec_from_pycapsule( + session: Bound<'_, PyAny>, +) -> PyResult { + let attr = "__datafusion_logical_extension_codec__"; + let capsule = if session.hasattr(attr)? { + session.getattr(attr)?.call0()? + } else { + session + }; + + let capsule = capsule.downcast::().map_err(|e| { + pyo3::exceptions::PyValueError::new_err(format!( + "session did not produce a PyCapsule for the logical extension codec: {e}" + )) + })?; + + let capsule_name = capsule.name()?.ok_or_else(|| { + pyo3::exceptions::PyValueError::new_err( + "datafusion_logical_extension_codec PyCapsule has no name set", + ) + })?; + let capsule_name = capsule_name.to_str()?; + if capsule_name != "datafusion_logical_extension_codec" { + return Err(pyo3::exceptions::PyValueError::new_err(format!( + "expected capsule name 'datafusion_logical_extension_codec', got '{capsule_name}'" + ))); + } + + // SAFETY: The capsule was produced by datafusion-python and contains a + // valid FFI_LogicalExtensionCodec (#[repr(C)] StableAbi struct). + let codec = unsafe { capsule.reference::() }; + Ok(codec.clone()) +} + +// ============================================================================ +// Python-visible Table Class +// ============================================================================ + /// A lazy table provider that wraps Python stream factory functions. /// /// This class implements the `__datafusion_table_provider__` protocol, allowing @@ -802,11 +852,6 @@ impl PartitionStream for PyArrowStreamPartition { /// SQL filters on dimension columns (time, lat, lon, etc.) automatically prune /// partitions that can't contain matching rows when metadata is supplied. /// -/// # Note -/// -/// Due to a bug in DataFusion v51.0.0's `collect()` method, use `to_arrow_table()` -/// instead for aggregation queries to ensure complete results. -/// /// # Example /// /// ```python @@ -829,6 +874,7 @@ impl PartitionStream for PyArrowStreamPartition { /// ctx.register_table("air", table) /// result = ctx.sql("SELECT AVG(air) FROM air WHERE time > 500000000").to_arrow_table() /// ``` + #[pyclass(name = "LazyArrowStreamTable")] struct LazyArrowStreamTable { /// The underlying table provider with pruning support @@ -902,27 +948,22 @@ impl LazyArrowStreamTable { /// /// This method is called by DataFusion's `register_table()` to get a /// foreign table provider that can be used in queries. + /// + /// In DataFusion 52+, the caller passes `session` (a `SessionContext`) + /// so that the provider can access task-context and codec information + /// needed for physical plan serialisation across the FFI boundary. fn __datafusion_table_provider__<'py>( &self, py: Python<'py>, + session: Bound<'py, PyAny>, ) -> PyResult> { - // Create the FFI table provider + let codec = ffi_logical_codec_from_pycapsule(session)?; + let provider: Arc = self.table.clone(); - // Try to get the current tokio runtime handle (available when called from DataFusion context) - let runtime = Handle::try_current().ok(); + let ffi_provider = FFI_TableProvider::new_with_ffi_codec(provider, true, None, codec); - // Create FFI wrapper with filter pushdown ENABLED - let ffi_provider = FFI_TableProvider::new( - provider, true, // can_support_pushdown_filters = ENABLED - runtime, - ); - - // Create the capsule name let name = CString::new("datafusion_table_provider").unwrap(); - - // Create the PyCapsule without a destructor closure - // The PyCapsule takes ownership of the FFI_TableProvider PyCapsule::new(py, ffi_provider, Some(name)) } diff --git a/xarray_sql/df_test.py b/xarray_sql/df_test.py index abb2456..8eeba0d 100644 --- a/xarray_sql/df_test.py +++ b/xarray_sql/df_test.py @@ -473,9 +473,7 @@ def test_read_xarray_table_memory_bounds(large_ds): # --- Query phase --- ctx = SessionContext() ctx.register_table("weather", table) - ctx.sql( - "SELECT AVG(temperature), AVG(precipitation) FROM weather" - ).to_arrow_table() + ctx.sql("SELECT AVG(temperature), AVG(precipitation) FROM weather").collect() _, query_peak = tracemalloc.get_traced_memory() # tracemalloc measures Python-heap allocations, which include Arrow diff --git a/xarray_sql/reader.py b/xarray_sql/reader.py index 98a11e1..01bb024 100644 --- a/xarray_sql/reader.py +++ b/xarray_sql/reader.py @@ -208,15 +208,10 @@ def read_xarray_table( automatically prune partitions that can't contain matching rows. For example: # This query will skip loading partitions with time < '2020-02-01' - result = ctx.sql('SELECT * FROM air WHERE time > \"2020-02-01\"').to_arrow_table() + result = ctx.sql('SELECT * FROM air WHERE time > \"2020-02-01\"').collect() Supported operators: =, <, >, <=, >=, BETWEEN, IN, AND, OR. - Note: - Due to a bug in DataFusion v51.0.0's collect() method, use - `to_arrow_table()` instead of `collect()` for aggregation queries - to ensure complete results. This should be fixed in datafusion-python 52+. - Args: ds: An xarray Dataset. All data_vars must share the same dimensions. chunks: Xarray-like chunks specification. If not provided, uses @@ -243,7 +238,7 @@ def read_xarray_table( >>> >>> # Data is only read here, during query execution >>> # Filters on 'time' will prune partitions automatically! - >>> result = ctx.sql('SELECT AVG(air) FROM air').to_arrow_table() + >>> result = ctx.sql('SELECT AVG(air) FROM air').collect() """ from ._native import LazyArrowStreamTable diff --git a/xarray_sql/reader_test.py b/xarray_sql/reader_test.py index adfec27..6ab6406 100644 --- a/xarray_sql/reader_test.py +++ b/xarray_sql/reader_test.py @@ -739,12 +739,8 @@ def test_aggregation_with_many_batches(self): """Verify aggregation queries work correctly with many batches. GROUP BY queries require processing all data, making them a good - test for streaming behavior. - - Note: We use to_arrow_table() instead of collect() due to a bug in - DataFusion v51.0.0 where collect() returns partial results for - parallel aggregation queries. - # TODO(#107): Upgrade to latest datafusion-python, which has the fix. + test for streaming behavior. Uses collect() to verify that parallel + aggregation returns complete results (fixed in DataFusion 52+). """ np.random.seed(789) time_coord = pd.date_range("2020-01-01", periods=120, freq="h") @@ -770,14 +766,11 @@ def test_aggregation_with_many_batches(self): ctx = SessionContext() ctx.register_table("test_table", table) - # GROUP BY requires scanning all data - # Use to_arrow_table() to avoid DataFusion collect() bug - result = ctx.sql( + # GROUP BY requires scanning all data; collect() must return complete results + df = ctx.sql( "SELECT lat, AVG(temperature) as avg_temp FROM test_table GROUP BY lat" - ).to_arrow_table() + ).to_pandas() - # Should have result for each lat value - df = result.to_pandas() assert len(df) == 5, f"Expected 5 lat groups, got {len(df)}" # All partitions processed @@ -996,7 +989,7 @@ def test_time_gt_filter_prunes_early_partitions(self, time_chunked_ds): SELECT COUNT(*) as cnt FROM test WHERE time >= '2020-03-16' """ - ).to_arrow_table() + ).to_pandas() # Should read only 1 partition (the last one) assert ( @@ -1004,7 +997,7 @@ def test_time_gt_filter_prunes_early_partitions(self, time_chunked_ds): ), f"Expected 1 partition after filter pushdown, got {tracker.iteration_count}" # Verify data correctness - 25 days * 5 lat = 125 rows - count = result.to_pandas()["cnt"].iloc[0] + count = result["cnt"].iloc[0] assert count == 125, f"Expected 125 rows, got {count}" def test_time_lt_filter_prunes_late_partitions(self, time_chunked_ds): @@ -1026,7 +1019,7 @@ def test_time_lt_filter_prunes_late_partitions(self, time_chunked_ds): SELECT COUNT(*) as cnt FROM test WHERE time < '2020-01-26' """ - ).to_arrow_table() + ).to_pandas() # Should read only 1 partition (the first one) assert ( @@ -1034,7 +1027,7 @@ def test_time_lt_filter_prunes_late_partitions(self, time_chunked_ds): ), f"Expected 1 partition after filter pushdown, got {tracker.iteration_count}" # Verify correctness: Jan 1–25 (25 days) × 5 lat = 125 rows - count = result.to_pandas()["cnt"].iloc[0] + count = result["cnt"].iloc[0] assert count == 125, f"Expected 125 rows, got {count}" def test_time_between_filter_prunes_outside_range(self, time_chunked_ds): @@ -1056,7 +1049,7 @@ def test_time_between_filter_prunes_outside_range(self, time_chunked_ds): SELECT COUNT(*) as cnt FROM test WHERE time BETWEEN '2020-02-01' AND '2020-03-21' """ - ).to_arrow_table() + ).collect() # Partition 0 (Jan 1–25) ends before Feb 1 and is pruned. # Partitions 1, 2, 3 each overlap with Feb 1–Mar 21. @@ -1099,7 +1092,7 @@ def test_lat_filter_prunes_partitions(self): SELECT COUNT(*) as cnt FROM test WHERE lat < 0 """ - ).to_arrow_table() + ).collect() # np.linspace(-90, 90, 100) chunked by 25: # Partition 0: indices 0–24, lat -90.0 to -46.4 (all negative) @@ -1128,7 +1121,7 @@ def test_no_pruning_for_data_column_filters(self, time_chunked_ds): """ SELECT COUNT(*) FROM test WHERE temperature > 0.5 """ - ).to_arrow_table() + ).collect() # All 4 partitions should be read (can't prune on data column) assert ( @@ -1151,11 +1144,11 @@ def test_filter_correctness_preserved(self, time_chunked_ds): SELECT COUNT(*) as cnt FROM test WHERE time >= '2020-02-15' AND time <= '2020-03-15' """ - ).to_arrow_table() + ).to_pandas() # Manual calculation: Feb 15 (day 45) to Mar 15 (day 74) = 30 days # 30 days * 5 lat values = 150 rows - count = filtered.to_pandas()["cnt"].iloc[0] + count = filtered["cnt"].iloc[0] assert count == 150, f"Expected 150 rows, got {count}" def test_and_filter_combines_pruning(self, time_chunked_ds): @@ -1177,7 +1170,7 @@ def test_and_filter_combines_pruning(self, time_chunked_ds): SELECT * FROM test WHERE time >= '2020-03-20' AND time <= '2020-04-05' """ - ).to_arrow_table() + ).collect() # Should read only 1 partition assert ( @@ -1203,7 +1196,7 @@ def test_or_filter_is_conservative(self, time_chunked_ds): SELECT * FROM test WHERE time < '2020-01-10' OR time > '2020-03-30' """ - ).to_arrow_table() + ).collect() # Should read at least 2 partitions (first and last) assert ( @@ -1229,7 +1222,7 @@ def test_empty_result_from_impossible_filter(self, time_chunked_ds): SELECT COUNT(*) as cnt FROM test WHERE time > '2025-01-01' """ - ).to_arrow_table() + ).to_pandas() # Should read 0 partitions (all pruned) assert ( @@ -1237,7 +1230,7 @@ def test_empty_result_from_impossible_filter(self, time_chunked_ds): ), f"Expected 0 partitions for impossible filter, got {tracker.iteration_count}" # Result should be 0 rows - count = result.to_pandas()["cnt"].iloc[0] + count = result["cnt"].iloc[0] assert count == 0, f"Expected 0 rows, got {count}" @@ -1281,7 +1274,7 @@ def test_single_column_select_projects_only_that_variable(self, two_var_ds): ctx = SessionContext() ctx.register_table("data", table) - ctx.sql("SELECT AVG(temperature) FROM data").to_arrow_table() + ctx.sql("SELECT AVG(temperature) FROM data").collect() assert ( tracker.iteration_count > 0 @@ -1308,7 +1301,7 @@ def test_full_select_includes_all_variables(self, two_var_ds): ctx = SessionContext() ctx.register_table("data", table) - ctx.sql("SELECT * FROM data").to_arrow_table() + ctx.sql("SELECT * FROM data").collect() assert ( tracker.iteration_count > 0 @@ -1331,9 +1324,7 @@ def test_multi_column_select_includes_all_requested(self, two_var_ds): ctx = SessionContext() ctx.register_table("data", table) - ctx.sql( - "SELECT AVG(temperature), AVG(precipitation) FROM data" - ).to_arrow_table() + ctx.sql("SELECT AVG(temperature), AVG(precipitation) FROM data").collect() assert tracker.iteration_count > 0 for proj in tracker.projections_seen: @@ -1350,7 +1341,6 @@ def test_projection_result_correctness(self, two_var_ds): projected = ( ctx.sql("SELECT AVG(temperature) as avg_t FROM data") - .to_arrow_table() .to_pandas()["avg_t"] .iloc[0] ) @@ -1373,10 +1363,10 @@ def callback(block, projection_names): ) ctx = SessionContext() ctx.register_table("data", table) - result = ctx.sql("SELECT COUNT(*) FROM data").to_arrow_table() + result = ctx.sql("SELECT COUNT(*) FROM data").collect() total_rows = 10 * 5 # time=10, lat=5 - assert result[0][0].as_py() == total_rows + assert result[0][0][0].as_py() == total_rows assert all( p is None for p in projections_seen ), f"COUNT(*) should not push a projection, but got: {projections_seen}"