diff --git a/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml b/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml index f907f681b1..c1e94674f1 100644 --- a/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml @@ -9,7 +9,8 @@ body: description: What Apache Iceberg version are you using? multiple: false options: - - "0.6.0 (latest release)" + - "0.6.1 (latest release)" + - "0.6.0" - "0.5.0" - "0.4.0" - "0.3.0" diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml index b8d9b5dae3..9519bad0b4 100644 --- a/.github/workflows/python-release.yml +++ b/.github/workflows/python-release.yml @@ -59,7 +59,7 @@ jobs: if: startsWith(matrix.os, 'ubuntu') - name: Build wheels - uses: pypa/cibuildwheel@v2.18.1 + uses: pypa/cibuildwheel@v2.19.1 with: output-dir: wheelhouse config-file: "pyproject.toml" diff --git a/dev/provision.py b/dev/provision.py index 44086caf20..6c8fe366d7 100644 --- a/dev/provision.py +++ b/dev/provision.py @@ -342,3 +342,50 @@ (array(), map(), array(struct(1))) """ ) + + spark.sql( + f""" + CREATE OR REPLACE TABLE {catalog_name}.default.test_table_snapshot_operations ( + number integer + ) + USING iceberg + TBLPROPERTIES ( + 'format-version'='2' + ); + """ + ) + + spark.sql( + f""" + INSERT INTO {catalog_name}.default.test_table_snapshot_operations + VALUES (1) + """ + ) + + spark.sql( + f""" + INSERT INTO {catalog_name}.default.test_table_snapshot_operations + VALUES (2) + """ + ) + + spark.sql( + f""" + DELETE FROM {catalog_name}.default.test_table_snapshot_operations + WHERE number = 2 + """ + ) + + spark.sql( + f""" + INSERT INTO {catalog_name}.default.test_table_snapshot_operations + VALUES (3) + """ + ) + + spark.sql( + f""" + INSERT INTO {catalog_name}.default.test_table_snapshot_operations + VALUES (4) + """ + ) diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 4751939a8e..6124258366 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -606,6 +606,100 @@ min_snapshots_to_keep: [[null,10]] max_snapshot_age_in_ms: [[null,604800000]] ``` +### Manifests + +To show a table's current file manifests: + +```python +table.inspect.manifests() +``` + +``` +pyarrow.Table +content: int8 not null +path: string not null +length: int64 not null +partition_spec_id: int32 not null +added_snapshot_id: int64 not null +added_data_files_count: int32 not null +existing_data_files_count: int32 not null +deleted_data_files_count: int32 not null +added_delete_files_count: int32 not null +existing_delete_files_count: int32 not null +deleted_delete_files_count: int32 not null +partition_summaries: list> not null + child 0, item: struct + child 0, contains_null: bool not null + child 1, contains_nan: bool + child 2, lower_bound: string + child 3, upper_bound: string +---- +content: [[0]] +path: [["s3://warehouse/default/table_metadata_manifests/metadata/3bf5b4c6-a7a4-4b43-a6ce-ca2b4887945a-m0.avro"]] +length: [[6886]] +partition_spec_id: [[0]] +added_snapshot_id: [[3815834705531553721]] +added_data_files_count: [[1]] +existing_data_files_count: [[0]] +deleted_data_files_count: [[0]] +added_delete_files_count: [[0]] +existing_delete_files_count: [[0]] +deleted_delete_files_count: [[0]] +partition_summaries: [[ -- is_valid: all not null + -- child 0 type: bool +[false] + -- child 1 type: bool +[false] + -- child 2 type: string +["test"] + -- child 3 type: string +["test"]]] +``` + +### Metadata Log Entries + +To show table metadata log entries: + +```python +table.inspect.metadata_log_entries() +``` + +``` +pyarrow.Table +timestamp: timestamp[ms] not null +file: string not null +latest_snapshot_id: int64 +latest_schema_id: int32 +latest_sequence_number: int64 +---- +timestamp: [[2024-04-28 17:03:00.214,2024-04-28 17:03:00.352,2024-04-28 17:03:00.445,2024-04-28 17:03:00.498]] +file: [["s3://warehouse/default/table_metadata_log_entries/metadata/00000-0b3b643b-0f3a-4787-83ad-601ba57b7319.metadata.json","s3://warehouse/default/table_metadata_log_entries/metadata/00001-f74e4b2c-0f89-4f55-822d-23d099fd7d54.metadata.json","s3://warehouse/default/table_metadata_log_entries/metadata/00002-97e31507-e4d9-4438-aff1-3c0c5304d271.metadata.json","s3://warehouse/default/table_metadata_log_entries/metadata/00003-6c8b7033-6ad8-4fe4-b64d-d70381aeaddc.metadata.json"]] +latest_snapshot_id: [[null,3958871664825505738,1289234307021405706,7640277914614648349]] +latest_schema_id: [[null,0,0,0]] +latest_sequence_number: [[null,0,0,0]] +``` + +### History + +To show a table's history: + +```python +table.inspect.history() +``` + +``` +pyarrow.Table +made_current_at: timestamp[ms] not null +snapshot_id: int64 not null +parent_id: int64 +is_current_ancestor: bool not null +---- +made_current_at: [[2024-06-18 16:17:48.768,2024-06-18 16:17:49.240,2024-06-18 16:17:49.343,2024-06-18 16:17:49.511]] +snapshot_id: [[4358109269873137077,3380769165026943338,4358109269873137077,3089420140651211776]] +parent_id: [[null,4358109269873137077,null,4358109269873137077]] +is_current_ancestor: [[true,false,true,true]] +``` + ### Files Inspect the data files in the current snapshot of the table: @@ -994,6 +1088,28 @@ tbl.overwrite(df, snapshot_properties={"abc": "def"}) assert tbl.metadata.snapshots[-1].summary["abc"] == "def" ``` +## Snapshot Management + +Manage snapshots with operations through the `Table` API: + +```python +# To run a specific operation +table.manage_snapshots().create_tag(snapshot_id, "tag123").commit() +# To run multiple operations +table.manage_snapshots() + .create_tag(snapshot_id1, "tag123") + .create_tag(snapshot_id2, "tag456") + .commit() +# Operations are applied on commit. +``` + +You can also use context managers to make more changes: + +```python +with table.manage_snapshots() as ms: + ms.create_branch(snapshot_id1, "Branch_A").create_tag(snapshot_id2, "tag789") +``` + ## Query the data To query a table, a table scan is needed. A table scan accepts a filter, columns, optionally a limit and a snapshot ID: @@ -1062,6 +1178,15 @@ tpep_dropoff_datetime: [[2021-04-01 00:47:59.000000,...,2021-05-01 00:14:47.0000 This will only pull in the files that that might contain matching rows. +One can also return a PyArrow RecordBatchReader, if reading one record batch at a time is preferred: + +```python +table.scan( + row_filter=GreaterThanOrEqual("trip_distance", 10.0), + selected_fields=("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime"), +).to_arrow_batch_reader() +``` + ### Pandas diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 1ca071f009..f8a69119c8 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -89,6 +89,7 @@ For the FileIO there are several configuration options available: | s3.access-key-id | admin | Configure the static secret access key used to access the FileIO. | | s3.secret-access-key | password | Configure the static session token used to access the FileIO. | | s3.signer | bearer | Configure the signature version of the FileIO. | +| s3.signer.uri | http://my.signer:8080/s3 | Configure the remote signing uri if it differs from the catalog uri. Remote signing is only implemented for `FsspecFileIO`. The final request is sent to `/v1/aws/s3/sign`. | | s3.region | us-west-2 | Sets the region of the bucket | | s3.proxy-uri | http://my.proxy.com:8080 | Configure the proxy server to be used by the FileIO. | | s3.connect-timeout | 60.0 | Configure socket connection timeout, in seconds. | @@ -298,4 +299,4 @@ PyIceberg uses multiple threads to parallelize operations. The number of workers # Backward Compatibility -Previous versions of Java (`<1.4.0`) implementations incorrectly assume the optional attribute `current-snapshot-id` to be a required attribute in TableMetadata. This means that if `current-snapshot-id` is missing in the metadata file (e.g. on table creation), the application will throw an exception without being able to load the table. This assumption has been corrected in more recent Iceberg versions. However, it is possible to force PyIceberg to create a table with a metadata file that will be compatible with previous versions. This can be configured by setting the `legacy-current-snapshot-id` entry as "True" in the configuration file, or by setting the `LEGACY_CURRENT_SNAPSHOT_ID` environment variable. Refer to the [PR discussion](https://github.com/apache/iceberg-python/pull/473) for more details on the issue +Previous versions of Java (`<1.4.0`) implementations incorrectly assume the optional attribute `current-snapshot-id` to be a required attribute in TableMetadata. This means that if `current-snapshot-id` is missing in the metadata file (e.g. on table creation), the application will throw an exception without being able to load the table. This assumption has been corrected in more recent Iceberg versions. However, it is possible to force PyIceberg to create a table with a metadata file that will be compatible with previous versions. This can be configured by setting the `legacy-current-snapshot-id` entry as "True" in the configuration file, or by setting the `PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID` environment variable. Refer to the [PR discussion](https://github.com/apache/iceberg-python/pull/473) for more details on the issue diff --git a/mkdocs/docs/how-to-release.md b/mkdocs/docs/how-to-release.md index 2ecb635610..429af5a3b8 100644 --- a/mkdocs/docs/how-to-release.md +++ b/mkdocs/docs/how-to-release.md @@ -214,3 +214,7 @@ Thanks to everyone for contributing! ## Release the docs A committer triggers the [`Python Docs` Github Actions](https://github.com/apache/iceberg-python/actions/workflows/python-ci-docs.yml) through the UI by selecting the branch that just has been released. This will publish the new docs. + +## Update the Github template + +Make sure to create a PR to update the [GitHub issues template](https://github.com/apache/iceberg-python/blob/main/.github/ISSUE_TEMPLATE/iceberg_bug_report.yml) with the latest version. diff --git a/mkdocs/mkdocs.yml b/mkdocs/mkdocs.yml index 90892ac73b..679aff2578 100644 --- a/mkdocs/mkdocs.yml +++ b/mkdocs/mkdocs.yml @@ -53,8 +53,11 @@ theme: toggle: icon: material/brightness-4 name: Switch to light mode + markdown_extensions: - admonition - pymdownx.highlight: anchor_linenums: true - pymdownx.superfences + - toc: + permalink: true diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index ccf518ba9b..292036d9ef 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -16,13 +16,13 @@ # under the License. mkdocs==1.6.0 -griffe==0.45.0 +griffe==0.47.0 jinja2==3.1.4 mkdocstrings==0.25.1 -mkdocstrings-python==1.10.2 +mkdocstrings-python==1.10.5 mkdocs-literate-nav==0.6.1 mkdocs-autorefs==1.0.1 mkdocs-gen-files==0.5.0 -mkdocs-material==9.5.24 +mkdocs-material==9.5.27 mkdocs-material-extensions==1.3.1 mkdocs-section-index==0.3.9 diff --git a/poetry.lock b/poetry.lock index 3c075152af..db148196d8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -295,20 +295,20 @@ requests = ">=2.20.0" [[package]] name = "azure-identity" -version = "1.15.0" +version = "1.16.1" description = "Microsoft Azure Identity Library for Python" optional = true -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "azure-identity-1.15.0.tar.gz", hash = "sha256:4c28fc246b7f9265610eb5261d65931183d019a23d4b0e99357facb2e6c227c8"}, - {file = "azure_identity-1.15.0-py3-none-any.whl", hash = "sha256:a14b1f01c7036f11f148f22cd8c16e05035293d714458d6b44ddf534d93eb912"}, + {file = "azure-identity-1.16.1.tar.gz", hash = "sha256:6d93f04468f240d59246d8afde3091494a5040d4f141cad0f49fc0c399d0d91e"}, + {file = "azure_identity-1.16.1-py3-none-any.whl", hash = "sha256:8fb07c25642cd4ac422559a8b50d3e77f73dcc2bbfaba419d06d6c9d7cff6726"}, ] [package.dependencies] -azure-core = ">=1.23.0,<2.0.0" +azure-core = ">=1.23.0" cryptography = ">=2.5" -msal = ">=1.24.0,<2.0.0" -msal-extensions = ">=0.3.0,<2.0.0" +msal = ">=1.24.0" +msal-extensions = ">=0.3.0" [[package]] name = "azure-storage-blob" @@ -652,63 +652,63 @@ files = [ [[package]] name = "coverage" -version = "7.5.1" +version = "7.5.4" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.8" files = [ - {file = "coverage-7.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0884920835a033b78d1c73b6d3bbcda8161a900f38a488829a83982925f6c2e"}, - {file = "coverage-7.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:39afcd3d4339329c5f58de48a52f6e4e50f6578dd6099961cf22228feb25f38f"}, - {file = "coverage-7.5.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a7b0ceee8147444347da6a66be737c9d78f3353b0681715b668b72e79203e4a"}, - {file = "coverage-7.5.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a9ca3f2fae0088c3c71d743d85404cec8df9be818a005ea065495bedc33da35"}, - {file = "coverage-7.5.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd215c0c7d7aab005221608a3c2b46f58c0285a819565887ee0b718c052aa4e"}, - {file = "coverage-7.5.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4bf0655ab60d754491004a5efd7f9cccefcc1081a74c9ef2da4735d6ee4a6223"}, - {file = "coverage-7.5.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:61c4bf1ba021817de12b813338c9be9f0ad5b1e781b9b340a6d29fc13e7c1b5e"}, - {file = "coverage-7.5.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:db66fc317a046556a96b453a58eced5024af4582a8dbdc0c23ca4dbc0d5b3146"}, - {file = "coverage-7.5.1-cp310-cp310-win32.whl", hash = "sha256:b016ea6b959d3b9556cb401c55a37547135a587db0115635a443b2ce8f1c7228"}, - {file = "coverage-7.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:df4e745a81c110e7446b1cc8131bf986157770fa405fe90e15e850aaf7619bc8"}, - {file = "coverage-7.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:796a79f63eca8814ca3317a1ea443645c9ff0d18b188de470ed7ccd45ae79428"}, - {file = "coverage-7.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4fc84a37bfd98db31beae3c2748811a3fa72bf2007ff7902f68746d9757f3746"}, - {file = "coverage-7.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6175d1a0559986c6ee3f7fccfc4a90ecd12ba0a383dcc2da30c2b9918d67d8a3"}, - {file = "coverage-7.5.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fc81d5878cd6274ce971e0a3a18a8803c3fe25457165314271cf78e3aae3aa2"}, - {file = "coverage-7.5.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:556cf1a7cbc8028cb60e1ff0be806be2eded2daf8129b8811c63e2b9a6c43bca"}, - {file = "coverage-7.5.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9981706d300c18d8b220995ad22627647be11a4276721c10911e0e9fa44c83e8"}, - {file = "coverage-7.5.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d7fed867ee50edf1a0b4a11e8e5d0895150e572af1cd6d315d557758bfa9c057"}, - {file = "coverage-7.5.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ef48e2707fb320c8f139424a596f5b69955a85b178f15af261bab871873bb987"}, - {file = "coverage-7.5.1-cp311-cp311-win32.whl", hash = "sha256:9314d5678dcc665330df5b69c1e726a0e49b27df0461c08ca12674bcc19ef136"}, - {file = "coverage-7.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:5fa567e99765fe98f4e7d7394ce623e794d7cabb170f2ca2ac5a4174437e90dd"}, - {file = "coverage-7.5.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b6cf3764c030e5338e7f61f95bd21147963cf6aa16e09d2f74f1fa52013c1206"}, - {file = "coverage-7.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ec92012fefebee89a6b9c79bc39051a6cb3891d562b9270ab10ecfdadbc0c34"}, - {file = "coverage-7.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16db7f26000a07efcf6aea00316f6ac57e7d9a96501e990a36f40c965ec7a95d"}, - {file = "coverage-7.5.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:beccf7b8a10b09c4ae543582c1319c6df47d78fd732f854ac68d518ee1fb97fa"}, - {file = "coverage-7.5.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8748731ad392d736cc9ccac03c9845b13bb07d020a33423fa5b3a36521ac6e4e"}, - {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7352b9161b33fd0b643ccd1f21f3a3908daaddf414f1c6cb9d3a2fd618bf2572"}, - {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:7a588d39e0925f6a2bff87154752481273cdb1736270642aeb3635cb9b4cad07"}, - {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:68f962d9b72ce69ea8621f57551b2fa9c70509af757ee3b8105d4f51b92b41a7"}, - {file = "coverage-7.5.1-cp312-cp312-win32.whl", hash = "sha256:f152cbf5b88aaeb836127d920dd0f5e7edff5a66f10c079157306c4343d86c19"}, - {file = "coverage-7.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:5a5740d1fb60ddf268a3811bcd353de34eb56dc24e8f52a7f05ee513b2d4f596"}, - {file = "coverage-7.5.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e2213def81a50519d7cc56ed643c9e93e0247f5bbe0d1247d15fa520814a7cd7"}, - {file = "coverage-7.5.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5037f8fcc2a95b1f0e80585bd9d1ec31068a9bcb157d9750a172836e98bc7a90"}, - {file = "coverage-7.5.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3721c2c9e4c4953a41a26c14f4cef64330392a6d2d675c8b1db3b645e31f0e"}, - {file = "coverage-7.5.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca498687ca46a62ae590253fba634a1fe9836bc56f626852fb2720f334c9e4e5"}, - {file = "coverage-7.5.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cdcbc320b14c3e5877ee79e649677cb7d89ef588852e9583e6b24c2e5072661"}, - {file = "coverage-7.5.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:57e0204b5b745594e5bc14b9b50006da722827f0b8c776949f1135677e88d0b8"}, - {file = "coverage-7.5.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8fe7502616b67b234482c3ce276ff26f39ffe88adca2acf0261df4b8454668b4"}, - {file = "coverage-7.5.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9e78295f4144f9dacfed4f92935fbe1780021247c2fabf73a819b17f0ccfff8d"}, - {file = "coverage-7.5.1-cp38-cp38-win32.whl", hash = "sha256:1434e088b41594baa71188a17533083eabf5609e8e72f16ce8c186001e6b8c41"}, - {file = "coverage-7.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:0646599e9b139988b63704d704af8e8df7fa4cbc4a1f33df69d97f36cb0a38de"}, - {file = "coverage-7.5.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4cc37def103a2725bc672f84bd939a6fe4522310503207aae4d56351644682f1"}, - {file = "coverage-7.5.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fc0b4d8bfeabd25ea75e94632f5b6e047eef8adaed0c2161ada1e922e7f7cece"}, - {file = "coverage-7.5.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d0a0f5e06881ecedfe6f3dd2f56dcb057b6dbeb3327fd32d4b12854df36bf26"}, - {file = "coverage-7.5.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9735317685ba6ec7e3754798c8871c2f49aa5e687cc794a0b1d284b2389d1bd5"}, - {file = "coverage-7.5.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d21918e9ef11edf36764b93101e2ae8cc82aa5efdc7c5a4e9c6c35a48496d601"}, - {file = "coverage-7.5.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c3e757949f268364b96ca894b4c342b41dc6f8f8b66c37878aacef5930db61be"}, - {file = "coverage-7.5.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:79afb6197e2f7f60c4824dd4b2d4c2ec5801ceb6ba9ce5d2c3080e5660d51a4f"}, - {file = "coverage-7.5.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d1d0d98d95dd18fe29dc66808e1accf59f037d5716f86a501fc0256455219668"}, - {file = "coverage-7.5.1-cp39-cp39-win32.whl", hash = "sha256:1cc0fe9b0b3a8364093c53b0b4c0c2dd4bb23acbec4c9240b5f284095ccf7981"}, - {file = "coverage-7.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:dde0070c40ea8bb3641e811c1cfbf18e265d024deff6de52c5950677a8fb1e0f"}, - {file = "coverage-7.5.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:6537e7c10cc47c595828b8a8be04c72144725c383c4702703ff4e42e44577312"}, - {file = "coverage-7.5.1.tar.gz", hash = "sha256:54de9ef3a9da981f7af93eafde4ede199e0846cd819eb27c88e2b712aae9708c"}, + {file = "coverage-7.5.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6cfb5a4f556bb51aba274588200a46e4dd6b505fb1a5f8c5ae408222eb416f99"}, + {file = "coverage-7.5.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2174e7c23e0a454ffe12267a10732c273243b4f2d50d07544a91198f05c48f47"}, + {file = "coverage-7.5.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2214ee920787d85db1b6a0bd9da5f8503ccc8fcd5814d90796c2f2493a2f4d2e"}, + {file = "coverage-7.5.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1137f46adb28e3813dec8c01fefadcb8c614f33576f672962e323b5128d9a68d"}, + {file = "coverage-7.5.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b385d49609f8e9efc885790a5a0e89f2e3ae042cdf12958b6034cc442de428d3"}, + {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b4a474f799456e0eb46d78ab07303286a84a3140e9700b9e154cfebc8f527016"}, + {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5cd64adedf3be66f8ccee418473c2916492d53cbafbfcff851cbec5a8454b136"}, + {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e564c2cf45d2f44a9da56f4e3a26b2236504a496eb4cb0ca7221cd4cc7a9aca9"}, + {file = "coverage-7.5.4-cp310-cp310-win32.whl", hash = "sha256:7076b4b3a5f6d2b5d7f1185fde25b1e54eb66e647a1dfef0e2c2bfaf9b4c88c8"}, + {file = "coverage-7.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:018a12985185038a5b2bcafab04ab833a9a0f2c59995b3cec07e10074c78635f"}, + {file = "coverage-7.5.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:db14f552ac38f10758ad14dd7b983dbab424e731588d300c7db25b6f89e335b5"}, + {file = "coverage-7.5.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3257fdd8e574805f27bb5342b77bc65578e98cbc004a92232106344053f319ba"}, + {file = "coverage-7.5.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a6612c99081d8d6134005b1354191e103ec9705d7ba2754e848211ac8cacc6b"}, + {file = "coverage-7.5.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d45d3cbd94159c468b9b8c5a556e3f6b81a8d1af2a92b77320e887c3e7a5d080"}, + {file = "coverage-7.5.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed550e7442f278af76d9d65af48069f1fb84c9f745ae249c1a183c1e9d1b025c"}, + {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7a892be37ca35eb5019ec85402c3371b0f7cda5ab5056023a7f13da0961e60da"}, + {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8192794d120167e2a64721d88dbd688584675e86e15d0569599257566dec9bf0"}, + {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:820bc841faa502e727a48311948e0461132a9c8baa42f6b2b84a29ced24cc078"}, + {file = "coverage-7.5.4-cp311-cp311-win32.whl", hash = "sha256:6aae5cce399a0f065da65c7bb1e8abd5c7a3043da9dceb429ebe1b289bc07806"}, + {file = "coverage-7.5.4-cp311-cp311-win_amd64.whl", hash = "sha256:d2e344d6adc8ef81c5a233d3a57b3c7d5181f40e79e05e1c143da143ccb6377d"}, + {file = "coverage-7.5.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:54317c2b806354cbb2dc7ac27e2b93f97096912cc16b18289c5d4e44fc663233"}, + {file = "coverage-7.5.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:042183de01f8b6d531e10c197f7f0315a61e8d805ab29c5f7b51a01d62782747"}, + {file = "coverage-7.5.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6bb74ed465d5fb204b2ec41d79bcd28afccf817de721e8a807d5141c3426638"}, + {file = "coverage-7.5.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3d45ff86efb129c599a3b287ae2e44c1e281ae0f9a9bad0edc202179bcc3a2e"}, + {file = "coverage-7.5.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5013ed890dc917cef2c9f765c4c6a8ae9df983cd60dbb635df8ed9f4ebc9f555"}, + {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1014fbf665fef86cdfd6cb5b7371496ce35e4d2a00cda501cf9f5b9e6fced69f"}, + {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3684bc2ff328f935981847082ba4fdc950d58906a40eafa93510d1b54c08a66c"}, + {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:581ea96f92bf71a5ec0974001f900db495488434a6928a2ca7f01eee20c23805"}, + {file = "coverage-7.5.4-cp312-cp312-win32.whl", hash = "sha256:73ca8fbc5bc622e54627314c1a6f1dfdd8db69788f3443e752c215f29fa87a0b"}, + {file = "coverage-7.5.4-cp312-cp312-win_amd64.whl", hash = "sha256:cef4649ec906ea7ea5e9e796e68b987f83fa9a718514fe147f538cfeda76d7a7"}, + {file = "coverage-7.5.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cdd31315fc20868c194130de9ee6bfd99755cc9565edff98ecc12585b90be882"}, + {file = "coverage-7.5.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:02ff6e898197cc1e9fa375581382b72498eb2e6d5fc0b53f03e496cfee3fac6d"}, + {file = "coverage-7.5.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d05c16cf4b4c2fc880cb12ba4c9b526e9e5d5bb1d81313d4d732a5b9fe2b9d53"}, + {file = "coverage-7.5.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5986ee7ea0795a4095ac4d113cbb3448601efca7f158ec7f7087a6c705304e4"}, + {file = "coverage-7.5.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5df54843b88901fdc2f598ac06737f03d71168fd1175728054c8f5a2739ac3e4"}, + {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ab73b35e8d109bffbda9a3e91c64e29fe26e03e49addf5b43d85fc426dde11f9"}, + {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:aea072a941b033813f5e4814541fc265a5c12ed9720daef11ca516aeacd3bd7f"}, + {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:16852febd96acd953b0d55fc842ce2dac1710f26729b31c80b940b9afcd9896f"}, + {file = "coverage-7.5.4-cp38-cp38-win32.whl", hash = "sha256:8f894208794b164e6bd4bba61fc98bf6b06be4d390cf2daacfa6eca0a6d2bb4f"}, + {file = "coverage-7.5.4-cp38-cp38-win_amd64.whl", hash = "sha256:e2afe743289273209c992075a5a4913e8d007d569a406ffed0bd080ea02b0633"}, + {file = "coverage-7.5.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b95c3a8cb0463ba9f77383d0fa8c9194cf91f64445a63fc26fb2327e1e1eb088"}, + {file = "coverage-7.5.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7564cc09dd91b5a6001754a5b3c6ecc4aba6323baf33a12bd751036c998be4"}, + {file = "coverage-7.5.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44da56a2589b684813f86d07597fdf8a9c6ce77f58976727329272f5a01f99f7"}, + {file = "coverage-7.5.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e16f3d6b491c48c5ae726308e6ab1e18ee830b4cdd6913f2d7f77354b33f91c8"}, + {file = "coverage-7.5.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbc5958cb471e5a5af41b0ddaea96a37e74ed289535e8deca404811f6cb0bc3d"}, + {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a04e990a2a41740b02d6182b498ee9796cf60eefe40cf859b016650147908029"}, + {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ddbd2f9713a79e8e7242d7c51f1929611e991d855f414ca9996c20e44a895f7c"}, + {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b1ccf5e728ccf83acd313c89f07c22d70d6c375a9c6f339233dcf792094bcbf7"}, + {file = "coverage-7.5.4-cp39-cp39-win32.whl", hash = "sha256:56b4eafa21c6c175b3ede004ca12c653a88b6f922494b023aeb1e836df953ace"}, + {file = "coverage-7.5.4-cp39-cp39-win_amd64.whl", hash = "sha256:65e528e2e921ba8fd67d9055e6b9f9e34b21ebd6768ae1c1723f4ea6ace1234d"}, + {file = "coverage-7.5.4-pp38.pp39.pp310-none-any.whl", hash = "sha256:79b356f3dd5b26f3ad23b35c75dbdaf1f9e2450b6bcefc6d0825ea0aa3f86ca5"}, + {file = "coverage-7.5.4.tar.gz", hash = "sha256:a44963520b069e12789d0faea4e9fdb1e410cdc4aab89d94f7f55cbb7fef0353"}, ] [package.dependencies] @@ -1030,58 +1030,57 @@ files = [ [[package]] name = "duckdb" -version = "0.10.2" +version = "1.0.0" description = "DuckDB in-process database" optional = true python-versions = ">=3.7.0" files = [ - {file = "duckdb-0.10.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3891d3ac03e12a3e5c43afa3020fe701f64060f52d25f429a1ed7b5d914368d3"}, - {file = "duckdb-0.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4f63877651f1fb940e049dc53038eb763856616319acf4f892b1c3ed074f5ab0"}, - {file = "duckdb-0.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:06e3a36f04f4d98d2c0bbdd63e517cfbe114a795306e26ec855e62e076af5043"}, - {file = "duckdb-0.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf5f95ad5b75c8e65c6508b4df02043dd0b9d97712b9a33236ad77c388ce7861"}, - {file = "duckdb-0.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ff62bc98278c98fecbd6eecec5d698ad41ebd654110feaadbf8ac8bb59b1ecf"}, - {file = "duckdb-0.10.2-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cceede13fde095c23cf9a53adf7c414c7bfb21b9a7aa6a4836014fdbecbfca70"}, - {file = "duckdb-0.10.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:acdfff60b7efccd7f731213a9795851256249dfacf80367074b2b2e144f716dd"}, - {file = "duckdb-0.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:4a5d5655cf0bdaf664a6f332afe465e02b08cef715548a0983bb7aef48da06a6"}, - {file = "duckdb-0.10.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a9d15842876d18763e085648656cccc7660a215d16254906db5c4471be2c7732"}, - {file = "duckdb-0.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c88cdcdc8452c910e4298223e7d9fca291534ff5aa36090aa49c9e6557550b13"}, - {file = "duckdb-0.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:364cd6f5dc8a1010d144d08c410ba9a74c521336ee5bda84fabc6616216a6d6a"}, - {file = "duckdb-0.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c57c11d1060296f5e9ebfb5bb7e5521e0d77912e8f9ff43c90240c3311e9de9"}, - {file = "duckdb-0.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:186d86b8dda8e1076170eb770bb2bb73ea88ca907d92885c9695d6515207b205"}, - {file = "duckdb-0.10.2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f65b62f31c6bff21afc0261cfe28d238b8f34ec78f339546b12f4740c39552a"}, - {file = "duckdb-0.10.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a860d7466a5c93714cdd94559ce9e1db2ab91914f0941c25e5e93d4ebe36a5fa"}, - {file = "duckdb-0.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:33308190e9c7f05a3a0a2d46008a043effd4eae77011869d7c18fb37acdd9215"}, - {file = "duckdb-0.10.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:3a8b2f1229b4aecb79cd28ffdb99032b1497f0a805d0da1136a9b6115e1afc70"}, - {file = "duckdb-0.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d23a6dea61963733a0f45a0d0bbb1361fb2a47410ed5ff308b4a1f869d4eeb6f"}, - {file = "duckdb-0.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:20ee0aa27e688aa52a40b434ec41a50431d0b06edeab88edc2feaca18d82c62c"}, - {file = "duckdb-0.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:80a6d43d9044f0997a15a92e0c0ff3afd21151a1e572a92f439cc4f56b7090e1"}, - {file = "duckdb-0.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6934758cacd06029a5c9f54556a43bd277a86757e22bf8d0dd11ca15c1813d1c"}, - {file = "duckdb-0.10.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a11e2d68bd79044eea5486b1cddb5b915115f537e5c74eeb94c768ce30f9f4b"}, - {file = "duckdb-0.10.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0bf58385c43b8e448a2fea7e8729054934bf73ea616d1d7ef8184eda07f975e2"}, - {file = "duckdb-0.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:eae75c7014597ded6e7f6dc51e32d48362a31608acd73e9f795748ee94335a54"}, - {file = "duckdb-0.10.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:62e89deff778a7a86f651802b947a3466425f6cce41e9d7d412d39e492932943"}, - {file = "duckdb-0.10.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f87e555fd36ec6da316b727a39fb24c53124a797dfa9b451bdea87b2f20a351f"}, - {file = "duckdb-0.10.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41e8b34b1a944590ebcf82f8cc59d67b084fe99479f048892d60da6c1402c386"}, - {file = "duckdb-0.10.2-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2c68c6dde2773774cf2371522a3959ea2716fc2b3a4891d4066f0e426455fe19"}, - {file = "duckdb-0.10.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ff6a8a0980d0f9398fa461deffa59465dac190d707468478011ea8a5fe1f2c81"}, - {file = "duckdb-0.10.2-cp37-cp37m-win_amd64.whl", hash = "sha256:728dd4ff0efda387a424754e5508d4f8c72a272c2d3ccb036a83286f60b46002"}, - {file = "duckdb-0.10.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c461d6b4619e80170044a9eb999bbf4097e330d3a4974ced0a7eaeb79c7c39f6"}, - {file = "duckdb-0.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:909351ff72eb3b50b89761251148d8a186594d8a438e12dcf5494794caff6693"}, - {file = "duckdb-0.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d9eeb8393d69abafd355b869669957eb85b89e4df677e420b9ef0693b7aa6cb4"}, - {file = "duckdb-0.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3102bcf5011e8f82ea3c2bde43108774fe5a283a410d292c0843610ea13e2237"}, - {file = "duckdb-0.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d64d443613e5f16caf7d67102733538c90f7715867c1a98597efd3babca068e3"}, - {file = "duckdb-0.10.2-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cb31398826d1b7473344e5ee8e0f826370c9752549469ba1327042ace9041f80"}, - {file = "duckdb-0.10.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d09dcec467cd6127d5cc1fb0ce4efbd77e761882d9d772b0f64fc2f79a2a1cde"}, - {file = "duckdb-0.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:82fab1a24faf7c33d8a7afed08b57ee36e8821a3a68a2f1574cd238ea440bba0"}, - {file = "duckdb-0.10.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38607e6e6618e8ea28c8d9b67aa9e22cfd6d6d673f2e8ab328bd6e867b697f69"}, - {file = "duckdb-0.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fb0c23bc8c09615bff38aebcf8e92e6ae74959c67b3c9e5b00edddc730bf22be"}, - {file = "duckdb-0.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:00576c11c78c83830ab483bad968e07cd9b5f730e7ffaf5aa5fadee5ac4f71e9"}, - {file = "duckdb-0.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:077db692cdda50c4684ef87dc2a68507665804caa90e539dbe819116bda722ad"}, - {file = "duckdb-0.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca25984ad9f9a04e46e8359f852668c11569534e3bb8424b80be711303ad2314"}, - {file = "duckdb-0.10.2-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6a72cc40982c7b92cf555e574618fc711033b013bf258b611ba18d7654c89d8c"}, - {file = "duckdb-0.10.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d27b9efd6e788eb561535fdc0cbc7c74aca1ff39f748b7cfc27aa49b00e22da1"}, - {file = "duckdb-0.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:4800469489bc262dda61a7f1d40acedf67cf2454874e9d8bbf07920dc2b147e6"}, - {file = "duckdb-0.10.2.tar.gz", hash = "sha256:0f609c9d5f941f1ecde810f010dd9321cd406a552c1df20318a13fa64247f67f"}, + {file = "duckdb-1.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4a8ce2d1f9e1c23b9bab3ae4ca7997e9822e21563ff8f646992663f66d050211"}, + {file = "duckdb-1.0.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:19797670f20f430196e48d25d082a264b66150c264c1e8eae8e22c64c2c5f3f5"}, + {file = "duckdb-1.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:b71c342090fe117b35d866a91ad6bffce61cd6ff3e0cff4003f93fc1506da0d8"}, + {file = "duckdb-1.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25dd69f44ad212c35ae2ea736b0e643ea2b70f204b8dff483af1491b0e2a4cec"}, + {file = "duckdb-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8da5f293ecb4f99daa9a9352c5fd1312a6ab02b464653a0c3a25ab7065c45d4d"}, + {file = "duckdb-1.0.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3207936da9967ddbb60644ec291eb934d5819b08169bc35d08b2dedbe7068c60"}, + {file = "duckdb-1.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1128d6c9c33e883b1f5df6b57c1eb46b7ab1baf2650912d77ee769aaa05111f9"}, + {file = "duckdb-1.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:02310d263474d0ac238646677feff47190ffb82544c018b2ff732a4cb462c6ef"}, + {file = "duckdb-1.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:75586791ab2702719c284157b65ecefe12d0cca9041da474391896ddd9aa71a4"}, + {file = "duckdb-1.0.0-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:83bb415fc7994e641344f3489e40430ce083b78963cb1057bf714ac3a58da3ba"}, + {file = "duckdb-1.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:bee2e0b415074e84c5a2cefd91f6b5ebeb4283e7196ba4ef65175a7cef298b57"}, + {file = "duckdb-1.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa5a4110d2a499312609544ad0be61e85a5cdad90e5b6d75ad16b300bf075b90"}, + {file = "duckdb-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fa389e6a382d4707b5f3d1bc2087895925ebb92b77e9fe3bfb23c9b98372fdc"}, + {file = "duckdb-1.0.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7ede6f5277dd851f1a4586b0c78dc93f6c26da45e12b23ee0e88c76519cbdbe0"}, + {file = "duckdb-1.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0b88cdbc0d5c3e3d7545a341784dc6cafd90fc035f17b2f04bf1e870c68456e5"}, + {file = "duckdb-1.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:fd1693cdd15375156f7fff4745debc14e5c54928589f67b87fb8eace9880c370"}, + {file = "duckdb-1.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:c65a7fe8a8ce21b985356ee3ec0c3d3b3b2234e288e64b4cfb03356dbe6e5583"}, + {file = "duckdb-1.0.0-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:e5a8eda554379b3a43b07bad00968acc14dd3e518c9fbe8f128b484cf95e3d16"}, + {file = "duckdb-1.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:a1b6acdd54c4a7b43bd7cb584975a1b2ff88ea1a31607a2b734b17960e7d3088"}, + {file = "duckdb-1.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a677bb1b6a8e7cab4a19874249d8144296e6e39dae38fce66a80f26d15e670df"}, + {file = "duckdb-1.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:752e9d412b0a2871bf615a2ede54be494c6dc289d076974eefbf3af28129c759"}, + {file = "duckdb-1.0.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3aadb99d098c5e32d00dc09421bc63a47134a6a0de9d7cd6abf21780b678663c"}, + {file = "duckdb-1.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83b7091d4da3e9301c4f9378833f5ffe934fb1ad2b387b439ee067b2c10c8bb0"}, + {file = "duckdb-1.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:6a8058d0148b544694cb5ea331db44f6c2a00a7b03776cc4dd1470735c3d5ff7"}, + {file = "duckdb-1.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e40cb20e5ee19d44bc66ec99969af791702a049079dc5f248c33b1c56af055f4"}, + {file = "duckdb-1.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7bce1bc0de9af9f47328e24e6e7e39da30093179b1c031897c042dd94a59c8e"}, + {file = "duckdb-1.0.0-cp37-cp37m-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8355507f7a04bc0a3666958f4414a58e06141d603e91c0fa5a7c50e49867fb6d"}, + {file = "duckdb-1.0.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:39f1a46f5a45ad2886dc9b02ce5b484f437f90de66c327f86606d9ba4479d475"}, + {file = "duckdb-1.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a6d29ba477b27ae41676b62c8fae8d04ee7cbe458127a44f6049888231ca58fa"}, + {file = "duckdb-1.0.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:1bea713c1925918714328da76e79a1f7651b2b503511498ccf5e007a7e67d49e"}, + {file = "duckdb-1.0.0-cp38-cp38-macosx_12_0_universal2.whl", hash = "sha256:bfe67f3bcf181edbf6f918b8c963eb060e6aa26697d86590da4edc5707205450"}, + {file = "duckdb-1.0.0-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:dbc6093a75242f002be1d96a6ace3fdf1d002c813e67baff52112e899de9292f"}, + {file = "duckdb-1.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba1881a2b11c507cee18f8fd9ef10100be066fddaa2c20fba1f9a664245cd6d8"}, + {file = "duckdb-1.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:445d0bb35087c522705c724a75f9f1c13f1eb017305b694d2686218d653c8142"}, + {file = "duckdb-1.0.0-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:224553432e84432ffb9684f33206572477049b371ce68cc313a01e214f2fbdda"}, + {file = "duckdb-1.0.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:d3914032e47c4e76636ad986d466b63fdea65e37be8a6dfc484ed3f462c4fde4"}, + {file = "duckdb-1.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:af9128a2eb7e1bb50cd2c2020d825fb2946fdad0a2558920cd5411d998999334"}, + {file = "duckdb-1.0.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:dd2659a5dbc0df0de68f617a605bf12fe4da85ba24f67c08730984a0892087e8"}, + {file = "duckdb-1.0.0-cp39-cp39-macosx_12_0_universal2.whl", hash = "sha256:ac5a4afb0bc20725e734e0b2c17e99a274de4801aff0d4e765d276b99dad6d90"}, + {file = "duckdb-1.0.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:2c5a53bee3668d6e84c0536164589d5127b23d298e4c443d83f55e4150fafe61"}, + {file = "duckdb-1.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b980713244d7708b25ee0a73de0c65f0e5521c47a0e907f5e1b933d79d972ef6"}, + {file = "duckdb-1.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21cbd4f9fe7b7a56eff96c3f4d6778770dd370469ca2212eddbae5dd63749db5"}, + {file = "duckdb-1.0.0-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed228167c5d49888c5ef36f6f9cbf65011c2daf9dcb53ea8aa7a041ce567b3e4"}, + {file = "duckdb-1.0.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:46d8395fbcea7231fd5032a250b673cc99352fef349b718a23dea2c0dd2b8dec"}, + {file = "duckdb-1.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:6ad1fc1a4d57e7616944166a5f9417bdbca1ea65c490797e3786e3a42e162d8a"}, + {file = "duckdb-1.0.0.tar.gz", hash = "sha256:a2a059b77bc7d5b76ae9d88e267372deff19c291048d59450c431e166233d453"}, ] [[package]] @@ -1344,17 +1343,17 @@ gcsfuse = ["fusepy"] [[package]] name = "getdaft" -version = "0.2.24" +version = "0.2.28" description = "Distributed Dataframes for Multimodal Data" optional = true python-versions = ">=3.8" files = [ - {file = "getdaft-0.2.24-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:6dbb2c25f14c008fe1323590dc86bbed9d0de8b470aa62c0844bb218864b42da"}, - {file = "getdaft-0.2.24-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:1c27ff4e3e00275db611c8fad5edefc1a24f8494093ce18f0b846b147b4d6cd6"}, - {file = "getdaft-0.2.24-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae0d0ae1238fa5eb2ddfbefbc52e47aa6f9d00e9621dde0ecbee70be43cee8e8"}, - {file = "getdaft-0.2.24-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:473881f9406d166dace7f12a3cb74915f8901b628f6d9f0900fdf69cf05b0031"}, - {file = "getdaft-0.2.24-cp38-abi3-win_amd64.whl", hash = "sha256:c77266e55245c95a5c972dd49a47a764cde1b2007cc30ab08c2f25f7a36d6697"}, - {file = "getdaft-0.2.24.tar.gz", hash = "sha256:1fa4eae81ab101bed544ee64e3128e2df4f267a87640cd1473e00f944c32a216"}, + {file = "getdaft-0.2.28-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:052632bf156dfabc61b00bc3e055f11c045ed1011818ed398e82bee549346510"}, + {file = "getdaft-0.2.28-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6d120504f05dadac6fa0c170558f2635e5654d1e49ffcd95c20952847427e069"}, + {file = "getdaft-0.2.28-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:477d77f695129843d1bdfe3896d17cd5af43024e06c1956077f6afe2069e4dcf"}, + {file = "getdaft-0.2.28-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da7be6b900798061090ea99f474ad1d128fb493f958c17854248eacfad68a969"}, + {file = "getdaft-0.2.28-cp38-abi3-win_amd64.whl", hash = "sha256:679a9d26f76f695f4fa3c51c732c02f511eeb5a832b305bbd237c2e62333f815"}, + {file = "getdaft-0.2.28.tar.gz", hash = "sha256:1389ef47caa61f0daf3217b4bd5042b50e854bfb1315b104341110c09a6c072f"}, ] [package.dependencies] @@ -1364,7 +1363,7 @@ tqdm = "*" typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.10\""} [package.extras] -all = ["getdaft[aws,azure,deltalake,gcp,iceberg,numpy,pandas,ray,sql]"] +all = ["getdaft[aws,azure,deltalake,gcp,iceberg,numpy,pandas,ray,sql,unity]"] aws = ["boto3"] deltalake = ["deltalake"] hudi = ["pyarrow (>=8.0.0)"] @@ -1374,6 +1373,7 @@ numpy = ["numpy"] pandas = ["pandas"] ray = ["packaging", "ray[client,data] (>=2.0.0)", "ray[client,data] (>=2.10.0)"] sql = ["connectorx", "sqlalchemy", "sqlglot"] +unity = ["unitycatalog"] [[package]] name = "google-api-core" @@ -2212,13 +2212,13 @@ test = ["mypy (>=1.0)", "pytest (>=7.0.0)"] [[package]] name = "moto" -version = "5.0.7" +version = "5.0.9" description = "" optional = false python-versions = ">=3.8" files = [ - {file = "moto-5.0.7-py2.py3-none-any.whl", hash = "sha256:c0214c1361fb1dc85f587d9ce17cd988c6f69ff0ed54d43789654022e0e744f2"}, - {file = "moto-5.0.7.tar.gz", hash = "sha256:f2cde691dc4bc675e318a65f018902ac7f89d61bf2646052f7df215d212f069e"}, + {file = "moto-5.0.9-py2.py3-none-any.whl", hash = "sha256:21a13e02f83d6a18cfcd99949c96abb2e889f4bd51c4c6a3ecc8b78765cb854e"}, + {file = "moto-5.0.9.tar.gz", hash = "sha256:eb71f1cba01c70fff1f16086acb24d6d9aeb32830d646d8989f98a29aeae24ba"}, ] [package.dependencies] @@ -2288,22 +2288,22 @@ tests = ["pytest (>=4.6)"] [[package]] name = "msal" -version = "1.26.0" +version = "1.28.0" description = "The Microsoft Authentication Library (MSAL) for Python library enables your app to access the Microsoft Cloud by supporting authentication of users with Microsoft Azure Active Directory accounts (AAD) and Microsoft Accounts (MSA) using industry standard OAuth2 and OpenID Connect." optional = true -python-versions = ">=2.7" +python-versions = ">=3.7" files = [ - {file = "msal-1.26.0-py2.py3-none-any.whl", hash = "sha256:be77ba6a8f49c9ff598bbcdc5dfcf1c9842f3044300109af738e8c3e371065b5"}, - {file = "msal-1.26.0.tar.gz", hash = "sha256:224756079fe338be838737682b49f8ebc20a87c1c5eeaf590daae4532b83de15"}, + {file = "msal-1.28.0-py3-none-any.whl", hash = "sha256:3064f80221a21cd535ad8c3fafbb3a3582cd9c7e9af0bb789ae14f726a0ca99b"}, + {file = "msal-1.28.0.tar.gz", hash = "sha256:80bbabe34567cb734efd2ec1869b2d98195c927455369d8077b3c542088c5c9d"}, ] [package.dependencies] -cryptography = ">=0.6,<44" +cryptography = ">=0.6,<45" PyJWT = {version = ">=1.0.0,<3", extras = ["crypto"]} requests = ">=2.0.0,<3" [package.extras] -broker = ["pymsalruntime (>=0.13.2,<0.14)"] +broker = ["pymsalruntime (>=0.13.2,<0.15)"] [[package]] name = "msal-extensions" @@ -2490,13 +2490,13 @@ files = [ [[package]] name = "mypy-boto3-glue" -version = "1.34.88" -description = "Type annotations for boto3.Glue 1.34.88 service generated with mypy-boto3-builder 7.23.2" +version = "1.34.131" +description = "Type annotations for boto3.Glue 1.34.131 service generated with mypy-boto3-builder 7.24.0" optional = true python-versions = ">=3.8" files = [ - {file = "mypy_boto3_glue-1.34.88-py3-none-any.whl", hash = "sha256:bb5c4ac3ac4806fb19ff3bebe2400635cf0d959e4a086a3de36b0eccbf04febc"}, - {file = "mypy_boto3_glue-1.34.88.tar.gz", hash = "sha256:7626368b66c92236f57008bf56303f3eda1ef2705ffe0d2cd845b1b877eb0596"}, + {file = "mypy_boto3_glue-1.34.131-py3-none-any.whl", hash = "sha256:b1c6cc67749f991f95106454eaad9061057d1381b8d290ed8020d15005e15071"}, + {file = "mypy_boto3_glue-1.34.131.tar.gz", hash = "sha256:4a67430aa68456817d0c1afa52ff529e0d8eccf8478a322020b143cc4be04ef5"}, ] [package.dependencies] @@ -3024,18 +3024,18 @@ files = [ [[package]] name = "pydantic" -version = "2.7.1" +version = "2.7.4" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic-2.7.1-py3-none-any.whl", hash = "sha256:e029badca45266732a9a79898a15ae2e8b14840b1eabbb25844be28f0b33f3d5"}, - {file = "pydantic-2.7.1.tar.gz", hash = "sha256:e9dbb5eada8abe4d9ae5f46b9939aead650cd2b68f249bb3a8139dbe125803cc"}, + {file = "pydantic-2.7.4-py3-none-any.whl", hash = "sha256:ee8538d41ccb9c0a9ad3e0e5f07bf15ed8015b481ced539a1759d8cc89ae90d0"}, + {file = "pydantic-2.7.4.tar.gz", hash = "sha256:0c84efd9548d545f63ac0060c1e4d39bb9b14db8b3c0652338aecc07b5adec52"}, ] [package.dependencies] annotated-types = ">=0.4.0" -pydantic-core = "2.18.2" +pydantic-core = "2.18.4" typing-extensions = ">=4.6.1" [package.extras] @@ -3043,90 +3043,90 @@ email = ["email-validator (>=2.0.0)"] [[package]] name = "pydantic-core" -version = "2.18.2" +version = "2.18.4" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic_core-2.18.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:9e08e867b306f525802df7cd16c44ff5ebbe747ff0ca6cf3fde7f36c05a59a81"}, - {file = "pydantic_core-2.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f0a21cbaa69900cbe1a2e7cad2aa74ac3cf21b10c3efb0fa0b80305274c0e8a2"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0680b1f1f11fda801397de52c36ce38ef1c1dc841a0927a94f226dea29c3ae3d"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:95b9d5e72481d3780ba3442eac863eae92ae43a5f3adb5b4d0a1de89d42bb250"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fcf5cd9c4b655ad666ca332b9a081112cd7a58a8b5a6ca7a3104bc950f2038"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b5155ff768083cb1d62f3e143b49a8a3432e6789a3abee8acd005c3c7af1c74"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553ef617b6836fc7e4df130bb851e32fe357ce36336d897fd6646d6058d980af"}, - {file = "pydantic_core-2.18.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b89ed9eb7d616ef5714e5590e6cf7f23b02d0d539767d33561e3675d6f9e3857"}, - {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:75f7e9488238e920ab6204399ded280dc4c307d034f3924cd7f90a38b1829563"}, - {file = "pydantic_core-2.18.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ef26c9e94a8c04a1b2924149a9cb081836913818e55681722d7f29af88fe7b38"}, - {file = "pydantic_core-2.18.2-cp310-none-win32.whl", hash = "sha256:182245ff6b0039e82b6bb585ed55a64d7c81c560715d1bad0cbad6dfa07b4027"}, - {file = "pydantic_core-2.18.2-cp310-none-win_amd64.whl", hash = "sha256:e23ec367a948b6d812301afc1b13f8094ab7b2c280af66ef450efc357d2ae543"}, - {file = "pydantic_core-2.18.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:219da3f096d50a157f33645a1cf31c0ad1fe829a92181dd1311022f986e5fbe3"}, - {file = "pydantic_core-2.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc1cfd88a64e012b74e94cd00bbe0f9c6df57049c97f02bb07d39e9c852e19a4"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05b7133a6e6aeb8df37d6f413f7705a37ab4031597f64ab56384c94d98fa0e90"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:224c421235f6102e8737032483f43c1a8cfb1d2f45740c44166219599358c2cd"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b14d82cdb934e99dda6d9d60dc84a24379820176cc4a0d123f88df319ae9c150"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2728b01246a3bba6de144f9e3115b532ee44bd6cf39795194fb75491824a1413"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:470b94480bb5ee929f5acba6995251ada5e059a5ef3e0dfc63cca287283ebfa6"}, - {file = "pydantic_core-2.18.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:997abc4df705d1295a42f95b4eec4950a37ad8ae46d913caeee117b6b198811c"}, - {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75250dbc5290e3f1a0f4618db35e51a165186f9034eff158f3d490b3fed9f8a0"}, - {file = "pydantic_core-2.18.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4456f2dca97c425231d7315737d45239b2b51a50dc2b6f0c2bb181fce6207664"}, - {file = "pydantic_core-2.18.2-cp311-none-win32.whl", hash = "sha256:269322dcc3d8bdb69f054681edff86276b2ff972447863cf34c8b860f5188e2e"}, - {file = "pydantic_core-2.18.2-cp311-none-win_amd64.whl", hash = "sha256:800d60565aec896f25bc3cfa56d2277d52d5182af08162f7954f938c06dc4ee3"}, - {file = "pydantic_core-2.18.2-cp311-none-win_arm64.whl", hash = "sha256:1404c69d6a676245199767ba4f633cce5f4ad4181f9d0ccb0577e1f66cf4c46d"}, - {file = "pydantic_core-2.18.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:fb2bd7be70c0fe4dfd32c951bc813d9fe6ebcbfdd15a07527796c8204bd36242"}, - {file = "pydantic_core-2.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6132dd3bd52838acddca05a72aafb6eab6536aa145e923bb50f45e78b7251043"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7d904828195733c183d20a54230c0df0eb46ec746ea1a666730787353e87182"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c9bd70772c720142be1020eac55f8143a34ec9f82d75a8e7a07852023e46617f"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8ed04b3582771764538f7ee7001b02e1170223cf9b75dff0bc698fadb00cf3"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e6dac87ddb34aaec85f873d737e9d06a3555a1cc1a8e0c44b7f8d5daeb89d86f"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ca4ae5a27ad7a4ee5170aebce1574b375de390bc01284f87b18d43a3984df72"}, - {file = "pydantic_core-2.18.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:886eec03591b7cf058467a70a87733b35f44707bd86cf64a615584fd72488b7c"}, - {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ca7b0c1f1c983e064caa85f3792dd2fe3526b3505378874afa84baf662e12241"}, - {file = "pydantic_core-2.18.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b4356d3538c3649337df4074e81b85f0616b79731fe22dd11b99499b2ebbdf3"}, - {file = "pydantic_core-2.18.2-cp312-none-win32.whl", hash = "sha256:8b172601454f2d7701121bbec3425dd71efcb787a027edf49724c9cefc14c038"}, - {file = "pydantic_core-2.18.2-cp312-none-win_amd64.whl", hash = "sha256:b1bd7e47b1558ea872bd16c8502c414f9e90dcf12f1395129d7bb42a09a95438"}, - {file = "pydantic_core-2.18.2-cp312-none-win_arm64.whl", hash = "sha256:98758d627ff397e752bc339272c14c98199c613f922d4a384ddc07526c86a2ec"}, - {file = "pydantic_core-2.18.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:9fdad8e35f278b2c3eb77cbdc5c0a49dada440657bf738d6905ce106dc1de439"}, - {file = "pydantic_core-2.18.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1d90c3265ae107f91a4f279f4d6f6f1d4907ac76c6868b27dc7fb33688cfb347"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:390193c770399861d8df9670fb0d1874f330c79caaca4642332df7c682bf6b91"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:82d5d4d78e4448683cb467897fe24e2b74bb7b973a541ea1dcfec1d3cbce39fb"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4774f3184d2ef3e14e8693194f661dea5a4d6ca4e3dc8e39786d33a94865cefd"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d4d938ec0adf5167cb335acb25a4ee69a8107e4984f8fbd2e897021d9e4ca21b"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0e8b1be28239fc64a88a8189d1df7fad8be8c1ae47fcc33e43d4be15f99cc70"}, - {file = "pydantic_core-2.18.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:868649da93e5a3d5eacc2b5b3b9235c98ccdbfd443832f31e075f54419e1b96b"}, - {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:78363590ef93d5d226ba21a90a03ea89a20738ee5b7da83d771d283fd8a56761"}, - {file = "pydantic_core-2.18.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:852e966fbd035a6468fc0a3496589b45e2208ec7ca95c26470a54daed82a0788"}, - {file = "pydantic_core-2.18.2-cp38-none-win32.whl", hash = "sha256:6a46e22a707e7ad4484ac9ee9f290f9d501df45954184e23fc29408dfad61350"}, - {file = "pydantic_core-2.18.2-cp38-none-win_amd64.whl", hash = "sha256:d91cb5ea8b11607cc757675051f61b3d93f15eca3cefb3e6c704a5d6e8440f4e"}, - {file = "pydantic_core-2.18.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:ae0a8a797a5e56c053610fa7be147993fe50960fa43609ff2a9552b0e07013e8"}, - {file = "pydantic_core-2.18.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:042473b6280246b1dbf530559246f6842b56119c2926d1e52b631bdc46075f2a"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a388a77e629b9ec814c1b1e6b3b595fe521d2cdc625fcca26fbc2d44c816804"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25add29b8f3b233ae90ccef2d902d0ae0432eb0d45370fe315d1a5cf231004b"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f459a5ce8434614dfd39bbebf1041952ae01da6bed9855008cb33b875cb024c0"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eff2de745698eb46eeb51193a9f41d67d834d50e424aef27df2fcdee1b153845"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8309f67285bdfe65c372ea3722b7a5642680f3dba538566340a9d36e920b5f0"}, - {file = "pydantic_core-2.18.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f93a8a2e3938ff656a7c1bc57193b1319960ac015b6e87d76c76bf14fe0244b4"}, - {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:22057013c8c1e272eb8d0eebc796701167d8377441ec894a8fed1af64a0bf399"}, - {file = "pydantic_core-2.18.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfeecd1ac6cc1fb2692c3d5110781c965aabd4ec5d32799773ca7b1456ac636b"}, - {file = "pydantic_core-2.18.2-cp39-none-win32.whl", hash = "sha256:0d69b4c2f6bb3e130dba60d34c0845ba31b69babdd3f78f7c0c8fae5021a253e"}, - {file = "pydantic_core-2.18.2-cp39-none-win_amd64.whl", hash = "sha256:d9319e499827271b09b4e411905b24a426b8fb69464dfa1696258f53a3334641"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a1874c6dd4113308bd0eb568418e6114b252afe44319ead2b4081e9b9521fe75"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:ccdd111c03bfd3666bd2472b674c6899550e09e9f298954cfc896ab92b5b0e6d"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e18609ceaa6eed63753037fc06ebb16041d17d28199ae5aba0052c51449650a9"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5c584d357c4e2baf0ff7baf44f4994be121e16a2c88918a5817331fc7599d7"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43f0f463cf89ace478de71a318b1b4f05ebc456a9b9300d027b4b57c1a2064fb"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e1b395e58b10b73b07b7cf740d728dd4ff9365ac46c18751bf8b3d8cca8f625a"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:0098300eebb1c837271d3d1a2cd2911e7c11b396eac9661655ee524a7f10587b"}, - {file = "pydantic_core-2.18.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:36789b70d613fbac0a25bb07ab3d9dba4d2e38af609c020cf4d888d165ee0bf3"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3f9a801e7c8f1ef8718da265bba008fa121243dfe37c1cea17840b0944dfd72c"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:3a6515ebc6e69d85502b4951d89131ca4e036078ea35533bb76327f8424531ce"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20aca1e2298c56ececfd8ed159ae4dde2df0781988c97ef77d5c16ff4bd5b400"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:223ee893d77a310a0391dca6df00f70bbc2f36a71a895cecd9a0e762dc37b349"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2334ce8c673ee93a1d6a65bd90327588387ba073c17e61bf19b4fd97d688d63c"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cbca948f2d14b09d20268cda7b0367723d79063f26c4ffc523af9042cad95592"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:b3ef08e20ec49e02d5c6717a91bb5af9b20f1805583cb0adfe9ba2c6b505b5ae"}, - {file = "pydantic_core-2.18.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6fdc8627910eed0c01aed6a390a252fe3ea6d472ee70fdde56273f198938374"}, - {file = "pydantic_core-2.18.2.tar.gz", hash = "sha256:2e29d20810dfc3043ee13ac7d9e25105799817683348823f305ab3f349b9386e"}, + {file = "pydantic_core-2.18.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:f76d0ad001edd426b92233d45c746fd08f467d56100fd8f30e9ace4b005266e4"}, + {file = "pydantic_core-2.18.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:59ff3e89f4eaf14050c8022011862df275b552caef8082e37b542b066ce1ff26"}, + {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a55b5b16c839df1070bc113c1f7f94a0af4433fcfa1b41799ce7606e5c79ce0a"}, + {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4d0dcc59664fcb8974b356fe0a18a672d6d7cf9f54746c05f43275fc48636851"}, + {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8951eee36c57cd128f779e641e21eb40bc5073eb28b2d23f33eb0ef14ffb3f5d"}, + {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4701b19f7e3a06ea655513f7938de6f108123bf7c86bbebb1196eb9bd35cf724"}, + {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e00a3f196329e08e43d99b79b286d60ce46bed10f2280d25a1718399457e06be"}, + {file = "pydantic_core-2.18.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97736815b9cc893b2b7f663628e63f436018b75f44854c8027040e05230eeddb"}, + {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6891a2ae0e8692679c07728819b6e2b822fb30ca7445f67bbf6509b25a96332c"}, + {file = "pydantic_core-2.18.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bc4ff9805858bd54d1a20efff925ccd89c9d2e7cf4986144b30802bf78091c3e"}, + {file = "pydantic_core-2.18.4-cp310-none-win32.whl", hash = "sha256:1b4de2e51bbcb61fdebd0ab86ef28062704f62c82bbf4addc4e37fa4b00b7cbc"}, + {file = "pydantic_core-2.18.4-cp310-none-win_amd64.whl", hash = "sha256:6a750aec7bf431517a9fd78cb93c97b9b0c496090fee84a47a0d23668976b4b0"}, + {file = "pydantic_core-2.18.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:942ba11e7dfb66dc70f9ae66b33452f51ac7bb90676da39a7345e99ffb55402d"}, + {file = "pydantic_core-2.18.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b2ebef0e0b4454320274f5e83a41844c63438fdc874ea40a8b5b4ecb7693f1c4"}, + {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a642295cd0c8df1b86fc3dced1d067874c353a188dc8e0f744626d49e9aa51c4"}, + {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f09baa656c904807e832cf9cce799c6460c450c4ad80803517032da0cd062e2"}, + {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98906207f29bc2c459ff64fa007afd10a8c8ac080f7e4d5beff4c97086a3dabd"}, + {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:19894b95aacfa98e7cb093cd7881a0c76f55731efad31073db4521e2b6ff5b7d"}, + {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fbbdc827fe5e42e4d196c746b890b3d72876bdbf160b0eafe9f0334525119c8"}, + {file = "pydantic_core-2.18.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f85d05aa0918283cf29a30b547b4df2fbb56b45b135f9e35b6807cb28bc47951"}, + {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e85637bc8fe81ddb73fda9e56bab24560bdddfa98aa64f87aaa4e4b6730c23d2"}, + {file = "pydantic_core-2.18.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2f5966897e5461f818e136b8451d0551a2e77259eb0f73a837027b47dc95dab9"}, + {file = "pydantic_core-2.18.4-cp311-none-win32.whl", hash = "sha256:44c7486a4228413c317952e9d89598bcdfb06399735e49e0f8df643e1ccd0558"}, + {file = "pydantic_core-2.18.4-cp311-none-win_amd64.whl", hash = "sha256:8a7164fe2005d03c64fd3b85649891cd4953a8de53107940bf272500ba8a788b"}, + {file = "pydantic_core-2.18.4-cp311-none-win_arm64.whl", hash = "sha256:4e99bc050fe65c450344421017f98298a97cefc18c53bb2f7b3531eb39bc7805"}, + {file = "pydantic_core-2.18.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6f5c4d41b2771c730ea1c34e458e781b18cc668d194958e0112455fff4e402b2"}, + {file = "pydantic_core-2.18.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2fdf2156aa3d017fddf8aea5adfba9f777db1d6022d392b682d2a8329e087cef"}, + {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4748321b5078216070b151d5271ef3e7cc905ab170bbfd27d5c83ee3ec436695"}, + {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:847a35c4d58721c5dc3dba599878ebbdfd96784f3fb8bb2c356e123bdcd73f34"}, + {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3c40d4eaad41f78e3bbda31b89edc46a3f3dc6e171bf0ecf097ff7a0ffff7cb1"}, + {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:21a5e440dbe315ab9825fcd459b8814bb92b27c974cbc23c3e8baa2b76890077"}, + {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01dd777215e2aa86dfd664daed5957704b769e726626393438f9c87690ce78c3"}, + {file = "pydantic_core-2.18.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4b06beb3b3f1479d32befd1f3079cc47b34fa2da62457cdf6c963393340b56e9"}, + {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:564d7922e4b13a16b98772441879fcdcbe82ff50daa622d681dd682175ea918c"}, + {file = "pydantic_core-2.18.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0eb2a4f660fcd8e2b1c90ad566db2b98d7f3f4717c64fe0a83e0adb39766d5b8"}, + {file = "pydantic_core-2.18.4-cp312-none-win32.whl", hash = "sha256:8b8bab4c97248095ae0c4455b5a1cd1cdd96e4e4769306ab19dda135ea4cdb07"}, + {file = "pydantic_core-2.18.4-cp312-none-win_amd64.whl", hash = "sha256:14601cdb733d741b8958224030e2bfe21a4a881fb3dd6fbb21f071cabd48fa0a"}, + {file = "pydantic_core-2.18.4-cp312-none-win_arm64.whl", hash = "sha256:c1322d7dd74713dcc157a2b7898a564ab091ca6c58302d5c7b4c07296e3fd00f"}, + {file = "pydantic_core-2.18.4-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:823be1deb01793da05ecb0484d6c9e20baebb39bd42b5d72636ae9cf8350dbd2"}, + {file = "pydantic_core-2.18.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ebef0dd9bf9b812bf75bda96743f2a6c5734a02092ae7f721c048d156d5fabae"}, + {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae1d6df168efb88d7d522664693607b80b4080be6750c913eefb77e34c12c71a"}, + {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f9899c94762343f2cc2fc64c13e7cae4c3cc65cdfc87dd810a31654c9b7358cc"}, + {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99457f184ad90235cfe8461c4d70ab7dd2680e28821c29eca00252ba90308c78"}, + {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18f469a3d2a2fdafe99296a87e8a4c37748b5080a26b806a707f25a902c040a8"}, + {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7cdf28938ac6b8b49ae5e92f2735056a7ba99c9b110a474473fd71185c1af5d"}, + {file = "pydantic_core-2.18.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:938cb21650855054dc54dfd9120a851c974f95450f00683399006aa6e8abb057"}, + {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:44cd83ab6a51da80fb5adbd9560e26018e2ac7826f9626bc06ca3dc074cd198b"}, + {file = "pydantic_core-2.18.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:972658f4a72d02b8abfa2581d92d59f59897d2e9f7e708fdabe922f9087773af"}, + {file = "pydantic_core-2.18.4-cp38-none-win32.whl", hash = "sha256:1d886dc848e60cb7666f771e406acae54ab279b9f1e4143babc9c2258213daa2"}, + {file = "pydantic_core-2.18.4-cp38-none-win_amd64.whl", hash = "sha256:bb4462bd43c2460774914b8525f79b00f8f407c945d50881568f294c1d9b4443"}, + {file = "pydantic_core-2.18.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:44a688331d4a4e2129140a8118479443bd6f1905231138971372fcde37e43528"}, + {file = "pydantic_core-2.18.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a2fdd81edd64342c85ac7cf2753ccae0b79bf2dfa063785503cb85a7d3593223"}, + {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86110d7e1907ab36691f80b33eb2da87d780f4739ae773e5fc83fb272f88825f"}, + {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:46387e38bd641b3ee5ce247563b60c5ca098da9c56c75c157a05eaa0933ed154"}, + {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:123c3cec203e3f5ac7b000bd82235f1a3eced8665b63d18be751f115588fea30"}, + {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1803ac5c32ec324c5261c7209e8f8ce88e83254c4e1aebdc8b0a39f9ddb443"}, + {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53db086f9f6ab2b4061958d9c276d1dbe3690e8dd727d6abf2321d6cce37fa94"}, + {file = "pydantic_core-2.18.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abc267fa9837245cc28ea6929f19fa335f3dc330a35d2e45509b6566dc18be23"}, + {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a0d829524aaefdebccb869eed855e2d04c21d2d7479b6cada7ace5448416597b"}, + {file = "pydantic_core-2.18.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:509daade3b8649f80d4e5ff21aa5673e4ebe58590b25fe42fac5f0f52c6f034a"}, + {file = "pydantic_core-2.18.4-cp39-none-win32.whl", hash = "sha256:ca26a1e73c48cfc54c4a76ff78df3727b9d9f4ccc8dbee4ae3f73306a591676d"}, + {file = "pydantic_core-2.18.4-cp39-none-win_amd64.whl", hash = "sha256:c67598100338d5d985db1b3d21f3619ef392e185e71b8d52bceacc4a7771ea7e"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:574d92eac874f7f4db0ca653514d823a0d22e2354359d0759e3f6a406db5d55d"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1f4d26ceb5eb9eed4af91bebeae4b06c3fb28966ca3a8fb765208cf6b51102ab"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77450e6d20016ec41f43ca4a6c63e9fdde03f0ae3fe90e7c27bdbeaece8b1ed4"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d323a01da91851a4f17bf592faf46149c9169d68430b3146dcba2bb5e5719abc"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43d447dd2ae072a0065389092a231283f62d960030ecd27565672bd40746c507"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:578e24f761f3b425834f297b9935e1ce2e30f51400964ce4801002435a1b41ef"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:81b5efb2f126454586d0f40c4d834010979cb80785173d1586df845a632e4e6d"}, + {file = "pydantic_core-2.18.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ab86ce7c8f9bea87b9d12c7f0af71102acbf5ecbc66c17796cff45dae54ef9a5"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:90afc12421df2b1b4dcc975f814e21bc1754640d502a2fbcc6d41e77af5ec312"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:51991a89639a912c17bef4b45c87bd83593aee0437d8102556af4885811d59f5"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:293afe532740370aba8c060882f7d26cfd00c94cae32fd2e212a3a6e3b7bc15e"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b48ece5bde2e768197a2d0f6e925f9d7e3e826f0ad2271120f8144a9db18d5c8"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:eae237477a873ab46e8dd748e515c72c0c804fb380fbe6c85533c7de51f23a8f"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:834b5230b5dfc0c1ec37b2fda433b271cbbc0e507560b5d1588e2cc1148cf1ce"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e858ac0a25074ba4bce653f9b5d0a85b7456eaddadc0ce82d3878c22489fa4ee"}, + {file = "pydantic_core-2.18.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2fd41f6eff4c20778d717af1cc50eca52f5afe7805ee530a4fbd0bae284f16e9"}, + {file = "pydantic_core-2.18.4.tar.gz", hash = "sha256:ec3beeada09ff865c344ff3bc2f427f5e6c26401cc6113d77e372c3fdac73864"}, ] [package.dependencies] @@ -3304,13 +3304,13 @@ six = ">=1.5" [[package]] name = "python-snappy" -version = "0.7.1" +version = "0.7.2" description = "Python library for the snappy compression library from Google" optional = true python-versions = "*" files = [ - {file = "python-snappy-0.7.1.tar.gz", hash = "sha256:1bc29d36211d44bb9f04f3d7ccfbaeaebbc2f62b6d40f4fc4edd1fb16bc52c13"}, - {file = "python_snappy-0.7.1-py3-none-any.whl", hash = "sha256:7c9111be1ae1dcbf4ce32b752366d4a5d4f07898d517691c4003d41e04b03488"}, + {file = "python_snappy-0.7.2-py3-none-any.whl", hash = "sha256:b4b2c39142064925d5a554672a09de4188fc4f2b2494a2ecb35042930e129444"}, + {file = "python_snappy-0.7.2.tar.gz", hash = "sha256:04bf182f9d9f67b7a846dae2f1df36180ceeee8d3380e4b6799deff5272c4978"}, ] [package.dependencies] @@ -3584,13 +3584,13 @@ files = [ [[package]] name = "requests" -version = "2.32.1" +version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" files = [ - {file = "requests-2.32.1-py3-none-any.whl", hash = "sha256:21ac9465cdf8c1650fe1ecde8a71669a93d4e6f147550483a2967d08396a56a5"}, - {file = "requests-2.32.1.tar.gz", hash = "sha256:eb97e87e64c79e64e5b8ac75cee9dd1f97f49e289b083ee6be96268930725685"}, + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, ] [package.dependencies] @@ -3904,64 +3904,64 @@ files = [ [[package]] name = "sqlalchemy" -version = "2.0.30" +version = "2.0.31" description = "Database Abstraction Library" optional = true python-versions = ">=3.7" files = [ - {file = "SQLAlchemy-2.0.30-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3b48154678e76445c7ded1896715ce05319f74b1e73cf82d4f8b59b46e9c0ddc"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2753743c2afd061bb95a61a51bbb6a1a11ac1c44292fad898f10c9839a7f75b2"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a7bfc726d167f425d4c16269a9a10fe8630ff6d14b683d588044dcef2d0f6be7"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4f61ada6979223013d9ab83a3ed003ded6959eae37d0d685db2c147e9143797"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3a365eda439b7a00732638f11072907c1bc8e351c7665e7e5da91b169af794af"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bba002a9447b291548e8d66fd8c96a6a7ed4f2def0bb155f4f0a1309fd2735d5"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-win32.whl", hash = "sha256:0138c5c16be3600923fa2169532205d18891b28afa817cb49b50e08f62198bb8"}, - {file = "SQLAlchemy-2.0.30-cp310-cp310-win_amd64.whl", hash = "sha256:99650e9f4cf3ad0d409fed3eec4f071fadd032e9a5edc7270cd646a26446feeb"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:955991a09f0992c68a499791a753523f50f71a6885531568404fa0f231832aa0"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f69e4c756ee2686767eb80f94c0125c8b0a0b87ede03eacc5c8ae3b54b99dc46"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69c9db1ce00e59e8dd09d7bae852a9add716efdc070a3e2068377e6ff0d6fdaa"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1429a4b0f709f19ff3b0cf13675b2b9bfa8a7e79990003207a011c0db880a13"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:efedba7e13aa9a6c8407c48facfdfa108a5a4128e35f4c68f20c3407e4376aa9"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:16863e2b132b761891d6c49f0a0f70030e0bcac4fd208117f6b7e053e68668d0"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-win32.whl", hash = "sha256:2ecabd9ccaa6e914e3dbb2aa46b76dede7eadc8cbf1b8083c94d936bcd5ffb49"}, - {file = "SQLAlchemy-2.0.30-cp311-cp311-win_amd64.whl", hash = "sha256:0b3f4c438e37d22b83e640f825ef0f37b95db9aa2d68203f2c9549375d0b2260"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5a79d65395ac5e6b0c2890935bad892eabb911c4aa8e8015067ddb37eea3d56c"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9a5baf9267b752390252889f0c802ea13b52dfee5e369527da229189b8bd592e"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cb5a646930c5123f8461f6468901573f334c2c63c795b9af350063a736d0134"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:296230899df0b77dec4eb799bcea6fbe39a43707ce7bb166519c97b583cfcab3"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c62d401223f468eb4da32627bffc0c78ed516b03bb8a34a58be54d618b74d472"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3b69e934f0f2b677ec111b4d83f92dc1a3210a779f69bf905273192cf4ed433e"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-win32.whl", hash = "sha256:77d2edb1f54aff37e3318f611637171e8ec71472f1fdc7348b41dcb226f93d90"}, - {file = "SQLAlchemy-2.0.30-cp312-cp312-win_amd64.whl", hash = "sha256:b6c7ec2b1f4969fc19b65b7059ed00497e25f54069407a8701091beb69e591a5"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5a8e3b0a7e09e94be7510d1661339d6b52daf202ed2f5b1f9f48ea34ee6f2d57"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b60203c63e8f984df92035610c5fb76d941254cf5d19751faab7d33b21e5ddc0"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1dc3eabd8c0232ee8387fbe03e0a62220a6f089e278b1f0aaf5e2d6210741ad"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:40ad017c672c00b9b663fcfcd5f0864a0a97828e2ee7ab0c140dc84058d194cf"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e42203d8d20dc704604862977b1470a122e4892791fe3ed165f041e4bf447a1b"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-win32.whl", hash = "sha256:2a4f4da89c74435f2bc61878cd08f3646b699e7d2eba97144030d1be44e27584"}, - {file = "SQLAlchemy-2.0.30-cp37-cp37m-win_amd64.whl", hash = "sha256:b6bf767d14b77f6a18b6982cbbf29d71bede087edae495d11ab358280f304d8e"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc0c53579650a891f9b83fa3cecd4e00218e071d0ba00c4890f5be0c34887ed3"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:311710f9a2ee235f1403537b10c7687214bb1f2b9ebb52702c5aa4a77f0b3af7"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:408f8b0e2c04677e9c93f40eef3ab22f550fecb3011b187f66a096395ff3d9fd"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37a4b4fb0dd4d2669070fb05b8b8824afd0af57587393015baee1cf9890242d9"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a943d297126c9230719c27fcbbeab57ecd5d15b0bd6bfd26e91bfcfe64220621"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0a089e218654e740a41388893e090d2e2c22c29028c9d1353feb38638820bbeb"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-win32.whl", hash = "sha256:fa561138a64f949f3e889eb9ab8c58e1504ab351d6cf55259dc4c248eaa19da6"}, - {file = "SQLAlchemy-2.0.30-cp38-cp38-win_amd64.whl", hash = "sha256:7d74336c65705b986d12a7e337ba27ab2b9d819993851b140efdf029248e818e"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ae8c62fe2480dd61c532ccafdbce9b29dacc126fe8be0d9a927ca3e699b9491a"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2383146973a15435e4717f94c7509982770e3e54974c71f76500a0136f22810b"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8409de825f2c3b62ab15788635ccaec0c881c3f12a8af2b12ae4910a0a9aeef6"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0094c5dc698a5f78d3d1539853e8ecec02516b62b8223c970c86d44e7a80f6c7"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:edc16a50f5e1b7a06a2dcc1f2205b0b961074c123ed17ebda726f376a5ab0953"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f7703c2010355dd28f53deb644a05fc30f796bd8598b43f0ba678878780b6e4c"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-win32.whl", hash = "sha256:1f9a727312ff6ad5248a4367358e2cf7e625e98b1028b1d7ab7b806b7d757513"}, - {file = "SQLAlchemy-2.0.30-cp39-cp39-win_amd64.whl", hash = "sha256:a0ef36b28534f2a5771191be6edb44cc2673c7b2edf6deac6562400288664221"}, - {file = "SQLAlchemy-2.0.30-py3-none-any.whl", hash = "sha256:7108d569d3990c71e26a42f60474b4c02c8586c4681af5fd67e51a044fdea86a"}, - {file = "SQLAlchemy-2.0.30.tar.gz", hash = "sha256:2b1708916730f4830bc69d6f49d37f7698b5bd7530aca7f04f785f8849e95255"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f2a213c1b699d3f5768a7272de720387ae0122f1becf0901ed6eaa1abd1baf6c"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9fea3d0884e82d1e33226935dac990b967bef21315cbcc894605db3441347443"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3ad7f221d8a69d32d197e5968d798217a4feebe30144986af71ada8c548e9fa"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f2bee229715b6366f86a95d497c347c22ddffa2c7c96143b59a2aa5cc9eebbc"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cd5b94d4819c0c89280b7c6109c7b788a576084bf0a480ae17c227b0bc41e109"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:750900a471d39a7eeba57580b11983030517a1f512c2cb287d5ad0fcf3aebd58"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-win32.whl", hash = "sha256:7bd112be780928c7f493c1a192cd8c5fc2a2a7b52b790bc5a84203fb4381c6be"}, + {file = "SQLAlchemy-2.0.31-cp310-cp310-win_amd64.whl", hash = "sha256:5a48ac4d359f058474fadc2115f78a5cdac9988d4f99eae44917f36aa1476327"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f68470edd70c3ac3b6cd5c2a22a8daf18415203ca1b036aaeb9b0fb6f54e8298"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e2c38c2a4c5c634fe6c3c58a789712719fa1bf9b9d6ff5ebfce9a9e5b89c1ca"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd15026f77420eb2b324dcb93551ad9c5f22fab2c150c286ef1dc1160f110203"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2196208432deebdfe3b22185d46b08f00ac9d7b01284e168c212919891289396"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:352b2770097f41bff6029b280c0e03b217c2dcaddc40726f8f53ed58d8a85da4"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:56d51ae825d20d604583f82c9527d285e9e6d14f9a5516463d9705dab20c3740"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-win32.whl", hash = "sha256:6e2622844551945db81c26a02f27d94145b561f9d4b0c39ce7bfd2fda5776dac"}, + {file = "SQLAlchemy-2.0.31-cp311-cp311-win_amd64.whl", hash = "sha256:ccaf1b0c90435b6e430f5dd30a5aede4764942a695552eb3a4ab74ed63c5b8d3"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3b74570d99126992d4b0f91fb87c586a574a5872651185de8297c6f90055ae42"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f77c4f042ad493cb8595e2f503c7a4fe44cd7bd59c7582fd6d78d7e7b8ec52c"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd1591329333daf94467e699e11015d9c944f44c94d2091f4ac493ced0119449"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74afabeeff415e35525bf7a4ecdab015f00e06456166a2eba7590e49f8db940e"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b9c01990d9015df2c6f818aa8f4297d42ee71c9502026bb074e713d496e26b67"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:66f63278db425838b3c2b1c596654b31939427016ba030e951b292e32b99553e"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-win32.whl", hash = "sha256:0b0f658414ee4e4b8cbcd4a9bb0fd743c5eeb81fc858ca517217a8013d282c96"}, + {file = "SQLAlchemy-2.0.31-cp312-cp312-win_amd64.whl", hash = "sha256:fa4b1af3e619b5b0b435e333f3967612db06351217c58bfb50cee5f003db2a5a"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:f43e93057cf52a227eda401251c72b6fbe4756f35fa6bfebb5d73b86881e59b0"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d337bf94052856d1b330d5fcad44582a30c532a2463776e1651bd3294ee7e58b"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c06fb43a51ccdff3b4006aafee9fcf15f63f23c580675f7734245ceb6b6a9e05"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:b6e22630e89f0e8c12332b2b4c282cb01cf4da0d26795b7eae16702a608e7ca1"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:79a40771363c5e9f3a77f0e28b3302801db08040928146e6808b5b7a40749c88"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-win32.whl", hash = "sha256:501ff052229cb79dd4c49c402f6cb03b5a40ae4771efc8bb2bfac9f6c3d3508f"}, + {file = "SQLAlchemy-2.0.31-cp37-cp37m-win_amd64.whl", hash = "sha256:597fec37c382a5442ffd471f66ce12d07d91b281fd474289356b1a0041bdf31d"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:dc6d69f8829712a4fd799d2ac8d79bdeff651c2301b081fd5d3fe697bd5b4ab9"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:23b9fbb2f5dd9e630db70fbe47d963c7779e9c81830869bd7d137c2dc1ad05fb"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a21c97efcbb9f255d5c12a96ae14da873233597dfd00a3a0c4ce5b3e5e79704"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26a6a9837589c42b16693cf7bf836f5d42218f44d198f9343dd71d3164ceeeac"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc251477eae03c20fae8db9c1c23ea2ebc47331bcd73927cdcaecd02af98d3c3"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:2fd17e3bb8058359fa61248c52c7b09a97cf3c820e54207a50af529876451808"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-win32.whl", hash = "sha256:c76c81c52e1e08f12f4b6a07af2b96b9b15ea67ccdd40ae17019f1c373faa227"}, + {file = "SQLAlchemy-2.0.31-cp38-cp38-win_amd64.whl", hash = "sha256:4b600e9a212ed59355813becbcf282cfda5c93678e15c25a0ef896b354423238"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b6cf796d9fcc9b37011d3f9936189b3c8074a02a4ed0c0fbbc126772c31a6d4"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:78fe11dbe37d92667c2c6e74379f75746dc947ee505555a0197cfba9a6d4f1a4"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2fc47dc6185a83c8100b37acda27658fe4dbd33b7d5e7324111f6521008ab4fe"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a41514c1a779e2aa9a19f67aaadeb5cbddf0b2b508843fcd7bafdf4c6864005"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:afb6dde6c11ea4525318e279cd93c8734b795ac8bb5dda0eedd9ebaca7fa23f1"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3f9faef422cfbb8fd53716cd14ba95e2ef655400235c3dfad1b5f467ba179c8c"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-win32.whl", hash = "sha256:fc6b14e8602f59c6ba893980bea96571dd0ed83d8ebb9c4479d9ed5425d562e9"}, + {file = "SQLAlchemy-2.0.31-cp39-cp39-win_amd64.whl", hash = "sha256:3cb8a66b167b033ec72c3812ffc8441d4e9f5f78f5e31e54dcd4c90a4ca5bebc"}, + {file = "SQLAlchemy-2.0.31-py3-none-any.whl", hash = "sha256:69f3e3c08867a8e4856e92d7afb618b95cdee18e0bc1647b77599722c9a28911"}, + {file = "SQLAlchemy-2.0.31.tar.gz", hash = "sha256:b607489dd4a54de56984a0c7656247504bd5523d9d0ba799aef59d4add009484"}, ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\""} +greenlet = {version = "!=0.4.17", markers = "python_version < \"3.13\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} typing-extensions = ">=4.6.0" [package.extras] @@ -4019,13 +4019,13 @@ mpmath = ">=0.19" [[package]] name = "tenacity" -version = "8.3.0" +version = "8.4.2" description = "Retry code until it succeeds" optional = false python-versions = ">=3.8" files = [ - {file = "tenacity-8.3.0-py3-none-any.whl", hash = "sha256:3649f6443dbc0d9b01b9d8020a9c4ec7a1ff5f6f3c6c8a036ef371f573fe9185"}, - {file = "tenacity-8.3.0.tar.gz", hash = "sha256:953d4e6ad24357bceffbc9707bc74349aca9d245f68eb65419cf0c249a1949a2"}, + {file = "tenacity-8.4.2-py3-none-any.whl", hash = "sha256:9e6f7cf7da729125c7437222f8a522279751cdfbe6b67bfe64f75d3a348661b2"}, + {file = "tenacity-8.4.2.tar.gz", hash = "sha256:cd80a53a79336edba8489e767f729e4f391c896956b57140b5d7511a64bbd3ef"}, ] [package.extras] @@ -4083,13 +4083,13 @@ telegram = ["requests"] [[package]] name = "typing-extensions" -version = "4.11.0" +version = "4.12.2" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"}, - {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"}, + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] [[package]] @@ -4105,13 +4105,13 @@ files = [ [[package]] name = "urllib3" -version = "1.26.18" +version = "1.26.19" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ - {file = "urllib3-1.26.18-py2.py3-none-any.whl", hash = "sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07"}, - {file = "urllib3-1.26.18.tar.gz", hash = "sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0"}, + {file = "urllib3-1.26.19-py2.py3-none-any.whl", hash = "sha256:37a0344459b199fce0e80b0d3569837ec6b6937435c5244e7fd73fa6006830f3"}, + {file = "urllib3-1.26.19.tar.gz", hash = "sha256:3e3d753a8618b86d7de333b4223005f68720bcd6a7d2bcb9fbd2229ec7c1e429"}, ] [package.extras] @@ -4119,23 +4119,6 @@ brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotl secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] -[[package]] -name = "urllib3" -version = "2.0.7" -description = "HTTP library with thread-safe connection pooling, file post, and more." -optional = false -python-versions = ">=3.7" -files = [ - {file = "urllib3-2.0.7-py3-none-any.whl", hash = "sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e"}, - {file = "urllib3-2.0.7.tar.gz", hash = "sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84"}, -] - -[package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] -secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"] -socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] -zstd = ["zstandard (>=0.18.0)"] - [[package]] name = "virtualenv" version = "20.25.0" @@ -4462,4 +4445,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "2c019a99dfec370111ef19bae1ca7e00f434cec159296f5fcf4aee1b4552ba06" +content-hash = "6e68bbd21368ac70baa311ed9567b5ad971b134207972549b1835718f76402a6" diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 0b70fe32e1..9a951b5c8e 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -588,7 +588,7 @@ def identifier_to_tuple(identifier: Union[str, Identifier]) -> Identifier: If the identifier is a string, it is split into a tuple on '.'. If it is a tuple, it is used as-is. Args: - identifier (str | Identifier: an identifier, either a string or tuple of strings. + identifier (str | Identifier): an identifier, either a string or tuple of strings. Returns: Identifier: a tuple of strings. @@ -619,6 +619,29 @@ def namespace_from(identifier: Union[str, Identifier]) -> Identifier: """ return Catalog.identifier_to_tuple(identifier)[:-1] + @staticmethod + def namespace_to_string( + identifier: Union[str, Identifier], err: Union[Type[ValueError], Type[NoSuchNamespaceError]] = ValueError + ) -> str: + """Transform a namespace identifier into a string. + + Args: + identifier (Union[str, Identifier]): a namespace identifier. + err (Union[Type[ValueError], Type[NoSuchNamespaceError]]): the error type to raise when identifier is empty. + + Returns: + Identifier: Namespace identifier. + """ + tuple_identifier = Catalog.identifier_to_tuple(identifier) + if len(tuple_identifier) < 1: + raise err("Empty namespace identifier") + + # Check if any segment of the tuple is an empty string + if any(segment.strip() == "" for segment in tuple_identifier): + raise err("Namespace identifier contains an empty segment or a segment with only whitespace") + + return ".".join(segment.strip() for segment in tuple_identifier) + @staticmethod def identifier_to_database( identifier: Union[str, Identifier], err: Union[Type[ValueError], Type[NoSuchNamespaceError]] = ValueError @@ -738,7 +761,7 @@ def _create_staged_table( metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties ) - io = load_file_io(properties=self.properties, location=metadata_location) + io = self._load_file_io(properties=properties, location=metadata_location) return StagedTable( identifier=(self.name, database_name, table_name), metadata=metadata, diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index 275cda7ed0..8819c2e266 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -417,7 +417,14 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: Raises: TableAlreadyExistsError: If the table already exists """ - raise NotImplementedError + database_name, table_name = self.identifier_to_database_and_table(identifier) + properties = EMPTY_DICT + io = self._load_file_io(location=metadata_location) + file = io.new_input(metadata_location) + metadata = FromInputFile.table_metadata(file) + table_input = _construct_table_input(table_name, metadata_location, properties, metadata) + self._create_glue_table(database_name=database_name, table_name=table_name, table_input=table_input) + return self.load_table(identifier=identifier) def _commit_table(self, table_request: CommitTableRequest) -> CommitTableResponse: """Update the table. diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 708ae8c9d4..83bbd50779 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -70,11 +70,11 @@ NamespaceNotEmptyError, NoSuchIcebergTableError, NoSuchNamespaceError, + NoSuchPropertyException, NoSuchTableError, TableAlreadyExistsError, WaitingForLockException, ) -from pyiceberg.io import FileIO, load_file_io from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema, SchemaVisitor, visit from pyiceberg.serializers import FromInputFile @@ -82,11 +82,10 @@ CommitTableRequest, CommitTableResponse, PropertyUtil, + StagedTable, Table, TableProperties, - update_table_metadata, ) -from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties from pyiceberg.types import ( @@ -146,7 +145,7 @@ def __init__(self, uri: str, ugi: Optional[str] = None): protocol = TBinaryProtocol.TBinaryProtocol(transport) self._client = Client(protocol) - self._ugi = ugi.split(':') if ugi else None + self._ugi = ugi.split(":") if ugi else None def __enter__(self) -> Client: self._transport.open() @@ -272,10 +271,12 @@ def __init__(self, name: str, **properties: str): DEFAULT_LOCK_CHECK_RETRIES, ) - def _convert_hive_into_iceberg(self, table: HiveTable, io: FileIO) -> Table: + def _convert_hive_into_iceberg(self, table: HiveTable) -> Table: properties: Dict[str, str] = table.parameters if TABLE_TYPE not in properties: - raise NoSuchTableError(f"Property table_type missing, could not determine type: {table.dbName}.{table.tableName}") + raise NoSuchPropertyException( + f"Property table_type missing, could not determine type: {table.dbName}.{table.tableName}" + ) table_type = properties[TABLE_TYPE] if table_type.lower() != ICEBERG: @@ -286,8 +287,9 @@ def _convert_hive_into_iceberg(self, table: HiveTable, io: FileIO) -> Table: if prop_metadata_location := properties.get(METADATA_LOCATION): metadata_location = prop_metadata_location else: - raise NoSuchTableError(f"Table property {METADATA_LOCATION} is missing") + raise NoSuchPropertyException(f"Table property {METADATA_LOCATION} is missing") + io = self._load_file_io(location=metadata_location) file = io.new_input(metadata_location) metadata = FromInputFile.table_metadata(file) return Table( @@ -298,6 +300,38 @@ def _convert_hive_into_iceberg(self, table: HiveTable, io: FileIO) -> Table: catalog=self, ) + def _convert_iceberg_into_hive(self, table: Table) -> HiveTable: + identifier_tuple = self.identifier_to_tuple_without_catalog(table.identifier) + database_name, table_name = self.identifier_to_database_and_table(identifier_tuple, NoSuchTableError) + current_time_millis = int(time.time() * 1000) + + return HiveTable( + dbName=database_name, + tableName=table_name, + owner=table.properties[OWNER] if table.properties and OWNER in table.properties else getpass.getuser(), + createTime=current_time_millis // 1000, + lastAccessTime=current_time_millis // 1000, + sd=_construct_hive_storage_descriptor( + table.schema(), + table.location(), + PropertyUtil.property_as_bool(self.properties, HIVE2_COMPATIBLE, HIVE2_COMPATIBLE_DEFAULT), + ), + tableType=EXTERNAL_TABLE, + parameters=_construct_parameters(table.metadata_location), + ) + + def _create_hive_table(self, open_client: Client, hive_table: HiveTable) -> None: + try: + open_client.create_table(hive_table) + except AlreadyExistsException as e: + raise TableAlreadyExistsError(f"Table {hive_table.dbName}.{hive_table.tableName} already exists") from e + + def _get_hive_table(self, open_client: Client, database_name: str, table_name: str) -> HiveTable: + try: + return open_client.get_table(dbname=database_name, tbl_name=table_name) + except NoSuchObjectException as e: + raise NoSuchTableError(f"Table does not exists: {table_name}") from e + def create_table( self, identifier: Union[str, Identifier], @@ -324,45 +358,25 @@ def create_table( AlreadyExistsError: If a table with the name already exists. ValueError: If the identifier is invalid. """ - schema: Schema = self._convert_schema_if_needed(schema) # type: ignore - properties = {**DEFAULT_PROPERTIES, **properties} - database_name, table_name = self.identifier_to_database_and_table(identifier) - current_time_millis = int(time.time() * 1000) - - location = self._resolve_table_location(location, database_name, table_name) - - metadata_location = self._get_metadata_location(location=location) - metadata = new_table_metadata( - location=location, + staged_table = self._create_staged_table( + identifier=identifier, schema=schema, + location=location, partition_spec=partition_spec, sort_order=sort_order, properties=properties, ) - io = load_file_io({**self.properties, **properties}, location=location) - self._write_metadata(metadata, io, metadata_location) + database_name, table_name = self.identifier_to_database_and_table(identifier) - tbl = HiveTable( - dbName=database_name, - tableName=table_name, - owner=properties[OWNER] if properties and OWNER in properties else getpass.getuser(), - createTime=current_time_millis // 1000, - lastAccessTime=current_time_millis // 1000, - sd=_construct_hive_storage_descriptor( - schema, location, PropertyUtil.property_as_bool(self.properties, HIVE2_COMPATIBLE, HIVE2_COMPATIBLE_DEFAULT) - ), - tableType=EXTERNAL_TABLE, - parameters=_construct_parameters(metadata_location), - ) - try: - with self._client as open_client: - open_client.create_table(tbl) - hive_table = open_client.get_table(dbname=database_name, tbl_name=table_name) - except AlreadyExistsException as e: - raise TableAlreadyExistsError(f"Table {database_name}.{table_name} already exists") from e + self._write_metadata(staged_table.metadata, staged_table.io, staged_table.metadata_location) + tbl = self._convert_iceberg_into_hive(staged_table) + + with self._client as open_client: + self._create_hive_table(open_client, tbl) + hive_table = open_client.get_table(dbname=database_name, tbl_name=table_name) - return self._convert_hive_into_iceberg(hive_table, io) + return self._convert_hive_into_iceberg(hive_table) def register_table(self, identifier: Union[str, Identifier], metadata_location: str) -> Table: """Register a new table using existing metadata. @@ -437,36 +451,52 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons else: raise CommitFailedException(f"Failed to acquire lock for {table_request.identifier}, state: {lock.state}") - hive_table = open_client.get_table(dbname=database_name, tbl_name=table_name) - io = load_file_io({**self.properties, **hive_table.parameters}, hive_table.sd.location) - current_table = self._convert_hive_into_iceberg(hive_table, io) - - base_metadata = current_table.metadata - for requirement in table_request.requirements: - requirement.validate(base_metadata) - - updated_metadata = update_table_metadata(base_metadata, table_request.updates) - if updated_metadata == base_metadata: + hive_table: Optional[HiveTable] + current_table: Optional[Table] + try: + hive_table = self._get_hive_table(open_client, database_name, table_name) + current_table = self._convert_hive_into_iceberg(hive_table) + except NoSuchTableError: + hive_table = None + current_table = None + + updated_staged_table = self._update_and_stage_table(current_table, table_request) + if current_table and updated_staged_table.metadata == current_table.metadata: # no changes, do nothing - return CommitTableResponse(metadata=base_metadata, metadata_location=current_table.metadata_location) - - # write new metadata - new_metadata_version = self._parse_metadata_version(current_table.metadata_location) + 1 - new_metadata_location = self._get_metadata_location(current_table.metadata.location, new_metadata_version) - self._write_metadata(updated_metadata, current_table.io, new_metadata_location) - - hive_table.parameters = _construct_parameters( - metadata_location=new_metadata_location, previous_metadata_location=current_table.metadata_location + return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) + self._write_metadata( + metadata=updated_staged_table.metadata, + io=updated_staged_table.io, + metadata_path=updated_staged_table.metadata_location, ) - open_client.alter_table(dbname=database_name, tbl_name=table_name, new_tbl=hive_table) - except NoSuchObjectException as e: - raise NoSuchTableError(f"Table does not exist: {table_name}") from e + + if hive_table and current_table: + # Table exists, update it. + hive_table.parameters = _construct_parameters( + metadata_location=updated_staged_table.metadata_location, + previous_metadata_location=current_table.metadata_location, + ) + open_client.alter_table(dbname=database_name, tbl_name=table_name, new_tbl=hive_table) + else: + # Table does not exist, create it. + hive_table = self._convert_iceberg_into_hive( + StagedTable( + identifier=(self.name, database_name, table_name), + metadata=updated_staged_table.metadata, + metadata_location=updated_staged_table.metadata_location, + io=updated_staged_table.io, + catalog=self, + ) + ) + self._create_hive_table(open_client, hive_table) except WaitingForLockException as e: raise CommitFailedException(f"Failed to acquire lock for {table_request.identifier}, state: {lock.state}") from e finally: open_client.unlock(UnlockRequest(lockid=lock.lockid)) - return CommitTableResponse(metadata=updated_metadata, metadata_location=new_metadata_location) + return CommitTableResponse( + metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location + ) def load_table(self, identifier: Union[str, Identifier]) -> Table: """Load the table's metadata and return the table instance. @@ -485,14 +515,11 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: """ identifier_tuple = self.identifier_to_tuple_without_catalog(identifier) database_name, table_name = self.identifier_to_database_and_table(identifier_tuple, NoSuchTableError) - try: - with self._client as open_client: - hive_table = open_client.get_table(dbname=database_name, tbl_name=table_name) - except NoSuchObjectException as e: - raise NoSuchTableError(f"Table does not exists: {table_name}") from e - io = load_file_io({**self.properties, **hive_table.parameters}, hive_table.sd.location) - return self._convert_hive_into_iceberg(hive_table, io) + with self._client as open_client: + hive_table = self._get_hive_table(open_client, database_name, table_name) + + return self._convert_hive_into_iceberg(hive_table) def drop_table(self, identifier: Union[str, Identifier]) -> None: """Drop a table. diff --git a/pyiceberg/catalog/rest.py b/pyiceberg/catalog/rest.py index 7259f9fa38..2474b89853 100644 --- a/pyiceberg/catalog/rest.py +++ b/pyiceberg/catalog/rest.py @@ -152,7 +152,7 @@ class CreateTableRequest(IcebergBaseModel): properties: Dict[str, str] = Field(default_factory=dict) # validators - @field_validator('properties', mode='before') + @field_validator("properties", mode="before") def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: return transform_dict_value_to_str(properties) @@ -790,4 +790,4 @@ def table_exists(self, identifier: Union[str, Identifier]) -> bool: response = self._session.head( self.url(Endpoints.load_table, prefixed=True, **self._split_identifier_for_path(identifier_tuple)) ) - return response.status_code == 200 + return response.status_code in (200, 204) diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py index 978109b2a3..ff7831d77f 100644 --- a/pyiceberg/catalog/sql.py +++ b/pyiceberg/catalog/sql.py @@ -43,6 +43,7 @@ from pyiceberg.catalog import ( METADATA_LOCATION, + Catalog, MetastoreCatalog, PropertiesUpdateSummary, ) @@ -59,7 +60,7 @@ from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.serializers import FromInputFile -from pyiceberg.table import CommitTableRequest, CommitTableResponse, Table, update_table_metadata +from pyiceberg.table import CommitTableRequest, CommitTableResponse, Table from pyiceberg.table.metadata import new_table_metadata from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties @@ -94,6 +95,16 @@ class IcebergNamespaceProperties(SqlCatalogBaseTable): class SqlCatalog(MetastoreCatalog): + """Implementation of a SQL based catalog. + + In the `JDBCCatalog` implementation, a `Namespace` is composed of a list of strings separated by dots: `'ns1.ns2.ns3'`. + And you can have as many levels as you want, but you need at least one. The `SqlCatalog` honors the same convention. + + In the `JDBCCatalog` implementation, a `TableIdentifier` is composed of an optional `Namespace` and a table name. + When a `Namespace` is present, the full name will be `'ns1.ns2.ns3.table'`. A valid `TableIdentifier` could be `'name'` (no namespace). + The `SqlCatalog` has a different convention where a `TableIdentifier` requires a `Namespace`. + """ + def __init__(self, name: str, **properties: str): super().__init__(name, **properties) @@ -136,7 +147,7 @@ def _convert_orm_to_iceberg(self, orm_table: IcebergTables) -> Table: file = io.new_input(metadata_location) metadata = FromInputFile.table_metadata(file) return Table( - identifier=(self.name, table_namespace, table_name), + identifier=(self.name,) + Catalog.identifier_to_tuple(table_namespace) + (table_name,), metadata=metadata, metadata_location=metadata_location, io=self._load_file_io(metadata.properties, metadata_location), @@ -173,11 +184,14 @@ def create_table( """ schema: Schema = self._convert_schema_if_needed(schema) # type: ignore - database_name, table_name = self.identifier_to_database_and_table(identifier) - if not self._namespace_exists(database_name): - raise NoSuchNamespaceError(f"Namespace does not exist: {database_name}") + identifier_nocatalog = self.identifier_to_tuple_without_catalog(identifier) + namespace_identifier = Catalog.namespace_from(identifier_nocatalog) + table_name = Catalog.table_name_from(identifier_nocatalog) + if not self._namespace_exists(namespace_identifier): + raise NoSuchNamespaceError(f"Namespace does not exist: {namespace_identifier}") - location = self._resolve_table_location(location, database_name, table_name) + namespace = Catalog.namespace_to_string(namespace_identifier) + location = self._resolve_table_location(location, namespace, table_name) metadata_location = self._get_metadata_location(location=location) metadata = new_table_metadata( location=location, schema=schema, partition_spec=partition_spec, sort_order=sort_order, properties=properties @@ -190,7 +204,7 @@ def create_table( session.add( IcebergTables( catalog_name=self.name, - table_namespace=database_name, + table_namespace=namespace, table_name=table_name, metadata_location=metadata_location, previous_metadata_location=None, @@ -198,7 +212,7 @@ def create_table( ) session.commit() except IntegrityError as e: - raise TableAlreadyExistsError(f"Table {database_name}.{table_name} already exists") from e + raise TableAlreadyExistsError(f"Table {namespace}.{table_name} already exists") from e return self.load_table(identifier=identifier) @@ -216,16 +230,19 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: TableAlreadyExistsError: If the table already exists NoSuchNamespaceError: If namespace does not exist """ - database_name, table_name = self.identifier_to_database_and_table(identifier) - if not self._namespace_exists(database_name): - raise NoSuchNamespaceError(f"Namespace does not exist: {database_name}") + identifier_tuple = self.identifier_to_tuple_without_catalog(identifier) + namespace_tuple = Catalog.namespace_from(identifier_tuple) + namespace = Catalog.namespace_to_string(namespace_tuple) + table_name = Catalog.table_name_from(identifier_tuple) + if not self._namespace_exists(namespace): + raise NoSuchNamespaceError(f"Namespace does not exist: {namespace}") with Session(self.engine) as session: try: session.add( IcebergTables( catalog_name=self.name, - table_namespace=database_name, + table_namespace=namespace, table_name=table_name, metadata_location=metadata_location, previous_metadata_location=None, @@ -233,7 +250,7 @@ def register_table(self, identifier: Union[str, Identifier], metadata_location: ) session.commit() except IntegrityError as e: - raise TableAlreadyExistsError(f"Table {database_name}.{table_name} already exists") from e + raise TableAlreadyExistsError(f"Table {namespace}.{table_name} already exists") from e return self.load_table(identifier=identifier) @@ -253,17 +270,19 @@ def load_table(self, identifier: Union[str, Identifier]) -> Table: NoSuchTableError: If a table with the name does not exist. """ identifier_tuple = self.identifier_to_tuple_without_catalog(identifier) - database_name, table_name = self.identifier_to_database_and_table(identifier_tuple, NoSuchTableError) + namespace_tuple = Catalog.namespace_from(identifier_tuple) + namespace = Catalog.namespace_to_string(namespace_tuple) + table_name = Catalog.table_name_from(identifier_tuple) with Session(self.engine) as session: stmt = select(IcebergTables).where( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == database_name, + IcebergTables.table_namespace == namespace, IcebergTables.table_name == table_name, ) result = session.scalar(stmt) if result: return self._convert_orm_to_iceberg(result) - raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") + raise NoSuchTableError(f"Table does not exist: {namespace}.{table_name}") def drop_table(self, identifier: Union[str, Identifier]) -> None: """Drop a table. @@ -275,18 +294,20 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: NoSuchTableError: If a table with the name does not exist. """ identifier_tuple = self.identifier_to_tuple_without_catalog(identifier) - database_name, table_name = self.identifier_to_database_and_table(identifier_tuple, NoSuchTableError) + namespace_tuple = Catalog.namespace_from(identifier_tuple) + namespace = Catalog.namespace_to_string(namespace_tuple) + table_name = Catalog.table_name_from(identifier_tuple) with Session(self.engine) as session: if self.engine.dialect.supports_sane_rowcount: res = session.execute( delete(IcebergTables).where( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == database_name, + IcebergTables.table_namespace == namespace, IcebergTables.table_name == table_name, ) ) if res.rowcount < 1: - raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") + raise NoSuchTableError(f"Table does not exist: {namespace}.{table_name}") else: try: tbl = ( @@ -294,14 +315,14 @@ def drop_table(self, identifier: Union[str, Identifier]) -> None: .with_for_update(of=IcebergTables) .filter( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == database_name, + IcebergTables.table_namespace == namespace, IcebergTables.table_name == table_name, ) .one() ) session.delete(tbl) except NoResultFound as e: - raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name}") from e + raise NoSuchTableError(f"Table does not exist: {namespace}.{table_name}") from e session.commit() def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table: @@ -320,10 +341,15 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U NoSuchNamespaceError: If the target namespace does not exist. """ from_identifier_tuple = self.identifier_to_tuple_without_catalog(from_identifier) - from_database_name, from_table_name = self.identifier_to_database_and_table(from_identifier_tuple, NoSuchTableError) - to_database_name, to_table_name = self.identifier_to_database_and_table(to_identifier) - if not self._namespace_exists(to_database_name): - raise NoSuchNamespaceError(f"Namespace does not exist: {to_database_name}") + to_identifier_tuple = self.identifier_to_tuple_without_catalog(to_identifier) + from_namespace_tuple = Catalog.namespace_from(from_identifier_tuple) + from_namespace = Catalog.namespace_to_string(from_namespace_tuple) + from_table_name = Catalog.table_name_from(from_identifier_tuple) + to_namespace_tuple = Catalog.namespace_from(to_identifier_tuple) + to_namespace = Catalog.namespace_to_string(to_namespace_tuple) + to_table_name = Catalog.table_name_from(to_identifier_tuple) + if not self._namespace_exists(to_namespace): + raise NoSuchNamespaceError(f"Namespace does not exist: {to_namespace}") with Session(self.engine) as session: try: if self.engine.dialect.supports_sane_rowcount: @@ -331,10 +357,10 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U update(IcebergTables) .where( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == from_database_name, + IcebergTables.table_namespace == from_namespace, IcebergTables.table_name == from_table_name, ) - .values(table_namespace=to_database_name, table_name=to_table_name) + .values(table_namespace=to_namespace, table_name=to_table_name) ) result = session.execute(stmt) if result.rowcount < 1: @@ -346,18 +372,18 @@ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: U .with_for_update(of=IcebergTables) .filter( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == from_database_name, + IcebergTables.table_namespace == from_namespace, IcebergTables.table_name == from_table_name, ) .one() ) - tbl.table_namespace = to_database_name + tbl.table_namespace = to_namespace tbl.table_name = to_table_name except NoResultFound as e: raise NoSuchTableError(f"Table does not exist: {from_table_name}") from e session.commit() except IntegrityError as e: - raise TableAlreadyExistsError(f"Table {to_database_name}.{to_table_name} already exists") from e + raise TableAlreadyExistsError(f"Table {to_namespace}.{to_table_name} already exists") from e return self.load_table(to_identifier) def _commit_table(self, table_request: CommitTableRequest) -> CommitTableResponse: @@ -376,60 +402,87 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons identifier_tuple = self.identifier_to_tuple_without_catalog( tuple(table_request.identifier.namespace.root + [table_request.identifier.name]) ) - current_table = self.load_table(identifier_tuple) - database_name, table_name = self.identifier_to_database_and_table(identifier_tuple, NoSuchTableError) - base_metadata = current_table.metadata - for requirement in table_request.requirements: - requirement.validate(base_metadata) - - updated_metadata = update_table_metadata(base_metadata, table_request.updates) - if updated_metadata == base_metadata: + namespace_tuple = Catalog.namespace_from(identifier_tuple) + namespace = Catalog.namespace_to_string(namespace_tuple) + table_name = Catalog.table_name_from(identifier_tuple) + + current_table: Optional[Table] + try: + current_table = self.load_table(identifier_tuple) + except NoSuchTableError: + current_table = None + + updated_staged_table = self._update_and_stage_table(current_table, table_request) + if current_table and updated_staged_table.metadata == current_table.metadata: # no changes, do nothing - return CommitTableResponse(metadata=base_metadata, metadata_location=current_table.metadata_location) - - # write new metadata - new_metadata_version = self._parse_metadata_version(current_table.metadata_location) + 1 - new_metadata_location = self._get_metadata_location(current_table.metadata.location, new_metadata_version) - self._write_metadata(updated_metadata, current_table.io, new_metadata_location) + return CommitTableResponse(metadata=current_table.metadata, metadata_location=current_table.metadata_location) + self._write_metadata( + metadata=updated_staged_table.metadata, + io=updated_staged_table.io, + metadata_path=updated_staged_table.metadata_location, + ) with Session(self.engine) as session: - if self.engine.dialect.supports_sane_rowcount: - stmt = ( - update(IcebergTables) - .where( - IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == database_name, - IcebergTables.table_name == table_name, - IcebergTables.metadata_location == current_table.metadata_location, - ) - .values(metadata_location=new_metadata_location, previous_metadata_location=current_table.metadata_location) - ) - result = session.execute(stmt) - if result.rowcount < 1: - raise CommitFailedException(f"Table has been updated by another process: {database_name}.{table_name}") - else: - try: - tbl = ( - session.query(IcebergTables) - .with_for_update(of=IcebergTables) - .filter( + if current_table: + # table exists, update it + if self.engine.dialect.supports_sane_rowcount: + stmt = ( + update(IcebergTables) + .where( IcebergTables.catalog_name == self.name, - IcebergTables.table_namespace == database_name, + IcebergTables.table_namespace == namespace, IcebergTables.table_name == table_name, IcebergTables.metadata_location == current_table.metadata_location, ) - .one() + .values( + metadata_location=updated_staged_table.metadata_location, + previous_metadata_location=current_table.metadata_location, + ) ) - tbl.metadata_location = new_metadata_location - tbl.previous_metadata_location = current_table.metadata_location - except NoResultFound as e: - raise CommitFailedException(f"Table has been updated by another process: {database_name}.{table_name}") from e - session.commit() + result = session.execute(stmt) + if result.rowcount < 1: + raise CommitFailedException(f"Table has been updated by another process: {namespace}.{table_name}") + else: + try: + tbl = ( + session.query(IcebergTables) + .with_for_update(of=IcebergTables) + .filter( + IcebergTables.catalog_name == self.name, + IcebergTables.table_namespace == namespace, + IcebergTables.table_name == table_name, + IcebergTables.metadata_location == current_table.metadata_location, + ) + .one() + ) + tbl.metadata_location = updated_staged_table.metadata_location + tbl.previous_metadata_location = current_table.metadata_location + except NoResultFound as e: + raise CommitFailedException(f"Table has been updated by another process: {namespace}.{table_name}") from e + session.commit() + else: + # table does not exist, create it + try: + session.add( + IcebergTables( + catalog_name=self.name, + table_namespace=namespace, + table_name=table_name, + metadata_location=updated_staged_table.metadata_location, + previous_metadata_location=None, + ) + ) + session.commit() + except IntegrityError as e: + raise TableAlreadyExistsError(f"Table {namespace}.{table_name} already exists") from e - return CommitTableResponse(metadata=updated_metadata, metadata_location=new_metadata_location) + return CommitTableResponse( + metadata=updated_staged_table.metadata, metadata_location=updated_staged_table.metadata_location + ) def _namespace_exists(self, identifier: Union[str, Identifier]) -> bool: - namespace = self.identifier_to_database(identifier) + namespace_tuple = Catalog.identifier_to_tuple(identifier) + namespace = Catalog.namespace_to_string(namespace_tuple, NoSuchNamespaceError) with Session(self.engine) as session: stmt = ( select(IcebergTables) @@ -462,18 +515,20 @@ def create_namespace(self, namespace: Union[str, Identifier], properties: Proper Raises: NamespaceAlreadyExistsError: If a namespace with the given name already exists. """ + if self._namespace_exists(namespace): + raise NamespaceAlreadyExistsError(f"Namespace {namespace} already exists") + if not properties: properties = IcebergNamespaceProperties.NAMESPACE_MINIMAL_PROPERTIES - database_name = self.identifier_to_database(namespace) - if self._namespace_exists(database_name): - raise NamespaceAlreadyExistsError(f"Database {database_name} already exists") - create_properties = properties if properties else IcebergNamespaceProperties.NAMESPACE_MINIMAL_PROPERTIES with Session(self.engine) as session: for key, value in create_properties.items(): session.add( IcebergNamespaceProperties( - catalog_name=self.name, namespace=database_name, property_key=key, property_value=value + catalog_name=self.name, + namespace=Catalog.namespace_to_string(namespace, NoSuchNamespaceError), + property_key=key, + property_value=value, ) ) session.commit() @@ -488,16 +543,16 @@ def drop_namespace(self, namespace: Union[str, Identifier]) -> None: NoSuchNamespaceError: If a namespace with the given name does not exist. NamespaceNotEmptyError: If the namespace is not empty. """ - database_name = self.identifier_to_database(namespace, NoSuchNamespaceError) - if self._namespace_exists(database_name): - if tables := self.list_tables(database_name): - raise NamespaceNotEmptyError(f"Database {database_name} is not empty. {len(tables)} tables exist.") + if self._namespace_exists(namespace): + namespace_str = Catalog.namespace_to_string(namespace) + if tables := self.list_tables(namespace): + raise NamespaceNotEmptyError(f"Namespace {namespace_str} is not empty. {len(tables)} tables exist.") with Session(self.engine) as session: session.execute( delete(IcebergNamespaceProperties).where( IcebergNamespaceProperties.catalog_name == self.name, - IcebergNamespaceProperties.namespace == database_name, + IcebergNamespaceProperties.namespace == namespace_str, ) ) session.commit() @@ -516,14 +571,14 @@ def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]: Raises: NoSuchNamespaceError: If a namespace with the given name does not exist. """ - database_name = self.identifier_to_database(namespace, NoSuchNamespaceError) + if namespace and not self._namespace_exists(namespace): + raise NoSuchNamespaceError(f"Namespace does not exist: {namespace}") - stmt = select(IcebergTables).where( - IcebergTables.catalog_name == self.name, IcebergTables.table_namespace == database_name - ) + namespace = Catalog.namespace_to_string(namespace) + stmt = select(IcebergTables).where(IcebergTables.catalog_name == self.name, IcebergTables.table_namespace == namespace) with Session(self.engine) as session: result = session.scalars(stmt) - return [(table.table_namespace, table.table_name) for table in result] + return [(Catalog.identifier_to_tuple(table.table_namespace) + (table.table_name,)) for table in result] def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]: """List namespaces from the given namespace. If not given, list top-level namespaces from the catalog. @@ -543,15 +598,15 @@ def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identi table_stmt = select(IcebergTables.table_namespace).where(IcebergTables.catalog_name == self.name) namespace_stmt = select(IcebergNamespaceProperties.namespace).where(IcebergNamespaceProperties.catalog_name == self.name) if namespace: - database_name = self.identifier_to_database(namespace, NoSuchNamespaceError) - table_stmt = table_stmt.where(IcebergTables.table_namespace.like(database_name)) - namespace_stmt = namespace_stmt.where(IcebergNamespaceProperties.namespace.like(database_name)) + namespace_str = Catalog.namespace_to_string(namespace, NoSuchNamespaceError) + table_stmt = table_stmt.where(IcebergTables.table_namespace.like(namespace_str)) + namespace_stmt = namespace_stmt.where(IcebergNamespaceProperties.namespace.like(namespace_str)) stmt = union( table_stmt, namespace_stmt, ) with Session(self.engine) as session: - return [self.identifier_to_tuple(namespace_col) for namespace_col in session.execute(stmt).scalars()] + return [Catalog.identifier_to_tuple(namespace_col) for namespace_col in session.execute(stmt).scalars()] def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties: """Get properties for a namespace. @@ -565,12 +620,12 @@ def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Proper Raises: NoSuchNamespaceError: If a namespace with the given name does not exist. """ - database_name = self.identifier_to_database(namespace) - if not self._namespace_exists(database_name): - raise NoSuchNamespaceError(f"Database {database_name} does not exists") + namespace_str = Catalog.namespace_to_string(namespace) + if not self._namespace_exists(namespace): + raise NoSuchNamespaceError(f"Namespace {namespace_str} does not exists") stmt = select(IcebergNamespaceProperties).where( - IcebergNamespaceProperties.catalog_name == self.name, IcebergNamespaceProperties.namespace == database_name + IcebergNamespaceProperties.catalog_name == self.name, IcebergNamespaceProperties.namespace == namespace_str ) with Session(self.engine) as session: result = session.scalars(stmt) @@ -590,9 +645,9 @@ def update_namespace_properties( NoSuchNamespaceError: If a namespace with the given name does not exist. ValueError: If removals and updates have overlapping keys. """ - database_name = self.identifier_to_database(namespace) - if not self._namespace_exists(database_name): - raise NoSuchNamespaceError(f"Database {database_name} does not exists") + namespace_str = Catalog.namespace_to_string(namespace) + if not self._namespace_exists(namespace): + raise NoSuchNamespaceError(f"Namespace {namespace_str} does not exists") current_properties = self.load_namespace_properties(namespace=namespace) properties_update_summary = self._get_updated_props_and_update_summary( @@ -603,7 +658,7 @@ def update_namespace_properties( if removals: delete_stmt = delete(IcebergNamespaceProperties).where( IcebergNamespaceProperties.catalog_name == self.name, - IcebergNamespaceProperties.namespace == database_name, + IcebergNamespaceProperties.namespace == namespace_str, IcebergNamespaceProperties.property_key.in_(removals), ) session.execute(delete_stmt) @@ -614,14 +669,14 @@ def update_namespace_properties( # This is not a problem since it runs in a single transaction delete_stmt = delete(IcebergNamespaceProperties).where( IcebergNamespaceProperties.catalog_name == self.name, - IcebergNamespaceProperties.namespace == database_name, + IcebergNamespaceProperties.namespace == namespace_str, IcebergNamespaceProperties.property_key.in_(set(updates.keys())), ) session.execute(delete_stmt) insert_stmt = insert(IcebergNamespaceProperties) for property_key, property_value in updates.items(): insert_stmt = insert_stmt.values( - catalog_name=self.name, namespace=database_name, property_key=property_key, property_value=property_value + catalog_name=self.name, namespace=namespace_str, property_key=property_key, property_value=property_value ) session.execute(insert_stmt) session.commit() diff --git a/pyiceberg/cli/console.py b/pyiceberg/cli/console.py index 0fbda10960..d1833df081 100644 --- a/pyiceberg/cli/console.py +++ b/pyiceberg/cli/console.py @@ -112,9 +112,13 @@ def list(ctx: Context, parent: Optional[str]) -> None: # pylint: disable=redefi """List tables or namespaces.""" catalog, output = _catalog_and_output(ctx) - identifiers = catalog.list_namespaces(parent or ()) - if not identifiers and parent: + identifiers = [] + if parent: + # Do we have tables under parent namespace? identifiers = catalog.list_tables(parent) + if not identifiers: + # List hierarchical namespaces if parent, root namespaces otherwise. + identifiers = catalog.list_namespaces(parent or ()) output.identifiers(identifiers) diff --git a/pyiceberg/expressions/parser.py b/pyiceberg/expressions/parser.py index 8873907813..107d2349db 100644 --- a/pyiceberg/expressions/parser.py +++ b/pyiceberg/expressions/parser.py @@ -78,7 +78,7 @@ identifier = Word(alphas, alphanums + "_$").set_results_name("identifier") column = DelimitedList(identifier, delim=".", combine=False).set_results_name("column") -like_regex = r'(?P(?(?(?(? BooleanExpression: match = re.search(like_regex, literal_like.value) - if match and match.groupdict()['invalid_wildcard']: + if match and match.groupdict()["invalid_wildcard"]: raise ValueError("LIKE expressions only supports wildcard, '%', at the end of a string") - elif match and match.groupdict()['valid_wildcard']: - return StartsWith(result.column, StringLiteral(literal_like.value[:-1].replace('\\%', '%'))) + elif match and match.groupdict()["valid_wildcard"]: + return StartsWith(result.column, StringLiteral(literal_like.value[:-1].replace("\\%", "%"))) else: - return EqualTo(result.column, StringLiteral(literal_like.value.replace('\\%', '%'))) + return EqualTo(result.column, StringLiteral(literal_like.value.replace("\\%", "%"))) predicate = (comparison | in_check | null_check | nan_check | starts_check | boolean).set_results_name("predicate") diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 4b5e99d336..36c3e625c8 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -53,10 +53,18 @@ S3_REGION = "s3.region" S3_PROXY_URI = "s3.proxy-uri" S3_CONNECT_TIMEOUT = "s3.connect-timeout" +S3_SIGNER_URI = "s3.signer.uri" HDFS_HOST = "hdfs.host" HDFS_PORT = "hdfs.port" HDFS_USER = "hdfs.user" HDFS_KERB_TICKET = "hdfs.kerberos_ticket" +ADLFS_CONNECTION_STRING = "adlfs.connection-string" +ADLFS_ACCOUNT_NAME = "adlfs.account-name" +ADLFS_ACCOUNT_KEY = "adlfs.account-key" +ADLFS_SAS_TOKEN = "adlfs.sas-token" +ADLFS_TENANT_ID = "adlfs.tenant-id" +ADLFS_CLIENT_ID = "adlfs.client-id" +ADLFS_ClIENT_SECRET = "adlfs.client-secret" GCS_TOKEN = "gcs.oauth2.token" GCS_TOKEN_EXPIRES_AT_MS = "gcs.oauth2.token-expires-at" GCS_PROJECT_ID = "gcs.project-id" @@ -277,6 +285,7 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None: "gs": [ARROW_FILE_IO], "file": [ARROW_FILE_IO, FSSPEC_FILE_IO], "hdfs": [ARROW_FILE_IO], + "viewfs": [ARROW_FILE_IO], "abfs": [FSSPEC_FILE_IO], "abfss": [FSSPEC_FILE_IO], } diff --git a/pyiceberg/io/fsspec.py b/pyiceberg/io/fsspec.py index ee97829c2e..bb76f043c9 100644 --- a/pyiceberg/io/fsspec.py +++ b/pyiceberg/io/fsspec.py @@ -40,6 +40,12 @@ from pyiceberg.catalog import TOKEN from pyiceberg.exceptions import SignError from pyiceberg.io import ( + ADLFS_ACCOUNT_KEY, + ADLFS_ACCOUNT_NAME, + ADLFS_CLIENT_ID, + ADLFS_CONNECTION_STRING, + ADLFS_SAS_TOKEN, + ADLFS_TENANT_ID, GCS_ACCESS, GCS_CACHE_TIMEOUT, GCS_CONSISTENCY, @@ -57,6 +63,8 @@ S3_REGION, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, + S3_SIGNER_URI, + ADLFS_ClIENT_SECRET, FileIO, InputFile, InputStream, @@ -72,7 +80,7 @@ def s3v4_rest_signer(properties: Properties, request: AWSRequest, **_: Any) -> A if TOKEN not in properties: raise SignError("Signer set, but token is not available") - signer_url = properties["uri"].rstrip("/") + signer_url = properties.get(S3_SIGNER_URI, properties["uri"]).rstrip("/") signer_headers = {"Authorization": f"Bearer {properties[TOKEN]}"} signer_body = { "method": request.method, @@ -163,13 +171,13 @@ def _adlfs(properties: Properties) -> AbstractFileSystem: from adlfs import AzureBlobFileSystem return AzureBlobFileSystem( - connection_string=properties.get("adlfs.connection-string"), - account_name=properties.get("adlfs.account-name"), - account_key=properties.get("adlfs.account-key"), - sas_token=properties.get("adlfs.sas-token"), - tenant_id=properties.get("adlfs.tenant-id"), - client_id=properties.get("adlfs.client-id"), - client_secret=properties.get("adlfs.client-secret"), + connection_string=properties.get(ADLFS_CONNECTION_STRING), + account_name=properties.get(ADLFS_ACCOUNT_NAME), + account_key=properties.get(ADLFS_ACCOUNT_KEY), + sas_token=properties.get(ADLFS_SAS_TOKEN), + tenant_id=properties.get(ADLFS_TENANT_ID), + client_id=properties.get(ADLFS_CLIENT_ID), + client_secret=properties.get(ADLFS_ClIENT_SECRET), ) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 9216c37f15..e6490ae156 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -332,7 +332,7 @@ def parse_location(location: str) -> Tuple[str, str, str]: uri = urlparse(location) if not uri.scheme: return "file", uri.netloc, os.path.abspath(location) - elif uri.scheme == "hdfs": + elif uri.scheme in ("hdfs", "viewfs"): return uri.scheme, uri.netloc, uri.path else: return uri.scheme, uri.netloc, f"{uri.netloc}{uri.path}" @@ -356,12 +356,12 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste client_kwargs["connect_timeout"] = float(connect_timeout) return S3FileSystem(**client_kwargs) - elif scheme == "hdfs": + elif scheme in ("hdfs", "viewfs"): from pyarrow.fs import HadoopFileSystem hdfs_kwargs: Dict[str, Any] = {} if netloc: - return HadoopFileSystem.from_uri(f"hdfs://{netloc}") + return HadoopFileSystem.from_uri(f"{scheme}://{netloc}") if host := self.properties.get(HDFS_HOST): hdfs_kwargs["host"] = host if port := self.properties.get(HDFS_PORT): @@ -469,15 +469,18 @@ def __setstate__(self, state: Dict[str, Any]) -> None: self.fs_by_scheme = lru_cache(self._initialize_fs) -def schema_to_pyarrow(schema: Union[Schema, IcebergType], metadata: Dict[bytes, bytes] = EMPTY_DICT) -> pa.schema: - return visit(schema, _ConvertToArrowSchema(metadata)) +def schema_to_pyarrow( + schema: Union[Schema, IcebergType], metadata: Dict[bytes, bytes] = EMPTY_DICT, include_field_ids: bool = True +) -> pa.schema: + return visit(schema, _ConvertToArrowSchema(metadata, include_field_ids)) class _ConvertToArrowSchema(SchemaVisitorPerPrimitiveType[pa.DataType]): _metadata: Dict[bytes, bytes] - def __init__(self, metadata: Dict[bytes, bytes] = EMPTY_DICT) -> None: + def __init__(self, metadata: Dict[bytes, bytes] = EMPTY_DICT, include_field_ids: bool = True) -> None: self._metadata = metadata + self._include_field_ids = include_field_ids def schema(self, _: Schema, struct_result: pa.StructType) -> pa.schema: return pa.schema(list(struct_result), metadata=self._metadata) @@ -486,18 +489,22 @@ def struct(self, _: StructType, field_results: List[pa.DataType]) -> pa.DataType return pa.struct(field_results) def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field: + metadata = {} + if field.doc: + metadata[PYARROW_FIELD_DOC_KEY] = field.doc + if self._include_field_ids: + metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id) + return pa.field( name=field.name, type=field_result, nullable=field.optional, - metadata={PYARROW_FIELD_DOC_KEY: field.doc, PYARROW_PARQUET_FIELD_ID_KEY: str(field.field_id)} - if field.doc - else {PYARROW_PARQUET_FIELD_ID_KEY: str(field.field_id)}, + metadata=metadata, ) def list(self, list_type: ListType, element_result: pa.DataType) -> pa.DataType: element_field = self.field(list_type.element_field, element_result) - return pa.list_(value_type=element_field) + return pa.large_list(value_type=element_field) def map(self, map_type: MapType, key_result: pa.DataType, value_result: pa.DataType) -> pa.DataType: key_field = self.field(map_type.key_field, key_result) @@ -541,7 +548,7 @@ def visit_timestamptz(self, _: TimestamptzType) -> pa.DataType: return pa.timestamp(unit="us", tz="UTC") def visit_string(self, _: StringType) -> pa.DataType: - return pa.string() + return pa.large_string() def visit_uuid(self, _: UUIDType) -> pa.DataType: return pa.binary(16) @@ -648,12 +655,12 @@ def _read_deletes(fs: FileSystem, data_file: DataFile) -> Dict[str, pa.ChunkedAr } -def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], rows: int) -> pa.Array: +def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], start_index: int, end_index: int) -> pa.Array: if len(positional_deletes) == 1: all_chunks = positional_deletes[0] else: all_chunks = pa.chunked_array(itertools.chain(*[arr.chunks for arr in positional_deletes])) - return np.setdiff1d(np.arange(rows), all_chunks, assume_unique=False) + return np.subtract(np.setdiff1d(np.arange(start_index, end_index), all_chunks, assume_unique=False), start_index) def pyarrow_to_schema(schema: pa.Schema, name_mapping: Optional[NameMapping] = None) -> Schema: @@ -673,6 +680,10 @@ def _pyarrow_to_schema_without_ids(schema: pa.Schema) -> Schema: return visit_pyarrow(schema, _ConvertToIcebergWithoutIDs()) +def _pyarrow_schema_ensure_large_types(schema: pa.Schema) -> pa.Schema: + return visit_pyarrow(schema, _ConvertToLargeTypes()) + + @singledispatch def visit_pyarrow(obj: Union[pa.DataType, pa.Schema], visitor: PyArrowSchemaVisitor[T]) -> T: """Apply a pyarrow schema visitor to any point within a schema. @@ -945,6 +956,30 @@ def after_map_value(self, element: pa.Field) -> None: self._field_names.pop() +class _ConvertToLargeTypes(PyArrowSchemaVisitor[Union[pa.DataType, pa.Schema]]): + def schema(self, schema: pa.Schema, struct_result: pa.StructType) -> pa.Schema: + return pa.schema(struct_result) + + def struct(self, struct: pa.StructType, field_results: List[pa.Field]) -> pa.StructType: + return pa.struct(field_results) + + def field(self, field: pa.Field, field_result: pa.DataType) -> pa.Field: + return field.with_type(field_result) + + def list(self, list_type: pa.ListType, element_result: pa.DataType) -> pa.DataType: + return pa.large_list(element_result) + + def map(self, map_type: pa.MapType, key_result: pa.DataType, value_result: pa.DataType) -> pa.DataType: + return pa.map_(key_result, value_result) + + def primitive(self, primitive: pa.DataType) -> pa.DataType: + if primitive == pa.string(): + return pa.large_string() + elif primitive == pa.binary(): + return pa.large_binary() + return primitive + + class _ConvertToIcebergWithoutIDs(_ConvertToIceberg): """ Converts PyArrowSchema to Iceberg Schema with all -1 ids. @@ -960,7 +995,7 @@ def _field_id(self, field: pa.Field) -> int: return -1 -def _task_to_table( +def _task_to_record_batches( fs: FileSystem, task: FileScanTask, bound_row_filter: BooleanExpression, @@ -968,9 +1003,8 @@ def _task_to_table( projected_field_ids: Set[int], positional_deletes: Optional[List[ChunkedArray]], case_sensitive: bool, - limit: Optional[int] = None, name_mapping: Optional[NameMapping] = None, -) -> Optional[pa.Table]: +) -> Iterator[pa.RecordBatch]: _, _, path = PyArrowFileIO.parse_location(task.file.file_path) arrow_format = ds.ParquetFileFormat(pre_buffer=True, buffer_size=(ONE_MEGABYTE * 8)) with fs.open_input_file(path) as fin: @@ -991,43 +1025,48 @@ def _task_to_table( fragment_scanner = ds.Scanner.from_fragment( fragment=fragment, - schema=physical_schema, + # We always use large types in memory as it uses larger offsets + # That can chunk more row values into the buffers + schema=_pyarrow_schema_ensure_large_types(physical_schema), # This will push down the query to Arrow. # But in case there are positional deletes, we have to apply them first filter=pyarrow_filter if not positional_deletes else None, columns=[col.name for col in file_project_schema.columns], ) - if positional_deletes: - # Create the mask of indices that we're interested in - indices = _combine_positional_deletes(positional_deletes, fragment.count_rows()) - - if limit: - if pyarrow_filter is not None: - # In case of the filter, we don't exactly know how many rows - # we need to fetch upfront, can be optimized in the future: - # https://github.com/apache/arrow/issues/35301 - arrow_table = fragment_scanner.take(indices) - arrow_table = arrow_table.filter(pyarrow_filter) - arrow_table = arrow_table.slice(0, limit) - else: - arrow_table = fragment_scanner.take(indices[0:limit]) - else: - arrow_table = fragment_scanner.take(indices) + current_index = 0 + batches = fragment_scanner.to_batches() + for batch in batches: + if positional_deletes: + # Create the mask of indices that we're interested in + indices = _combine_positional_deletes(positional_deletes, current_index, current_index + len(batch)) + batch = batch.take(indices) # Apply the user filter if pyarrow_filter is not None: + # we need to switch back and forth between RecordBatch and Table + # as Expression filter isn't yet supported in RecordBatch + # https://github.com/apache/arrow/issues/39220 + arrow_table = pa.Table.from_batches([batch]) arrow_table = arrow_table.filter(pyarrow_filter) - else: - # If there are no deletes, we can just take the head - # and the user-filter is already applied - if limit: - arrow_table = fragment_scanner.head(limit) - else: - arrow_table = fragment_scanner.to_table() + batch = arrow_table.to_batches()[0] + yield to_requested_schema(projected_schema, file_project_schema, batch) + current_index += len(batch) - if len(arrow_table) < 1: - return None - return to_requested_schema(projected_schema, file_project_schema, arrow_table) + +def _task_to_table( + fs: FileSystem, + task: FileScanTask, + bound_row_filter: BooleanExpression, + projected_schema: Schema, + projected_field_ids: Set[int], + positional_deletes: Optional[List[ChunkedArray]], + case_sensitive: bool, + name_mapping: Optional[NameMapping] = None, +) -> pa.Table: + batches = _task_to_record_batches( + fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, name_mapping + ) + return pa.Table.from_batches(batches, schema=schema_to_pyarrow(projected_schema, include_field_ids=False)) def _read_all_delete_files(fs: FileSystem, tasks: Iterable[FileScanTask]) -> Dict[str, List[ChunkedArray]]: @@ -1106,7 +1145,6 @@ def project_table( projected_field_ids, deletes_per_file.get(task.file.file_path), case_sensitive, - limit, table_metadata.name_mapping(), ) for task in tasks @@ -1130,7 +1168,7 @@ def project_table( tables = [f.result() for f in completed_futures if f.result()] if len(tables) < 1: - return pa.Table.from_batches([], schema=schema_to_pyarrow(projected_schema)) + return pa.Table.from_batches([], schema=schema_to_pyarrow(projected_schema, include_field_ids=False)) result = pa.concat_tables(tables) @@ -1140,8 +1178,78 @@ def project_table( return result -def to_requested_schema(requested_schema: Schema, file_schema: Schema, table: pa.Table) -> pa.Table: - struct_array = visit_with_partner(requested_schema, table, ArrowProjectionVisitor(file_schema), ArrowAccessor(file_schema)) +def project_batches( + tasks: Iterable[FileScanTask], + table_metadata: TableMetadata, + io: FileIO, + row_filter: BooleanExpression, + projected_schema: Schema, + case_sensitive: bool = True, + limit: Optional[int] = None, +) -> Iterator[pa.RecordBatch]: + """Resolve the right columns based on the identifier. + + Args: + tasks (Iterable[FileScanTask]): A URI or a path to a local file. + table_metadata (TableMetadata): The table metadata of the table that's being queried + io (FileIO): A FileIO to open streams to the object store + row_filter (BooleanExpression): The expression for filtering rows. + projected_schema (Schema): The output schema. + case_sensitive (bool): Case sensitivity when looking up column names. + limit (Optional[int]): Limit the number of records. + + Raises: + ResolveError: When an incompatible query is done. + """ + scheme, netloc, _ = PyArrowFileIO.parse_location(table_metadata.location) + if isinstance(io, PyArrowFileIO): + fs = io.fs_by_scheme(scheme, netloc) + else: + try: + from pyiceberg.io.fsspec import FsspecFileIO + + if isinstance(io, FsspecFileIO): + from pyarrow.fs import PyFileSystem + + fs = PyFileSystem(FSSpecHandler(io.get_fs(scheme))) + else: + raise ValueError(f"Expected PyArrowFileIO or FsspecFileIO, got: {io}") + except ModuleNotFoundError as e: + # When FsSpec is not installed + raise ValueError(f"Expected PyArrowFileIO or FsspecFileIO, got: {io}") from e + + bound_row_filter = bind(table_metadata.schema(), row_filter, case_sensitive=case_sensitive) + + projected_field_ids = { + id for id in projected_schema.field_ids if not isinstance(projected_schema.find_type(id), (MapType, ListType)) + }.union(extract_field_ids(bound_row_filter)) + + deletes_per_file = _read_all_delete_files(fs, tasks) + + total_row_count = 0 + + for task in tasks: + batches = _task_to_record_batches( + fs, + task, + bound_row_filter, + projected_schema, + projected_field_ids, + deletes_per_file.get(task.file.file_path), + case_sensitive, + table_metadata.name_mapping(), + ) + for batch in batches: + if limit is not None: + if total_row_count + len(batch) >= limit: + yield batch.slice(0, limit - total_row_count) + break + yield batch + total_row_count += len(batch) + + +def to_requested_schema(requested_schema: Schema, file_schema: Schema, batch: pa.RecordBatch) -> pa.RecordBatch: + struct_array = visit_with_partner(requested_schema, batch, ArrowProjectionVisitor(file_schema), ArrowAccessor(file_schema)) arrays = [] fields = [] @@ -1149,7 +1257,7 @@ def to_requested_schema(requested_schema: Schema, file_schema: Schema, table: pa array = struct_array.field(pos) arrays.append(array) fields.append(pa.field(field.name, array.type, field.optional)) - return pa.Table.from_arrays(arrays, schema=pa.schema(fields)) + return pa.RecordBatch.from_arrays(arrays, schema=pa.schema(fields)) class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, Optional[pa.Array]]): @@ -1160,8 +1268,14 @@ def __init__(self, file_schema: Schema): def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: file_field = self.file_schema.find_field(field.field_id) - if field.field_type.is_primitive and field.field_type != file_field.field_type: - return values.cast(schema_to_pyarrow(promote(file_field.field_type, field.field_type))) + if field.field_type.is_primitive: + if field.field_type != file_field.field_type: + return values.cast(schema_to_pyarrow(promote(file_field.field_type, field.field_type), include_field_ids=False)) + elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=False)) != values.type: + # if file_field and field_type (e.g. String) are the same + # but the pyarrow type of the array is different from the expected type + # (e.g. string vs larger_string), we want to cast the array to the larger type + return values.cast(target_type) return values def _construct_field(self, field: NestedField, arrow_type: pa.DataType) -> pa.Field: @@ -1188,7 +1302,7 @@ def struct( field_arrays.append(array) fields.append(self._construct_field(field, array.type)) elif field.optional: - arrow_type = schema_to_pyarrow(field.field_type) + arrow_type = schema_to_pyarrow(field.field_type, include_field_ids=False) field_arrays.append(pa.nulls(len(struct_array), type=arrow_type)) fields.append(self._construct_field(field, arrow_type)) else: @@ -1200,13 +1314,13 @@ def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional return field_array def list(self, list_type: ListType, list_array: Optional[pa.Array], value_array: Optional[pa.Array]) -> Optional[pa.Array]: - if isinstance(list_array, pa.ListArray) and value_array is not None: + if isinstance(list_array, (pa.ListArray, pa.LargeListArray, pa.FixedSizeListArray)) and value_array is not None: if isinstance(value_array, pa.StructArray): # This can be removed once this has been fixed: # https://github.com/apache/arrow/issues/38809 - list_array = pa.ListArray.from_arrays(list_array.offsets, value_array) + list_array = pa.LargeListArray.from_arrays(list_array.offsets, value_array) - arrow_field = pa.list_(self._construct_field(list_type.element_field, value_array.type)) + arrow_field = pa.large_list(self._construct_field(list_type.element_field, value_array.type)) return list_array.cast(arrow_field) else: return None @@ -1250,13 +1364,15 @@ def field_partner(self, partner_struct: Optional[pa.Array], field_id: int, _: st if isinstance(partner_struct, pa.StructArray): return partner_struct.field(name) - elif isinstance(partner_struct, pa.Table): - return partner_struct.column(name).combine_chunks() + elif isinstance(partner_struct, pa.RecordBatch): + return partner_struct.column(name) + else: + raise ValueError(f"Cannot find {name} in expected partner_struct type {type(partner_struct)}") return None def list_element_partner(self, partner_list: Optional[pa.Array]) -> Optional[pa.Array]: - return partner_list.values if isinstance(partner_list, pa.ListArray) else None + return partner_list.values if isinstance(partner_list, (pa.ListArray, pa.LargeListArray, pa.FixedSizeListArray)) else None def map_key_partner(self, partner_map: Optional[pa.Array]) -> Optional[pa.Array]: return partner_map.keys if isinstance(partner_map, pa.MapArray) else None @@ -1788,15 +1904,19 @@ def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteT def write_parquet(task: WriteTask) -> DataFile: table_schema = task.schema - arrow_table = pa.Table.from_batches(task.record_batches) + # if schema needs to be transformed, use the transformed schema and adjust the arrow table accordingly # otherwise use the original schema if (sanitized_schema := sanitize_column_names(table_schema)) != table_schema: file_schema = sanitized_schema - arrow_table = to_requested_schema(requested_schema=file_schema, file_schema=table_schema, table=arrow_table) else: file_schema = table_schema + batches = [ + to_requested_schema(requested_schema=file_schema, file_schema=table_schema, batch=batch) + for batch in task.record_batches + ] + arrow_table = pa.Table.from_batches(batches) file_path = f'{table_metadata.location}/data/{task.generate_data_file_path("parquet")}' fo = io.new_output(file_path) with fo.create(overwrite=True) as fos: diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 3b8138b61a..bf5749ce9b 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -18,6 +18,7 @@ import math from abc import ABC, abstractmethod +from copy import copy from enum import Enum from types import TracebackType from typing import ( @@ -30,13 +31,15 @@ Type, ) +from pydantic_core import to_json + from pyiceberg.avro.file import AvroFile, AvroOutputFile from pyiceberg.conversions import to_bytes from pyiceberg.exceptions import ValidationError from pyiceberg.io import FileIO, InputFile, OutputFile from pyiceberg.partitioning import PartitionSpec from pyiceberg.schema import Schema -from pyiceberg.typedef import EMPTY_DICT, Record, TableVersion +from pyiceberg.typedef import Record, TableVersion from pyiceberg.types import ( BinaryType, BooleanType, @@ -644,7 +647,6 @@ class ManifestWriter(ABC): _output_file: OutputFile _writer: AvroOutputFile[ManifestEntry] _snapshot_id: int - _meta: Dict[str, str] _added_files: int _added_rows: int _existing_files: int @@ -654,15 +656,12 @@ class ManifestWriter(ABC): _min_data_sequence_number: Optional[int] _partitions: List[Record] - def __init__( - self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int, meta: Dict[str, str] = EMPTY_DICT - ) -> None: + def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int) -> None: self.closed = False self._spec = spec self._schema = schema self._output_file = output_file self._snapshot_id = snapshot_id - self._meta = meta self._added_files = 0 self._added_rows = 0 @@ -696,6 +695,15 @@ def content(self) -> ManifestContent: ... @abstractmethod def version(self) -> TableVersion: ... + @property + def _meta(self) -> Dict[str, str]: + return { + "schema": self._schema.model_dump_json(), + "partition-spec": to_json(self._spec.fields).decode("utf-8"), + "partition-spec-id": str(self._spec.spec_id), + "format-version": str(self.version), + } + def _with_partition(self, format_version: TableVersion) -> Schema: data_file_type = data_file_with_partition( format_version=format_version, partition_type=self._spec.partition_type(self._schema) @@ -770,12 +778,6 @@ def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, schema, output_file, snapshot_id, - { - "schema": schema.model_dump_json(), - "partition-spec": spec.model_dump_json(), - "partition-spec-id": str(spec.spec_id), - "format-version": "1", - }, ) def content(self) -> ManifestContent: @@ -791,19 +793,7 @@ def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry: class ManifestWriterV2(ManifestWriter): def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int): - super().__init__( - spec, - schema, - output_file, - snapshot_id, - meta={ - "schema": schema.model_dump_json(), - "partition-spec": spec.model_dump_json(), - "partition-spec-id": str(spec.spec_id), - "format-version": "2", - "content": "data", - }, - ) + super().__init__(spec, schema, output_file, snapshot_id) def content(self) -> ManifestContent: return ManifestContent.DATA @@ -812,6 +802,13 @@ def content(self) -> ManifestContent: def version(self) -> TableVersion: return 2 + @property + def _meta(self) -> Dict[str, str]: + return { + **super()._meta, + "content": "data", + } + def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry: if entry.data_sequence_number is None: if entry.snapshot_id is not None and entry.snapshot_id != self._snapshot_id: @@ -909,7 +906,7 @@ def __init__(self, output_file: OutputFile, snapshot_id: int, parent_snapshot_id self._sequence_number = sequence_number def prepare_manifest(self, manifest_file: ManifestFile) -> ManifestFile: - wrapped_manifest_file = ManifestFile(*manifest_file.record_fields()) + wrapped_manifest_file = copy(manifest_file) if wrapped_manifest_file.sequence_number == UNASSIGNED_SEQ: # if the sequence number is being assigned here, then the manifest must be created by the current operation. diff --git a/pyiceberg/partitioning.py b/pyiceberg/partitioning.py index a3cf255341..da52d5df8e 100644 --- a/pyiceberg/partitioning.py +++ b/pyiceberg/partitioning.py @@ -229,11 +229,11 @@ def partition_to_path(self, data: Record, schema: Schema) -> str: field_strs = [] value_strs = [] - for pos, value in enumerate(data.record_fields()): + for pos in range(len(self.fields)): partition_field = self.fields[pos] - value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=value) + value_str = partition_field.transform.to_human_string(field_types[pos].field_type, value=data[pos]) - value_str = quote(value_str, safe='') + value_str = quote(value_str, safe="") value_strs.append(value_str) field_strs.append(partition_field.name) @@ -387,7 +387,7 @@ def partition(self) -> Record: # partition key transformed with iceberg interna for raw_partition_field_value in self.raw_partition_field_values: partition_fields = self.partition_spec.source_id_to_fields_map[raw_partition_field_value.field.source_id] if len(partition_fields) != 1: - raise ValueError("partition_fields must contain exactly one field.") + raise ValueError(f"Cannot have redundant partitions: {partition_fields}") partition_field = partition_fields[0] iceberg_typed_key_values[partition_field.name] = partition_record_value( partition_field=partition_field, diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index b2739d8618..77f1addbf5 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -1311,11 +1311,11 @@ def _valid_avro_name(name: str) -> bool: length = len(name) assert length > 0, ValueError("Can not validate empty avro name") first = name[0] - if not (first.isalpha() or first == '_'): + if not (first.isalpha() or first == "_"): return False for character in name[1:]: - if not (character.isalnum() or character == '_'): + if not (character.isalnum() or character == "_"): return False return True @@ -1323,17 +1323,17 @@ def _valid_avro_name(name: str) -> bool: def _sanitize_name(name: str) -> str: sb = [] first = name[0] - if not (first.isalpha() or first == '_'): + if not (first.isalpha() or first == "_"): sb.append(_sanitize_char(first)) else: sb.append(first) for character in name[1:]: - if not (character.isalnum() or character == '_'): + if not (character.isalnum() or character == "_"): sb.append(_sanitize_char(character)) else: sb.append(character) - return ''.join(sb) + return "".join(sb) def _sanitize_char(character: str) -> str: diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 2a7b4d9ab7..b6305455b9 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -71,6 +71,7 @@ ManifestEntry, ManifestEntryStatus, ManifestFile, + PartitionFieldSummary, write_manifest, write_manifest_list, ) @@ -112,6 +113,7 @@ SnapshotLogEntry, SnapshotSummaryCollector, Summary, + ancestors_of, update_snapshot_summaries, ) from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder @@ -137,6 +139,7 @@ ) from pyiceberg.utils.concurrent import ExecutorFactory from pyiceberg.utils.datetime import datetime_to_millis +from pyiceberg.utils.deprecated import deprecated from pyiceberg.utils.singleton import _convert_to_hashable_type if TYPE_CHECKING: @@ -350,6 +353,88 @@ def set_properties(self, properties: Properties = EMPTY_DICT, **kwargs: Any) -> updates = properties or kwargs return self._apply((SetPropertiesUpdate(updates=updates),)) + @deprecated( + deprecated_in="0.7.0", + removed_in="0.8.0", + help_message="Please use one of the functions in ManageSnapshots instead", + ) + def add_snapshot(self, snapshot: Snapshot) -> Transaction: + """Add a new snapshot to the table. + + Returns: + The transaction with the add-snapshot staged. + """ + updates = (AddSnapshotUpdate(snapshot=snapshot),) + + return self._apply(updates, ()) + + @deprecated( + deprecated_in="0.7.0", + removed_in="0.8.0", + help_message="Please use one of the functions in ManageSnapshots instead", + ) + def set_ref_snapshot( + self, + snapshot_id: int, + parent_snapshot_id: Optional[int], + ref_name: str, + type: str, + max_ref_age_ms: Optional[int] = None, + max_snapshot_age_ms: Optional[int] = None, + min_snapshots_to_keep: Optional[int] = None, + ) -> Transaction: + """Update a ref to a snapshot. + + Returns: + The transaction with the set-snapshot-ref staged + """ + updates = ( + SetSnapshotRefUpdate( + snapshot_id=snapshot_id, + ref_name=ref_name, + type=type, + max_ref_age_ms=max_ref_age_ms, + max_snapshot_age_ms=max_snapshot_age_ms, + min_snapshots_to_keep=min_snapshots_to_keep, + ), + ) + + requirements = (AssertRefSnapshotId(snapshot_id=parent_snapshot_id, ref="main"),) + return self._apply(updates, requirements) + + def _set_ref_snapshot( + self, + snapshot_id: int, + ref_name: str, + type: str, + max_ref_age_ms: Optional[int] = None, + max_snapshot_age_ms: Optional[int] = None, + min_snapshots_to_keep: Optional[int] = None, + ) -> UpdatesAndRequirements: + """Update a ref to a snapshot. + + Returns: + The updates and requirements for the set-snapshot-ref staged + """ + updates = ( + SetSnapshotRefUpdate( + snapshot_id=snapshot_id, + ref_name=ref_name, + type=type, + max_ref_age_ms=max_ref_age_ms, + max_snapshot_age_ms=max_snapshot_age_ms, + min_snapshots_to_keep=min_snapshots_to_keep, + ), + ) + requirements = ( + AssertRefSnapshotId( + snapshot_id=self.table_metadata.refs[ref_name].snapshot_id if ref_name in self.table_metadata.refs else None, + ref=ref_name, + ), + ) + + return updates, requirements + def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive: bool = True) -> UpdateSchema: """Create a new UpdateSchema to alter the columns of this table. @@ -391,10 +476,11 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT) if not isinstance(df, pa.Table): raise ValueError(f"Expected PyArrow table, got: {df}") - supported_transforms = {IdentityTransform} - if not all(type(field.transform) in supported_transforms for field in self.table_metadata.spec().fields): + if unsupported_partitions := [ + field for field in self.table_metadata.spec().fields if not field.transform.supports_pyarrow_transform + ]: raise ValueError( - f"All transforms are not supported, expected: {supported_transforms}, but get: {[str(field) for field in self.table_metadata.spec().fields if field.transform not in supported_transforms]}." + f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: {unsupported_partitions}." ) _check_schema_compatible(self._table.schema(), other_schema=df.schema) @@ -509,6 +595,7 @@ def commit_transaction(self) -> Table: The table with the updates applied. """ if len(self._updates) > 0: + self._requirements += (AssertTableUUID(uuid=self.table_metadata.table_uuid),) self._table._do_commit( # pylint: disable=W0212 updates=self._updates, requirements=self._requirements, @@ -563,21 +650,25 @@ def commit_transaction(self) -> Table: The table with the updates applied. """ self._requirements = (AssertCreate(),) - return super().commit_transaction() + self._table._do_commit( # pylint: disable=W0212 + updates=self._updates, + requirements=self._requirements, + ) + return self._table class AssignUUIDUpdate(IcebergBaseModel): - action: Literal['assign-uuid'] = Field(default="assign-uuid") + action: Literal["assign-uuid"] = Field(default="assign-uuid") uuid: uuid.UUID class UpgradeFormatVersionUpdate(IcebergBaseModel): - action: Literal['upgrade-format-version'] = Field(default="upgrade-format-version") + action: Literal["upgrade-format-version"] = Field(default="upgrade-format-version") format_version: int = Field(alias="format-version") class AddSchemaUpdate(IcebergBaseModel): - action: Literal['add-schema'] = Field(default="add-schema") + action: Literal["add-schema"] = Field(default="add-schema") schema_: Schema = Field(alias="schema") # This field is required: https://github.com/apache/iceberg/pull/7445 last_column_id: int = Field(alias="last-column-id") @@ -586,47 +677,47 @@ class AddSchemaUpdate(IcebergBaseModel): class SetCurrentSchemaUpdate(IcebergBaseModel): - action: Literal['set-current-schema'] = Field(default="set-current-schema") + action: Literal["set-current-schema"] = Field(default="set-current-schema") schema_id: int = Field( alias="schema-id", description="Schema ID to set as current, or -1 to set last added schema", default=-1 ) class AddPartitionSpecUpdate(IcebergBaseModel): - action: Literal['add-spec'] = Field(default="add-spec") + action: Literal["add-spec"] = Field(default="add-spec") spec: PartitionSpec initial_change: bool = Field(default=False, exclude=True) class SetDefaultSpecUpdate(IcebergBaseModel): - action: Literal['set-default-spec'] = Field(default="set-default-spec") + action: Literal["set-default-spec"] = Field(default="set-default-spec") spec_id: int = Field( alias="spec-id", description="Partition spec ID to set as the default, or -1 to set last added spec", default=-1 ) class AddSortOrderUpdate(IcebergBaseModel): - action: Literal['add-sort-order'] = Field(default="add-sort-order") + action: Literal["add-sort-order"] = Field(default="add-sort-order") sort_order: SortOrder = Field(alias="sort-order") initial_change: bool = Field(default=False, exclude=True) class SetDefaultSortOrderUpdate(IcebergBaseModel): - action: Literal['set-default-sort-order'] = Field(default="set-default-sort-order") + action: Literal["set-default-sort-order"] = Field(default="set-default-sort-order") sort_order_id: int = Field( alias="sort-order-id", description="Sort order ID to set as the default, or -1 to set last added sort order", default=-1 ) class AddSnapshotUpdate(IcebergBaseModel): - action: Literal['add-snapshot'] = Field(default="add-snapshot") + action: Literal["add-snapshot"] = Field(default="add-snapshot") snapshot: Snapshot class SetSnapshotRefUpdate(IcebergBaseModel): - action: Literal['set-snapshot-ref'] = Field(default="set-snapshot-ref") + action: Literal["set-snapshot-ref"] = Field(default="set-snapshot-ref") ref_name: str = Field(alias="ref-name") type: Literal["tag", "branch"] snapshot_id: int = Field(alias="snapshot-id") @@ -636,31 +727,31 @@ class SetSnapshotRefUpdate(IcebergBaseModel): class RemoveSnapshotsUpdate(IcebergBaseModel): - action: Literal['remove-snapshots'] = Field(default="remove-snapshots") + action: Literal["remove-snapshots"] = Field(default="remove-snapshots") snapshot_ids: List[int] = Field(alias="snapshot-ids") class RemoveSnapshotRefUpdate(IcebergBaseModel): - action: Literal['remove-snapshot-ref'] = Field(default="remove-snapshot-ref") + action: Literal["remove-snapshot-ref"] = Field(default="remove-snapshot-ref") ref_name: str = Field(alias="ref-name") class SetLocationUpdate(IcebergBaseModel): - action: Literal['set-location'] = Field(default="set-location") + action: Literal["set-location"] = Field(default="set-location") location: str class SetPropertiesUpdate(IcebergBaseModel): - action: Literal['set-properties'] = Field(default="set-properties") + action: Literal["set-properties"] = Field(default="set-properties") updates: Dict[str, str] - @field_validator('updates', mode='before') + @field_validator("updates", mode="before") def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: return transform_dict_value_to_str(properties) class RemovePropertiesUpdate(IcebergBaseModel): - action: Literal['remove-properties'] = Field(default="remove-properties") + action: Literal["remove-properties"] = Field(default="remove-properties") removals: List[str] @@ -682,7 +773,7 @@ class RemovePropertiesUpdate(IcebergBaseModel): SetPropertiesUpdate, RemovePropertiesUpdate, ], - Field(discriminator='action'), + Field(discriminator="action"), ] @@ -1141,7 +1232,7 @@ def validate(self, base_metadata: Optional[TableMetadata]) -> None: AssertDefaultSpecId, AssertDefaultSortOrderId, ], - Field(discriminator='type'), + Field(discriminator="type"), ] UpdatesAndRequirements = Tuple[Tuple[TableUpdate, ...], Tuple[TableRequirement, ...]] @@ -1152,7 +1243,7 @@ class Namespace(IcebergRootModel[List[str]]): root: List[str] = Field( ..., - description='Reference to one or more levels of a namespace', + description="Reference to one or more levels of a namespace", ) @@ -1300,10 +1391,37 @@ def snapshot_by_name(self, name: str) -> Optional[Snapshot]: return self.snapshot_by_id(ref.snapshot_id) return None + def snapshot_as_of_timestamp(self, timestamp_ms: int, inclusive: bool = True) -> Optional[Snapshot]: + """Get the snapshot that was current as of or right before the given timestamp, or None if there is no matching snapshot. + + Args: + timestamp_ms: Find snapshot that was current at/before this timestamp + inclusive: Includes timestamp_ms in search when True. Excludes timestamp_ms when False + """ + for log_entry in reversed(self.history()): + if (inclusive and log_entry.timestamp_ms <= timestamp_ms) or log_entry.timestamp_ms < timestamp_ms: + return self.snapshot_by_id(log_entry.snapshot_id) + return None + def history(self) -> List[SnapshotLogEntry]: """Get the snapshot history of this table.""" return self.metadata.snapshot_log + def manage_snapshots(self) -> ManageSnapshots: + """ + Shorthand to run snapshot management operations like create branch, create tag, etc. + + Use table.manage_snapshots().().commit() to run a specific operation. + Use table.manage_snapshots().().().commit() to run multiple operations. + Pending changes are applied on commit. + + We can also use context managers to make more changes. For example, + + with table.manage_snapshots() as ms: + ms.create_tag(snapshot_id1, "Tag_A").create_tag(snapshot_id2, "Tag_B") + """ + return ManageSnapshots(transaction=Transaction(self, autocommit=True)) + def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive: bool = True) -> UpdateSchema: """Create a new UpdateSchema to alter the columns of this table. @@ -1761,6 +1879,24 @@ def to_arrow(self) -> pa.Table: limit=self.limit, ) + def to_arrow_batch_reader(self) -> pa.RecordBatchReader: + import pyarrow as pa + + from pyiceberg.io.pyarrow import project_batches, schema_to_pyarrow + + return pa.RecordBatchReader.from_batches( + schema_to_pyarrow(self.projection()), + project_batches( + self.plan_files(), + self.table_metadata, + self.io, + self.row_filter, + self.projection(), + case_sensitive=self.case_sensitive, + limit=self.limit, + ), + ) + def to_pandas(self, **kwargs: Any) -> pd.DataFrame: return self.to_arrow().to_pandas(**kwargs) @@ -1792,7 +1928,7 @@ class Move: other_field_id: Optional[int] = None -U = TypeVar('U') +U = TypeVar("U") class UpdateTableMetadata(ABC, Generic[U]): @@ -1816,6 +1952,84 @@ def __enter__(self) -> U: return self # type: ignore +class ManageSnapshots(UpdateTableMetadata["ManageSnapshots"]): + """ + Run snapshot management operations using APIs. + + APIs include create branch, create tag, etc. + + Use table.manage_snapshots().().commit() to run a specific operation. + Use table.manage_snapshots().().().commit() to run multiple operations. + Pending changes are applied on commit. + + We can also use context managers to make more changes. For example, + + with table.manage_snapshots() as ms: + ms.create_tag(snapshot_id1, "Tag_A").create_tag(snapshot_id2, "Tag_B") + """ + + _updates: Tuple[TableUpdate, ...] = () + _requirements: Tuple[TableRequirement, ...] = () + + def _commit(self) -> UpdatesAndRequirements: + """Apply the pending changes and commit.""" + return self._updates, self._requirements + + def create_tag(self, snapshot_id: int, tag_name: str, max_ref_age_ms: Optional[int] = None) -> ManageSnapshots: + """ + Create a new tag pointing to the given snapshot id. + + Args: + snapshot_id (int): snapshot id of the existing snapshot to tag + tag_name (str): name of the tag + max_ref_age_ms (Optional[int]): max ref age in milliseconds + + Returns: + This for method chaining + """ + update, requirement = self._transaction._set_ref_snapshot( + snapshot_id=snapshot_id, + ref_name=tag_name, + type="tag", + max_ref_age_ms=max_ref_age_ms, + ) + self._updates += update + self._requirements += requirement + return self + + def create_branch( + self, + snapshot_id: int, + branch_name: str, + max_ref_age_ms: Optional[int] = None, + max_snapshot_age_ms: Optional[int] = None, + min_snapshots_to_keep: Optional[int] = None, + ) -> ManageSnapshots: + """ + Create a new branch pointing to the given snapshot id. + + Args: + snapshot_id (int): snapshot id of existing snapshot at which the branch is created. + branch_name (str): name of the new branch + max_ref_age_ms (Optional[int]): max ref age in milliseconds + max_snapshot_age_ms (Optional[int]): max age of snapshots to keep in milliseconds + min_snapshots_to_keep (Optional[int]): min number of snapshots to keep in milliseconds + Returns: + This for method chaining + """ + update, requirement = self._transaction._set_ref_snapshot( + snapshot_id=snapshot_id, + ref_name=branch_name, + type="branch", + max_ref_age_ms=max_ref_age_ms, + max_snapshot_age_ms=max_snapshot_age_ms, + min_snapshots_to_keep=min_snapshots_to_keep, + ) + self._updates += update + self._requirements += requirement + return self + + class UpdateSchema(UpdateTableMetadata["UpdateSchema"]): _schema: Schema _last_column_id: itertools.count[int] @@ -2681,13 +2895,13 @@ class AddFileTask: def _new_manifest_path(location: str, num: int, commit_uuid: uuid.UUID) -> str: - return f'{location}/metadata/{commit_uuid}-m{num}.avro' + return f"{location}/metadata/{commit_uuid}-m{num}.avro" def _generate_manifest_list_path(location: str, snapshot_id: int, attempt: int, commit_uuid: uuid.UUID) -> str: # Mimics the behavior in Java: # https://github.com/apache/iceberg/blob/c862b9177af8e2d83122220764a056f3b96fd00c/core/src/main/java/org/apache/iceberg/SnapshotProducer.java#L491 - return f'{location}/metadata/snap-{snapshot_id}-{attempt}-{commit_uuid}.avro' + return f"{location}/metadata/snap-{snapshot_id}-{attempt}-{commit_uuid}.avro" def _dataframe_to_data_files( @@ -2905,10 +3119,7 @@ def _commit(self) -> UpdatesAndRequirements: snapshot_id=self._snapshot_id, parent_snapshot_id=self._parent_snapshot_id, ref_name="main", type="branch" ), ), - ( - AssertTableUUID(uuid=self._transaction.table_metadata.table_uuid), - AssertRefSnapshotId(snapshot_id=self._parent_snapshot_id, ref="main"), - ), + (AssertRefSnapshotId(snapshot_id=self._parent_snapshot_id, ref="main"),), ) @@ -3241,7 +3452,7 @@ def _partition_field(self, transform_key: Tuple[int, Transform[Any, Any]], name: new_field_id = self._new_field_id() if name is None: - tmp_field = PartitionField(transform_key[0], new_field_id, transform_key[1], 'unassigned_field_name') + tmp_field = PartitionField(transform_key[0], new_field_id, transform_key[1], "unassigned_field_name") name = _visit_partition_field(self._transaction.table_metadata.schema(), tmp_field, _PartitionNameGenerator()) return PartitionField(transform_key[0], new_field_id, transform_key[1], name) @@ -3280,12 +3491,12 @@ def snapshots(self) -> "pa.Table": import pyarrow as pa snapshots_schema = pa.schema([ - pa.field('committed_at', pa.timestamp(unit='ms'), nullable=False), - pa.field('snapshot_id', pa.int64(), nullable=False), - pa.field('parent_id', pa.int64(), nullable=True), - pa.field('operation', pa.string(), nullable=True), - pa.field('manifest_list', pa.string(), nullable=False), - pa.field('summary', pa.map_(pa.string(), pa.string()), nullable=True), + pa.field("committed_at", pa.timestamp(unit="ms"), nullable=False), + pa.field("snapshot_id", pa.int64(), nullable=False), + pa.field("parent_id", pa.int64(), nullable=True), + pa.field("operation", pa.string(), nullable=True), + pa.field("manifest_list", pa.string(), nullable=False), + pa.field("summary", pa.map_(pa.string(), pa.string()), nullable=True), ]) snapshots = [] for snapshot in self.tbl.metadata.snapshots: @@ -3297,12 +3508,12 @@ def snapshots(self) -> "pa.Table": additional_properties = None snapshots.append({ - 'committed_at': datetime.utcfromtimestamp(snapshot.timestamp_ms / 1000.0), - 'snapshot_id': snapshot.snapshot_id, - 'parent_id': snapshot.parent_snapshot_id, - 'operation': str(operation), - 'manifest_list': snapshot.manifest_list, - 'summary': additional_properties, + "committed_at": datetime.utcfromtimestamp(snapshot.timestamp_ms / 1000.0), + "snapshot_id": snapshot.snapshot_id, + "parent_id": snapshot.parent_snapshot_id, + "operation": str(operation), + "manifest_list": snapshot.manifest_list, + "summary": additional_properties, }) return pa.Table.from_pylist( @@ -3339,33 +3550,33 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: pa_record_struct = schema_to_pyarrow(partition_record) entries_schema = pa.schema([ - pa.field('status', pa.int8(), nullable=False), - pa.field('snapshot_id', pa.int64(), nullable=False), - pa.field('sequence_number', pa.int64(), nullable=False), - pa.field('file_sequence_number', pa.int64(), nullable=False), + pa.field("status", pa.int8(), nullable=False), + pa.field("snapshot_id", pa.int64(), nullable=False), + pa.field("sequence_number", pa.int64(), nullable=False), + pa.field("file_sequence_number", pa.int64(), nullable=False), pa.field( - 'data_file', + "data_file", pa.struct([ - pa.field('content', pa.int8(), nullable=False), - pa.field('file_path', pa.string(), nullable=False), - pa.field('file_format', pa.string(), nullable=False), - pa.field('partition', pa_record_struct, nullable=False), - pa.field('record_count', pa.int64(), nullable=False), - pa.field('file_size_in_bytes', pa.int64(), nullable=False), - pa.field('column_sizes', pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field('value_counts', pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field('null_value_counts', pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field('nan_value_counts', pa.map_(pa.int32(), pa.int64()), nullable=True), - pa.field('lower_bounds', pa.map_(pa.int32(), pa.binary()), nullable=True), - pa.field('upper_bounds', pa.map_(pa.int32(), pa.binary()), nullable=True), - pa.field('key_metadata', pa.binary(), nullable=True), - pa.field('split_offsets', pa.list_(pa.int64()), nullable=True), - pa.field('equality_ids', pa.list_(pa.int32()), nullable=True), - pa.field('sort_order_id', pa.int32(), nullable=True), + pa.field("content", pa.int8(), nullable=False), + pa.field("file_path", pa.string(), nullable=False), + pa.field("file_format", pa.string(), nullable=False), + pa.field("partition", pa_record_struct, nullable=False), + pa.field("record_count", pa.int64(), nullable=False), + pa.field("file_size_in_bytes", pa.int64(), nullable=False), + pa.field("column_sizes", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("null_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("nan_value_counts", pa.map_(pa.int32(), pa.int64()), nullable=True), + pa.field("lower_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True), + pa.field("upper_bounds", pa.map_(pa.int32(), pa.binary()), nullable=True), + pa.field("key_metadata", pa.binary(), nullable=True), + pa.field("split_offsets", pa.list_(pa.int64()), nullable=True), + pa.field("equality_ids", pa.list_(pa.int32()), nullable=True), + pa.field("sort_order_id", pa.int32(), nullable=True), ]), nullable=False, ), - pa.field('readable_metrics', pa.struct(readable_metrics_struct), nullable=True), + pa.field("readable_metrics", pa.struct(readable_metrics_struct), nullable=True), ]) entries = [] @@ -3402,11 +3613,11 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: } entries.append({ - 'status': entry.status.value, - 'snapshot_id': entry.snapshot_id, - 'sequence_number': entry.data_sequence_number, - 'file_sequence_number': entry.file_sequence_number, - 'data_file': { + "status": entry.status.value, + "snapshot_id": entry.snapshot_id, + "sequence_number": entry.data_sequence_number, + "file_sequence_number": entry.file_sequence_number, + "data_file": { "content": entry.data_file.content, "file_path": entry.data_file.file_path, "file_format": entry.data_file.file_format, @@ -3425,7 +3636,7 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: "sort_order_id": entry.data_file.sort_order_id, "spec_id": entry.data_file.spec_id, }, - 'readable_metrics': readable_metrics, + "readable_metrics": readable_metrics, }) return pa.Table.from_pylist( @@ -3437,24 +3648,24 @@ def refs(self) -> "pa.Table": import pyarrow as pa ref_schema = pa.schema([ - pa.field('name', pa.string(), nullable=False), - pa.field('type', pa.dictionary(pa.int32(), pa.string()), nullable=False), - pa.field('snapshot_id', pa.int64(), nullable=False), - pa.field('max_reference_age_in_ms', pa.int64(), nullable=True), - pa.field('min_snapshots_to_keep', pa.int32(), nullable=True), - pa.field('max_snapshot_age_in_ms', pa.int64(), nullable=True), + pa.field("name", pa.string(), nullable=False), + pa.field("type", pa.dictionary(pa.int32(), pa.string()), nullable=False), + pa.field("snapshot_id", pa.int64(), nullable=False), + pa.field("max_reference_age_in_ms", pa.int64(), nullable=True), + pa.field("min_snapshots_to_keep", pa.int32(), nullable=True), + pa.field("max_snapshot_age_in_ms", pa.int64(), nullable=True), ]) ref_results = [] for ref in self.tbl.metadata.refs: if snapshot_ref := self.tbl.metadata.refs.get(ref): ref_results.append({ - 'name': ref, - 'type': snapshot_ref.snapshot_ref_type.upper(), - 'snapshot_id': snapshot_ref.snapshot_id, - 'max_reference_age_in_ms': snapshot_ref.max_ref_age_ms, - 'min_snapshots_to_keep': snapshot_ref.min_snapshots_to_keep, - 'max_snapshot_age_in_ms': snapshot_ref.max_snapshot_age_ms, + "name": ref, + "type": snapshot_ref.snapshot_ref_type.upper(), + "snapshot_id": snapshot_ref.snapshot_id, + "max_reference_age_in_ms": snapshot_ref.max_ref_age_ms, + "min_snapshots_to_keep": snapshot_ref.min_snapshots_to_keep, + "max_snapshot_age_in_ms": snapshot_ref.max_snapshot_age_ms, }) return pa.Table.from_pylist(ref_results, schema=ref_schema) @@ -3465,15 +3676,15 @@ def partitions(self, snapshot_id: Optional[int] = None) -> "pa.Table": from pyiceberg.io.pyarrow import schema_to_pyarrow table_schema = pa.schema([ - pa.field('record_count', pa.int64(), nullable=False), - pa.field('file_count', pa.int32(), nullable=False), - pa.field('total_data_file_size_in_bytes', pa.int64(), nullable=False), - pa.field('position_delete_record_count', pa.int64(), nullable=False), - pa.field('position_delete_file_count', pa.int32(), nullable=False), - pa.field('equality_delete_record_count', pa.int64(), nullable=False), - pa.field('equality_delete_file_count', pa.int32(), nullable=False), - pa.field('last_updated_at', pa.timestamp(unit='ms'), nullable=True), - pa.field('last_updated_snapshot_id', pa.int64(), nullable=True), + pa.field("record_count", pa.int64(), nullable=False), + pa.field("file_count", pa.int32(), nullable=False), + pa.field("total_data_file_size_in_bytes", pa.int64(), nullable=False), + pa.field("position_delete_record_count", pa.int64(), nullable=False), + pa.field("position_delete_file_count", pa.int32(), nullable=False), + pa.field("equality_delete_record_count", pa.int64(), nullable=False), + pa.field("equality_delete_file_count", pa.int32(), nullable=False), + pa.field("last_updated_at", pa.timestamp(unit="ms"), nullable=True), + pa.field("last_updated_snapshot_id", pa.int64(), nullable=True), ]) partition_record = self.tbl.metadata.specs_struct() @@ -3482,8 +3693,8 @@ def partitions(self, snapshot_id: Optional[int] = None) -> "pa.Table": if has_partitions: pa_record_struct = schema_to_pyarrow(partition_record) partitions_schema = pa.schema([ - pa.field('partition', pa_record_struct, nullable=False), - pa.field('spec_id', pa.int32(), nullable=False), + pa.field("partition", pa_record_struct, nullable=False), + pa.field("spec_id", pa.int32(), nullable=False), ]) table_schema = pa.unify_schemas([partitions_schema, table_schema]) @@ -3547,7 +3758,155 @@ def update_partitions_map( schema=table_schema, ) - def files(self, snapshot_id: Optional[int] = None) -> "pa.Table": + def manifests(self) -> "pa.Table": + import pyarrow as pa + from pyiceberg.conversions import from_bytes + + partition_summary_schema = pa.struct([ + pa.field("contains_null", pa.bool_(), nullable=False), + pa.field("contains_nan", pa.bool_(), nullable=True), + pa.field("lower_bound", pa.string(), nullable=True), + pa.field("upper_bound", pa.string(), nullable=True), + ]) + + manifest_schema = pa.schema([ + pa.field("content", pa.int8(), nullable=False), + pa.field("path", pa.string(), nullable=False), + pa.field("length", pa.int64(), nullable=False), + pa.field("partition_spec_id", pa.int32(), nullable=False), + pa.field("added_snapshot_id", pa.int64(), nullable=False), + pa.field("added_data_files_count", pa.int32(), nullable=False), + pa.field("existing_data_files_count", pa.int32(), nullable=False), + pa.field("deleted_data_files_count", pa.int32(), nullable=False), + pa.field("added_delete_files_count", pa.int32(), nullable=False), + pa.field("existing_delete_files_count", pa.int32(), nullable=False), + pa.field("deleted_delete_files_count", pa.int32(), nullable=False), + pa.field("partition_summaries", pa.list_(partition_summary_schema), nullable=False), + ]) + + def _partition_summaries_to_rows( + spec: PartitionSpec, partition_summaries: List[PartitionFieldSummary] + ) -> List[Dict[str, Any]]: + rows = [] + for i, field_summary in enumerate(partition_summaries): + field = spec.fields[i] + partition_field_type = spec.partition_type(self.tbl.schema()).fields[i].field_type + lower_bound = ( + ( + field.transform.to_human_string( + partition_field_type, from_bytes(partition_field_type, field_summary.lower_bound) + ) + ) + if field_summary.lower_bound + else None + ) + upper_bound = ( + ( + field.transform.to_human_string( + partition_field_type, from_bytes(partition_field_type, field_summary.upper_bound) + ) + ) + if field_summary.upper_bound + else None + ) + rows.append({ + "contains_null": field_summary.contains_null, + "contains_nan": field_summary.contains_nan, + "lower_bound": lower_bound, + "upper_bound": upper_bound, + }) + return rows + + specs = self.tbl.metadata.specs() + manifests = [] + if snapshot := self.tbl.metadata.current_snapshot(): + for manifest in snapshot.manifests(self.tbl.io): + is_data_file = manifest.content == ManifestContent.DATA + is_delete_file = manifest.content == ManifestContent.DELETES + manifests.append({ + "content": manifest.content, + "path": manifest.manifest_path, + "length": manifest.manifest_length, + "partition_spec_id": manifest.partition_spec_id, + "added_snapshot_id": manifest.added_snapshot_id, + "added_data_files_count": manifest.added_files_count if is_data_file else 0, + "existing_data_files_count": manifest.existing_files_count if is_data_file else 0, + "deleted_data_files_count": manifest.deleted_files_count if is_data_file else 0, + "added_delete_files_count": manifest.added_files_count if is_delete_file else 0, + "existing_delete_files_count": manifest.existing_files_count if is_delete_file else 0, + "deleted_delete_files_count": manifest.deleted_files_count if is_delete_file else 0, + "partition_summaries": _partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions) + if manifest.partitions + else [], + }) + + return pa.Table.from_pylist( + manifests, + schema=manifest_schema, + ) + + def metadata_log_entries(self) -> "pa.Table": + import pyarrow as pa + + from pyiceberg.table.snapshots import MetadataLogEntry + + table_schema = pa.schema([ + pa.field("timestamp", pa.timestamp(unit="ms"), nullable=False), + pa.field("file", pa.string(), nullable=False), + pa.field("latest_snapshot_id", pa.int64(), nullable=True), + pa.field("latest_schema_id", pa.int32(), nullable=True), + pa.field("latest_sequence_number", pa.int64(), nullable=True), + ]) + + def metadata_log_entry_to_row(metadata_entry: MetadataLogEntry) -> Dict[str, Any]: + latest_snapshot = self.tbl.snapshot_as_of_timestamp(metadata_entry.timestamp_ms) + return { + "timestamp": metadata_entry.timestamp_ms, + "file": metadata_entry.metadata_file, + "latest_snapshot_id": latest_snapshot.snapshot_id if latest_snapshot else None, + "latest_schema_id": latest_snapshot.schema_id if latest_snapshot else None, + "latest_sequence_number": latest_snapshot.sequence_number if latest_snapshot else None, + } + + # similar to MetadataLogEntriesTable in Java + # https://github.com/apache/iceberg/blob/8a70fe0ff5f241aec8856f8091c77fdce35ad256/core/src/main/java/org/apache/iceberg/MetadataLogEntriesTable.java#L62-L66 + metadata_log_entries = self.tbl.metadata.metadata_log + [ + MetadataLogEntry(metadata_file=self.tbl.metadata_location, timestamp_ms=self.tbl.metadata.last_updated_ms) + ] + + return pa.Table.from_pylist( + [metadata_log_entry_to_row(entry) for entry in metadata_log_entries], + schema=table_schema, + ) + + def history(self) -> "pa.Table": + import pyarrow as pa + + history_schema = pa.schema([ + pa.field("made_current_at", pa.timestamp(unit="ms"), nullable=False), + pa.field("snapshot_id", pa.int64(), nullable=False), + pa.field("parent_id", pa.int64(), nullable=True), + pa.field("is_current_ancestor", pa.bool_(), nullable=False), + ]) + + ancestors_ids = {snapshot.snapshot_id for snapshot in ancestors_of(self.tbl.current_snapshot(), self.tbl.metadata)} + + history = [] + metadata = self.tbl.metadata + + for snapshot_entry in metadata.snapshot_log: + snapshot = metadata.snapshot_by_id(snapshot_entry.snapshot_id) + + history.append({ + "made_current_at": datetime.utcfromtimestamp(snapshot_entry.timestamp_ms / 1000.0), + "snapshot_id": snapshot_entry.snapshot_id, + "parent_id": snapshot.parent_snapshot_id if snapshot else None, + "is_current_ancestor": snapshot_entry.snapshot_id in ancestors_ids, + }) + + return pa.Table.from_pylist(history, schema=history_schema) + + def files(self, snapshot_id: Optional[int] = None) -> "pa.Table": import pyarrow as pa from pyiceberg.io.pyarrow import schema_to_pyarrow @@ -3652,40 +4011,13 @@ class TablePartition: arrow_table_partition: pa.Table -def _get_partition_sort_order(partition_columns: list[str], reverse: bool = False) -> dict[str, Any]: - order = 'ascending' if not reverse else 'descending' - null_placement = 'at_start' if reverse else 'at_end' - return {'sort_keys': [(column_name, order) for column_name in partition_columns], 'null_placement': null_placement} - - -def group_by_partition_scheme(arrow_table: pa.Table, partition_columns: list[str]) -> pa.Table: - """Given a table, sort it by current partition scheme.""" - # only works for identity for now - sort_options = _get_partition_sort_order(partition_columns, reverse=False) - sorted_arrow_table = arrow_table.sort_by(sorting=sort_options['sort_keys'], null_placement=sort_options['null_placement']) - return sorted_arrow_table - - -def get_partition_columns( - spec: PartitionSpec, - schema: Schema, -) -> list[str]: - partition_cols = [] - for partition_field in spec.fields: - column_name = schema.find_column_name(partition_field.source_id) - if not column_name: - raise ValueError(f"{partition_field=} could not be found in {schema}.") - partition_cols.append(column_name) - return partition_cols - - def _get_table_partitions( arrow_table: pa.Table, partition_spec: PartitionSpec, schema: Schema, slice_instructions: list[dict[str, Any]], ) -> list[TablePartition]: - sorted_slice_instructions = sorted(slice_instructions, key=lambda x: x['offset']) + sorted_slice_instructions = sorted(slice_instructions, key=lambda x: x["offset"]) partition_fields = partition_spec.fields @@ -3733,13 +4065,30 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T """ import pyarrow as pa - partition_columns = get_partition_columns(spec=spec, schema=schema) - arrow_table = group_by_partition_scheme(arrow_table, partition_columns) - - reversing_sort_order_options = _get_partition_sort_order(partition_columns, reverse=True) - reversed_indices = pa.compute.sort_indices(arrow_table, **reversing_sort_order_options).to_pylist() - - slice_instructions: list[dict[str, Any]] = [] + partition_columns: List[Tuple[PartitionField, NestedField]] = [ + (partition_field, schema.find_field(partition_field.source_id)) for partition_field in spec.fields + ] + partition_values_table = pa.table({ + str(partition.field_id): partition.transform.pyarrow_transform(field.field_type)(arrow_table[field.name]) + for partition, field in partition_columns + }) + + # Sort by partitions + sort_indices = pa.compute.sort_indices( + partition_values_table, + sort_keys=[(col, "ascending") for col in partition_values_table.column_names], + null_placement="at_end", + ).to_pylist() + arrow_table = arrow_table.take(sort_indices) + + # Get slice_instructions to group by partitions + partition_values_table = partition_values_table.take(sort_indices) + reversed_indices = pa.compute.sort_indices( + partition_values_table, + sort_keys=[(col, "descending") for col in partition_values_table.column_names], + null_placement="at_start", + ).to_pylist() + slice_instructions: List[Dict[str, Any]] = [] last = len(reversed_indices) reversed_indices_size = len(reversed_indices) ptr = 0 @@ -3750,6 +4099,6 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T last = reversed_indices[ptr] ptr = ptr + group_size - table_partitions: list[TablePartition] = _get_table_partitions(arrow_table, spec, schema, slice_instructions) + table_partitions: List[TablePartition] = _get_table_partitions(arrow_table, spec, schema, slice_instructions) return table_partitions diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py index ba0c885758..1fea33010c 100644 --- a/pyiceberg/table/metadata.py +++ b/pyiceberg/table/metadata.py @@ -222,7 +222,7 @@ class TableMetadataCommonFields(IcebergBaseModel): current-snapshot-id even if the refs map is null.""" # validators - @field_validator('properties', mode='before') + @field_validator("properties", mode="before") def transform_properties_dict_value_to_str(cls, properties: Properties) -> Dict[str, str]: return transform_dict_value_to_str(properties) @@ -305,12 +305,19 @@ def sort_order_by_id(self, sort_order_id: int) -> Optional[SortOrder]: """Get the sort order by sort_order_id.""" return next((sort_order for sort_order in self.sort_orders if sort_order.order_id == sort_order_id), None) - @field_serializer('current_snapshot_id') + @field_serializer("current_snapshot_id") def serialize_current_snapshot_id(self, current_snapshot_id: Optional[int]) -> Optional[int]: if current_snapshot_id is None and Config().get_bool("legacy-current-snapshot-id"): return -1 return current_snapshot_id + @field_serializer("snapshots") + def serialize_snapshots(self, snapshots: List[Snapshot]) -> List[Snapshot]: + # Snapshot field `sequence-number` should not be written for v1 metadata + if self.format_version == 1: + return [snapshot.model_copy(update={"sequence_number": None}) for snapshot in snapshots] + return snapshots + def _generate_snapshot_id() -> int: """Generate a new Snapshot ID from a UUID. @@ -319,7 +326,7 @@ def _generate_snapshot_id() -> int: """ rnd_uuid = uuid.uuid4() snapshot_id = int.from_bytes( - bytes(lhs ^ rhs for lhs, rhs in zip(rnd_uuid.bytes[0:8], rnd_uuid.bytes[8:16])), byteorder='little', signed=True + bytes(lhs ^ rhs for lhs, rhs in zip(rnd_uuid.bytes[0:8], rnd_uuid.bytes[8:16])), byteorder="little", signed=True ) snapshot_id = snapshot_id if snapshot_id >= 0 else snapshot_id * -1 diff --git a/pyiceberg/table/name_mapping.py b/pyiceberg/table/name_mapping.py index baa15f168d..5a4e769003 100644 --- a/pyiceberg/table/name_mapping.py +++ b/pyiceberg/table/name_mapping.py @@ -40,12 +40,12 @@ class MappedField(IcebergBaseModel): names: List[str] = conlist(str, min_length=1) fields: List[MappedField] = Field(default_factory=list) - @field_validator('fields', mode='before') + @field_validator("fields", mode="before") @classmethod def convert_null_to_empty_List(cls, v: Any) -> Any: return v or [] - @field_validator('names', mode='after') + @field_validator("names", mode="after") @classmethod def check_at_least_one(cls, v: List[str]) -> Any: """ @@ -60,10 +60,10 @@ def check_at_least_one(cls, v: List[str]) -> Any: @model_serializer def ser_model(self) -> Dict[str, Any]: """Set custom serializer to leave out the field when it is empty.""" - fields = {'fields': self.fields} if len(self.fields) > 0 else {} + fields = {"fields": self.fields} if len(self.fields) > 0 else {} return { - 'field-id': self.field_id, - 'names': self.names, + "field-id": self.field_id, + "names": self.names, **fields, } @@ -87,7 +87,7 @@ def _field_by_name(self) -> Dict[str, MappedField]: return visit_name_mapping(self, _IndexByName()) def find(self, *names: str) -> MappedField: - name = '.'.join(names) + name = ".".join(names) try: return self._field_by_name[name] except KeyError as e: @@ -109,7 +109,7 @@ def __str__(self) -> str: return "[\n " + "\n ".join([str(e) for e in self.root]) + "\n]" -S = TypeVar('S') +S = TypeVar("S") T = TypeVar("T") diff --git a/pyiceberg/table/refs.py b/pyiceberg/table/refs.py index df18fadd31..d87a319a16 100644 --- a/pyiceberg/table/refs.py +++ b/pyiceberg/table/refs.py @@ -46,14 +46,14 @@ class SnapshotRef(IcebergBaseModel): max_snapshot_age_ms: Annotated[Optional[int], Field(alias="max-snapshot-age-ms", default=None, gt=0)] max_ref_age_ms: Annotated[Optional[int], Field(alias="max-ref-age-ms", default=None, gt=0)] - @model_validator(mode='after') - def check_min_snapshots_to_keep(self) -> 'SnapshotRef': + @model_validator(mode="after") + def check_min_snapshots_to_keep(self) -> "SnapshotRef": if self.min_snapshots_to_keep is not None and self.snapshot_ref_type == SnapshotRefType.TAG: raise ValidationError("Tags do not support setting minSnapshotsToKeep") return self - @model_validator(mode='after') - def check_max_snapshot_age_ms(self) -> 'SnapshotRef': + @model_validator(mode="after") + def check_max_snapshot_age_ms(self) -> "SnapshotRef": if self.max_snapshot_age_ms is not None and self.snapshot_ref_type == SnapshotRefType.TAG: raise ValidationError("Tags do not support setting maxSnapshotAgeMs") return self diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index f74ac4b7d4..842d42522a 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -14,10 +14,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from __future__ import annotations + import time from collections import defaultdict from enum import Enum -from typing import Any, DefaultDict, Dict, List, Mapping, Optional +from typing import TYPE_CHECKING, Any, DefaultDict, Dict, Iterable, List, Mapping, Optional from pydantic import Field, PrivateAttr, model_serializer @@ -25,34 +27,39 @@ from pyiceberg.manifest import DataFile, DataFileContent, ManifestFile, read_manifest_list from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema + +if TYPE_CHECKING: + from pyiceberg.table.metadata import TableMetadata from pyiceberg.typedef import IcebergBaseModel -ADDED_DATA_FILES = 'added-data-files' -ADDED_DELETE_FILES = 'added-delete-files' -ADDED_EQUALITY_DELETES = 'added-equality-deletes' -ADDED_FILE_SIZE = 'added-files-size' -ADDED_POSITION_DELETES = 'added-position-deletes' -ADDED_POSITION_DELETE_FILES = 'added-position-delete-files' -ADDED_RECORDS = 'added-records' -DELETED_DATA_FILES = 'deleted-data-files' -DELETED_RECORDS = 'deleted-records' -ADDED_EQUALITY_DELETE_FILES = 'added-equality-delete-files' -REMOVED_DELETE_FILES = 'removed-delete-files' -REMOVED_EQUALITY_DELETES = 'removed-equality-deletes' -REMOVED_EQUALITY_DELETE_FILES = 'removed-equality-delete-files' -REMOVED_FILE_SIZE = 'removed-files-size' -REMOVED_POSITION_DELETES = 'removed-position-deletes' -REMOVED_POSITION_DELETE_FILES = 'removed-position-delete-files' -TOTAL_EQUALITY_DELETES = 'total-equality-deletes' -TOTAL_POSITION_DELETES = 'total-position-deletes' -TOTAL_DATA_FILES = 'total-data-files' -TOTAL_DELETE_FILES = 'total-delete-files' -TOTAL_RECORDS = 'total-records' -TOTAL_FILE_SIZE = 'total-files-size' -CHANGED_PARTITION_COUNT_PROP = 'changed-partition-count' +ADDED_DATA_FILES = "added-data-files" +ADDED_DELETE_FILES = "added-delete-files" +ADDED_EQUALITY_DELETES = "added-equality-deletes" +ADDED_FILE_SIZE = "added-files-size" +ADDED_POSITION_DELETES = "added-position-deletes" +ADDED_POSITION_DELETE_FILES = "added-position-delete-files" +ADDED_RECORDS = "added-records" +DELETED_DATA_FILES = "deleted-data-files" +DELETED_RECORDS = "deleted-records" +ADDED_EQUALITY_DELETE_FILES = "added-equality-delete-files" +REMOVED_DELETE_FILES = "removed-delete-files" +REMOVED_EQUALITY_DELETES = "removed-equality-deletes" +REMOVED_EQUALITY_DELETE_FILES = "removed-equality-delete-files" +REMOVED_FILE_SIZE = "removed-files-size" +REMOVED_POSITION_DELETES = "removed-position-deletes" +REMOVED_POSITION_DELETE_FILES = "removed-position-delete-files" +TOTAL_EQUALITY_DELETES = "total-equality-deletes" +TOTAL_POSITION_DELETES = "total-position-deletes" +TOTAL_DATA_FILES = "total-data-files" +TOTAL_DELETE_FILES = "total-delete-files" +TOTAL_RECORDS = "total-records" +TOTAL_FILE_SIZE = "total-files-size" +CHANGED_PARTITION_COUNT_PROP = "changed-partition-count" CHANGED_PARTITION_PREFIX = "partitions." OPERATION = "operation" +INITIAL_SEQUENCE_NUMBER = 0 + class Operation(Enum): """Describes the operation. @@ -181,14 +188,14 @@ def __init__(self, operation: Operation, **data: Any) -> None: def __getitem__(self, __key: str) -> Optional[Any]: # type: ignore """Return a key as it is a map.""" - if __key.lower() == 'operation': + if __key.lower() == "operation": return self.operation else: return self._additional_properties.get(__key) def __setitem__(self, key: str, value: Any) -> None: """Set a key as it is a map.""" - if key.lower() == 'operation': + if key.lower() == "operation": self.operation = value else: self._additional_properties[key] = value @@ -226,7 +233,7 @@ def __eq__(self, other: Any) -> bool: class Snapshot(IcebergBaseModel): snapshot_id: int = Field(alias="snapshot-id") parent_snapshot_id: Optional[int] = Field(alias="parent-snapshot-id", default=None) - sequence_number: Optional[int] = Field(alias="sequence-number", default=None) + sequence_number: Optional[int] = Field(alias="sequence-number", default=INITIAL_SEQUENCE_NUMBER) timestamp_ms: int = Field(alias="timestamp-ms", default_factory=lambda: int(time.time() * 1000)) manifest_list: Optional[str] = Field( alias="manifest-list", description="Location of the snapshot's manifest list file", default=None @@ -274,14 +281,14 @@ def set_partition_summary_limit(self, limit: int) -> None: def add_file(self, data_file: DataFile, schema: Schema, partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC) -> None: self.metrics.add_file(data_file) - if len(data_file.partition.record_fields()) != 0: + if len(data_file.partition) > 0: self.update_partition_metrics(partition_spec=partition_spec, file=data_file, is_add_file=True, schema=schema) def remove_file( self, data_file: DataFile, schema: Schema, partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC ) -> None: self.metrics.remove_file(data_file) - if len(data_file.partition.record_fields()) != 0: + if len(data_file.partition) > 0: self.update_partition_metrics(partition_spec=partition_spec, file=data_file, is_add_file=False, schema=schema) def update_partition_metrics(self, partition_spec: PartitionSpec, file: DataFile, is_add_file: bool, schema: Schema) -> None: @@ -317,10 +324,10 @@ def _truncate_table_summary(summary: Summary, previous_summary: Mapping[str, str TOTAL_POSITION_DELETES, TOTAL_EQUALITY_DELETES, }: - summary[prop] = '0' + summary[prop] = "0" def get_prop(prop: str) -> int: - value = previous_summary.get(prop) or '0' + value = previous_summary.get(prop) or "0" try: return int(value) except ValueError as e: @@ -353,12 +360,12 @@ def update_snapshot_summaries( if not previous_summary: previous_summary = { - TOTAL_DATA_FILES: '0', - TOTAL_DELETE_FILES: '0', - TOTAL_RECORDS: '0', - TOTAL_FILE_SIZE: '0', - TOTAL_POSITION_DELETES: '0', - TOTAL_EQUALITY_DELETES: '0', + TOTAL_DATA_FILES: "0", + TOTAL_DELETE_FILES: "0", + TOTAL_RECORDS: "0", + TOTAL_FILE_SIZE: "0", + TOTAL_POSITION_DELETES: "0", + TOTAL_EQUALITY_DELETES: "0", } def _update_totals(total_property: str, added_property: str, removed_property: str) -> None: @@ -412,3 +419,13 @@ def _update_totals(total_property: str, added_property: str, removed_property: s def set_when_positive(properties: Dict[str, str], num: int, property_name: str) -> None: if num > 0: properties[property_name] = str(num) + + +def ancestors_of(current_snapshot: Optional[Snapshot], table_metadata: TableMetadata) -> Iterable[Snapshot]: + """Get the ancestors of and including the given snapshot.""" + snapshot = current_snapshot + while snapshot is not None: + yield snapshot + if snapshot.parent_snapshot_id is None: + break + snapshot = table_metadata.snapshot_by_id(snapshot.parent_snapshot_id) diff --git a/pyiceberg/transforms.py b/pyiceberg/transforms.py index 6dcae59e49..38cc6221a2 100644 --- a/pyiceberg/transforms.py +++ b/pyiceberg/transforms.py @@ -20,7 +20,7 @@ from abc import ABC, abstractmethod from enum import IntEnum from functools import singledispatch -from typing import Any, Callable, Generic, Optional, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar from typing import Literal as LiteralType from uuid import UUID @@ -82,6 +82,9 @@ from pyiceberg.utils.parsing import ParseNumberFromBrackets from pyiceberg.utils.singleton import Singleton +if TYPE_CHECKING: + import pyarrow as pa + S = TypeVar("S") T = TypeVar("T") @@ -175,6 +178,13 @@ def __eq__(self, other: Any) -> bool: return self.root == other.root return False + @property + def supports_pyarrow_transform(self) -> bool: + return False + + @abstractmethod + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": ... + class BucketTransform(Transform[S, int]): """Base Transform class to transform a value into a bucket partition value. @@ -290,6 +300,9 @@ def __repr__(self) -> str: """Return the string representation of the BucketTransform class.""" return f"BucketTransform(num_buckets={self._num_buckets})" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + raise NotImplementedError() + class TimeResolution(IntEnum): YEAR = 6 @@ -349,6 +362,10 @@ def dedup_name(self) -> str: def preserves_order(self) -> bool: return True + @property + def supports_pyarrow_transform(self) -> bool: + return True + class YearTransform(TimeTransform[S]): """Transforms a datetime value into a year value. @@ -391,6 +408,21 @@ def __repr__(self) -> str: """Return the string representation of the YearTransform class.""" return "YearTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(source, DateType): + epoch = datetime.EPOCH_DATE + elif isinstance(source, TimestampType): + epoch = datetime.EPOCH_TIMESTAMP + elif isinstance(source, TimestamptzType): + epoch = datetime.EPOCH_TIMESTAMPTZ + else: + raise ValueError(f"Cannot apply year transform for type: {source}") + + return lambda v: pc.years_between(pa.scalar(epoch), v) if v is not None else None + class MonthTransform(TimeTransform[S]): """Transforms a datetime value into a month value. @@ -433,6 +465,27 @@ def __repr__(self) -> str: """Return the string representation of the MonthTransform class.""" return "MonthTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(source, DateType): + epoch = datetime.EPOCH_DATE + elif isinstance(source, TimestampType): + epoch = datetime.EPOCH_TIMESTAMP + elif isinstance(source, TimestamptzType): + epoch = datetime.EPOCH_TIMESTAMPTZ + else: + raise ValueError(f"Cannot apply month transform for type: {source}") + + def month_func(v: pa.Array) -> pa.Array: + return pc.add( + pc.multiply(pc.years_between(pa.scalar(epoch), v), pa.scalar(12)), + pc.add(pc.month(v), pa.scalar(-1)), + ) + + return lambda v: month_func(v) if v is not None else None + class DayTransform(TimeTransform[S]): """Transforms a datetime value into a day value. @@ -478,6 +531,21 @@ def __repr__(self) -> str: """Return the string representation of the DayTransform class.""" return "DayTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(source, DateType): + epoch = datetime.EPOCH_DATE + elif isinstance(source, TimestampType): + epoch = datetime.EPOCH_TIMESTAMP + elif isinstance(source, TimestamptzType): + epoch = datetime.EPOCH_TIMESTAMPTZ + else: + raise ValueError(f"Cannot apply day transform for type: {source}") + + return lambda v: pc.days_between(pa.scalar(epoch), v) if v is not None else None + class HourTransform(TimeTransform[S]): """Transforms a datetime value into a hour value. @@ -515,6 +583,19 @@ def __repr__(self) -> str: """Return the string representation of the HourTransform class.""" return "HourTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + import pyarrow as pa + import pyarrow.compute as pc + + if isinstance(source, TimestampType): + epoch = datetime.EPOCH_TIMESTAMP + elif isinstance(source, TimestamptzType): + epoch = datetime.EPOCH_TIMESTAMPTZ + else: + raise ValueError(f"Cannot apply hour transform for type: {source}") + + return lambda v: pc.hours_between(pa.scalar(epoch), v) if v is not None else None + def _base64encode(buffer: bytes) -> str: """Convert bytes to base64 string.""" @@ -585,6 +666,13 @@ def __repr__(self) -> str: """Return the string representation of the IdentityTransform class.""" return "IdentityTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + return lambda v: v + + @property + def supports_pyarrow_transform(self) -> bool: + return True + class TruncateTransform(Transform[S, S]): """A transform for truncating a value to a specified width. @@ -725,6 +813,9 @@ def __repr__(self) -> str: """Return the string representation of the TruncateTransform class.""" return f"TruncateTransform(width={self._width})" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + raise NotImplementedError() + @singledispatch def _human_string(value: Any, _type: IcebergType) -> str: @@ -807,6 +898,9 @@ def __repr__(self) -> str: """Return the string representation of the UnknownTransform class.""" return f"UnknownTransform(transform={repr(self._transform)})" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + raise NotImplementedError() + class VoidTransform(Transform[S, None], Singleton): """A transform that always returns None.""" @@ -835,6 +929,9 @@ def __repr__(self) -> str: """Return the string representation of the VoidTransform class.""" return "VoidTransform()" + def pyarrow_transform(self, source: IcebergType) -> "Callable[[pa.Array], pa.Array]": + raise NotImplementedError() + def _truncate_number( name: str, pred: BoundLiteralPredicate[L], transform: Callable[[Optional[L]], Optional[L]] diff --git a/pyiceberg/typedef.py b/pyiceberg/typedef.py index 6ccf9526ba..2ff123148b 100644 --- a/pyiceberg/typedef.py +++ b/pyiceberg/typedef.py @@ -25,7 +25,6 @@ Callable, Dict, Generic, - List, Literal, Optional, Protocol, @@ -53,7 +52,7 @@ def update(self, *args: Any, **kwargs: Any) -> None: raise AttributeError("FrozenDict does not support .update()") -UTF8 = 'utf-8' +UTF8 = "utf-8" EMPTY_DICT = FrozenDict() @@ -198,9 +197,9 @@ def __repr__(self) -> str: """Return the string representation of the Record class.""" return f"{self.__class__.__name__}[{', '.join(f'{key}={repr(value)}' for key, value in self.__dict__.items() if not key.startswith('_'))}]" - def record_fields(self) -> List[str]: - """Return values of all the fields of the Record class except those specified in skip_fields.""" - return [self.__getattribute__(v) if hasattr(self, v) else None for v in self._position_to_field_name] + def __len__(self) -> int: + """Return the number of fields in the Record class.""" + return len(self._position_to_field_name) def __hash__(self) -> int: """Return hash value of the Record class.""" diff --git a/pyiceberg/utils/config.py b/pyiceberg/utils/config.py index 8b1b81d3a7..5eb9cfaa66 100644 --- a/pyiceberg/utils/config.py +++ b/pyiceberg/utils/config.py @@ -127,7 +127,7 @@ def set_property(_config: RecursiveDict, path: List[str], config_value: str) -> if env_var_lower.startswith(PYICEBERG.lower()): key = env_var_lower[len(PYICEBERG) :] parts = key.split("__", maxsplit=2) - parts_normalized = [part.replace('__', '.').replace("_", "-") for part in parts] + parts_normalized = [part.replace("__", ".").replace("_", "-") for part in parts] set_property(config, parts_normalized, config_value) return config diff --git a/pyproject.toml b/pyproject.toml index fafa5231a2..fe8fe4ed0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ zstandard = ">=0.13.0,<1.0.0" tenacity = ">=8.2.3,<9.0.0" pyarrow = { version = ">=9.0.0,<17.0.0", optional = true } pandas = { version = ">=1.0.0,<3.0.0", optional = true } -duckdb = { version = ">=0.5.0,<1.0.0", optional = true } +duckdb = { version = ">=0.5.0,<2.0.0", optional = true } ray = { version = ">=2.0.0,<2.10.0", optional = true } python-snappy = { version = ">=0.6.0,<1.0.0", optional = true } thrift = { version = ">=0.13.0,<1.0.0", optional = true } @@ -82,7 +82,7 @@ fastavro = "1.9.4" coverage = { version = "^7.4.2", extras = ["toml"] } requests-mock = "1.12.1" moto = { version = "^5.0.2", extras = ["server"] } -typing-extensions = "4.11.0" +typing-extensions = "4.12.2" pytest-mock = "3.14.0" pyspark = "3.5.1" cython = "3.0.10" diff --git a/ruff.toml b/ruff.toml index 92fb9a9c80..caaa108c84 100644 --- a/ruff.toml +++ b/ruff.toml @@ -80,4 +80,4 @@ known-first-party = ["pyiceberg", "tests"] section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"] [format] -quote-style = "preserve" +quote-style = "double" diff --git a/tests/avro/test_file.py b/tests/avro/test_file.py index 0809f56fea..4df132304c 100644 --- a/tests/avro/test_file.py +++ b/tests/avro/test_file.py @@ -173,13 +173,13 @@ def test_write_manifest_entry_with_iceberg_read_with_fastavro_v1() -> None: v2_entry = todict(entry) # These are not written in V1 - del v2_entry['data_sequence_number'] - del v2_entry['file_sequence_number'] - del v2_entry['data_file']['content'] - del v2_entry['data_file']['equality_ids'] + del v2_entry["data_sequence_number"] + del v2_entry["file_sequence_number"] + del v2_entry["data_file"]["content"] + del v2_entry["data_file"]["equality_ids"] # Required in V1 - v2_entry['data_file']['block_size_in_bytes'] = DEFAULT_BLOCK_SIZE + v2_entry["data_file"]["block_size_in_bytes"] = DEFAULT_BLOCK_SIZE assert v2_entry == fa_entry diff --git a/tests/catalog/integration_test_glue.py b/tests/catalog/integration_test_glue.py index 5b4aa58787..21c415212a 100644 --- a/tests/catalog/integration_test_glue.py +++ b/tests/catalog/integration_test_glue.py @@ -484,7 +484,7 @@ def test_commit_table_properties( updated_table_metadata = table.metadata assert MetastoreCatalog._parse_metadata_version(table.metadata_location) == 1 - assert updated_table_metadata.properties == {'Description': 'test_description', "test_a": "test_aa", "test_c": "test_c"} + assert updated_table_metadata.properties == {"Description": "test_description", "test_a": "test_aa", "test_c": "test_c"} table_info = glue.get_table( DatabaseName=database_name, @@ -570,3 +570,19 @@ def test_table_exists(test_catalog: Catalog, table_schema_nested: Schema, table_ test_catalog.create_namespace(database_name) test_catalog.create_table((database_name, table_name), table_schema_nested) assert test_catalog.table_exists((database_name, table_name)) is True + + +def test_register_table_with_given_location( + test_catalog: Catalog, table_schema_nested: Schema, table_name: str, database_name: str +) -> None: + identifier = (database_name, table_name) + new_identifier = (database_name, f"new_{table_name}") + test_catalog.create_namespace(database_name) + tbl = test_catalog.create_table(identifier, table_schema_nested) + location = tbl.metadata_location + test_catalog.drop_table(identifier) # drops the table but keeps the metadata file + assert not test_catalog.table_exists(identifier) + table = test_catalog.register_table(new_identifier, location) + assert table.identifier == (CATALOG_NAME,) + new_identifier + assert table.metadata_location == location + assert test_catalog.table_exists(new_identifier) diff --git a/tests/catalog/test_dynamodb.py b/tests/catalog/test_dynamodb.py index f4b16d343b..7ad1301d9d 100644 --- a/tests/catalog/test_dynamodb.py +++ b/tests/catalog/test_dynamodb.py @@ -569,10 +569,10 @@ def test_passing_provided_profile() -> None: } props = {"py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"} props.update(session_props) # type: ignore - with mock.patch('boto3.Session', return_value=mock.Mock()) as mock_session: + with mock.patch("boto3.Session", return_value=mock.Mock()) as mock_session: mock_client = mock.Mock() mock_session.return_value.client.return_value = mock_client - mock_client.describe_table.return_value = {'Table': {'TableStatus': 'ACTIVE'}} + mock_client.describe_table.return_value = {"Table": {"TableStatus": "ACTIVE"}} test_catalog = DynamoDbCatalog(catalog_name, **props) assert test_catalog.dynamodb is mock_client mock_session.assert_called_with(**session_props) @@ -590,4 +590,4 @@ def test_table_exists( # Act and Assert for an existing table assert test_catalog.table_exists(identifier) is True # Act and Assert for an non-existing table - assert test_catalog.table_exists(('non', 'exist')) is False + assert test_catalog.table_exists(("non", "exist")) is False diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 5b67b92c68..6b57f1dfe6 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -715,7 +715,7 @@ def test_commit_table_properties( updated_table_metadata = table.metadata assert test_catalog._parse_metadata_version(table.metadata_location) == 1 - assert updated_table_metadata.properties == {'Description': 'test_description', "test_a": "test_aa", "test_c": "test_c"} + assert updated_table_metadata.properties == {"Description": "test_description", "test_a": "test_aa", "test_c": "test_c"} table_info = _glue.get_table( DatabaseName=database_name, @@ -847,4 +847,18 @@ def test_table_exists( # Act and Assert for an existing table assert test_catalog.table_exists(identifier) is True # Act and Assert for a non-existing table - assert test_catalog.table_exists(('non', 'exist')) is False + assert test_catalog.table_exists(("non", "exist")) is False + + +@mock_aws +def test_register_table_with_given_location( + _bucket_initialize: None, moto_endpoint_url: str, metadata_location: str, database_name: str, table_name: str +) -> None: + catalog_name = "glue" + identifier = (database_name, table_name) + location = metadata_location + test_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}"}) + test_catalog.create_namespace(namespace=database_name, properties={"location": f"s3://{BUCKET_NAME}/{database_name}.db"}) + table = test_catalog.register_table(identifier, location) + assert table.identifier == (catalog_name,) + identifier + assert test_catalog.table_exists(identifier) is True diff --git a/tests/catalog/test_hive.py b/tests/catalog/test_hive.py index ef662b3aff..96e95815be 100644 --- a/tests/catalog/test_hive.py +++ b/tests/catalog/test_hive.py @@ -234,27 +234,27 @@ def test_create_table( retention=None, sd=StorageDescriptor( cols=[ - FieldSchema(name='boolean', type='boolean', comment=None), - FieldSchema(name='integer', type='int', comment=None), - FieldSchema(name='long', type='bigint', comment=None), - FieldSchema(name='float', type='float', comment=None), - FieldSchema(name='double', type='double', comment=None), - FieldSchema(name='decimal', type='decimal(32,3)', comment=None), - FieldSchema(name='date', type='date', comment=None), - FieldSchema(name='time', type='string', comment=None), - FieldSchema(name='timestamp', type='timestamp', comment=None), + FieldSchema(name="boolean", type="boolean", comment=None), + FieldSchema(name="integer", type="int", comment=None), + FieldSchema(name="long", type="bigint", comment=None), + FieldSchema(name="float", type="float", comment=None), + FieldSchema(name="double", type="double", comment=None), + FieldSchema(name="decimal", type="decimal(32,3)", comment=None), + FieldSchema(name="date", type="date", comment=None), + FieldSchema(name="time", type="string", comment=None), + FieldSchema(name="timestamp", type="timestamp", comment=None), FieldSchema( - name='timestamptz', - type='timestamp' if hive2_compatible else 'timestamp with local time zone', + name="timestamptz", + type="timestamp" if hive2_compatible else "timestamp with local time zone", comment=None, ), - FieldSchema(name='string', type='string', comment=None), - FieldSchema(name='uuid', type='string', comment=None), - FieldSchema(name='fixed', type='binary', comment=None), - FieldSchema(name='binary', type='binary', comment=None), - FieldSchema(name='list', type='array', comment=None), - FieldSchema(name='map', type='map', comment=None), - FieldSchema(name='struct', type='struct', comment=None), + FieldSchema(name="string", type="string", comment=None), + FieldSchema(name="uuid", type="string", comment=None), + FieldSchema(name="fixed", type="binary", comment=None), + FieldSchema(name="binary", type="binary", comment=None), + FieldSchema(name="list", type="array", comment=None), + FieldSchema(name="map", type="map", comment=None), + FieldSchema(name="struct", type="struct", comment=None), ], location=f"{hive_database.locationUri}/table", inputFormat="org.apache.hadoop.mapred.FileInputFormat", @@ -314,40 +314,40 @@ def test_create_table( last_column_id=22, schemas=[ Schema( - NestedField(field_id=1, name='boolean', field_type=BooleanType(), required=True), - NestedField(field_id=2, name='integer', field_type=IntegerType(), required=True), - NestedField(field_id=3, name='long', field_type=LongType(), required=True), - NestedField(field_id=4, name='float', field_type=FloatType(), required=True), - NestedField(field_id=5, name='double', field_type=DoubleType(), required=True), - NestedField(field_id=6, name='decimal', field_type=DecimalType(precision=32, scale=3), required=True), - NestedField(field_id=7, name='date', field_type=DateType(), required=True), - NestedField(field_id=8, name='time', field_type=TimeType(), required=True), - NestedField(field_id=9, name='timestamp', field_type=TimestampType(), required=True), - NestedField(field_id=10, name='timestamptz', field_type=TimestamptzType(), required=True), - NestedField(field_id=11, name='string', field_type=StringType(), required=True), - NestedField(field_id=12, name='uuid', field_type=UUIDType(), required=True), - NestedField(field_id=13, name='fixed', field_type=FixedType(length=12), required=True), - NestedField(field_id=14, name='binary', field_type=BinaryType(), required=True), + NestedField(field_id=1, name="boolean", field_type=BooleanType(), required=True), + NestedField(field_id=2, name="integer", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="long", field_type=LongType(), required=True), + NestedField(field_id=4, name="float", field_type=FloatType(), required=True), + NestedField(field_id=5, name="double", field_type=DoubleType(), required=True), + NestedField(field_id=6, name="decimal", field_type=DecimalType(precision=32, scale=3), required=True), + NestedField(field_id=7, name="date", field_type=DateType(), required=True), + NestedField(field_id=8, name="time", field_type=TimeType(), required=True), + NestedField(field_id=9, name="timestamp", field_type=TimestampType(), required=True), + NestedField(field_id=10, name="timestamptz", field_type=TimestamptzType(), required=True), + NestedField(field_id=11, name="string", field_type=StringType(), required=True), + NestedField(field_id=12, name="uuid", field_type=UUIDType(), required=True), + NestedField(field_id=13, name="fixed", field_type=FixedType(length=12), required=True), + NestedField(field_id=14, name="binary", field_type=BinaryType(), required=True), NestedField( field_id=15, - name='list', - field_type=ListType(type='list', element_id=18, element_type=StringType(), element_required=True), + name="list", + field_type=ListType(type="list", element_id=18, element_type=StringType(), element_required=True), required=True, ), NestedField( field_id=16, - name='map', + name="map", field_type=MapType( - type='map', key_id=19, key_type=StringType(), value_id=20, value_type=IntegerType(), value_required=True + type="map", key_id=19, key_type=StringType(), value_id=20, value_type=IntegerType(), value_required=True ), required=True, ), NestedField( field_id=17, - name='struct', + name="struct", field_type=StructType( - NestedField(field_id=21, name='inner_string', field_type=StringType(), required=False), - NestedField(field_id=22, name='inner_int', field_type=IntegerType(), required=True), + NestedField(field_id=21, name="inner_string", field_type=StringType(), required=False), + NestedField(field_id=22, name="inner_int", field_type=IntegerType(), required=True), ), required=False, ), @@ -357,7 +357,7 @@ def test_create_table( ], current_schema_id=0, last_partition_id=999, - properties={"owner": "javaberg", 'write.parquet.compression-codec': 'zstd'}, + properties={"owner": "javaberg", "write.parquet.compression-codec": "zstd"}, partition_specs=[PartitionSpec()], default_spec_id=0, current_snapshot_id=None, @@ -409,27 +409,27 @@ def test_create_table_with_given_location_removes_trailing_slash( retention=None, sd=StorageDescriptor( cols=[ - FieldSchema(name='boolean', type='boolean', comment=None), - FieldSchema(name='integer', type='int', comment=None), - FieldSchema(name='long', type='bigint', comment=None), - FieldSchema(name='float', type='float', comment=None), - FieldSchema(name='double', type='double', comment=None), - FieldSchema(name='decimal', type='decimal(32,3)', comment=None), - FieldSchema(name='date', type='date', comment=None), - FieldSchema(name='time', type='string', comment=None), - FieldSchema(name='timestamp', type='timestamp', comment=None), + FieldSchema(name="boolean", type="boolean", comment=None), + FieldSchema(name="integer", type="int", comment=None), + FieldSchema(name="long", type="bigint", comment=None), + FieldSchema(name="float", type="float", comment=None), + FieldSchema(name="double", type="double", comment=None), + FieldSchema(name="decimal", type="decimal(32,3)", comment=None), + FieldSchema(name="date", type="date", comment=None), + FieldSchema(name="time", type="string", comment=None), + FieldSchema(name="timestamp", type="timestamp", comment=None), FieldSchema( - name='timestamptz', - type='timestamp' if hive2_compatible else 'timestamp with local time zone', + name="timestamptz", + type="timestamp" if hive2_compatible else "timestamp with local time zone", comment=None, ), - FieldSchema(name='string', type='string', comment=None), - FieldSchema(name='uuid', type='string', comment=None), - FieldSchema(name='fixed', type='binary', comment=None), - FieldSchema(name='binary', type='binary', comment=None), - FieldSchema(name='list', type='array', comment=None), - FieldSchema(name='map', type='map', comment=None), - FieldSchema(name='struct', type='struct', comment=None), + FieldSchema(name="string", type="string", comment=None), + FieldSchema(name="uuid", type="string", comment=None), + FieldSchema(name="fixed", type="binary", comment=None), + FieldSchema(name="binary", type="binary", comment=None), + FieldSchema(name="list", type="array", comment=None), + FieldSchema(name="map", type="map", comment=None), + FieldSchema(name="struct", type="struct", comment=None), ], location=f"{hive_database.locationUri}/table-given-location", inputFormat="org.apache.hadoop.mapred.FileInputFormat", @@ -489,40 +489,40 @@ def test_create_table_with_given_location_removes_trailing_slash( last_column_id=22, schemas=[ Schema( - NestedField(field_id=1, name='boolean', field_type=BooleanType(), required=True), - NestedField(field_id=2, name='integer', field_type=IntegerType(), required=True), - NestedField(field_id=3, name='long', field_type=LongType(), required=True), - NestedField(field_id=4, name='float', field_type=FloatType(), required=True), - NestedField(field_id=5, name='double', field_type=DoubleType(), required=True), - NestedField(field_id=6, name='decimal', field_type=DecimalType(precision=32, scale=3), required=True), - NestedField(field_id=7, name='date', field_type=DateType(), required=True), - NestedField(field_id=8, name='time', field_type=TimeType(), required=True), - NestedField(field_id=9, name='timestamp', field_type=TimestampType(), required=True), - NestedField(field_id=10, name='timestamptz', field_type=TimestamptzType(), required=True), - NestedField(field_id=11, name='string', field_type=StringType(), required=True), - NestedField(field_id=12, name='uuid', field_type=UUIDType(), required=True), - NestedField(field_id=13, name='fixed', field_type=FixedType(length=12), required=True), - NestedField(field_id=14, name='binary', field_type=BinaryType(), required=True), + NestedField(field_id=1, name="boolean", field_type=BooleanType(), required=True), + NestedField(field_id=2, name="integer", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="long", field_type=LongType(), required=True), + NestedField(field_id=4, name="float", field_type=FloatType(), required=True), + NestedField(field_id=5, name="double", field_type=DoubleType(), required=True), + NestedField(field_id=6, name="decimal", field_type=DecimalType(precision=32, scale=3), required=True), + NestedField(field_id=7, name="date", field_type=DateType(), required=True), + NestedField(field_id=8, name="time", field_type=TimeType(), required=True), + NestedField(field_id=9, name="timestamp", field_type=TimestampType(), required=True), + NestedField(field_id=10, name="timestamptz", field_type=TimestamptzType(), required=True), + NestedField(field_id=11, name="string", field_type=StringType(), required=True), + NestedField(field_id=12, name="uuid", field_type=UUIDType(), required=True), + NestedField(field_id=13, name="fixed", field_type=FixedType(length=12), required=True), + NestedField(field_id=14, name="binary", field_type=BinaryType(), required=True), NestedField( field_id=15, - name='list', - field_type=ListType(type='list', element_id=18, element_type=StringType(), element_required=True), + name="list", + field_type=ListType(type="list", element_id=18, element_type=StringType(), element_required=True), required=True, ), NestedField( field_id=16, - name='map', + name="map", field_type=MapType( - type='map', key_id=19, key_type=StringType(), value_id=20, value_type=IntegerType(), value_required=True + type="map", key_id=19, key_type=StringType(), value_id=20, value_type=IntegerType(), value_required=True ), required=True, ), NestedField( field_id=17, - name='struct', + name="struct", field_type=StructType( - NestedField(field_id=21, name='inner_string', field_type=StringType(), required=False), - NestedField(field_id=22, name='inner_int', field_type=IntegerType(), required=True), + NestedField(field_id=21, name="inner_string", field_type=StringType(), required=False), + NestedField(field_id=22, name="inner_int", field_type=IntegerType(), required=True), ), required=False, ), @@ -532,7 +532,7 @@ def test_create_table_with_given_location_removes_trailing_slash( ], current_schema_id=0, last_partition_id=999, - properties={"owner": "javaberg", 'write.parquet.compression-codec': 'zstd'}, + properties={"owner": "javaberg", "write.parquet.compression-codec": "zstd"}, partition_specs=[PartitionSpec()], default_spec_id=0, current_snapshot_id=None, diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index ec5a6a22a4..b5c626d6f0 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -691,6 +691,16 @@ def test_table_exist_200(rest_mock: Mocker) -> None: assert catalog.table_exists(("fokko", "table")) +def test_table_exist_204(rest_mock: Mocker) -> None: + rest_mock.head( + f"{TEST_URI}v1/namespaces/fokko/tables/table", + status_code=204, + request_headers=TEST_HEADERS, + ) + catalog = RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN) + assert catalog.table_exists(("fokko", "table")) + + def test_table_exist_500(rest_mock: Mocker) -> None: rest_mock.head( f"{TEST_URI}v1/namespaces/fokko/tables/table", diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index efa7b746a9..24adfb88ab 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -17,7 +17,7 @@ import os from pathlib import Path -from typing import Generator, List +from typing import Any, Generator, List import pyarrow as pa import pytest @@ -25,6 +25,9 @@ from pytest_lazyfixture import lazy_fixture from sqlalchemy.exc import ArgumentError, IntegrityError +from pyiceberg.catalog import ( + Catalog, +) from pyiceberg.catalog.sql import SqlCatalog from pyiceberg.exceptions import ( CommitFailedException, @@ -52,51 +55,90 @@ from pyiceberg.types import IntegerType -@pytest.fixture(name="random_identifier") -def fixture_random_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier: +@pytest.fixture(scope="module") +def catalog_name() -> str: + return "test_sql_catalog" + + +@pytest.fixture(name="random_table_identifier") +def fixture_random_table_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier: os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True) return database_name, table_name -@pytest.fixture(name="another_random_identifier") -def fixture_another_random_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier: +@pytest.fixture(name="random_table_identifier_with_catalog") +def fixture_random_table_identifier_with_catalog( + warehouse: Path, catalog_name: str, database_name: str, table_name: str +) -> Identifier: + os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True) + return catalog_name, database_name, table_name + + +@pytest.fixture(name="another_random_table_identifier") +def fixture_another_random_table_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier: database_name = database_name + "_new" table_name = table_name + "_new" os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True) return database_name, table_name +@pytest.fixture(name="another_random_table_identifier_with_catalog") +def fixture_another_random_table_identifier_with_catalog( + warehouse: Path, catalog_name: str, database_name: str, table_name: str +) -> Identifier: + database_name = database_name + "_new" + table_name = table_name + "_new" + os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True) + return catalog_name, database_name, table_name + + +@pytest.fixture(name="random_hierarchical_identifier") +def fixture_random_hierarchical_identifier(warehouse: Path, hierarchical_namespace_name: str, table_name: str) -> Identifier: + os.makedirs(f"{warehouse}/{hierarchical_namespace_name}.db/{table_name}/metadata/", exist_ok=True) + return Catalog.identifier_to_tuple(".".join((hierarchical_namespace_name, table_name))) + + +@pytest.fixture(name="another_random_hierarchical_identifier") +def fixture_another_random_hierarchical_identifier( + warehouse: Path, hierarchical_namespace_name: str, table_name: str +) -> Identifier: + hierarchical_namespace_name = hierarchical_namespace_name + "_new" + table_name = table_name + "_new" + os.makedirs(f"{warehouse}/{hierarchical_namespace_name}.db/{table_name}/metadata/", exist_ok=True) + return Catalog.identifier_to_tuple(".".join((hierarchical_namespace_name, table_name))) + + @pytest.fixture(scope="module") -def catalog_memory(warehouse: Path) -> Generator[SqlCatalog, None, None]: +def catalog_memory(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]: props = { "uri": "sqlite:///:memory:", "warehouse": f"file://{warehouse}", } - catalog = SqlCatalog("test_sql_catalog", **props) + catalog = SqlCatalog(catalog_name, **props) catalog.create_tables() yield catalog catalog.destroy_tables() @pytest.fixture(scope="module") -def catalog_sqlite(warehouse: Path) -> Generator[SqlCatalog, None, None]: +def catalog_sqlite(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]: props = { "uri": f"sqlite:////{warehouse}/sql-catalog.db", "warehouse": f"file://{warehouse}", } - catalog = SqlCatalog("test_sql_catalog", **props) + catalog = SqlCatalog(catalog_name, **props) catalog.create_tables() yield catalog catalog.destroy_tables() @pytest.fixture(scope="module") -def catalog_sqlite_without_rowcount(warehouse: Path) -> Generator[SqlCatalog, None, None]: +def catalog_sqlite_without_rowcount(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]: props = { "uri": f"sqlite:////{warehouse}/sql-catalog.db", "warehouse": f"file://{warehouse}", } - catalog = SqlCatalog("test_sql_catalog", **props) + catalog = SqlCatalog(catalog_name, **props) catalog.engine.dialect.supports_sane_rowcount = False catalog.create_tables() yield catalog @@ -104,33 +146,33 @@ def catalog_sqlite_without_rowcount(warehouse: Path) -> Generator[SqlCatalog, No @pytest.fixture(scope="module") -def catalog_sqlite_fsspec(warehouse: Path) -> Generator[SqlCatalog, None, None]: +def catalog_sqlite_fsspec(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]: props = { "uri": f"sqlite:////{warehouse}/sql-catalog.db", "warehouse": f"file://{warehouse}", PY_IO_IMPL: FSSPEC_FILE_IO, } - catalog = SqlCatalog("test_sql_catalog", **props) + catalog = SqlCatalog(catalog_name, **props) catalog.create_tables() yield catalog catalog.destroy_tables() -def test_creation_with_no_uri() -> None: +def test_creation_with_no_uri(catalog_name: str) -> None: with pytest.raises(NoSuchPropertyException): - SqlCatalog("test_ddb_catalog", not_uri="unused") + SqlCatalog(catalog_name, not_uri="unused") -def test_creation_with_unsupported_uri() -> None: +def test_creation_with_unsupported_uri(catalog_name: str) -> None: with pytest.raises(ArgumentError): - SqlCatalog("test_ddb_catalog", uri="unsupported:xxx") + SqlCatalog(catalog_name, uri="unsupported:xxx") @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_create_tables_idempotency(catalog: SqlCatalog) -> None: @@ -140,67 +182,102 @@ def test_create_tables_idempotency(catalog: SqlCatalog) -> None: @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_create_table_default_sort_order(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_create_table_default_sort_order(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) assert table.sort_order().order_id == 0, "Order ID must match" assert table.sort_order().is_unsorted is True, "Order must be unsorted" - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_create_v1_table(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested, properties={"format-version": "1"}) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_create_v1_table(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested, properties={"format-version": "1"}) assert table.sort_order().order_id == 0, "Order ID must match" assert table.sort_order().is_unsorted is True, "Order must be unsorted" assert table.format_version == 1 assert table.spec() == UNPARTITIONED_PARTITION_SPEC - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + ], +) +@pytest.mark.parametrize( + "table_identifier", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), ], ) def test_create_table_with_pyarrow_schema( catalog: SqlCatalog, pyarrow_schema_simple_without_ids: pa.Schema, iceberg_table_schema_simple: Schema, - random_identifier: Identifier, + table_identifier: Identifier, ) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, pyarrow_schema_simple_without_ids) + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, pyarrow_schema_simple_without_ids) assert table.schema() == iceberg_table_schema_simple - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + ], +) +@pytest.mark.parametrize( + "table_identifier", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), ], ) -def test_write_pyarrow_schema(catalog: SqlCatalog, random_identifier: Identifier) -> None: +def test_write_pyarrow_schema(catalog: SqlCatalog, table_identifier: Identifier) -> None: import pyarrow as pa pyarrow_table = pa.Table.from_arrays( @@ -211,118 +288,165 @@ def test_write_pyarrow_schema(catalog: SqlCatalog, random_identifier: Identifier pa.array([None, "A", "B", "C"]), # 'large' column ], schema=pa.schema([ - pa.field('foo', pa.string(), nullable=True), - pa.field('bar', pa.int32(), nullable=False), - pa.field('baz', pa.bool_(), nullable=True), - pa.field('large', pa.large_string(), nullable=True), + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("large", pa.large_string(), nullable=True), ]), ) - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, pyarrow_table.schema) + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, pyarrow_table.schema) table.overwrite(pyarrow_table) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_create_table_custom_sort_order(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_create_table_custom_sort_order(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) order = SortOrder(SortField(source_id=2, transform=IdentityTransform(), null_order=NullOrder.NULLS_FIRST)) - table = catalog.create_table(random_identifier, table_schema_nested, sort_order=order) + table = catalog.create_table(table_identifier, table_schema_nested, sort_order=order) given_sort_order = table.sort_order() assert given_sort_order.order_id == 1, "Order ID must match" assert len(given_sort_order.fields) == 1, "Order must have 1 field" assert given_sort_order.fields[0].direction == SortDirection.ASC, "Direction must match" assert given_sort_order.fields[0].null_order == NullOrder.NULLS_FIRST, "Null order must match" assert isinstance(given_sort_order.fields[0].transform, IdentityTransform), "Transform must match" - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + ], +) +@pytest.mark.parametrize( + "table_identifier", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), ], ) def test_create_table_with_default_warehouse_location( - warehouse: Path, catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier + warehouse: Path, catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier ) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - catalog.create_table(random_identifier, table_schema_nested) - table = catalog.load_table(random_identifier) - assert table.identifier == (catalog.name,) + random_identifier + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + catalog.create_table(table_identifier, table_schema_nested) + table = catalog.load_table(table_identifier) + assert table.identifier == (catalog.name,) + table_identifier_nocatalog assert table.metadata_location.startswith(f"file://{warehouse}") assert os.path.exists(table.metadata_location[len("file://") :]) - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + ], +) +@pytest.mark.parametrize( + "table_identifier", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), ], ) def test_create_table_with_given_location_removes_trailing_slash( - warehouse: Path, catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier + warehouse: Path, catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier ) -> None: - database_name, table_name = random_identifier - location = f"file://{warehouse}/{database_name}.db/{table_name}-given" - catalog.create_namespace(database_name) - catalog.create_table(random_identifier, table_schema_nested, location=f"{location}/") - table = catalog.load_table(random_identifier) - assert table.identifier == (catalog.name,) + random_identifier + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + table_name = Catalog.table_name_from(table_identifier_nocatalog) + location = f"file://{warehouse}/{catalog.name}.db/{table_name}-given" + catalog.create_namespace(namespace) + catalog.create_table(table_identifier, table_schema_nested, location=f"{location}/") + table = catalog.load_table(table_identifier) + assert table.identifier == (catalog.name,) + table_identifier_nocatalog assert table.metadata_location.startswith(f"file://{warehouse}") assert os.path.exists(table.metadata_location[len("file://") :]) assert table.location() == location - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_create_duplicated_table(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - catalog.create_table(random_identifier, table_schema_nested) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_create_duplicated_table(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + catalog.create_table(table_identifier, table_schema_nested) with pytest.raises(TableAlreadyExistsError): - catalog.create_table(random_identifier, table_schema_nested) + catalog.create_table(table_identifier, table_schema_nested) @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + ], +) +@pytest.mark.parametrize( + "table_identifier", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), ], ) def test_create_table_if_not_exists_duplicated_table( - catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier + catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier ) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table1 = catalog.create_table(random_identifier, table_schema_nested) - table2 = catalog.create_table_if_not_exists(random_identifier, table_schema_nested) + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table1 = catalog.create_table(table_identifier, table_schema_nested) + table2 = catalog.create_table_if_not_exists(table_identifier, table_schema_nested) assert table1.identifier == table2.identifier @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_create_table_with_non_existing_namespace(catalog: SqlCatalog, table_schema_nested: Schema, table_name: str) -> None: @@ -332,54 +456,72 @@ def test_create_table_with_non_existing_namespace(catalog: SqlCatalog, table_sch @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_create_table_without_namespace(catalog: SqlCatalog, table_schema_nested: Schema, table_name: str) -> None: - with pytest.raises(ValueError): + with pytest.raises(NoSuchNamespaceError): catalog.create_table(table_name, table_schema_nested) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_register_table(catalog: SqlCatalog, random_identifier: Identifier, metadata_location: str) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.register_table(random_identifier, metadata_location) - assert table.identifier == (catalog.name,) + random_identifier +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_register_table(catalog: SqlCatalog, table_identifier: Identifier, metadata_location: str) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.register_table(table_identifier, metadata_location) + assert table.identifier == (catalog.name,) + table_identifier_nocatalog assert table.metadata_location == metadata_location assert os.path.exists(metadata_location) - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_register_existing_table(catalog: SqlCatalog, random_identifier: Identifier, metadata_location: str) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - catalog.register_table(random_identifier, metadata_location) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_register_existing_table(catalog: SqlCatalog, table_identifier: Identifier, metadata_location: str) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + catalog.register_table(table_identifier, metadata_location) with pytest.raises(TableAlreadyExistsError): - catalog.register_table(random_identifier, metadata_location) + catalog.register_table(table_identifier, metadata_location) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_register_table_with_non_existing_namespace(catalog: SqlCatalog, metadata_location: str, table_name: str) -> None: @@ -389,10 +531,10 @@ def test_register_table_with_non_existing_namespace(catalog: SqlCatalog, metadat @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_register_table_without_namespace(catalog: SqlCatalog, metadata_location: str, table_name: str) -> None: @@ -401,35 +543,53 @@ def test_register_table_without_namespace(catalog: SqlCatalog, metadata_location @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_load_table(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - loaded_table = catalog.load_table(random_identifier) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_load_table(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) + loaded_table = catalog.load_table(table_identifier) assert table.identifier == loaded_table.identifier assert table.metadata_location == loaded_table.metadata_location assert table.metadata == loaded_table.metadata @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_load_table_from_self_identifier(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - intermediate = catalog.load_table(random_identifier) - assert intermediate.identifier == (catalog.name,) + random_identifier +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_load_table_from_self_identifier(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) + intermediate = catalog.load_table(table_identifier) + assert intermediate.identifier == (catalog.name,) + table_identifier_nocatalog loaded_table = catalog.load_table(intermediate.identifier) assert table.identifier == loaded_table.identifier assert table.metadata_location == loaded_table.metadata_location @@ -437,207 +597,340 @@ def test_load_table_from_self_identifier(catalog: SqlCatalog, table_schema_neste @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) -def test_drop_table(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + random_identifier - catalog.drop_table(random_identifier) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_drop_table(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) + assert table.identifier == (catalog.name,) + table_identifier_nocatalog + catalog.drop_table(table_identifier) with pytest.raises(NoSuchTableError): - catalog.load_table(random_identifier) + catalog.load_table(table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) -def test_drop_table_from_self_identifier(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + random_identifier +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_drop_table_from_self_identifier(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) + assert table.identifier == (catalog.name,) + table_identifier_nocatalog catalog.drop_table(table.identifier) with pytest.raises(NoSuchTableError): catalog.load_table(table.identifier) with pytest.raises(NoSuchTableError): - catalog.load_table(random_identifier) + catalog.load_table(table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + ], +) +@pytest.mark.parametrize( + "table_identifier", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), ], ) -def test_drop_table_that_does_not_exist(catalog: SqlCatalog, random_identifier: Identifier) -> None: +def test_drop_table_that_does_not_exist(catalog: SqlCatalog, table_identifier: Identifier) -> None: with pytest.raises(NoSuchTableError): - catalog.drop_table(random_identifier) + catalog.drop_table(table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + ], +) +@pytest.mark.parametrize( + "from_table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +@pytest.mark.parametrize( + "to_table_identifier", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("another_random_table_identifier"), + lazy_fixture("another_random_hierarchical_identifier"), + lazy_fixture("another_random_table_identifier_with_catalog"), ], ) def test_rename_table( - catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier, another_random_identifier: Identifier + catalog: SqlCatalog, table_schema_nested: Schema, from_table_identifier: Identifier, to_table_identifier: Identifier ) -> None: - from_database_name, _from_table_name = random_identifier - to_database_name, _to_table_name = another_random_identifier - catalog.create_namespace(from_database_name) - catalog.create_namespace(to_database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + random_identifier - catalog.rename_table(random_identifier, another_random_identifier) - new_table = catalog.load_table(another_random_identifier) - assert new_table.identifier == (catalog.name,) + another_random_identifier + from_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(from_table_identifier) + to_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(to_table_identifier) + from_namespace = Catalog.namespace_from(from_table_identifier_nocatalog) + to_namespace = Catalog.namespace_from(to_table_identifier_nocatalog) + catalog.create_namespace(from_namespace) + catalog.create_namespace(to_namespace) + table = catalog.create_table(from_table_identifier, table_schema_nested) + assert table.identifier == (catalog.name,) + from_table_identifier_nocatalog + catalog.rename_table(from_table_identifier, to_table_identifier) + new_table = catalog.load_table(to_table_identifier) + assert new_table.identifier == (catalog.name,) + to_table_identifier_nocatalog assert new_table.metadata_location == table.metadata_location with pytest.raises(NoSuchTableError): - catalog.load_table(random_identifier) + catalog.load_table(from_table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + ], +) +@pytest.mark.parametrize( + "from_table_identifier", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +@pytest.mark.parametrize( + "to_table_identifier", + [ + lazy_fixture("another_random_table_identifier"), + lazy_fixture("another_random_hierarchical_identifier"), + lazy_fixture("another_random_table_identifier_with_catalog"), ], ) def test_rename_table_from_self_identifier( - catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier, another_random_identifier: Identifier + catalog: SqlCatalog, table_schema_nested: Schema, from_table_identifier: Identifier, to_table_identifier: Identifier ) -> None: - from_database_name, _from_table_name = random_identifier - to_database_name, _to_table_name = another_random_identifier - catalog.create_namespace(from_database_name) - catalog.create_namespace(to_database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + random_identifier - catalog.rename_table(table.identifier, another_random_identifier) - new_table = catalog.load_table(another_random_identifier) - assert new_table.identifier == (catalog.name,) + another_random_identifier + from_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(from_table_identifier) + to_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(to_table_identifier) + from_namespace = Catalog.namespace_from(from_table_identifier_nocatalog) + to_namespace = Catalog.namespace_from(to_table_identifier_nocatalog) + catalog.create_namespace(from_namespace) + catalog.create_namespace(to_namespace) + table = catalog.create_table(from_table_identifier, table_schema_nested) + assert table.identifier == (catalog.name,) + from_table_identifier_nocatalog + catalog.rename_table(table.identifier, to_table_identifier) + new_table = catalog.load_table(to_table_identifier) + assert new_table.identifier == (catalog.name,) + to_table_identifier_nocatalog assert new_table.metadata_location == table.metadata_location with pytest.raises(NoSuchTableError): catalog.load_table(table.identifier) with pytest.raises(NoSuchTableError): - catalog.load_table(random_identifier) + catalog.load_table(from_table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + ], +) +@pytest.mark.parametrize( + "from_table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +@pytest.mark.parametrize( + "to_table_identifier", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("another_random_table_identifier"), + lazy_fixture("another_random_hierarchical_identifier"), + lazy_fixture("another_random_table_identifier_with_catalog"), ], ) def test_rename_table_to_existing_one( - catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier, another_random_identifier: Identifier + catalog: SqlCatalog, table_schema_nested: Schema, from_table_identifier: Identifier, to_table_identifier: Identifier ) -> None: - from_database_name, _from_table_name = random_identifier - to_database_name, _to_table_name = another_random_identifier - catalog.create_namespace(from_database_name) - catalog.create_namespace(to_database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + random_identifier - new_table = catalog.create_table(another_random_identifier, table_schema_nested) - assert new_table.identifier == (catalog.name,) + another_random_identifier + from_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(from_table_identifier) + to_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(to_table_identifier) + from_namespace = Catalog.namespace_from(from_table_identifier_nocatalog) + to_namespace = Catalog.namespace_from(to_table_identifier_nocatalog) + catalog.create_namespace(from_namespace) + catalog.create_namespace(to_namespace) + table = catalog.create_table(from_table_identifier, table_schema_nested) + assert table.identifier == (catalog.name,) + from_table_identifier_nocatalog + new_table = catalog.create_table(to_table_identifier, table_schema_nested) + assert new_table.identifier == (catalog.name,) + to_table_identifier_nocatalog with pytest.raises(TableAlreadyExistsError): - catalog.rename_table(random_identifier, another_random_identifier) + catalog.rename_table(from_table_identifier, to_table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + ], +) +@pytest.mark.parametrize( + "from_table_identifier", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), ], ) -def test_rename_missing_table(catalog: SqlCatalog, random_identifier: Identifier, another_random_identifier: Identifier) -> None: - to_database_name, _to_table_name = another_random_identifier - catalog.create_namespace(to_database_name) +@pytest.mark.parametrize( + "to_table_identifier", + [ + lazy_fixture("another_random_table_identifier"), + lazy_fixture("another_random_hierarchical_identifier"), + lazy_fixture("another_random_table_identifier_with_catalog"), + ], +) +def test_rename_missing_table(catalog: SqlCatalog, from_table_identifier: Identifier, to_table_identifier: Identifier) -> None: + to_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(to_table_identifier) + to_namespace = Catalog.namespace_from(to_table_identifier_nocatalog) + catalog.create_namespace(to_namespace) with pytest.raises(NoSuchTableError): - catalog.rename_table(random_identifier, another_random_identifier) + catalog.rename_table(from_table_identifier, to_table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + ], +) +@pytest.mark.parametrize( + "from_table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +@pytest.mark.parametrize( + "to_table_identifier", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("another_random_table_identifier"), + lazy_fixture("another_random_hierarchical_identifier"), + lazy_fixture("another_random_table_identifier_with_catalog"), ], ) def test_rename_table_to_missing_namespace( - catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier, another_random_identifier: Identifier + catalog: SqlCatalog, table_schema_nested: Schema, from_table_identifier: Identifier, to_table_identifier: Identifier ) -> None: - from_database_name, _from_table_name = random_identifier - catalog.create_namespace(from_database_name) - table = catalog.create_table(random_identifier, table_schema_nested) - assert table.identifier == (catalog.name,) + random_identifier + from_table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(from_table_identifier) + from_namespace = Catalog.namespace_from(from_table_identifier_nocatalog) + catalog.create_namespace(from_namespace) + table = catalog.create_table(from_table_identifier, table_schema_nested) + assert table.identifier == (catalog.name,) + from_table_identifier_nocatalog with pytest.raises(NoSuchNamespaceError): - catalog.rename_table(random_identifier, another_random_identifier) + catalog.rename_table(from_table_identifier, to_table_identifier) @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + ], +) +@pytest.mark.parametrize( + "table_identifier_1", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +@pytest.mark.parametrize( + "table_identifier_2", + [ + lazy_fixture("another_random_table_identifier"), + lazy_fixture("another_random_hierarchical_identifier"), + lazy_fixture("another_random_table_identifier_with_catalog"), ], ) def test_list_tables( - catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier, another_random_identifier: Identifier + catalog: SqlCatalog, table_schema_nested: Schema, table_identifier_1: Identifier, table_identifier_2: Identifier ) -> None: - database_name_1, _table_name_1 = random_identifier - database_name_2, _table_name_2 = another_random_identifier - catalog.create_namespace(database_name_1) - catalog.create_namespace(database_name_2) - catalog.create_table(random_identifier, table_schema_nested) - catalog.create_table(another_random_identifier, table_schema_nested) - identifier_list = catalog.list_tables(database_name_1) + table_identifier_1_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier_1) + table_identifier_2_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier_2) + namespace_1 = Catalog.namespace_from(table_identifier_1_nocatalog) + namespace_2 = Catalog.namespace_from(table_identifier_2_nocatalog) + catalog.create_namespace(namespace_1) + catalog.create_namespace(namespace_2) + catalog.create_table(table_identifier_1, table_schema_nested) + catalog.create_table(table_identifier_2, table_schema_nested) + identifier_list = catalog.list_tables(namespace_1) assert len(identifier_list) == 1 - assert random_identifier in identifier_list + assert table_identifier_1_nocatalog in identifier_list - identifier_list = catalog.list_tables(database_name_2) + identifier_list = catalog.list_tables(namespace_2) assert len(identifier_list) == 1 - assert another_random_identifier in identifier_list + assert table_identifier_2_nocatalog in identifier_list @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_create_namespace(catalog: SqlCatalog, database_name: str) -> None: - catalog.create_namespace(database_name) - assert (database_name,) in catalog.list_namespaces() +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_list_tables_when_missing_namespace(catalog: SqlCatalog, namespace: str) -> None: + with pytest.raises(NoSuchNamespaceError): + catalog.list_tables(namespace) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_create_namespace_if_not_exists(catalog: SqlCatalog, database_name: str) -> None: @@ -648,89 +941,123 @@ def test_create_namespace_if_not_exists(catalog: SqlCatalog, database_name: str) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_create_duplicate_namespace(catalog: SqlCatalog, database_name: str) -> None: - catalog.create_namespace(database_name) +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_create_namespace(catalog: SqlCatalog, namespace: str) -> None: + catalog.create_namespace(namespace) + assert (Catalog.identifier_to_tuple(namespace)) in catalog.list_namespaces() + + +@pytest.mark.parametrize( + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + ], +) +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_create_duplicate_namespace(catalog: SqlCatalog, namespace: str) -> None: + catalog.create_namespace(namespace) with pytest.raises(NamespaceAlreadyExistsError): - catalog.create_namespace(database_name) + catalog.create_namespace(namespace) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_create_namespaces_sharing_same_prefix(catalog: SqlCatalog, database_name: str) -> None: - catalog.create_namespace(database_name + "_1") +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_create_namespaces_sharing_same_prefix(catalog: SqlCatalog, namespace: str) -> None: + catalog.create_namespace(namespace + "_1") # Second namespace is a prefix of the first one, make sure it can be added. - catalog.create_namespace(database_name) + catalog.create_namespace(namespace) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_create_namespace_with_comment_and_location(catalog: SqlCatalog, database_name: str) -> None: +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_create_namespace_with_comment_and_location(catalog: SqlCatalog, namespace: str) -> None: test_location = "/test/location" test_properties = { "comment": "this is a test description", "location": test_location, } - catalog.create_namespace(namespace=database_name, properties=test_properties) + catalog.create_namespace(namespace=namespace, properties=test_properties) loaded_database_list = catalog.list_namespaces() - assert (database_name,) in loaded_database_list - properties = catalog.load_namespace_properties(database_name) + assert Catalog.identifier_to_tuple(namespace) in loaded_database_list + properties = catalog.load_namespace_properties(namespace) assert properties["comment"] == "this is a test description" assert properties["location"] == test_location @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) @pytest.mark.filterwarnings("ignore") -def test_create_namespace_with_null_properties(catalog: SqlCatalog, database_name: str) -> None: +def test_create_namespace_with_null_properties(catalog: SqlCatalog, namespace: str) -> None: with pytest.raises(IntegrityError): - catalog.create_namespace(namespace=database_name, properties={None: "value"}) # type: ignore + catalog.create_namespace(namespace=namespace, properties={None: "value"}) # type: ignore with pytest.raises(IntegrityError): - catalog.create_namespace(namespace=database_name, properties={"key": None}) + catalog.create_namespace(namespace=namespace, properties={"key": None}) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_list_namespaces(catalog: SqlCatalog, database_list: List[str]) -> None: - for database_name in database_list: - catalog.create_namespace(database_name) - db_list = catalog.list_namespaces() - for database_name in database_list: - assert (database_name,) in db_list - assert len(catalog.list_namespaces(database_name)) == 1 +@pytest.mark.parametrize("empty_namespace", ["", (), (""), ("", ""), " ", (" ")]) +def test_create_namespace_with_empty_identifier(catalog: SqlCatalog, empty_namespace: Any) -> None: + with pytest.raises(NoSuchNamespaceError): + catalog.create_namespace(empty_namespace) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + ], +) +@pytest.mark.parametrize("namespace_list", [lazy_fixture("database_list"), lazy_fixture("hierarchical_namespace_list")]) +def test_list_namespaces(catalog: SqlCatalog, namespace_list: List[str]) -> None: + for namespace in namespace_list: + catalog.create_namespace(namespace) + # Test global list + ns_list = catalog.list_namespaces() + for namespace in namespace_list: + assert Catalog.identifier_to_tuple(namespace) in ns_list + # Test individual namespace list + assert len(one_namespace := catalog.list_namespaces(namespace)) == 1 + assert Catalog.identifier_to_tuple(namespace) == one_namespace[0] + + +@pytest.mark.parametrize( + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_list_non_existing_namespaces(catalog: SqlCatalog) -> None: @@ -739,66 +1066,77 @@ def test_list_non_existing_namespaces(catalog: SqlCatalog) -> None: @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_drop_namespace(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, table_name = random_identifier - catalog.create_namespace(database_name) - assert (database_name,) in catalog.list_namespaces() - catalog.create_table((database_name, table_name), table_schema_nested) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_drop_namespace(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + assert namespace in catalog.list_namespaces() + catalog.create_table(table_identifier, table_schema_nested) with pytest.raises(NamespaceNotEmptyError): - catalog.drop_namespace(database_name) - catalog.drop_table((database_name, table_name)) - catalog.drop_namespace(database_name) - assert (database_name,) not in catalog.list_namespaces() + catalog.drop_namespace(namespace) + catalog.drop_table(table_identifier) + catalog.drop_namespace(namespace) + assert namespace not in catalog.list_namespaces() @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_load_namespace_properties(catalog: SqlCatalog, database_name: str) -> None: +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_load_namespace_properties(catalog: SqlCatalog, namespace: str) -> None: warehouse_location = "/test/location" test_properties = { "comment": "this is a test description", - "location": f"{warehouse_location}/{database_name}.db", + "location": f"{warehouse_location}/{namespace}.db", "test_property1": "1", "test_property2": "2", "test_property3": "3", } - catalog.create_namespace(database_name, test_properties) - listed_properties = catalog.load_namespace_properties(database_name) + catalog.create_namespace(namespace, test_properties) + listed_properties = catalog.load_namespace_properties(namespace) for k, v in listed_properties.items(): assert k in test_properties assert v == test_properties[k] @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_load_empty_namespace_properties(catalog: SqlCatalog, database_name: str) -> None: - catalog.create_namespace(database_name) - listed_properties = catalog.load_namespace_properties(database_name) +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_load_empty_namespace_properties(catalog: SqlCatalog, namespace: str) -> None: + catalog.create_namespace(namespace) + listed_properties = catalog.load_namespace_properties(namespace) assert listed_properties == {"exists": "true"} @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) def test_load_namespace_properties_non_existing_namespace(catalog: SqlCatalog) -> None: @@ -807,25 +1145,26 @@ def test_load_namespace_properties_non_existing_namespace(catalog: SqlCatalog) - @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_update_namespace_properties(catalog: SqlCatalog, database_name: str) -> None: +@pytest.mark.parametrize("namespace", [lazy_fixture("database_name"), lazy_fixture("hierarchical_namespace_name")]) +def test_update_namespace_properties(catalog: SqlCatalog, namespace: str) -> None: warehouse_location = "/test/location" test_properties = { "comment": "this is a test description", - "location": f"{warehouse_location}/{database_name}.db", + "location": f"{warehouse_location}/{namespace}.db", "test_property1": "1", "test_property2": "2", "test_property3": "3", } removals = {"test_property1", "test_property2", "test_property3", "should_not_removed"} updates = {"test_property4": "4", "test_property5": "5", "comment": "updated test description"} - catalog.create_namespace(database_name, test_properties) - update_report = catalog.update_namespace_properties(database_name, removals, updates) + catalog.create_namespace(namespace, test_properties) + update_report = catalog.update_namespace_properties(namespace, removals, updates) for k in updates.keys(): assert k in update_report.updated for k in removals: @@ -833,21 +1172,30 @@ def test_update_namespace_properties(catalog: SqlCatalog, database_name: str) -> assert k in update_report.missing else: assert k in update_report.removed - assert "updated test description" == catalog.load_namespace_properties(database_name)["comment"] + assert "updated test description" == catalog.load_namespace_properties(namespace)["comment"] @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) -def test_commit_table(catalog: SqlCatalog, table_schema_nested: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_nested) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_commit_table(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) assert catalog._parse_metadata_version(table.metadata_location) == 0 assert table.metadata.current_schema_id == 0 @@ -870,18 +1218,27 @@ def test_commit_table(catalog: SqlCatalog, table_schema_nested: Schema, random_i @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), - lazy_fixture('catalog_sqlite_fsspec'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + lazy_fixture("catalog_sqlite_fsspec"), ], ) -def test_append_table(catalog: SqlCatalog, table_schema_simple: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table = catalog.create_table(random_identifier, table_schema_simple) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_append_table(catalog: SqlCatalog, table_schema_simple: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_simple) df = pa.Table.from_pydict( { @@ -901,28 +1258,37 @@ def test_append_table(catalog: SqlCatalog, table_schema_simple: Schema, random_i assert table.metadata.snapshots[0].sequence_number == 1 assert table.metadata.snapshots[0].summary is not None assert table.metadata.snapshots[0].summary.operation == Operation.APPEND - assert table.metadata.snapshots[0].summary['added-data-files'] == '1' - assert table.metadata.snapshots[0].summary['added-records'] == '1' - assert table.metadata.snapshots[0].summary['total-data-files'] == '1' - assert table.metadata.snapshots[0].summary['total-records'] == '1' + assert table.metadata.snapshots[0].summary["added-data-files"] == "1" + assert table.metadata.snapshots[0].summary["added-records"] == "1" + assert table.metadata.snapshots[0].summary["total-data-files"] == "1" + assert table.metadata.snapshots[0].summary["total-records"] == "1" # read back the data assert df == table.scan().to_arrow() @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) -def test_concurrent_commit_table(catalog: SqlCatalog, table_schema_simple: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - table_a = catalog.create_table(random_identifier, table_schema_simple) - table_b = catalog.load_table(random_identifier) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_concurrent_commit_table(catalog: SqlCatalog, table_schema_simple: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + table_a = catalog.create_table(table_identifier, table_schema_simple) + table_b = catalog.load_table(table_identifier) with table_a.update_schema() as update: update.add_column(path="b", field_type=IntegerType()) @@ -934,11 +1300,11 @@ def test_concurrent_commit_table(catalog: SqlCatalog, table_schema_simple: Schem @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) @pytest.mark.parametrize("format_version", [1, 2]) @@ -957,20 +1323,20 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None: pa_table = pa.Table.from_pydict( { - 'foo': ['a', None, 'z'], + "foo": ["a", None, "z"], }, - schema=pa.schema([pa.field("foo", pa.string(), nullable=True)]), + schema=pa.schema([pa.field("foo", pa.large_string(), nullable=True)]), ) tbl = catalog.create_table(identifier=identifier, schema=pa_table.schema, properties={"format-version": str(format_version)}) pa_table_with_column = pa.Table.from_pydict( { - 'foo': ['a', None, 'z'], - 'bar': [19, None, 25], + "foo": ["a", None, "z"], + "bar": [19, None, 25], }, schema=pa.schema([ - pa.field("foo", pa.string(), nullable=True), + pa.field("foo", pa.large_string(), nullable=True), pa.field("bar", pa.int32(), nullable=True), ]), ) @@ -985,55 +1351,142 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None: @pytest.mark.parametrize( - 'catalog', + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + ], +) +@pytest.mark.parametrize("format_version", [1, 2]) +def test_create_table_transaction(catalog: SqlCatalog, format_version: int) -> None: + identifier = f"default.arrow_create_table_transaction_{catalog.name}_{format_version}" + try: + catalog.create_namespace("default") + except NamespaceAlreadyExistsError: + pass + + try: + catalog.drop_table(identifier=identifier) + except NoSuchTableError: + pass + + pa_table = pa.Table.from_pydict( + { + "foo": ["a", None, "z"], + }, + schema=pa.schema([pa.field("foo", pa.large_string(), nullable=True)]), + ) + + pa_table_with_column = pa.Table.from_pydict( + { + "foo": ["a", None, "z"], + "bar": [19, None, 25], + }, + schema=pa.schema([ + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + ]), + ) + + with catalog.create_table_transaction( + identifier=identifier, schema=pa_table.schema, properties={"format-version": str(format_version)} + ) as txn: + with txn.update_snapshot().fast_append() as snapshot_update: + for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=pa_table, io=txn._table.io): + snapshot_update.append_data_file(data_file) + + with txn.update_schema() as schema_txn: + schema_txn.union_by_name(pa_table_with_column.schema) + + with txn.update_snapshot().fast_append() as snapshot_update: + for data_file in _dataframe_to_data_files( + table_metadata=txn.table_metadata, df=pa_table_with_column, io=txn._table.io + ): + snapshot_update.append_data_file(data_file) + + tbl = catalog.load_table(identifier=identifier) + assert tbl.format_version == format_version + assert len(tbl.scan().to_arrow()) == 6 + + +@pytest.mark.parametrize( + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), ], ) -def test_table_properties_int_value(catalog: SqlCatalog, table_schema_simple: Schema, random_identifier: Identifier) -> None: +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_table_properties_int_value(catalog: SqlCatalog, table_schema_simple: Schema, table_identifier: Identifier) -> None: # table properties can be set to int, but still serialized to string - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) property_with_int = {"property_name": 42} - table = catalog.create_table(random_identifier, table_schema_simple, properties=property_with_int) + table = catalog.create_table(table_identifier, table_schema_simple, properties=property_with_int) assert isinstance(table.properties["property_name"], str) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), - lazy_fixture('catalog_sqlite_without_rowcount'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + ], +) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), ], ) def test_table_properties_raise_for_none_value( - catalog: SqlCatalog, table_schema_simple: Schema, random_identifier: Identifier + catalog: SqlCatalog, table_schema_simple: Schema, table_identifier: Identifier ) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) property_with_none = {"property_name": None} with pytest.raises(ValidationError) as exc_info: - _ = catalog.create_table(random_identifier, table_schema_simple, properties=property_with_none) + _ = catalog.create_table(table_identifier, table_schema_simple, properties=property_with_none) assert "None type is not a supported value in properties: property_name" in str(exc_info.value) @pytest.mark.parametrize( - 'catalog', + "catalog", [ - lazy_fixture('catalog_memory'), - lazy_fixture('catalog_sqlite'), + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), ], ) -def test_table_exists(catalog: SqlCatalog, table_schema_simple: Schema, random_identifier: Identifier) -> None: - database_name, _table_name = random_identifier - catalog.create_namespace(database_name) - catalog.create_table(random_identifier, table_schema_simple, properties={"format-version": "2"}) - existing_table = random_identifier +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + lazy_fixture("random_table_identifier_with_catalog"), + ], +) +def test_table_exists(catalog: SqlCatalog, table_schema_simple: Schema, table_identifier: Identifier) -> None: + table_identifier_nocatalog = catalog.identifier_to_tuple_without_catalog(table_identifier) + namespace = Catalog.namespace_from(table_identifier_nocatalog) + catalog.create_namespace(namespace) + catalog.create_table(table_identifier, table_schema_simple, properties={"format-version": "2"}) + existing_table = table_identifier # Act and Assert for an existing table assert catalog.table_exists(existing_table) is True # Act and Assert for a non-existing table - assert catalog.table_exists(('non', 'exist')) is False + assert catalog.table_exists(("non", "exist")) is False diff --git a/tests/conftest.py b/tests/conftest.py index 6679543694..2092d93d0e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -29,10 +29,11 @@ import re import socket import string +import time import uuid from datetime import date, datetime, timezone from pathlib import Path -from random import choice +from random import choice, randint from tempfile import TemporaryDirectory from typing import ( TYPE_CHECKING, @@ -324,9 +325,9 @@ def pyarrow_schema_simple_without_ids() -> "pa.Schema": import pyarrow as pa return pa.schema([ - pa.field('foo', pa.string(), nullable=True), - pa.field('bar', pa.int32(), nullable=False), - pa.field('baz', pa.bool_(), nullable=True), + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), ]) @@ -335,12 +336,12 @@ def pyarrow_schema_nested_without_ids() -> "pa.Schema": import pyarrow as pa return pa.schema([ - pa.field('foo', pa.string(), nullable=False), - pa.field('bar', pa.int32(), nullable=False), - pa.field('baz', pa.bool_(), nullable=True), - pa.field('qux', pa.list_(pa.string()), nullable=False), + pa.field("foo", pa.string(), nullable=False), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("qux", pa.list_(pa.string()), nullable=False), pa.field( - 'quux', + "quux", pa.map_( pa.string(), pa.map_(pa.string(), pa.int32()), @@ -348,20 +349,20 @@ def pyarrow_schema_nested_without_ids() -> "pa.Schema": nullable=False, ), pa.field( - 'location', + "location", pa.list_( pa.struct([ - pa.field('latitude', pa.float32(), nullable=False), - pa.field('longitude', pa.float32(), nullable=False), + pa.field("latitude", pa.float32(), nullable=False), + pa.field("longitude", pa.float32(), nullable=False), ]), ), nullable=False, ), pa.field( - 'person', + "person", pa.struct([ - pa.field('name', pa.string(), nullable=True), - pa.field('age', pa.int32(), nullable=False), + pa.field("name", pa.string(), nullable=True), + pa.field("age", pa.int32(), nullable=False), ]), nullable=True, ), @@ -731,6 +732,77 @@ def example_table_metadata_no_snapshot_v1() -> Dict[str, Any]: return EXAMPLE_TABLE_METADATA_NO_SNAPSHOT_V1 +@pytest.fixture +def example_table_metadata_v2_with_extensive_snapshots() -> Dict[str, Any]: + def generate_snapshot( + snapshot_id: int, + parent_snapshot_id: Optional[int] = None, + timestamp_ms: Optional[int] = None, + sequence_number: int = 0, + ) -> Dict[str, Any]: + return { + "snapshot-id": snapshot_id, + "parent-snapshot-id": parent_snapshot_id, + "timestamp-ms": timestamp_ms or int(time.time() * 1000), + "sequence-number": sequence_number, + "summary": {"operation": "append"}, + "manifest-list": f"s3://a/b/{snapshot_id}.avro", + } + + snapshots = [] + snapshot_log = [] + initial_snapshot_id = 3051729675574597004 + + for i in range(2000): + snapshot_id = initial_snapshot_id + i + parent_snapshot_id = snapshot_id - 1 if i > 0 else None + timestamp_ms = int(time.time() * 1000) - randint(0, 1000000) + snapshots.append(generate_snapshot(snapshot_id, parent_snapshot_id, timestamp_ms, i)) + snapshot_log.append({"snapshot-id": snapshot_id, "timestamp-ms": timestamp_ms}) + + return { + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 1, + "schemas": [ + {"type": "struct", "schema-id": 0, "fields": [{"id": 1, "name": "x", "required": True, "type": "long"}]}, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [1, 2], + "fields": [ + {"id": 1, "name": "x", "required": True, "type": "long"}, + {"id": 2, "name": "y", "required": True, "type": "long", "doc": "comment"}, + {"id": 3, "name": "z", "required": True, "type": "long"}, + ], + }, + ], + "default-spec-id": 0, + "partition-specs": [{"spec-id": 0, "fields": [{"name": "x", "transform": "identity", "source-id": 1, "field-id": 1000}]}], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + {"transform": "identity", "source-id": 2, "direction": "asc", "null-order": "nulls-first"}, + {"transform": "bucket[4]", "source-id": 3, "direction": "desc", "null-order": "nulls-last"}, + ], + } + ], + "properties": {"read.split.target.size": "134217728"}, + "current-snapshot-id": initial_snapshot_id + 1999, + "snapshots": snapshots, + "snapshot-log": snapshot_log, + "metadata-log": [{"metadata-file": "s3://bucket/.../v1.json", "timestamp-ms": 1515100}], + "refs": {"test": {"snapshot-id": initial_snapshot_id, "type": "tag", "max-ref-age-ms": 10000000}}, + } + + EXAMPLE_TABLE_METADATA_V2 = { "format-version": 2, "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", @@ -1878,6 +1950,19 @@ def database_list(database_name: str) -> List[str]: return [f"{database_name}_{idx}" for idx in range(NUM_TABLES)] +@pytest.fixture() +def hierarchical_namespace_name() -> str: + prefix = "my_iceberg_ns-" + random_tag1 = "".join(choice(string.ascii_letters) for _ in range(RANDOM_LENGTH)) + random_tag2 = "".join(choice(string.ascii_letters) for _ in range(RANDOM_LENGTH)) + return ".".join([prefix + random_tag1, prefix + random_tag2]).lower() + + +@pytest.fixture() +def hierarchical_namespace_list(hierarchical_namespace_name: str) -> List[str]: + return [f"{hierarchical_namespace_name}_{idx}" for idx in range(NUM_TABLES)] + + BUCKET_NAME = "test_bucket" TABLE_METADATA_LOCATION_REGEX = re.compile( r"""s3://test_bucket/my_iceberg_database-[a-z]{20}.db/ @@ -1979,6 +2064,18 @@ def table_v2(example_table_metadata_v2: Dict[str, Any]) -> Table: ) +@pytest.fixture +def table_v2_with_extensive_snapshots(example_table_metadata_v2_with_extensive_snapshots: Dict[str, Any]) -> Table: + table_metadata = TableMetadataV2(**example_table_metadata_v2_with_extensive_snapshots) + return Table( + identifier=("database", "table"), + metadata=table_metadata, + metadata_location=f"{table_metadata.location}/uuid.metadata.json", + io=load_file_io(), + catalog=NoopCatalog("NoopCatalog"), + ) + + @pytest.fixture def bound_reference_str() -> BoundReference[str]: return BoundReference(field=NestedField(1, "field", StringType(), required=False), accessor=Accessor(position=0, inner=None)) @@ -2068,31 +2165,31 @@ def spark() -> "SparkSession": TEST_DATA_WITH_NULL = { - 'bool': [False, None, True], - 'string': ['a', None, 'z'], + "bool": [False, None, True], + "string": ["a", None, "z"], # Go over the 16 bytes to kick in truncation - 'string_long': ['a' * 22, None, 'z' * 22], - 'int': [1, None, 9], - 'long': [1, None, 9], - 'float': [0.0, None, 0.9], - 'double': [0.0, None, 0.9], + "string_long": ["a" * 22, None, "z" * 22], + "int": [1, None, 9], + "long": [1, None, 9], + "float": [0.0, None, 0.9], + "double": [0.0, None, 0.9], # 'time': [1_000_000, None, 3_000_000], # Example times: 1s, none, and 3s past midnight #Spark does not support time fields - 'timestamp': [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], - 'timestamptz': [ + "timestamp": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], + "timestamptz": [ datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), None, datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), ], - 'date': [date(2023, 1, 1), None, date(2023, 3, 1)], + "date": [date(2023, 1, 1), None, date(2023, 3, 1)], # Not supported by Spark # 'time': [time(1, 22, 0), None, time(19, 25, 0)], # Not natively supported by Arrow # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes], - 'binary': [b'\01', None, b'\22'], - 'fixed': [ - uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, + "binary": [b"\01", None, b"\22"], + "fixed": [ + uuid.UUID("00000000-0000-0000-0000-000000000000").bytes, None, - uuid.UUID('11111111-1111-1111-1111-111111111111').bytes, + uuid.UUID("11111111-1111-1111-1111-111111111111").bytes, ], } @@ -2103,8 +2200,8 @@ def pa_schema() -> "pa.Schema": return pa.schema([ ("bool", pa.bool_()), - ("string", pa.string()), - ("string_long", pa.string()), + ("string", pa.large_string()), + ("string_long", pa.large_string()), ("int", pa.int32()), ("long", pa.int64()), ("float", pa.float32()), @@ -2145,3 +2242,46 @@ def arrow_table_with_only_nulls(pa_schema: "pa.Schema") -> "pa.Table": import pyarrow as pa return pa.Table.from_pylist([{}, {}], schema=pa_schema) + + +@pytest.fixture(scope="session") +def arrow_table_date_timestamps() -> "pa.Table": + """Pyarrow table with only date, timestamp and timestamptz values.""" + import pyarrow as pa + + return pa.Table.from_pydict( + { + "date": [date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1), date(2024, 2, 1), None], + "timestamp": [ + datetime(2023, 12, 31, 0, 0, 0), + datetime(2024, 1, 1, 0, 0, 0), + datetime(2024, 1, 31, 0, 0, 0), + datetime(2024, 2, 1, 0, 0, 0), + datetime(2024, 2, 1, 6, 0, 0), + None, + ], + "timestamptz": [ + datetime(2023, 12, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 1, 31, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 0, 0, 0, tzinfo=timezone.utc), + datetime(2024, 2, 1, 6, 0, 0, tzinfo=timezone.utc), + None, + ], + }, + schema=pa.schema([ + ("date", pa.date32()), + ("timestamp", pa.timestamp(unit="us")), + ("timestamptz", pa.timestamp(unit="us", tz="UTC")), + ]), + ) + + +@pytest.fixture(scope="session") +def arrow_table_date_timestamps_schema() -> Schema: + """Pyarrow table Schema with only date, timestamp and timestamptz values.""" + return Schema( + NestedField(field_id=1, name="date", field_type=DateType(), required=False), + NestedField(field_id=2, name="timestamp", field_type=TimestampType(), required=False), + NestedField(field_id=3, name="timestamptz", field_type=TimestamptzType(), required=False), + ) diff --git a/tests/expressions/test_expressions.py b/tests/expressions/test_expressions.py index f277672d87..87856a04f6 100644 --- a/tests/expressions/test_expressions.py +++ b/tests/expressions/test_expressions.py @@ -1152,11 +1152,11 @@ def test_above_long_bounds_greater_than_or_equal( def test_eq_bound_expression(bound_reference_str: BoundReference[str]) -> None: - assert BoundEqualTo(term=bound_reference_str, literal=literal('a')) != BoundGreaterThanOrEqual( - term=bound_reference_str, literal=literal('a') + assert BoundEqualTo(term=bound_reference_str, literal=literal("a")) != BoundGreaterThanOrEqual( + term=bound_reference_str, literal=literal("a") ) - assert BoundEqualTo(term=bound_reference_str, literal=literal('a')) == BoundEqualTo( - term=bound_reference_str, literal=literal('a') + assert BoundEqualTo(term=bound_reference_str, literal=literal("a")) == BoundEqualTo( + term=bound_reference_str, literal=literal("a") ) diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py index 94c73918c8..84729fcca4 100644 --- a/tests/integration/test_add_files.py +++ b/tests/integration/test_add_files.py @@ -65,10 +65,10 @@ ) ARROW_SCHEMA_WITH_IDS = pa.schema([ - pa.field('foo', pa.bool_(), nullable=False, metadata={"PARQUET:field_id": "1"}), - pa.field('bar', pa.string(), nullable=False, metadata={"PARQUET:field_id": "2"}), - pa.field('baz', pa.int32(), nullable=False, metadata={"PARQUET:field_id": "3"}), - pa.field('qux', pa.date32(), nullable=False, metadata={"PARQUET:field_id": "4"}), + pa.field("foo", pa.bool_(), nullable=False, metadata={"PARQUET:field_id": "1"}), + pa.field("bar", pa.string(), nullable=False, metadata={"PARQUET:field_id": "2"}), + pa.field("baz", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "3"}), + pa.field("qux", pa.date32(), nullable=False, metadata={"PARQUET:field_id": "4"}), ]) diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index 2c936c2ca9..262d2f39df 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -88,45 +88,45 @@ def test_inspect_snapshots( df = tbl.inspect.snapshots() assert df.column_names == [ - 'committed_at', - 'snapshot_id', - 'parent_id', - 'operation', - 'manifest_list', - 'summary', + "committed_at", + "snapshot_id", + "parent_id", + "operation", + "manifest_list", + "summary", ] - for committed_at in df['committed_at']: + for committed_at in df["committed_at"]: assert isinstance(committed_at.as_py(), datetime) - for snapshot_id in df['snapshot_id']: + for snapshot_id in df["snapshot_id"]: assert isinstance(snapshot_id.as_py(), int) - assert df['parent_id'][0].as_py() is None - assert df['parent_id'][1:] == df['snapshot_id'][:2] + assert df["parent_id"][0].as_py() is None + assert df["parent_id"][1:] == df["snapshot_id"][:2] - assert [operation.as_py() for operation in df['operation']] == ['append', 'overwrite', 'append'] + assert [operation.as_py() for operation in df["operation"]] == ["append", "overwrite", "append"] - for manifest_list in df['manifest_list']: + for manifest_list in df["manifest_list"]: assert manifest_list.as_py().startswith("s3://") - assert df['summary'][0].as_py() == [ - ('added-files-size', '5459'), - ('added-data-files', '1'), - ('added-records', '3'), - ('total-data-files', '1'), - ('total-delete-files', '0'), - ('total-records', '3'), - ('total-files-size', '5459'), - ('total-position-deletes', '0'), - ('total-equality-deletes', '0'), + assert df["summary"][0].as_py() == [ + ("added-files-size", "5459"), + ("added-data-files", "1"), + ("added-records", "3"), + ("total-data-files", "1"), + ("total-delete-files", "0"), + ("total-records", "3"), + ("total-files-size", "5459"), + ("total-position-deletes", "0"), + ("total-equality-deletes", "0"), ] lhs = spark.table(f"{identifier}.snapshots").toPandas() rhs = df.to_pandas() for column in df.column_names: for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): - if column == 'summary': + if column == "summary": # Arrow returns a list of tuples, instead of a dict right = dict(right) @@ -150,29 +150,29 @@ def test_inspect_entries( def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> None: assert df.column_names == [ - 'status', - 'snapshot_id', - 'sequence_number', - 'file_sequence_number', - 'data_file', - 'readable_metrics', + "status", + "snapshot_id", + "sequence_number", + "file_sequence_number", + "data_file", + "readable_metrics", ] # Make sure that they are filled properly - for int_column in ['status', 'snapshot_id', 'sequence_number', 'file_sequence_number']: + for int_column in ["status", "snapshot_id", "sequence_number", "file_sequence_number"]: for value in df[int_column]: assert isinstance(value.as_py(), int) - for snapshot_id in df['snapshot_id']: + for snapshot_id in df["snapshot_id"]: assert isinstance(snapshot_id.as_py(), int) lhs = df.to_pandas() rhs = spark_df.toPandas() for column in df.column_names: for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): - if column == 'data_file': + if column == "data_file": for df_column in left.keys(): - if df_column == 'partition': + if df_column == "partition": # Spark leaves out the partition if the table is unpartitioned continue @@ -183,20 +183,20 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> Non df_lhs = dict(df_lhs) assert df_lhs == df_rhs, f"Difference in data_file column {df_column}: {df_lhs} != {df_rhs}" - elif column == 'readable_metrics': + elif column == "readable_metrics": assert list(left.keys()) == [ - 'bool', - 'string', - 'string_long', - 'int', - 'long', - 'float', - 'double', - 'timestamp', - 'timestamptz', - 'date', - 'binary', - 'fixed', + "bool", + "string", + "string_long", + "int", + "long", + "float", + "double", + "timestamp", + "timestamptz", + "date", + "binary", + "fixed", ] assert left.keys() == right.keys() @@ -205,18 +205,18 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> Non rm_lhs = left[rm_column] rm_rhs = right[rm_column] - assert rm_lhs['column_size'] == rm_rhs['column_size'] - assert rm_lhs['value_count'] == rm_rhs['value_count'] - assert rm_lhs['null_value_count'] == rm_rhs['null_value_count'] - assert rm_lhs['nan_value_count'] == rm_rhs['nan_value_count'] + assert rm_lhs["column_size"] == rm_rhs["column_size"] + assert rm_lhs["value_count"] == rm_rhs["value_count"] + assert rm_lhs["null_value_count"] == rm_rhs["null_value_count"] + assert rm_lhs["nan_value_count"] == rm_rhs["nan_value_count"] - if rm_column == 'timestamptz': + if rm_column == "timestamptz": # PySpark does not correctly set the timstamptz - rm_rhs['lower_bound'] = rm_rhs['lower_bound'].replace(tzinfo=pytz.utc) - rm_rhs['upper_bound'] = rm_rhs['upper_bound'].replace(tzinfo=pytz.utc) + rm_rhs["lower_bound"] = rm_rhs["lower_bound"].replace(tzinfo=pytz.utc) + rm_rhs["upper_bound"] = rm_rhs["upper_bound"].replace(tzinfo=pytz.utc) - assert rm_lhs['lower_bound'] == rm_rhs['lower_bound'] - assert rm_lhs['upper_bound'] == rm_rhs['upper_bound'] + assert rm_lhs["lower_bound"] == rm_rhs["lower_bound"] + assert rm_lhs["upper_bound"] == rm_rhs["upper_bound"] else: assert left == right, f"Difference in column {column}: {left} != {right}" @@ -265,8 +265,8 @@ def test_inspect_entries_partitioned(spark: SparkSession, session_catalog: Catal df = session_catalog.load_table(identifier).inspect.entries() - assert df.to_pydict()['data_file'][0]['partition'] == {'dt_day': date(2021, 2, 1), 'dt_month': None} - assert df.to_pydict()['data_file'][1]['partition'] == {'dt_day': None, 'dt_month': 612} + assert df.to_pydict()["data_file"][0]["partition"] == {"dt_day": date(2021, 2, 1), "dt_month": None} + assert df.to_pydict()["data_file"][1]["partition"] == {"dt_day": None, "dt_month": 612} @pytest.mark.integration @@ -301,21 +301,21 @@ def test_inspect_refs( df = tbl.refresh().inspect.refs() assert df.column_names == [ - 'name', - 'type', - 'snapshot_id', - 'max_reference_age_in_ms', - 'min_snapshots_to_keep', - 'max_snapshot_age_in_ms', + "name", + "type", + "snapshot_id", + "max_reference_age_in_ms", + "min_snapshots_to_keep", + "max_snapshot_age_in_ms", ] - assert [name.as_py() for name in df['name']] == ['testBranch', 'main', 'testTag'] - assert [ref_type.as_py() for ref_type in df['type']] == ['BRANCH', 'BRANCH', 'TAG'] + assert [name.as_py() for name in df["name"]] == ["testBranch", "main", "testTag"] + assert [ref_type.as_py() for ref_type in df["type"]] == ["BRANCH", "BRANCH", "TAG"] - for snapshot_id in df['snapshot_id']: + for snapshot_id in df["snapshot_id"]: assert isinstance(snapshot_id.as_py(), int) - for int_column in ['max_reference_age_in_ms', 'min_snapshots_to_keep', 'max_snapshot_age_in_ms']: + for int_column in ["max_reference_age_in_ms", "min_snapshots_to_keep", "max_snapshot_age_in_ms"]: for value in df[int_column]: assert isinstance(value.as_py(), int) or not value.as_py() @@ -343,28 +343,28 @@ def test_inspect_partitions_unpartitioned( df = tbl.inspect.partitions() assert df.column_names == [ - 'record_count', - 'file_count', - 'total_data_file_size_in_bytes', - 'position_delete_record_count', - 'position_delete_file_count', - 'equality_delete_record_count', - 'equality_delete_file_count', - 'last_updated_at', - 'last_updated_snapshot_id', + "record_count", + "file_count", + "total_data_file_size_in_bytes", + "position_delete_record_count", + "position_delete_file_count", + "equality_delete_record_count", + "equality_delete_file_count", + "last_updated_at", + "last_updated_snapshot_id", ] - for last_updated_at in df['last_updated_at']: + for last_updated_at in df["last_updated_at"]: assert isinstance(last_updated_at.as_py(), datetime) int_cols = [ - 'record_count', - 'file_count', - 'total_data_file_size_in_bytes', - 'position_delete_record_count', - 'position_delete_file_count', - 'equality_delete_record_count', - 'equality_delete_file_count', - 'last_updated_snapshot_id', + "record_count", + "file_count", + "total_data_file_size_in_bytes", + "position_delete_record_count", + "position_delete_file_count", + "equality_delete_record_count", + "equality_delete_file_count", + "last_updated_snapshot_id", ] for column in int_cols: for value in df[column]: @@ -434,8 +434,8 @@ def test_inspect_partitions_partitioned(spark: SparkSession, session_catalog: Ca ) def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> None: - lhs = df.to_pandas().sort_values('spec_id') - rhs = spark_df.toPandas().sort_values('spec_id') + lhs = df.to_pandas().sort_values("spec_id") + rhs = spark_df.toPandas().sort_values("spec_id") for column in df.column_names: for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): assert left == right, f"Difference in column {column}: {left} != {right}" @@ -447,6 +447,197 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> Non check_pyiceberg_df_equals_spark_df(df, spark_df) +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_inspect_manifests(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: + identifier = "default.table_metadata_manifests" + try: + session_catalog.drop_table(identifier=identifier) + except NoSuchTableError: + pass + + spark.sql( + f""" + CREATE TABLE {identifier} ( + id int, + data string + ) + PARTITIONED BY (data) + """ + ) + + spark.sql( + f""" + INSERT INTO {identifier} VALUES (1, "a") + """ + ) + + spark.sql( + f""" + INSERT INTO {identifier} VALUES (2, "b") + """ + ) + + df = session_catalog.load_table(identifier).inspect.manifests() + + assert df.column_names == [ + "content", + "path", + "length", + "partition_spec_id", + "added_snapshot_id", + "added_data_files_count", + "existing_data_files_count", + "deleted_data_files_count", + "added_delete_files_count", + "existing_delete_files_count", + "deleted_delete_files_count", + "partition_summaries", + ] + + int_cols = [ + "content", + "length", + "partition_spec_id", + "added_snapshot_id", + "added_data_files_count", + "existing_data_files_count", + "deleted_data_files_count", + "added_delete_files_count", + "existing_delete_files_count", + "deleted_delete_files_count", + ] + + for column in int_cols: + for value in df[column]: + assert isinstance(value.as_py(), int) + + for value in df["path"]: + assert isinstance(value.as_py(), str) + + for value in df["partition_summaries"]: + assert isinstance(value.as_py(), list) + for row in value: + assert isinstance(row["contains_null"].as_py(), bool) + assert isinstance(row["contains_nan"].as_py(), (bool, type(None))) + assert isinstance(row["lower_bound"].as_py(), (str, type(None))) + assert isinstance(row["upper_bound"].as_py(), (str, type(None))) + + lhs = spark.table(f"{identifier}.manifests").toPandas() + rhs = df.to_pandas() + for column in df.column_names: + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): + assert left == right, f"Difference in column {column}: {left} != {right}" + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_inspect_metadata_log_entries( + spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int +) -> None: + from pandas.testing import assert_frame_equal + + identifier = "default.table_metadata_log_entries" + try: + session_catalog.drop_table(identifier=identifier) + except NoSuchTableError: + pass + + tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version}) + + # Write some data + tbl.append(arrow_table_with_null) + tbl.append(arrow_table_with_null) + tbl.append(arrow_table_with_null) + + df = tbl.inspect.metadata_log_entries() + spark_df = spark.sql(f"SELECT * FROM {identifier}.metadata_log_entries") + lhs = df.to_pandas() + rhs = spark_df.toPandas() + + # Timestamp in the last row of `metadata_log_entries` table is based on when the table was read + # Therefore, the timestamp of the last row for pyiceberg dataframe and spark dataframe will be different + left_before_last, left_last = lhs[:-1], lhs[-1:] + right_before_last, right_last = rhs[:-1], rhs[-1:] + + # compare all rows except for the last row + assert_frame_equal(left_before_last, right_before_last, check_dtype=False) + # compare the last row, except for the timestamp + for column in df.column_names: + for left, right in zip(left_last[column], right_last[column]): + if column == "timestamp": + continue + assert left == right, f"Difference in column {column}: {left} != {right}" + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_inspect_history(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: + identifier = "default.table_history" + + try: + session_catalog.drop_table(identifier=identifier) + except NoSuchTableError: + pass + + spark.sql( + f""" + CREATE TABLE {identifier} ( + id int, + data string + ) + PARTITIONED BY (data) + """ + ) + + spark.sql( + f""" + INSERT INTO {identifier} VALUES (1, "a") + """ + ) + + table = session_catalog.load_table(identifier) + first_snapshot = table.current_snapshot() + snapshot_id = None if not first_snapshot else first_snapshot.snapshot_id + + spark.sql( + f""" + INSERT INTO {identifier} VALUES (2, "b") + """ + ) + + spark.sql( + f""" + CALL integration.system.rollback_to_snapshot('{identifier}', {snapshot_id}) + """ + ) + + spark.sql( + f""" + INSERT INTO {identifier} VALUES (3, "c") + """ + ) + + table.refresh() + + df = table.inspect.history() + + assert df.column_names == [ + "made_current_at", + "snapshot_id", + "parent_id", + "is_current_ancestor", + ] + + lhs = spark.table(f"{identifier}.history").toPandas() + rhs = df.to_pandas() + for column in df.column_names: + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): + if isinstance(left, float) and math.isnan(left) and isinstance(right, float) and math.isnan(right): + # NaN != NaN in Python + continue + assert left == right, f"Difference in column {column}: {left} != {right}" + @pytest.mark.integration @pytest.mark.parametrize("format_version", [1, 2]) def test_inspect_files( @@ -550,4 +741,4 @@ def test_inspect_files( assert rm_lhs['lower_bound'] == rm_rhs['lower_bound'] assert rm_lhs['upper_bound'] == rm_rhs['upper_bound'] else: - assert left == right, f"Difference in column {column}: {left} != {right}" + assert left == right, f"Difference in column {column}: {left} != {right}" \ No newline at end of file diff --git a/tests/integration/test_partition_evolution.py b/tests/integration/test_partition_evolution.py index 785b34b82c..5cc7512f4a 100644 --- a/tests/integration/test_partition_evolution.py +++ b/tests/integration/test_partition_evolution.py @@ -73,7 +73,7 @@ def _create_table_with_schema(catalog: Catalog, schema: Schema, format_version: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_identity_partition(catalog: Catalog, table_schema_simple: Schema) -> None: simple_table = _simple_table(catalog, table_schema_simple) simple_table.update_spec().add_identity("foo").commit() @@ -85,7 +85,7 @@ def test_add_identity_partition(catalog: Catalog, table_schema_simple: Schema) - @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_year(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_field("event_ts", YearTransform(), "year_transform").commit() @@ -93,7 +93,7 @@ def test_add_year(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_month(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_field("event_ts", MonthTransform(), "month_transform").commit() @@ -101,7 +101,7 @@ def test_add_month(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_day(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_field("event_ts", DayTransform(), "day_transform").commit() @@ -109,7 +109,7 @@ def test_add_day(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_hour(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_field("event_ts", HourTransform(), "hour_transform").commit() @@ -117,7 +117,7 @@ def test_add_hour(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_bucket(catalog: Catalog, table_schema_simple: Schema) -> None: simple_table = _create_table_with_schema(catalog, table_schema_simple, "1") simple_table.update_spec().add_field("foo", BucketTransform(12), "bucket_transform").commit() @@ -125,7 +125,7 @@ def test_add_bucket(catalog: Catalog, table_schema_simple: Schema) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_truncate(catalog: Catalog, table_schema_simple: Schema) -> None: simple_table = _create_table_with_schema(catalog, table_schema_simple, "1") simple_table.update_spec().add_field("foo", TruncateTransform(1), "truncate_transform").commit() @@ -135,7 +135,7 @@ def test_add_truncate(catalog: Catalog, table_schema_simple: Schema) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_multiple_adds(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_identity("id").add_field("event_ts", HourTransform(), "hourly_partitioned").add_field( @@ -153,7 +153,7 @@ def test_multiple_adds(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_hour_to_day(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_field("event_ts", DayTransform(), "daily_partitioned").commit() @@ -169,7 +169,7 @@ def test_add_hour_to_day(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_add_multiple_buckets(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_field("id", BucketTransform(16)).add_field("id", BucketTransform(4)).commit() @@ -184,7 +184,7 @@ def test_add_multiple_buckets(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_remove_identity(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_identity("id").commit() @@ -192,12 +192,12 @@ def test_remove_identity(catalog: Catalog) -> None: assert len(table.specs()) == 3 assert table.spec().spec_id == 2 assert table.spec() == PartitionSpec( - PartitionField(source_id=1, field_id=1000, transform=VoidTransform(), name='id'), spec_id=2 + PartitionField(source_id=1, field_id=1000, transform=VoidTransform(), name="id"), spec_id=2 ) @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_remove_identity_v2(catalog: Catalog) -> None: table_v2 = _table_v2(catalog) table_v2.update_spec().add_identity("id").commit() @@ -208,7 +208,7 @@ def test_remove_identity_v2(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_remove_bucket(catalog: Catalog) -> None: table = _table(catalog) with table.update_spec() as update: @@ -223,13 +223,13 @@ def test_remove_bucket(catalog: Catalog) -> None: 1001, 2, 1001, - PartitionField(source_id=1, field_id=1000, transform=VoidTransform(), name='bucketed_id'), - PartitionField(source_id=2, field_id=1001, transform=DayTransform(), name='day_ts'), + PartitionField(source_id=1, field_id=1000, transform=VoidTransform(), name="bucketed_id"), + PartitionField(source_id=2, field_id=1001, transform=DayTransform(), name="day_ts"), ) @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_remove_bucket_v2(catalog: Catalog) -> None: table_v2 = _table_v2(catalog) with table_v2.update_spec() as update: @@ -239,12 +239,12 @@ def test_remove_bucket_v2(catalog: Catalog) -> None: remove.remove_field("bucketed_id") assert len(table_v2.specs()) == 3 _validate_new_partition_fields( - table_v2, 1001, 2, 1001, PartitionField(source_id=2, field_id=1001, transform=DayTransform(), name='day_ts') + table_v2, 1001, 2, 1001, PartitionField(source_id=2, field_id=1001, transform=DayTransform(), name="day_ts") ) @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_remove_day(catalog: Catalog) -> None: table = _table(catalog) with table.update_spec() as update: @@ -259,13 +259,13 @@ def test_remove_day(catalog: Catalog) -> None: 1001, 2, 1001, - PartitionField(source_id=1, field_id=1000, transform=BucketTransform(16), name='bucketed_id'), - PartitionField(source_id=2, field_id=1001, transform=VoidTransform(), name='day_ts'), + PartitionField(source_id=1, field_id=1000, transform=BucketTransform(16), name="bucketed_id"), + PartitionField(source_id=2, field_id=1001, transform=VoidTransform(), name="day_ts"), ) @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_remove_day_v2(catalog: Catalog) -> None: table_v2 = _table_v2(catalog) with table_v2.update_spec() as update: @@ -275,12 +275,12 @@ def test_remove_day_v2(catalog: Catalog) -> None: remove.remove_field("day_ts") assert len(table_v2.specs()) == 3 _validate_new_partition_fields( - table_v2, 1000, 2, 1001, PartitionField(source_id=1, field_id=1000, transform=BucketTransform(16), name='bucketed_id') + table_v2, 1000, 2, 1001, PartitionField(source_id=1, field_id=1000, transform=BucketTransform(16), name="bucketed_id") ) @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_rename(catalog: Catalog) -> None: table = _table(catalog) table.update_spec().add_identity("id").commit() @@ -291,7 +291,7 @@ def test_rename(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_add_and_remove(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -300,7 +300,7 @@ def test_cannot_add_and_remove(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_add_redundant_time_partition(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -311,7 +311,7 @@ def test_cannot_add_redundant_time_partition(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_delete_and_rename(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -321,7 +321,7 @@ def test_cannot_delete_and_rename(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_rename_and_delete(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -331,7 +331,7 @@ def test_cannot_rename_and_delete(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_add_same_tranform_for_same_field(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -342,7 +342,7 @@ def test_cannot_add_same_tranform_for_same_field(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_add_same_field_multiple_times(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -353,7 +353,7 @@ def test_cannot_add_same_field_multiple_times(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_cannot_add_multiple_specs_same_name(catalog: Catalog) -> None: table = _table(catalog) with pytest.raises(ValueError) as exc_info: @@ -364,7 +364,7 @@ def test_cannot_add_multiple_specs_same_name(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_change_specs_and_schema_transaction(catalog: Catalog) -> None: table = _table(catalog) with table.transaction() as transaction: @@ -387,17 +387,17 @@ def test_change_specs_and_schema_transaction(catalog: Catalog) -> None: ) assert table.schema() == Schema( - NestedField(field_id=1, name='id', field_type=LongType(), required=False), - NestedField(field_id=2, name='event_ts', field_type=TimestampType(), required=False), - NestedField(field_id=3, name='str', field_type=StringType(), required=False), - NestedField(field_id=4, name='col_string', field_type=StringType(), required=False), + NestedField(field_id=1, name="id", field_type=LongType(), required=False), + NestedField(field_id=2, name="event_ts", field_type=TimestampType(), required=False), + NestedField(field_id=3, name="str", field_type=StringType(), required=False), + NestedField(field_id=4, name="col_string", field_type=StringType(), required=False), identifier_field_ids=[], ) assert table.schema().schema_id == 1 @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_multiple_adds_and_remove_v1(catalog: Catalog) -> None: table = _table(catalog) with table.update_spec() as update: @@ -419,7 +419,7 @@ def test_multiple_adds_and_remove_v1(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_multiple_adds_and_remove_v2(catalog: Catalog) -> None: table_v2 = _table_v2(catalog) with table_v2.update_spec() as update: @@ -433,7 +433,7 @@ def test_multiple_adds_and_remove_v2(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_multiple_remove_and_add_reuses_v2(catalog: Catalog) -> None: table_v2 = _table_v2(catalog) with table_v2.update_spec() as update: diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index d89ecaf202..29f664909c 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -328,8 +328,8 @@ ), ( [PartitionField(source_id=11, field_id=1001, transform=IdentityTransform(), name="binary_field")], - [b'example'], - Record(binary_field=b'example'), + [b"example"], + Record(binary_field=b"example"), "binary_field=ZXhhbXBsZQ%3D%3D", f"""CREATE TABLE {identifier} ( binary_field binary, @@ -347,8 +347,8 @@ ), ( [PartitionField(source_id=13, field_id=1001, transform=IdentityTransform(), name="decimal_field")], - [Decimal('123.45')], - Record(decimal_field=Decimal('123.45')), + [Decimal("123.45")], + Record(decimal_field=Decimal("123.45")), "decimal_field=123.45", f"""CREATE TABLE {identifier} ( decimal_field decimal(5,2), @@ -638,8 +638,8 @@ ), ( [PartitionField(source_id=13, field_id=1001, transform=TruncateTransform(width=5), name="decimal_field_trunc")], - [Decimal('678.93')], - Record(decimal_field_trunc=Decimal('678.90')), + [Decimal("678.93")], + Record(decimal_field_trunc=Decimal("678.90")), "decimal_field_trunc=678.90", # Assuming truncation width of 1 leads to truncating to 670 f"""CREATE TABLE {identifier} ( decimal_field decimal(5,2), @@ -657,8 +657,8 @@ ), ( [PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(10), name="binary_field_trunc")], - [b'HELLOICEBERG'], - Record(binary_field_trunc=b'HELLOICEBE'), + [b"HELLOICEBERG"], + Record(binary_field_trunc=b"HELLOICEBE"), "binary_field_trunc=SEVMTE9JQ0VCRQ%3D%3D", f"""CREATE TABLE {identifier} ( binary_field binary, diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 2a10e37ba9..078abf406a 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -21,6 +21,7 @@ import uuid from urllib.parse import urlparse +import pyarrow as pa import pyarrow.parquet as pq import pytest from hive_metastore.ttypes import LockRequest, LockResponse, LockState, UnlockRequest @@ -51,7 +52,7 @@ ) from pyiceberg.utils.concurrent import ExecutorFactory -DEFAULT_PROPERTIES = {'write.parquet.compression-codec': 'zstd'} +DEFAULT_PROPERTIES = {"write.parquet.compression-codec": "zstd"} TABLE_NAME = ("default", "t1") @@ -74,7 +75,7 @@ def create_table(catalog: Catalog) -> Table: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_table_properties(catalog: Catalog) -> None: table = create_table(catalog) @@ -104,7 +105,7 @@ def test_table_properties(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_table_properties_dict(catalog: Catalog) -> None: table = create_table(catalog) @@ -134,7 +135,7 @@ def test_table_properties_dict(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_table_properties_error(catalog: Catalog) -> None: table = create_table(catalog) properties = {"abc": "def"} @@ -144,7 +145,7 @@ def test_table_properties_error(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_pyarrow_nan(catalog: Catalog) -> None: table_test_null_nan = catalog.load_table("default.test_null_nan") arrow_table = table_test_null_nan.scan(row_filter=IsNaN("col_numeric"), selected_fields=("idx", "col_numeric")).to_arrow() @@ -154,7 +155,7 @@ def test_pyarrow_nan(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_pyarrow_nan_rewritten(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") arrow_table = table_test_null_nan_rewritten.scan( @@ -166,7 +167,7 @@ def test_pyarrow_nan_rewritten(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) @pytest.mark.skip(reason="Fixing issues with NaN's: https://github.com/apache/arrow/issues/34162") def test_pyarrow_not_nan_count(catalog: Catalog) -> None: table_test_null_nan = catalog.load_table("default.test_null_nan") @@ -175,7 +176,48 @@ def test_pyarrow_not_nan_count(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_pyarrow_batches_nan(catalog: Catalog) -> None: + table_test_null_nan = catalog.load_table("default.test_null_nan") + arrow_batch_reader = table_test_null_nan.scan( + row_filter=IsNaN("col_numeric"), selected_fields=("idx", "col_numeric") + ).to_arrow_batch_reader() + assert isinstance(arrow_batch_reader, pa.RecordBatchReader) + arrow_table = arrow_batch_reader.read_all() + assert len(arrow_table) == 1 + assert arrow_table["idx"][0].as_py() == 1 + assert math.isnan(arrow_table["col_numeric"][0].as_py()) + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_pyarrow_batches_nan_rewritten(catalog: Catalog) -> None: + table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") + arrow_batch_reader = table_test_null_nan_rewritten.scan( + row_filter=IsNaN("col_numeric"), selected_fields=("idx", "col_numeric") + ).to_arrow_batch_reader() + assert isinstance(arrow_batch_reader, pa.RecordBatchReader) + arrow_table = arrow_batch_reader.read_all() + assert len(arrow_table) == 1 + assert arrow_table["idx"][0].as_py() == 1 + assert math.isnan(arrow_table["col_numeric"][0].as_py()) + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +@pytest.mark.skip(reason="Fixing issues with NaN's: https://github.com/apache/arrow/issues/34162") +def test_pyarrow_batches_not_nan_count(catalog: Catalog) -> None: + table_test_null_nan = catalog.load_table("default.test_null_nan") + arrow_batch_reader = table_test_null_nan.scan( + row_filter=NotNaN("col_numeric"), selected_fields=("idx",) + ).to_arrow_batch_reader() + assert isinstance(arrow_batch_reader, pa.RecordBatchReader) + arrow_table = arrow_batch_reader.read_all() + assert len(arrow_table) == 2 + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_duckdb_nan(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") con = table_test_null_nan_rewritten.scan().to_duckdb("table_test_null_nan") @@ -185,7 +227,7 @@ def test_duckdb_nan(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_pyarrow_limit(catalog: Catalog) -> None: table_test_limit = catalog.load_table("default.test_limit") limited_result = table_test_limit.scan(selected_fields=("idx",), limit=1).to_arrow() @@ -200,7 +242,7 @@ def test_pyarrow_limit(catalog: Catalog) -> None: @pytest.mark.integration @pytest.mark.filterwarnings("ignore") -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_daft_nan(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") df = table_test_null_nan_rewritten.to_daft() @@ -209,7 +251,7 @@ def test_daft_nan(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_daft_nan_rewritten(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") df = table_test_null_nan_rewritten.to_daft() @@ -222,7 +264,7 @@ def test_daft_nan_rewritten(catalog: Catalog) -> None: @pytest.mark.integration @pytest.mark.filterwarnings("ignore") -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_ray_nan(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") ray_dataset = table_test_null_nan_rewritten.scan().to_ray() @@ -231,7 +273,7 @@ def test_ray_nan(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_ray_nan_rewritten(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") ray_dataset = table_test_null_nan_rewritten.scan( @@ -243,7 +285,7 @@ def test_ray_nan_rewritten(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) @pytest.mark.skip(reason="Fixing issues with NaN's: https://github.com/apache/arrow/issues/34162") def test_ray_not_nan_count(catalog: Catalog) -> None: table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten") @@ -252,7 +294,7 @@ def test_ray_not_nan_count(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_ray_all_types(catalog: Catalog) -> None: table_test_all_types = catalog.load_table("default.test_all_types") ray_dataset = table_test_all_types.scan().to_ray() @@ -262,7 +304,7 @@ def test_ray_all_types(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_pyarrow_to_iceberg_all_types(catalog: Catalog) -> None: table_test_all_types = catalog.load_table("default.test_all_types") fs = S3FileSystem( @@ -281,7 +323,7 @@ def test_pyarrow_to_iceberg_all_types(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_pyarrow_deletes(catalog: Catalog) -> None: # number, letter # (1, 'a'), @@ -318,7 +360,7 @@ def test_pyarrow_deletes(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_pyarrow_deletes_double(catalog: Catalog) -> None: # number, letter # (1, 'a'), @@ -355,7 +397,91 @@ def test_pyarrow_deletes_double(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_pyarrow_batches_deletes(catalog: Catalog) -> None: + # number, letter + # (1, 'a'), + # (2, 'b'), + # (3, 'c'), + # (4, 'd'), + # (5, 'e'), + # (6, 'f'), + # (7, 'g'), + # (8, 'h'), + # (9, 'i'), <- deleted + # (10, 'j'), + # (11, 'k'), + # (12, 'l') + test_positional_mor_deletes = catalog.load_table("default.test_positional_mor_deletes") + arrow_table = test_positional_mor_deletes.scan().to_arrow_batch_reader().read_all() + assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12] + + # Checking the filter + arrow_table = ( + test_positional_mor_deletes.scan(row_filter=And(GreaterThanOrEqual("letter", "e"), LessThan("letter", "k"))) + .to_arrow_batch_reader() + .read_all() + ) + assert arrow_table["number"].to_pylist() == [5, 6, 7, 8, 10] + + # Testing the combination of a filter and a limit + arrow_table = ( + test_positional_mor_deletes.scan(row_filter=And(GreaterThanOrEqual("letter", "e"), LessThan("letter", "k")), limit=1) + .to_arrow_batch_reader() + .read_all() + ) + assert arrow_table["number"].to_pylist() == [5] + + # Testing the slicing of indices + arrow_table = test_positional_mor_deletes.scan(limit=3).to_arrow_batch_reader().read_all() + assert arrow_table["number"].to_pylist() == [1, 2, 3] + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_pyarrow_batches_deletes_double(catalog: Catalog) -> None: + # number, letter + # (1, 'a'), + # (2, 'b'), + # (3, 'c'), + # (4, 'd'), + # (5, 'e'), + # (6, 'f'), <- second delete + # (7, 'g'), + # (8, 'h'), + # (9, 'i'), <- first delete + # (10, 'j'), + # (11, 'k'), + # (12, 'l') + test_positional_mor_double_deletes = catalog.load_table("default.test_positional_mor_double_deletes") + arrow_table = test_positional_mor_double_deletes.scan().to_arrow_batch_reader().read_all() + assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 7, 8, 10, 11, 12] + + # Checking the filter + arrow_table = ( + test_positional_mor_double_deletes.scan(row_filter=And(GreaterThanOrEqual("letter", "e"), LessThan("letter", "k"))) + .to_arrow_batch_reader() + .read_all() + ) + assert arrow_table["number"].to_pylist() == [5, 7, 8, 10] + + # Testing the combination of a filter and a limit + arrow_table = ( + test_positional_mor_double_deletes.scan( + row_filter=And(GreaterThanOrEqual("letter", "e"), LessThan("letter", "k")), limit=1 + ) + .to_arrow_batch_reader() + .read_all() + ) + assert arrow_table["number"].to_pylist() == [5] + + # Testing the slicing of indices + arrow_table = test_positional_mor_double_deletes.scan(limit=8).to_arrow_batch_reader().read_all() + assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 5, 7, 8, 10] + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_partitioned_tables(catalog: Catalog) -> None: for table_name, predicate in [ ("test_partitioned_by_identity", "ts >= '2023-03-05T00:00:00+00:00'"), @@ -372,7 +498,7 @@ def test_partitioned_tables(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_unpartitioned_uuid_table(catalog: Catalog) -> None: unpartitioned_uuid = catalog.load_table("default.test_uuid_and_fixed_unpartitioned") arrow_table_eq = unpartitioned_uuid.scan(row_filter="uuid_col == '102cb62f-e6f8-4eb0-9973-d9b012ff0967'").to_arrow() @@ -389,7 +515,7 @@ def test_unpartitioned_uuid_table(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_unpartitioned_fixed_table(catalog: Catalog) -> None: fixed_table = catalog.load_table("default.test_uuid_and_fixed_unpartitioned") arrow_table_eq = fixed_table.scan(row_filter=EqualTo("fixed_col", b"1234567890123456789012345")).to_arrow() @@ -408,7 +534,7 @@ def test_unpartitioned_fixed_table(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_scan_tag(catalog: Catalog) -> None: test_positional_mor_deletes = catalog.load_table("default.test_positional_mor_deletes") arrow_table = test_positional_mor_deletes.scan().use_ref("tag_12").to_arrow() @@ -416,7 +542,7 @@ def test_scan_tag(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_scan_branch(catalog: Catalog) -> None: test_positional_mor_deletes = catalog.load_table("default.test_positional_mor_deletes") arrow_table = test_positional_mor_deletes.scan().use_ref("without_5").to_arrow() @@ -424,21 +550,21 @@ def test_scan_branch(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_filter_on_new_column(catalog: Catalog) -> None: test_table_add_column = catalog.load_table("default.test_table_add_column") arrow_table = test_table_add_column.scan(row_filter="b == '2'").to_arrow() - assert arrow_table["b"].to_pylist() == ['2'] + assert arrow_table["b"].to_pylist() == ["2"] arrow_table = test_table_add_column.scan(row_filter="b is not null").to_arrow() - assert arrow_table["b"].to_pylist() == ['2'] + assert arrow_table["b"].to_pylist() == ["2"] arrow_table = test_table_add_column.scan(row_filter="b is null").to_arrow() assert arrow_table["b"].to_pylist() == [None] @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_upgrade_table_version(catalog: Catalog) -> None: table_test_table_version = catalog.load_table("default.test_table_version") @@ -466,7 +592,7 @@ def test_upgrade_table_version(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_sanitize_character(catalog: Catalog) -> None: table_test_table_sanitized_character = catalog.load_table("default.test_table_sanitized_character") arrow_table = table_test_table_sanitized_character.scan().to_arrow() @@ -476,7 +602,7 @@ def test_sanitize_character(catalog: Catalog) -> None: @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_null_list_and_map(catalog: Catalog) -> None: table_test_empty_list_and_map = catalog.load_table("default.test_table_empty_list_and_map") arrow_table = table_test_empty_list_and_map.scan().to_arrow() @@ -485,7 +611,7 @@ def test_null_list_and_map(catalog: Catalog) -> None: # This should be: # assert arrow_table["col_list_with_struct"].to_pylist() == [None, [{'test': 1}]] # Once https://github.com/apache/arrow/issues/38809 has been fixed - assert arrow_table["col_list_with_struct"].to_pylist() == [[], [{'test': 1}]] + assert arrow_table["col_list_with_struct"].to_pylist() == [[], [{"test": 1}]] @pytest.mark.integration diff --git a/tests/integration/test_rest_manifest.py b/tests/integration/test_rest_manifest.py index 8191209ae6..82c41cfd93 100644 --- a/tests/integration/test_rest_manifest.py +++ b/tests/integration/test_rest_manifest.py @@ -17,6 +17,7 @@ # pylint:disable=redefined-outer-name import inspect +from copy import copy from enum import Enum from tempfile import TemporaryDirectory from typing import Any @@ -26,7 +27,7 @@ from pyiceberg.catalog import Catalog, load_catalog from pyiceberg.io.pyarrow import PyArrowFileIO -from pyiceberg.manifest import DataFile, ManifestEntry, write_manifest +from pyiceberg.manifest import DataFile, write_manifest from pyiceberg.table import Table from pyiceberg.utils.lazydict import LazyDict @@ -99,11 +100,11 @@ def test_write_sample_manifest(table_test_all_types: Table) -> None: sort_order_id=entry.data_file.sort_order_id, spec_id=entry.data_file.spec_id, ) - wrapped_entry_v2 = ManifestEntry(*entry.record_fields()) + wrapped_entry_v2 = copy(entry) wrapped_entry_v2.data_file = wrapped_data_file_v2_debug wrapped_entry_v2_dict = todict(wrapped_entry_v2) # This one should not be written - del wrapped_entry_v2_dict['data_file']['spec_id'] + del wrapped_entry_v2_dict["data_file"]["spec_id"] with TemporaryDirectory() as tmpdir: tmp_avro_file = tmpdir + "/test_write_manifest.avro" diff --git a/tests/integration/test_rest_schema.py b/tests/integration/test_rest_schema.py index ac5d1ce050..f4ab98a883 100644 --- a/tests/integration/test_rest_schema.py +++ b/tests/integration/test_rest_schema.py @@ -358,16 +358,16 @@ def test_revert_changes(simple_table: Table, table_schema_simple: Schema) -> Non assert simple_table.schemas() == { 0: Schema( - NestedField(field_id=1, name='foo', field_type=StringType(), required=False), - NestedField(field_id=2, name='bar', field_type=IntegerType(), required=True), - NestedField(field_id=3, name='baz', field_type=BooleanType(), required=False), + NestedField(field_id=1, name="foo", field_type=StringType(), required=False), + NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), identifier_field_ids=[2], ), 1: Schema( - NestedField(field_id=1, name='foo', field_type=StringType(), required=False), - NestedField(field_id=2, name='bar', field_type=IntegerType(), required=True), - NestedField(field_id=3, name='baz', field_type=BooleanType(), required=False), - NestedField(field_id=4, name='data', field_type=IntegerType(), required=False), + NestedField(field_id=1, name="foo", field_type=StringType(), required=False), + NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), + NestedField(field_id=4, name="data", field_type=IntegerType(), required=False), identifier_field_ids=[2], ), } @@ -685,9 +685,9 @@ def test_rename_simple(simple_table: Table) -> None: # Check that the name mapping gets updated assert simple_table.name_mapping() == NameMapping([ - MappedField(field_id=1, names=['foo', 'vo']), - MappedField(field_id=2, names=['bar', 'var']), - MappedField(field_id=3, names=['baz']), + MappedField(field_id=1, names=["foo", "vo"]), + MappedField(field_id=2, names=["bar", "var"]), + MappedField(field_id=3, names=["baz"]), ]) @@ -719,7 +719,7 @@ def test_rename_simple_nested(catalog: Catalog) -> None: # Check that the name mapping gets updated assert tbl.name_mapping() == NameMapping([ - MappedField(field_id=1, names=['foo'], fields=[MappedField(field_id=2, names=['bar', 'vo'])]), + MappedField(field_id=1, names=["foo"], fields=[MappedField(field_id=2, names=["bar", "vo"])]), ]) diff --git a/tests/integration/test_snapshot_operations.py b/tests/integration/test_snapshot_operations.py new file mode 100644 index 0000000000..639193383e --- /dev/null +++ b/tests/integration/test_snapshot_operations.py @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest + +from pyiceberg.catalog import Catalog +from pyiceberg.table.refs import SnapshotRef + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_create_tag(catalog: Catalog) -> None: + identifier = "default.test_table_snapshot_operations" + tbl = catalog.load_table(identifier) + assert len(tbl.history()) > 3 + tag_snapshot_id = tbl.history()[-3].snapshot_id + tbl.manage_snapshots().create_tag(snapshot_id=tag_snapshot_id, tag_name="tag123").commit() + assert tbl.metadata.refs["tag123"] == SnapshotRef(snapshot_id=tag_snapshot_id, snapshot_ref_type="tag") + + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_create_branch(catalog: Catalog) -> None: + identifier = "default.test_table_snapshot_operations" + tbl = catalog.load_table(identifier) + assert len(tbl.history()) > 2 + branch_snapshot_id = tbl.history()[-2].snapshot_id + tbl.manage_snapshots().create_branch(snapshot_id=branch_snapshot_id, branch_name="branch123").commit() + assert tbl.metadata.refs["branch123"] == SnapshotRef(snapshot_id=branch_snapshot_id, snapshot_ref_type="branch") diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index d84b9745a7..76d559ca57 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -16,6 +16,10 @@ # under the License. # pylint:disable=redefined-outer-name + +from datetime import date +from typing import Any, Set + import pyarrow as pa import pytest from pyspark.sql import SparkSession @@ -23,12 +27,14 @@ from pyiceberg.catalog import Catalog from pyiceberg.exceptions import NoSuchTableError from pyiceberg.partitioning import PartitionField, PartitionSpec +from pyiceberg.schema import Schema from pyiceberg.transforms import ( BucketTransform, DayTransform, HourTransform, IdentityTransform, MonthTransform, + Transform, TruncateTransform, YearTransform, ) @@ -38,7 +44,7 @@ @pytest.mark.integration @pytest.mark.parametrize( - "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", 'timestamp', 'timestamptz', 'binary'] + "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamp", "timestamptz", "binary"] ) @pytest.mark.parametrize("format_version", [1, 2]) def test_query_filter_null_partitioned( @@ -71,7 +77,7 @@ def test_query_filter_null_partitioned( @pytest.mark.integration @pytest.mark.parametrize( - "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", 'timestamp', 'timestamptz', 'binary'] + "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamp", "timestamptz", "binary"] ) @pytest.mark.parametrize("format_version", [1, 2]) def test_query_filter_without_data_partitioned( @@ -103,7 +109,7 @@ def test_query_filter_without_data_partitioned( @pytest.mark.integration @pytest.mark.parametrize( - "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", 'timestamp', 'timestamptz', 'binary'] + "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamp", "timestamptz", "binary"] ) @pytest.mark.parametrize("format_version", [1, 2]) def test_query_filter_only_nulls_partitioned( @@ -135,7 +141,7 @@ def test_query_filter_only_nulls_partitioned( @pytest.mark.integration @pytest.mark.parametrize( - "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", "timestamptz", "timestamp", "binary"] + "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamptz", "timestamp", "binary"] ) @pytest.mark.parametrize("format_version", [1, 2]) def test_query_filter_appended_null_partitioned( @@ -174,7 +180,7 @@ def test_query_filter_appended_null_partitioned( @pytest.mark.integration @pytest.mark.parametrize( - "part_col", ['int', 'bool', 'string', "string_long", "long", "float", "double", "date", "timestamptz", "timestamp", "binary"] + "part_col", ["int", "bool", "string", "string_long", "long", "float", "double", "date", "timestamptz", "timestamp", "binary"] ) def test_query_filter_v1_v2_append_null( session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table, part_col: str @@ -225,7 +231,7 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro identifier=identifier, schema=TABLE_SCHEMA, partition_spec=PartitionSpec(PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int")), - properties={'format-version': '2'}, + properties={"format-version": "2"}, ) tbl.append(arrow_table_with_null) @@ -240,33 +246,33 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro ).collect() operations = [row.operation for row in rows] - assert operations == ['append', 'append'] + assert operations == ["append", "append"] summaries = [row.summary for row in rows] assert summaries[0] == { - 'changed-partition-count': '3', - 'added-data-files': '3', - 'added-files-size': '15029', - 'added-records': '3', - 'total-data-files': '3', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '15029', - 'total-position-deletes': '0', - 'total-records': '3', + "changed-partition-count": "3", + "added-data-files": "3", + "added-files-size": "15029", + "added-records": "3", + "total-data-files": "3", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "15029", + "total-position-deletes": "0", + "total-records": "3", } assert summaries[1] == { - 'changed-partition-count': '3', - 'added-data-files': '3', - 'added-files-size': '15029', - 'added-records': '3', - 'total-data-files': '6', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '30058', - 'total-position-deletes': '0', - 'total-records': '6', + "changed-partition-count": "3", + "added-data-files": "3", + "added-files-size": "15029", + "added-records": "3", + "total-data-files": "6", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "30058", + "total-position-deletes": "0", + "total-records": "6", } @@ -284,7 +290,7 @@ def test_data_files_with_table_partitioned_with_null( identifier=identifier, schema=TABLE_SCHEMA, partition_spec=PartitionSpec(PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int")), - properties={'format-version': '1'}, + properties={"format-version": "1"}, ) tbl.append(arrow_table_with_null) @@ -320,7 +326,7 @@ def test_invalid_arguments(spark: SparkSession, session_catalog: Catalog) -> Non identifier=identifier, schema=TABLE_SCHEMA, partition_spec=PartitionSpec(PartitionField(source_id=4, field_id=1001, transform=IdentityTransform(), name="int")), - properties={'format-version': '1'}, + properties={"format-version": "1"}, ) with pytest.raises(ValueError, match="Expected PyArrow table, got: not a df"): @@ -351,18 +357,6 @@ def test_invalid_arguments(spark: SparkSession, session_catalog: Catalog) -> Non (PartitionSpec(PartitionField(source_id=5, field_id=1001, transform=TruncateTransform(2), name="long_trunc"))), (PartitionSpec(PartitionField(source_id=2, field_id=1001, transform=TruncateTransform(2), name="string_trunc"))), (PartitionSpec(PartitionField(source_id=11, field_id=1001, transform=TruncateTransform(2), name="binary_trunc"))), - (PartitionSpec(PartitionField(source_id=8, field_id=1001, transform=YearTransform(), name="timestamp_year"))), - (PartitionSpec(PartitionField(source_id=9, field_id=1001, transform=YearTransform(), name="timestamptz_year"))), - (PartitionSpec(PartitionField(source_id=10, field_id=1001, transform=YearTransform(), name="date_year"))), - (PartitionSpec(PartitionField(source_id=8, field_id=1001, transform=MonthTransform(), name="timestamp_month"))), - (PartitionSpec(PartitionField(source_id=9, field_id=1001, transform=MonthTransform(), name="timestamptz_month"))), - (PartitionSpec(PartitionField(source_id=10, field_id=1001, transform=MonthTransform(), name="date_month"))), - (PartitionSpec(PartitionField(source_id=8, field_id=1001, transform=DayTransform(), name="timestamp_day"))), - (PartitionSpec(PartitionField(source_id=9, field_id=1001, transform=DayTransform(), name="timestamptz_day"))), - (PartitionSpec(PartitionField(source_id=10, field_id=1001, transform=DayTransform(), name="date_day"))), - (PartitionSpec(PartitionField(source_id=8, field_id=1001, transform=HourTransform(), name="timestamp_hour"))), - (PartitionSpec(PartitionField(source_id=9, field_id=1001, transform=HourTransform(), name="timestamptz_hour"))), - (PartitionSpec(PartitionField(source_id=10, field_id=1001, transform=HourTransform(), name="date_hour"))), ], ) def test_unsupported_transform( @@ -379,8 +373,189 @@ def test_unsupported_transform( identifier=identifier, schema=TABLE_SCHEMA, partition_spec=spec, - properties={'format-version': '1'}, + properties={"format-version": "1"}, ) - with pytest.raises(ValueError, match="All transforms are not supported.*"): + with pytest.raises( + ValueError, + match="Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: *", + ): tbl.append(arrow_table_with_null) + + +@pytest.mark.integration +@pytest.mark.parametrize( + "transform,expected_rows", + [ + pytest.param(YearTransform(), 2, id="year_transform"), + pytest.param(MonthTransform(), 3, id="month_transform"), + pytest.param(DayTransform(), 3, id="day_transform"), + ], +) +@pytest.mark.parametrize("part_col", ["date", "timestamp", "timestamptz"]) +@pytest.mark.parametrize("format_version", [1, 2]) +def test_append_ymd_transform_partitioned( + session_catalog: Catalog, + spark: SparkSession, + arrow_table_with_null: pa.Table, + transform: Transform[Any, Any], + expected_rows: int, + part_col: str, + format_version: int, +) -> None: + # Given + identifier = f"default.arrow_table_v{format_version}_with_{str(transform)}_partition_on_col_{part_col}" + nested_field = TABLE_SCHEMA.find_field(part_col) + partition_spec = PartitionSpec( + PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col) + ) + + # When + tbl = _create_table( + session_catalog=session_catalog, + identifier=identifier, + properties={"format-version": str(format_version)}, + data=[arrow_table_with_null], + partition_spec=partition_spec, + ) + + # Then + assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" + df = spark.table(identifier) + assert df.count() == 3, f"Expected 3 total rows for {identifier}" + for col in TEST_DATA_WITH_NULL.keys(): + assert df.where(f"{col} is not null").count() == 2, f"Expected 2 non-null rows for {col}" + assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" + + assert tbl.inspect.partitions().num_rows == expected_rows + files_df = spark.sql( + f""" + SELECT * + FROM {identifier}.files + """ + ) + assert files_df.count() == expected_rows + + +@pytest.mark.integration +@pytest.mark.parametrize( + "transform,expected_partitions", + [ + pytest.param(YearTransform(), {53, 54, None}, id="year_transform"), + pytest.param(MonthTransform(), {647, 648, 649, None}, id="month_transform"), + pytest.param( + DayTransform(), {date(2023, 12, 31), date(2024, 1, 1), date(2024, 1, 31), date(2024, 2, 1), None}, id="day_transform" + ), + pytest.param(HourTransform(), {473328, 473352, 474072, 474096, 474102, None}, id="hour_transform"), + ], +) +@pytest.mark.parametrize("format_version", [1, 2]) +def test_append_transform_partition_verify_partitions_count( + session_catalog: Catalog, + spark: SparkSession, + arrow_table_date_timestamps: pa.Table, + arrow_table_date_timestamps_schema: Schema, + transform: Transform[Any, Any], + expected_partitions: Set[Any], + format_version: int, +) -> None: + # Given + part_col = "timestamptz" + identifier = f"default.arrow_table_v{format_version}_with_{str(transform)}_transform_partitioned_on_col_{part_col}" + nested_field = arrow_table_date_timestamps_schema.find_field(part_col) + partition_spec = PartitionSpec( + PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col), + ) + + # When + tbl = _create_table( + session_catalog=session_catalog, + identifier=identifier, + properties={"format-version": str(format_version)}, + data=[arrow_table_date_timestamps], + partition_spec=partition_spec, + schema=arrow_table_date_timestamps_schema, + ) + + # Then + assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" + df = spark.table(identifier) + assert df.count() == 6, f"Expected 6 total rows for {identifier}" + for col in arrow_table_date_timestamps.column_names: + assert df.where(f"{col} is not null").count() == 5, f"Expected 2 non-null rows for {col}" + assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" + + partitions_table = tbl.inspect.partitions() + assert partitions_table.num_rows == len(expected_partitions) + assert {part[part_col] for part in partitions_table["partition"].to_pylist()} == expected_partitions + files_df = spark.sql( + f""" + SELECT * + FROM {identifier}.files + """ + ) + assert files_df.count() == len(expected_partitions) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_append_multiple_partitions( + session_catalog: Catalog, + spark: SparkSession, + arrow_table_date_timestamps: pa.Table, + arrow_table_date_timestamps_schema: Schema, + format_version: int, +) -> None: + # Given + identifier = f"default.arrow_table_v{format_version}_with_multiple_partitions" + partition_spec = PartitionSpec( + PartitionField( + source_id=arrow_table_date_timestamps_schema.find_field("date").field_id, + field_id=1001, + transform=YearTransform(), + name="date_year", + ), + PartitionField( + source_id=arrow_table_date_timestamps_schema.find_field("timestamptz").field_id, + field_id=1000, + transform=HourTransform(), + name="timestamptz_hour", + ), + ) + + # When + tbl = _create_table( + session_catalog=session_catalog, + identifier=identifier, + properties={"format-version": str(format_version)}, + data=[arrow_table_date_timestamps], + partition_spec=partition_spec, + schema=arrow_table_date_timestamps_schema, + ) + + # Then + assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" + df = spark.table(identifier) + assert df.count() == 6, f"Expected 6 total rows for {identifier}" + for col in arrow_table_date_timestamps.column_names: + assert df.where(f"{col} is not null").count() == 5, f"Expected 2 non-null rows for {col}" + assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" + + partitions_table = tbl.inspect.partitions() + assert partitions_table.num_rows == 6 + partitions = partitions_table["partition"].to_pylist() + assert {(part["date_year"], part["timestamptz_hour"]) for part in partitions} == { + (53, 473328), + (54, 473352), + (54, 474072), + (54, 474096), + (54, 474102), + (None, None), + } + files_df = spark.sql( + f""" + SELECT * + FROM {identifier}.files + """ + ) + assert files_df.count() == 6 diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index 74b6857dce..4585406cbb 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -34,6 +34,7 @@ from pyiceberg.catalog import Catalog from pyiceberg.catalog.hive import HiveCatalog +from pyiceberg.catalog.rest import RestCatalog from pyiceberg.catalog.sql import SqlCatalog from pyiceberg.exceptions import NoSuchTableError from pyiceberg.partitioning import PartitionField, PartitionSpec @@ -186,47 +187,47 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi ).collect() operations = [row.operation for row in rows] - assert operations == ['append', 'append', 'overwrite'] + assert operations == ["append", "append", "overwrite"] summaries = [row.summary for row in rows] assert summaries[0] == { - 'added-data-files': '1', - 'added-files-size': '5459', - 'added-records': '3', - 'total-data-files': '1', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '5459', - 'total-position-deletes': '0', - 'total-records': '3', + "added-data-files": "1", + "added-files-size": "5459", + "added-records": "3", + "total-data-files": "1", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "5459", + "total-position-deletes": "0", + "total-records": "3", } assert summaries[1] == { - 'added-data-files': '1', - 'added-files-size': '5459', - 'added-records': '3', - 'total-data-files': '2', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '10918', - 'total-position-deletes': '0', - 'total-records': '6', + "added-data-files": "1", + "added-files-size": "5459", + "added-records": "3", + "total-data-files": "2", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "10918", + "total-position-deletes": "0", + "total-records": "6", } assert summaries[2] == { - 'added-data-files': '1', - 'added-files-size': '5459', - 'added-records': '3', - 'deleted-data-files': '2', - 'deleted-records': '6', - 'removed-files-size': '10918', - 'total-data-files': '1', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '5459', - 'total-position-deletes': '0', - 'total-records': '3', + "added-data-files": "1", + "added-files-size": "5459", + "added-records": "3", + "deleted-data-files": "2", + "deleted-records": "6", + "removed-files-size": "10918", + "total-data-files": "1", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "5459", + "total-position-deletes": "0", + "total-records": "3", } @@ -283,25 +284,25 @@ def test_python_writes_special_character_column_with_spark_reads( identifier = "default.python_writes_special_character_column_with_spark_reads" column_name_with_special_character = "letter/abc" TEST_DATA_WITH_SPECIAL_CHARACTER_COLUMN = { - column_name_with_special_character: ['a', None, 'z'], - 'id': [1, 2, 3], - 'name': ['AB', 'CD', 'EF'], - 'address': [ - {'street': '123', 'city': 'SFO', 'zip': 12345, column_name_with_special_character: 'a'}, - {'street': '456', 'city': 'SW', 'zip': 67890, column_name_with_special_character: 'b'}, - {'street': '789', 'city': 'Random', 'zip': 10112, column_name_with_special_character: 'c'}, + column_name_with_special_character: ["a", None, "z"], + "id": [1, 2, 3], + "name": ["AB", "CD", "EF"], + "address": [ + {"street": "123", "city": "SFO", "zip": 12345, column_name_with_special_character: "a"}, + {"street": "456", "city": "SW", "zip": 67890, column_name_with_special_character: "b"}, + {"street": "789", "city": "Random", "zip": 10112, column_name_with_special_character: "c"}, ], } pa_schema = pa.schema([ pa.field(column_name_with_special_character, pa.string()), - pa.field('id', pa.int32()), - pa.field('name', pa.string()), + pa.field("id", pa.int32()), + pa.field("name", pa.string()), pa.field( - 'address', + "address", pa.struct([ - pa.field('street', pa.string()), - pa.field('city', pa.string()), - pa.field('zip', pa.int32()), + pa.field("street", pa.string()), + pa.field("city", pa.string()), + pa.field("zip", pa.int32()), pa.field(column_name_with_special_character, pa.string()), ]), ), @@ -322,12 +323,12 @@ def test_python_writes_dictionary_encoded_column_with_spark_reads( ) -> None: identifier = "default.python_writes_dictionary_encoded_column_with_spark_reads" TEST_DATA = { - 'id': [1, 2, 3, 1, 1], - 'name': ['AB', 'CD', 'EF', 'CD', 'EF'], + "id": [1, 2, 3, 1, 1], + "name": ["AB", "CD", "EF", "CD", "EF"], } pa_schema = pa.schema([ - pa.field('id', pa.dictionary(pa.int32(), pa.int32(), False)), - pa.field('name', pa.dictionary(pa.int32(), pa.string(), False)), + pa.field("id", pa.dictionary(pa.int32(), pa.int32(), False)), + pa.field("name", pa.dictionary(pa.int32(), pa.string(), False)), ]) arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema) @@ -339,6 +340,60 @@ def test_python_writes_dictionary_encoded_column_with_spark_reads( assert spark_df.equals(pyiceberg_df) +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_python_writes_with_small_and_large_types_spark_reads( + spark: SparkSession, session_catalog: Catalog, format_version: int +) -> None: + identifier = "default.python_writes_with_small_and_large_types_spark_reads" + TEST_DATA = { + "foo": ["a", None, "z"], + "id": [1, 2, 3], + "name": ["AB", "CD", "EF"], + "address": [ + {"street": "123", "city": "SFO", "zip": 12345, "bar": "a"}, + {"street": "456", "city": "SW", "zip": 67890, "bar": "b"}, + {"street": "789", "city": "Random", "zip": 10112, "bar": "c"}, + ], + } + pa_schema = pa.schema([ + pa.field("foo", pa.large_string()), + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field( + "address", + pa.struct([ + pa.field("street", pa.string()), + pa.field("city", pa.string()), + pa.field("zip", pa.int32()), + pa.field("bar", pa.large_string()), + ]), + ), + ]) + arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema) + tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, schema=pa_schema) + + tbl.overwrite(arrow_table) + spark_df = spark.sql(f"SELECT * FROM {identifier}").toPandas() + pyiceberg_df = tbl.scan().to_pandas() + assert spark_df.equals(pyiceberg_df) + arrow_table_on_read = tbl.scan().to_arrow() + assert arrow_table_on_read.schema == pa.schema([ + pa.field("foo", pa.large_string()), + pa.field("id", pa.int32()), + pa.field("name", pa.large_string()), + pa.field( + "address", + pa.struct([ + pa.field("street", pa.large_string()), + pa.field("city", pa.large_string()), + pa.field("zip", pa.int32()), + pa.field("bar", pa.large_string()), + ]), + ), + ]) + + @pytest.mark.integration def test_write_bin_pack_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None: identifier = "default.write_bin_pack_data_files" @@ -473,7 +528,7 @@ def test_write_parquet_unsupported_properties( @pytest.mark.integration def test_invalid_arguments(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None: identifier = "default.arrow_data_files" - tbl = _create_table(session_catalog, identifier, {'format-version': '1'}, []) + tbl = _create_table(session_catalog, identifier, {"format-version": "1"}, []) with pytest.raises(ValueError, match="Expected PyArrow table, got: not a df"): tbl.overwrite("not a df") @@ -488,7 +543,7 @@ def test_summaries_with_only_nulls( ) -> None: identifier = "default.arrow_table_summaries_with_only_nulls" tbl = _create_table( - session_catalog, identifier, {'format-version': '1'}, [arrow_table_without_data, arrow_table_with_only_nulls] + session_catalog, identifier, {"format-version": "1"}, [arrow_table_without_data, arrow_table_with_only_nulls] ) tbl.overwrite(arrow_table_without_data) @@ -501,49 +556,49 @@ def test_summaries_with_only_nulls( ).collect() operations = [row.operation for row in rows] - assert operations == ['append', 'append', 'overwrite'] + assert operations == ["append", "append", "overwrite"] summaries = [row.summary for row in rows] assert summaries[0] == { - 'total-data-files': '0', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '0', - 'total-position-deletes': '0', - 'total-records': '0', + "total-data-files": "0", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "0", + "total-position-deletes": "0", + "total-records": "0", } assert summaries[1] == { - 'added-data-files': '1', - 'added-files-size': '4239', - 'added-records': '2', - 'total-data-files': '1', - 'total-delete-files': '0', - 'total-equality-deletes': '0', - 'total-files-size': '4239', - 'total-position-deletes': '0', - 'total-records': '2', + "added-data-files": "1", + "added-files-size": "4239", + "added-records": "2", + "total-data-files": "1", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "4239", + "total-position-deletes": "0", + "total-records": "2", } assert summaries[2] == { - 'removed-files-size': '4239', - 'total-equality-deletes': '0', - 'total-position-deletes': '0', - 'deleted-data-files': '1', - 'total-delete-files': '0', - 'total-files-size': '0', - 'deleted-records': '2', - 'total-data-files': '0', - 'total-records': '0', + "removed-files-size": "4239", + "total-equality-deletes": "0", + "total-position-deletes": "0", + "deleted-data-files": "1", + "total-delete-files": "0", + "total-files-size": "0", + "deleted-records": "2", + "total-data-files": "0", + "total-records": "0", } @pytest.mark.integration def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> None: - os.environ['TZ'] = 'Etc/UTC' + os.environ["TZ"] = "Etc/UTC" time.tzset() - tz = pytz.timezone(os.environ['TZ']) + tz = pytz.timezone(os.environ["TZ"]) catalog = SqlCatalog("test_sql_catalog", uri="sqlite:///:memory:", warehouse=f"/{warehouse}") catalog.create_namespace("default") @@ -554,7 +609,7 @@ def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> import duckdb - duckdb.sql('INSTALL iceberg; LOAD iceberg;') + duckdb.sql("INSTALL iceberg; LOAD iceberg;") result = duckdb.sql( f""" SELECT * @@ -565,8 +620,8 @@ def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> assert result == [ ( False, - 'a', - 'aaaaaaaaaaaaaaaaaaaaaa', + "a", + "aaaaaaaaaaaaaaaaaaaaaa", 1, 1, 0.0, @@ -574,14 +629,14 @@ def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> datetime(2023, 1, 1, 19, 25), datetime(2023, 1, 1, 19, 25, tzinfo=tz), date(2023, 1, 1), - b'\x01', - b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + b"\x01", + b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", ), (None, None, None, None, None, None, None, None, None, None, None, None), ( True, - 'z', - 'zzzzzzzzzzzzzzzzzzzzzz', + "z", + "zzzzzzzzzzzzzzzzzzzzzz", 9, 9, 0.8999999761581421, @@ -589,8 +644,8 @@ def test_duckdb_url_import(warehouse: Path, arrow_table_with_null: pa.Table) -> datetime(2023, 3, 1, 19, 25), datetime(2023, 3, 1, 19, 25, tzinfo=tz), date(2023, 3, 1), - b'\x12', - b'\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11', + b"\x12", + b"\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11", ), ] @@ -607,7 +662,7 @@ def test_write_and_evolve(session_catalog: Catalog, format_version: int) -> None pa_table = pa.Table.from_pydict( { - 'foo': ['a', None, 'z'], + "foo": ["a", None, "z"], }, schema=pa.schema([pa.field("foo", pa.string(), nullable=True)]), ) @@ -618,8 +673,8 @@ def test_write_and_evolve(session_catalog: Catalog, format_version: int) -> None pa_table_with_column = pa.Table.from_pydict( { - 'foo': ['a', None, 'z'], - 'bar': [19, None, 25], + "foo": ["a", None, "z"], + "bar": [19, None, 25], }, schema=pa.schema([ pa.field("foo", pa.string(), nullable=True), @@ -637,31 +692,32 @@ def test_write_and_evolve(session_catalog: Catalog, format_version: int) -> None @pytest.mark.integration -@pytest.mark.parametrize("format_version", [2]) -def test_create_table_transaction(session_catalog: Catalog, format_version: int) -> None: - if format_version == 1: +@pytest.mark.parametrize("format_version", [1, 2]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_create_table_transaction(catalog: Catalog, format_version: int) -> None: + if format_version == 1 and isinstance(catalog, RestCatalog): pytest.skip( "There is a bug in the REST catalog (maybe server side) that prevents create and commit a staged version 1 table" ) - identifier = f"default.arrow_create_table_transaction{format_version}" + identifier = f"default.arrow_create_table_transaction_{catalog.name}_{format_version}" try: - session_catalog.drop_table(identifier=identifier) + catalog.drop_table(identifier=identifier) except NoSuchTableError: pass pa_table = pa.Table.from_pydict( { - 'foo': ['a', None, 'z'], + "foo": ["a", None, "z"], }, schema=pa.schema([pa.field("foo", pa.string(), nullable=True)]), ) pa_table_with_column = pa.Table.from_pydict( { - 'foo': ['a', None, 'z'], - 'bar': [19, None, 25], + "foo": ["a", None, "z"], + "bar": [19, None, 25], }, schema=pa.schema([ pa.field("foo", pa.string(), nullable=True), @@ -669,7 +725,7 @@ def test_create_table_transaction(session_catalog: Catalog, format_version: int) ]), ) - with session_catalog.create_table_transaction( + with catalog.create_table_transaction( identifier=identifier, schema=pa_table.schema, properties={"format-version": str(format_version)} ) as txn: with txn.update_snapshot().fast_append() as snapshot_update: @@ -685,7 +741,7 @@ def test_create_table_transaction(session_catalog: Catalog, format_version: int) ): snapshot_update.append_data_file(data_file) - tbl = session_catalog.load_table(identifier=identifier) + tbl = catalog.load_table(identifier=identifier) assert tbl.format_version == format_version assert len(tbl.scan().to_arrow()) == 6 @@ -741,45 +797,45 @@ def test_inspect_snapshots( df = tbl.inspect.snapshots() assert df.column_names == [ - 'committed_at', - 'snapshot_id', - 'parent_id', - 'operation', - 'manifest_list', - 'summary', + "committed_at", + "snapshot_id", + "parent_id", + "operation", + "manifest_list", + "summary", ] - for committed_at in df['committed_at']: + for committed_at in df["committed_at"]: assert isinstance(committed_at.as_py(), datetime) - for snapshot_id in df['snapshot_id']: + for snapshot_id in df["snapshot_id"]: assert isinstance(snapshot_id.as_py(), int) - assert df['parent_id'][0].as_py() is None - assert df['parent_id'][1:] == df['snapshot_id'][:2] + assert df["parent_id"][0].as_py() is None + assert df["parent_id"][1:] == df["snapshot_id"][:2] - assert [operation.as_py() for operation in df['operation']] == ['append', 'overwrite', 'append'] + assert [operation.as_py() for operation in df["operation"]] == ["append", "overwrite", "append"] - for manifest_list in df['manifest_list']: + for manifest_list in df["manifest_list"]: assert manifest_list.as_py().startswith("s3://") - assert df['summary'][0].as_py() == [ - ('added-files-size', '5459'), - ('added-data-files', '1'), - ('added-records', '3'), - ('total-data-files', '1'), - ('total-delete-files', '0'), - ('total-records', '3'), - ('total-files-size', '5459'), - ('total-position-deletes', '0'), - ('total-equality-deletes', '0'), + assert df["summary"][0].as_py() == [ + ("added-files-size", "5459"), + ("added-data-files", "1"), + ("added-records", "3"), + ("total-data-files", "1"), + ("total-delete-files", "0"), + ("total-records", "3"), + ("total-files-size", "5459"), + ("total-position-deletes", "0"), + ("total-equality-deletes", "0"), ] lhs = spark.table(f"{identifier}.snapshots").toPandas() rhs = df.to_pandas() for column in df.column_names: for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): - if column == 'summary': + if column == "summary": # Arrow returns a list of tuples, instead of a dict right = dict(right) @@ -838,7 +894,7 @@ def test_hive_catalog_storage_descriptor( @pytest.mark.integration -@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('session_catalog_hive'), pytest.lazy_fixture('session_catalog')]) +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_sanitize_character_partitioned(catalog: Catalog) -> None: table_name = "default.test_table_partitioned_sanitized_character" try: diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 90f5b08bf0..ecb946a98b 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -344,9 +344,9 @@ def test_deleting_hdfs_file_not_found() -> None: assert "Cannot delete file, does not exist:" in str(exc_info.value) -def test_schema_to_pyarrow_schema(table_schema_nested: Schema) -> None: +def test_schema_to_pyarrow_schema_include_field_ids(table_schema_nested: Schema) -> None: actual = schema_to_pyarrow(table_schema_nested) - expected = """foo: string + expected = """foo: large_string -- field metadata -- PARQUET:field_id: '1' bar: int32 not null @@ -355,20 +355,20 @@ def test_schema_to_pyarrow_schema(table_schema_nested: Schema) -> None: baz: bool -- field metadata -- PARQUET:field_id: '3' -qux: list not null - child 0, element: string not null +qux: large_list not null + child 0, element: large_string not null -- field metadata -- PARQUET:field_id: '5' -- field metadata -- PARQUET:field_id: '4' -quux: map> not null - child 0, entries: struct not null> not null - child 0, key: string not null +quux: map> not null + child 0, entries: struct not null> not null + child 0, key: large_string not null -- field metadata -- PARQUET:field_id: '7' - child 1, value: map not null - child 0, entries: struct not null - child 0, key: string not null + child 1, value: map not null + child 0, entries: struct not null + child 0, key: large_string not null -- field metadata -- PARQUET:field_id: '9' child 1, value: int32 not null @@ -378,7 +378,7 @@ def test_schema_to_pyarrow_schema(table_schema_nested: Schema) -> None: PARQUET:field_id: '8' -- field metadata -- PARQUET:field_id: '6' -location: list not null> not null +location: large_list not null> not null child 0, element: struct not null child 0, latitude: float -- field metadata -- @@ -390,8 +390,8 @@ def test_schema_to_pyarrow_schema(table_schema_nested: Schema) -> None: PARQUET:field_id: '12' -- field metadata -- PARQUET:field_id: '11' -person: struct - child 0, name: string +person: struct + child 0, name: large_string -- field metadata -- PARQUET:field_id: '16' child 1, age: int32 not null @@ -402,6 +402,30 @@ def test_schema_to_pyarrow_schema(table_schema_nested: Schema) -> None: assert repr(actual) == expected +def test_schema_to_pyarrow_schema_exclude_field_ids(table_schema_nested: Schema) -> None: + actual = schema_to_pyarrow(table_schema_nested, include_field_ids=False) + expected = """foo: large_string +bar: int32 not null +baz: bool +qux: large_list not null + child 0, element: large_string not null +quux: map> not null + child 0, entries: struct not null> not null + child 0, key: large_string not null + child 1, value: map not null + child 0, entries: struct not null + child 0, key: large_string not null + child 1, value: int32 not null +location: large_list not null> not null + child 0, element: struct not null + child 0, latitude: float + child 1, longitude: float +person: struct + child 0, name: large_string + child 1, age: int32 not null""" + assert repr(actual) == expected + + def test_fixed_type_to_pyarrow() -> None: length = 22 iceberg_type = FixedType(length) @@ -462,7 +486,7 @@ def test_timestamptz_type_to_pyarrow() -> None: def test_string_type_to_pyarrow() -> None: iceberg_type = StringType() - assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.string() + assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.large_string() def test_binary_type_to_pyarrow() -> None: @@ -472,7 +496,7 @@ def test_binary_type_to_pyarrow() -> None: def test_struct_type_to_pyarrow(table_schema_simple: Schema) -> None: expected = pa.struct([ - pa.field("foo", pa.string(), nullable=True, metadata={"field_id": "1"}), + pa.field("foo", pa.large_string(), nullable=True, metadata={"field_id": "1"}), pa.field("bar", pa.int32(), nullable=False, metadata={"field_id": "2"}), pa.field("baz", pa.bool_(), nullable=True, metadata={"field_id": "3"}), ]) @@ -489,7 +513,7 @@ def test_map_type_to_pyarrow() -> None: ) assert visit(iceberg_map, _ConvertToArrowSchema()) == pa.map_( pa.field("key", pa.int32(), nullable=False, metadata={"field_id": "1"}), - pa.field("value", pa.string(), nullable=False, metadata={"field_id": "2"}), + pa.field("value", pa.large_string(), nullable=False, metadata={"field_id": "2"}), ) @@ -499,7 +523,7 @@ def test_list_type_to_pyarrow() -> None: element_type=IntegerType(), element_required=True, ) - assert visit(iceberg_map, _ConvertToArrowSchema()) == pa.list_( + assert visit(iceberg_map, _ConvertToArrowSchema()) == pa.large_list( pa.field("element", pa.int32(), nullable=False, metadata={"field_id": "1"}) ) @@ -582,11 +606,11 @@ def test_expr_less_than_or_equal_to_pyarrow(bound_reference: BoundReference[str] def test_expr_in_to_pyarrow(bound_reference: BoundReference[str]) -> None: assert repr(expression_to_pyarrow(BoundIn(bound_reference, {literal("hello"), literal("world")}))) in ( - """""", - """""", @@ -595,11 +619,11 @@ def test_expr_in_to_pyarrow(bound_reference: BoundReference[str]) -> None: def test_expr_not_in_to_pyarrow(bound_reference: BoundReference[str]) -> None: assert repr(expression_to_pyarrow(BoundNotIn(bound_reference, {literal("hello"), literal("world")}))) in ( - """""", - """""", @@ -943,25 +967,15 @@ def test_projection_add_column(file_int: str) -> None: assert ( repr(result_table.schema) == """id: int32 -list: list +list: large_list child 0, element: int32 - -- field metadata -- - PARQUET:field_id: '21' -map: map - child 0, entries: struct not null +map: map + child 0, entries: struct not null child 0, key: int32 not null - -- field metadata -- - PARQUET:field_id: '31' - child 1, value: string - -- field metadata -- - PARQUET:field_id: '32' + child 1, value: large_string location: struct child 0, lat: double - -- field metadata -- - PARQUET:field_id: '41' - child 1, lon: double - -- field metadata -- - PARQUET:field_id: '42'""" + child 1, lon: double""" ) @@ -974,7 +988,7 @@ def test_read_list(schema_list: Schema, file_list: str) -> None: assert ( repr(result_table.schema) - == """ids: list + == """ids: large_list child 0, element: int32""" ) @@ -988,10 +1002,10 @@ def test_read_map(schema_map: Schema, file_map: str) -> None: assert ( repr(result_table.schema) - == """properties: map - child 0, entries: struct not null - child 0, key: string not null - child 1, value: string not null""" + == """properties: map + child 0, entries: struct not null + child 0, key: large_string not null + child 1, value: large_string not null""" ) @@ -1011,14 +1025,10 @@ def test_projection_add_column_struct(schema_int: Schema, file_int: str) -> None assert r.as_py() is None assert ( repr(result_table.schema) - == """id: map - child 0, entries: struct not null + == """id: map + child 0, entries: struct not null child 0, key: int32 not null - -- field metadata -- - PARQUET:field_id: '3' - child 1, value: string - -- field metadata -- - PARQUET:field_id: '4'""" + child 1, value: large_string""" ) @@ -1062,12 +1072,7 @@ def test_projection_concat_files(schema_int: Schema, file_int: str) -> None: def test_projection_filter(schema_int: Schema, file_int: str) -> None: result_table = project(schema_int, [file_int], GreaterThan("id", 4)) assert len(result_table.columns[0]) == 0 - assert ( - repr(result_table.schema) - == """id: int32 - -- field metadata -- - PARQUET:field_id: '1'""" - ) + assert repr(result_table.schema) == """id: int32""" def test_projection_filter_renamed_column(file_int: str) -> None: @@ -1215,18 +1220,18 @@ def test_projection_list_of_structs(schema_list_of_structs: Schema, file_list_of results = [row.as_py() for row in result_table.columns[0]] assert results == [ [ - {'latitude': 52.371807, 'longitude': 4.896029, 'altitude': None}, - {'latitude': 52.387386, 'longitude': 4.646219, 'altitude': None}, + {"latitude": 52.371807, "longitude": 4.896029, "altitude": None}, + {"latitude": 52.387386, "longitude": 4.646219, "altitude": None}, ], [], [ - {'latitude': 52.078663, 'longitude': 4.288788, 'altitude': None}, - {'latitude': 52.387386, 'longitude': 4.646219, 'altitude': None}, + {"latitude": 52.078663, "longitude": 4.288788, "altitude": None}, + {"latitude": 52.387386, "longitude": 4.646219, "altitude": None}, ], ] assert ( repr(result_table.schema) - == """locations: list> + == """locations: large_list> child 0, element: struct child 0, latitude: double not null child 1, longitude: double not null @@ -1274,9 +1279,9 @@ def test_projection_maps_of_structs(schema_map_of_structs: Schema, file_map_of_s assert actual.as_py() == expected assert ( repr(result_table.schema) - == """locations: map> - child 0, entries: struct not null> not null - child 0, key: string not null + == """locations: map> + child 0, entries: struct not null> not null + child 0, key: large_string not null child 1, value: struct not null child 0, latitude: double not null child 1, longitude: double not null @@ -1304,11 +1309,7 @@ def test_projection_nested_struct_different_parent_id(file_struct: str) -> None: repr(result_table.schema) == """location: struct child 0, lat: double - -- field metadata -- - PARQUET:field_id: '41' - child 1, long: double - -- field metadata -- - PARQUET:field_id: '42'""" + child 1, long: double""" ) @@ -1377,7 +1378,7 @@ def test_delete(deletes_file: str, example_task: FileScanTask, table_schema_simp assert ( str(with_deletes) == """pyarrow.Table -foo: string +foo: large_string bar: int32 not null baz: bool ---- @@ -1415,7 +1416,7 @@ def test_delete_duplicates(deletes_file: str, example_task: FileScanTask, table_ assert ( str(with_deletes) == """pyarrow.Table -foo: string +foo: large_string bar: int32 not null baz: bool ---- @@ -1446,7 +1447,7 @@ def test_pyarrow_wrap_fsspec(example_task: FileScanTask, table_schema_simple: Sc assert ( str(projection) == """pyarrow.Table -foo: string +foo: large_string bar: int32 not null baz: bool ---- diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index 46ad331aa0..d3b6217c7b 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -25,6 +25,7 @@ _ConvertToIceberg, _ConvertToIcebergWithoutIDs, _HasIds, + _pyarrow_schema_ensure_large_types, pyarrow_to_schema, schema_to_pyarrow, visit_pyarrow, @@ -209,7 +210,7 @@ def test_pyarrow_timestamp_tz_invalid_tz() -> None: def test_pyarrow_string_to_iceberg() -> None: - pyarrow_type = pa.string() + pyarrow_type = pa.large_string() converted_iceberg_type = visit_pyarrow(pyarrow_type, _ConvertToIceberg()) assert converted_iceberg_type == StringType() assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pyarrow_type @@ -329,7 +330,7 @@ def test_round_schema_large_string() -> None: def test_simple_schema_has_missing_ids() -> None: schema = pa.schema([ - pa.field('foo', pa.string(), nullable=False), + pa.field("foo", pa.string(), nullable=False), ]) visitor = _HasIds() has_ids = visit_pyarrow(schema, visitor) @@ -338,8 +339,8 @@ def test_simple_schema_has_missing_ids() -> None: def test_simple_schema_has_missing_ids_partial() -> None: schema = pa.schema([ - pa.field('foo', pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), - pa.field('bar', pa.int32(), nullable=False), + pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), + pa.field("bar", pa.int32(), nullable=False), ]) visitor = _HasIds() has_ids = visit_pyarrow(schema, visitor) @@ -348,9 +349,9 @@ def test_simple_schema_has_missing_ids_partial() -> None: def test_nested_schema_has_missing_ids() -> None: schema = pa.schema([ - pa.field('foo', pa.string(), nullable=False), + pa.field("foo", pa.string(), nullable=False), pa.field( - 'quux', + "quux", pa.map_( pa.string(), pa.map_(pa.string(), pa.int32()), @@ -365,16 +366,16 @@ def test_nested_schema_has_missing_ids() -> None: def test_nested_schema_has_ids() -> None: schema = pa.schema([ - pa.field('foo', pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), + pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), pa.field( - 'quux', + "quux", pa.map_( pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "7"}), pa.field( "value", pa.map_( - pa.field('key', pa.string(), nullable=False, metadata={"PARQUET:field_id": "9"}), - pa.field('value', pa.int32(), metadata={"PARQUET:field_id": "10"}), + pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "9"}), + pa.field("value", pa.int32(), metadata={"PARQUET:field_id": "10"}), ), nullable=False, metadata={"PARQUET:field_id": "8"}, @@ -391,14 +392,14 @@ def test_nested_schema_has_ids() -> None: def test_nested_schema_has_partial_missing_ids() -> None: schema = pa.schema([ - pa.field('foo', pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), + pa.field("foo", pa.string(), nullable=False, metadata={"PARQUET:field_id": "1", "doc": "foo doc"}), pa.field( - 'quux', + "quux", pa.map_( pa.field("key", pa.string(), nullable=False, metadata={"PARQUET:field_id": "7"}), pa.field( "value", - pa.map_(pa.field('key', pa.string(), nullable=False), pa.field('value', pa.int32())), + pa.map_(pa.field("key", pa.string(), nullable=False), pa.field("value", pa.int32())), nullable=False, ), ), @@ -426,9 +427,9 @@ def test_simple_pyarrow_schema_to_schema_missing_ids_using_name_mapping( ) -> None: schema = pyarrow_schema_simple_without_ids name_mapping = NameMapping([ - MappedField(field_id=1, names=['foo']), - MappedField(field_id=2, names=['bar']), - MappedField(field_id=3, names=['baz']), + MappedField(field_id=1, names=["foo"]), + MappedField(field_id=2, names=["bar"]), + MappedField(field_id=3, names=["baz"]), ]) assert pyarrow_to_schema(schema, name_mapping) == iceberg_schema_simple @@ -439,7 +440,7 @@ def test_simple_pyarrow_schema_to_schema_missing_ids_using_name_mapping_partial_ ) -> None: schema = pyarrow_schema_simple_without_ids name_mapping = NameMapping([ - MappedField(field_id=1, names=['foo']), + MappedField(field_id=1, names=["foo"]), ]) with pytest.raises(ValueError) as exc_info: _ = pyarrow_to_schema(schema, name_mapping) @@ -452,45 +453,45 @@ def test_nested_pyarrow_schema_to_schema_missing_ids_using_name_mapping( schema = pyarrow_schema_nested_without_ids name_mapping = NameMapping([ - MappedField(field_id=1, names=['foo']), - MappedField(field_id=2, names=['bar']), - MappedField(field_id=3, names=['baz']), - MappedField(field_id=4, names=['qux'], fields=[MappedField(field_id=5, names=['element'])]), + MappedField(field_id=1, names=["foo"]), + MappedField(field_id=2, names=["bar"]), + MappedField(field_id=3, names=["baz"]), + MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), MappedField( field_id=6, - names=['quux'], + names=["quux"], fields=[ - MappedField(field_id=7, names=['key']), + MappedField(field_id=7, names=["key"]), MappedField( field_id=8, - names=['value'], + names=["value"], fields=[ - MappedField(field_id=9, names=['key']), - MappedField(field_id=10, names=['value']), + MappedField(field_id=9, names=["key"]), + MappedField(field_id=10, names=["value"]), ], ), ], ), MappedField( field_id=11, - names=['location'], + names=["location"], fields=[ MappedField( field_id=12, - names=['element'], + names=["element"], fields=[ - MappedField(field_id=13, names=['latitude']), - MappedField(field_id=14, names=['longitude']), + MappedField(field_id=13, names=["latitude"]), + MappedField(field_id=14, names=["longitude"]), ], ) ], ), MappedField( field_id=15, - names=['person'], + names=["person"], fields=[ - MappedField(field_id=16, names=['name']), - MappedField(field_id=17, names=['age']), + MappedField(field_id=16, names=["name"]), + MappedField(field_id=17, names=["age"]), ], ), ]) @@ -500,9 +501,9 @@ def test_nested_pyarrow_schema_to_schema_missing_ids_using_name_mapping( def test_pyarrow_schema_to_schema_missing_ids_using_name_mapping_nested_missing_id() -> None: schema = pa.schema([ - pa.field('foo', pa.string(), nullable=False), + pa.field("foo", pa.string(), nullable=False), pa.field( - 'quux', + "quux", pa.map_( pa.string(), pa.map_(pa.string(), pa.int32()), @@ -512,17 +513,17 @@ def test_pyarrow_schema_to_schema_missing_ids_using_name_mapping_nested_missing_ ]) name_mapping = NameMapping([ - MappedField(field_id=1, names=['foo']), + MappedField(field_id=1, names=["foo"]), MappedField( field_id=6, - names=['quux'], + names=["quux"], fields=[ - MappedField(field_id=7, names=['key']), + MappedField(field_id=7, names=["key"]), MappedField( field_id=8, - names=['value'], + names=["value"], fields=[ - MappedField(field_id=10, names=['value']), + MappedField(field_id=10, names=["value"]), ], ), ], @@ -543,3 +544,39 @@ def test_pyarrow_schema_to_schema_fresh_ids_nested_schema( pyarrow_schema_nested_without_ids: pa.Schema, iceberg_schema_nested_no_ids: Schema ) -> None: assert visit_pyarrow(pyarrow_schema_nested_without_ids, _ConvertToIcebergWithoutIDs()) == iceberg_schema_nested_no_ids + + +def test_pyarrow_schema_ensure_large_types(pyarrow_schema_nested_without_ids: pa.Schema) -> None: + expected_schema = pa.schema([ + pa.field("foo", pa.large_string(), nullable=False), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("qux", pa.large_list(pa.large_string()), nullable=False), + pa.field( + "quux", + pa.map_( + pa.large_string(), + pa.map_(pa.large_string(), pa.int32()), + ), + nullable=False, + ), + pa.field( + "location", + pa.large_list( + pa.struct([ + pa.field("latitude", pa.float32(), nullable=False), + pa.field("longitude", pa.float32(), nullable=False), + ]), + ), + nullable=False, + ), + pa.field( + "person", + pa.struct([ + pa.field("name", pa.large_string(), nullable=True), + pa.field("age", pa.int32(), nullable=False), + ]), + nullable=True, + ), + ]) + assert _pyarrow_schema_ensure_large_types(pyarrow_schema_nested_without_ids) == expected_schema diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 2bc78f3197..6f8260fa56 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -76,6 +76,7 @@ Snapshot, SnapshotLogEntry, Summary, + ancestors_of, ) from pyiceberg.table.sorting import ( NullOrder, @@ -204,6 +205,57 @@ def test_snapshot_by_id(table_v2: Table) -> None: ) +def test_snapshot_by_timestamp(table_v2: Table) -> None: + assert table_v2.snapshot_as_of_timestamp(1515100955770) == Snapshot( + snapshot_id=3051729675574597004, + parent_snapshot_id=None, + sequence_number=0, + timestamp_ms=1515100955770, + manifest_list="s3://a/b/1.avro", + summary=Summary(Operation.APPEND), + schema_id=None, + ) + assert table_v2.snapshot_as_of_timestamp(1515100955770, inclusive=False) is None + + +def test_ancestors_of(table_v2: Table) -> None: + assert list(ancestors_of(table_v2.current_snapshot(), table_v2.metadata)) == [ + Snapshot( + snapshot_id=3055729675574597004, + parent_snapshot_id=3051729675574597004, + sequence_number=1, + timestamp_ms=1555100955770, + manifest_list="s3://a/b/2.avro", + summary=Summary(Operation.APPEND), + schema_id=1, + ), + Snapshot( + snapshot_id=3051729675574597004, + parent_snapshot_id=None, + sequence_number=0, + timestamp_ms=1515100955770, + manifest_list="s3://a/b/1.avro", + summary=Summary(Operation.APPEND), + schema_id=None, + ), + ] + + +def test_ancestors_of_recursive_error(table_v2_with_extensive_snapshots: Table) -> None: + # Test RecursionError: maximum recursion depth exceeded + assert ( + len( + list( + ancestors_of( + table_v2_with_extensive_snapshots.current_snapshot(), + table_v2_with_extensive_snapshots.metadata, + ) + ) + ) + == 2000 + ) + + def test_snapshot_by_id_does_not_exist(table_v2: Table) -> None: assert table_v2.snapshot_by_id(-1) is None @@ -652,6 +704,30 @@ def test_update_metadata_add_snapshot(table_v2: Table) -> None: assert new_metadata.last_updated_ms == new_snapshot.timestamp_ms +def test_update_metadata_set_ref_snapshot(table_v2: Table) -> None: + update, _ = table_v2.transaction()._set_ref_snapshot( + snapshot_id=3051729675574597004, + ref_name="main", + type="branch", + max_ref_age_ms=123123123, + max_snapshot_age_ms=12312312312, + min_snapshots_to_keep=1, + ) + + new_metadata = update_table_metadata(table_v2.metadata, update) + assert len(new_metadata.snapshot_log) == 3 + assert new_metadata.snapshot_log[2].snapshot_id == 3051729675574597004 + assert new_metadata.current_snapshot_id == 3051729675574597004 + assert new_metadata.last_updated_ms > table_v2.metadata.last_updated_ms + assert new_metadata.refs["main"] == SnapshotRef( + snapshot_id=3051729675574597004, + snapshot_ref_type="branch", + min_snapshots_to_keep=1, + max_snapshot_age_ms=12312312312, + max_ref_age_ms=123123123, + ) + + def test_update_metadata_set_snapshot_ref(table_v2: Table) -> None: update = SetSnapshotRefUpdate( ref_name="main", @@ -995,9 +1071,9 @@ def test_correct_schema() -> None: # Should use the current schema, instead the one from the snapshot projection_schema = t.scan().projection() assert projection_schema == Schema( - NestedField(field_id=1, name='x', field_type=LongType(), required=True), - NestedField(field_id=2, name='y', field_type=LongType(), required=True), - NestedField(field_id=3, name='z', field_type=LongType(), required=True), + NestedField(field_id=1, name="x", field_type=LongType(), required=True), + NestedField(field_id=2, name="y", field_type=LongType(), required=True), + NestedField(field_id=3, name="z", field_type=LongType(), required=True), identifier_field_ids=[1, 2], ) assert projection_schema.schema_id == 1 @@ -1005,7 +1081,7 @@ def test_correct_schema() -> None: # When we explicitly filter on the commit, we want to have the schema that's linked to the snapshot projection_schema = t.scan(snapshot_id=123).projection() assert projection_schema == Schema( - NestedField(field_id=1, name='x', field_type=LongType(), required=True), + NestedField(field_id=1, name="x", field_type=LongType(), required=True), identifier_field_ids=[], ) assert projection_schema.schema_id == 0 @@ -1138,8 +1214,8 @@ def test_table_properties_raise_for_none_value(example_table_metadata_v2: Dict[s def test_serialize_commit_table_request() -> None: request = CommitTableRequest( - requirements=(AssertTableUUID(uuid='4bfd18a3-74c6-478e-98b1-71c4c32f4163'),), - identifier=TableIdentifier(namespace=['a'], name='b'), + requirements=(AssertTableUUID(uuid="4bfd18a3-74c6-478e-98b1-71c4c32f4163"),), + identifier=TableIdentifier(namespace=["a"], name="b"), ) deserialized_request = CommitTableRequest.model_validate_json(request.model_dump_json()) @@ -1149,17 +1225,17 @@ def test_serialize_commit_table_request() -> None: def test_partition_for_demo() -> None: import pyarrow as pa - test_pa_schema = pa.schema([('year', pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) + test_pa_schema = pa.schema([("year", pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) test_schema = Schema( - NestedField(field_id=1, name='year', field_type=StringType(), required=False), - NestedField(field_id=2, name='n_legs', field_type=IntegerType(), required=True), - NestedField(field_id=3, name='animal', field_type=StringType(), required=False), + NestedField(field_id=1, name="year", field_type=StringType(), required=False), + NestedField(field_id=2, name="n_legs", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="animal", field_type=StringType(), required=False), schema_id=1, ) test_data = { - 'year': [2020, 2022, 2022, 2022, 2021, 2022, 2022, 2019, 2021], - 'n_legs': [2, 2, 2, 4, 4, 4, 4, 5, 100], - 'animal': ["Flamingo", "Parrot", "Parrot", "Horse", "Dog", "Horse", "Horse", "Brittle stars", "Centipede"], + "year": [2020, 2022, 2022, 2022, 2021, 2022, 2022, 2019, 2021], + "n_legs": [2, 2, 2, 4, 4, 4, 4, 5, 100], + "animal": ["Flamingo", "Parrot", "Parrot", "Horse", "Dog", "Horse", "Horse", "Brittle stars", "Centipede"], } arrow_table = pa.Table.from_pydict(test_data, schema=test_pa_schema) partition_spec = PartitionSpec( @@ -1183,11 +1259,11 @@ def test_partition_for_demo() -> None: def test_identity_partition_on_multi_columns() -> None: import pyarrow as pa - test_pa_schema = pa.schema([('born_year', pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) + test_pa_schema = pa.schema([("born_year", pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) test_schema = Schema( - NestedField(field_id=1, name='born_year', field_type=StringType(), required=False), - NestedField(field_id=2, name='n_legs', field_type=IntegerType(), required=True), - NestedField(field_id=3, name='animal', field_type=StringType(), required=False), + NestedField(field_id=1, name="born_year", field_type=StringType(), required=False), + NestedField(field_id=2, name="n_legs", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="animal", field_type=StringType(), required=False), schema_id=1, ) # 5 partitions, 6 unique row values, 12 rows @@ -1210,9 +1286,9 @@ def test_identity_partition_on_multi_columns() -> None: for _ in range(1000): random.shuffle(test_rows) test_data = { - 'born_year': [row[0] for row in test_rows], - 'n_legs': [row[1] for row in test_rows], - 'animal': [row[2] for row in test_rows], + "born_year": [row[0] for row in test_rows], + "n_legs": [row[1] for row in test_rows], + "animal": [row[2] for row in test_rows], } arrow_table = pa.Table.from_pydict(test_data, schema=test_pa_schema) @@ -1222,7 +1298,7 @@ def test_identity_partition_on_multi_columns() -> None: concatenated_arrow_table = pa.concat_tables([table_partition.arrow_table_partition for table_partition in result]) assert concatenated_arrow_table.num_rows == arrow_table.num_rows assert concatenated_arrow_table.sort_by([ - ('born_year', 'ascending'), - ('n_legs', 'ascending'), - ('animal', 'ascending'), - ]) == arrow_table.sort_by([('born_year', 'ascending'), ('n_legs', 'ascending'), ('animal', 'ascending')]) + ("born_year", "ascending"), + ("n_legs", "ascending"), + ("animal", "ascending"), + ]) == arrow_table.sort_by([("born_year", "ascending"), ("n_legs", "ascending"), ("animal", "ascending")]) diff --git a/tests/table/test_metadata.py b/tests/table/test_metadata.py index b4e30a6b84..0e2b91f24b 100644 --- a/tests/table/test_metadata.py +++ b/tests/table/test_metadata.py @@ -220,7 +220,7 @@ def test_new_table_metadata_with_explicit_v1_format() -> None: partition_spec=partition_spec, sort_order=sort_order, location="s3://some_v1_location/", - properties={'format-version': "1"}, + properties={"format-version": "1"}, ) expected_schema = Schema( diff --git a/tests/table/test_name_mapping.py b/tests/table/test_name_mapping.py index e039415ce3..d4a2bf6c41 100644 --- a/tests/table/test_name_mapping.py +++ b/tests/table/test_name_mapping.py @@ -30,45 +30,45 @@ @pytest.fixture(scope="session") def table_name_mapping_nested() -> NameMapping: return NameMapping([ - MappedField(field_id=1, names=['foo']), - MappedField(field_id=2, names=['bar']), - MappedField(field_id=3, names=['baz']), - MappedField(field_id=4, names=['qux'], fields=[MappedField(field_id=5, names=['element'])]), + MappedField(field_id=1, names=["foo"]), + MappedField(field_id=2, names=["bar"]), + MappedField(field_id=3, names=["baz"]), + MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), MappedField( field_id=6, - names=['quux'], + names=["quux"], fields=[ - MappedField(field_id=7, names=['key']), + MappedField(field_id=7, names=["key"]), MappedField( field_id=8, - names=['value'], + names=["value"], fields=[ - MappedField(field_id=9, names=['key']), - MappedField(field_id=10, names=['value']), + MappedField(field_id=9, names=["key"]), + MappedField(field_id=10, names=["value"]), ], ), ], ), MappedField( field_id=11, - names=['location'], + names=["location"], fields=[ MappedField( field_id=12, - names=['element'], + names=["element"], fields=[ - MappedField(field_id=13, names=['latitude']), - MappedField(field_id=14, names=['longitude']), + MappedField(field_id=13, names=["latitude"]), + MappedField(field_id=14, names=["longitude"]), ], ) ], ), MappedField( field_id=15, - names=['person'], + names=["person"], fields=[ - MappedField(field_id=16, names=['name']), - MappedField(field_id=17, names=['age']), + MappedField(field_id=16, names=["name"]), + MappedField(field_id=17, names=["age"]), ], ), ]) @@ -80,7 +80,7 @@ def test_json_mapped_field_deserialization() -> None: "names": ["id", "record_id"] } """ - assert MappedField(field_id=1, names=['id', 'record_id']) == MappedField.model_validate_json(mapped_field) + assert MappedField(field_id=1, names=["id", "record_id"]) == MappedField.model_validate_json(mapped_field) mapped_field_with_null_fields = """{ "field-id": 1, @@ -88,7 +88,7 @@ def test_json_mapped_field_deserialization() -> None: "fields": null } """ - assert MappedField(field_id=1, names=['id', 'record_id']) == MappedField.model_validate_json(mapped_field_with_null_fields) + assert MappedField(field_id=1, names=["id", "record_id"]) == MappedField.model_validate_json(mapped_field_with_null_fields) def test_json_name_mapping_deserialization() -> None: @@ -133,14 +133,14 @@ def test_json_name_mapping_deserialization() -> None: """ assert parse_mapping_from_json(name_mapping) == NameMapping([ - MappedField(field_id=1, names=['id', 'record_id']), - MappedField(field_id=2, names=['data']), + MappedField(field_id=1, names=["id", "record_id"]), + MappedField(field_id=2, names=["data"]), MappedField( - names=['location'], + names=["location"], field_id=3, fields=[ - MappedField(field_id=4, names=['latitude', 'lat']), - MappedField(field_id=5, names=['longitude', 'long']), + MappedField(field_id=4, names=["latitude", "lat"]), + MappedField(field_id=5, names=["longitude", "long"]), ], ), ]) @@ -155,14 +155,14 @@ def test_json_serialization(table_name_mapping_nested: NameMapping) -> None: def test_name_mapping_to_string() -> None: nm = NameMapping([ - MappedField(field_id=1, names=['id', 'record_id']), - MappedField(field_id=2, names=['data']), + MappedField(field_id=1, names=["id", "record_id"]), + MappedField(field_id=2, names=["data"]), MappedField( - names=['location'], + names=["location"], field_id=3, fields=[ - MappedField(field_id=4, names=['lat', 'latitude']), - MappedField(field_id=5, names=['long', 'longitude']), + MappedField(field_id=4, names=["lat", "latitude"]), + MappedField(field_id=5, names=["long", "longitude"]), ], ), ]) @@ -184,64 +184,64 @@ def test_mapping_from_schema(table_schema_nested: Schema, table_name_mapping_nes def test_mapping_by_name(table_name_mapping_nested: NameMapping) -> None: assert table_name_mapping_nested._field_by_name == { - 'person.age': MappedField(field_id=17, names=['age']), - 'person.name': MappedField(field_id=16, names=['name']), - 'person': MappedField( + "person.age": MappedField(field_id=17, names=["age"]), + "person.name": MappedField(field_id=16, names=["name"]), + "person": MappedField( field_id=15, - names=['person'], - fields=[MappedField(field_id=16, names=['name']), MappedField(field_id=17, names=['age'])], + names=["person"], + fields=[MappedField(field_id=16, names=["name"]), MappedField(field_id=17, names=["age"])], ), - 'location.element.longitude': MappedField(field_id=14, names=['longitude']), - 'location.element.latitude': MappedField(field_id=13, names=['latitude']), - 'location.element': MappedField( + "location.element.longitude": MappedField(field_id=14, names=["longitude"]), + "location.element.latitude": MappedField(field_id=13, names=["latitude"]), + "location.element": MappedField( field_id=12, - names=['element'], - fields=[MappedField(field_id=13, names=['latitude']), MappedField(field_id=14, names=['longitude'])], + names=["element"], + fields=[MappedField(field_id=13, names=["latitude"]), MappedField(field_id=14, names=["longitude"])], ), - 'location': MappedField( + "location": MappedField( field_id=11, - names=['location'], + names=["location"], fields=[ MappedField( field_id=12, - names=['element'], - fields=[MappedField(field_id=13, names=['latitude']), MappedField(field_id=14, names=['longitude'])], + names=["element"], + fields=[MappedField(field_id=13, names=["latitude"]), MappedField(field_id=14, names=["longitude"])], ) ], ), - 'quux.value.value': MappedField(field_id=10, names=['value']), - 'quux.value.key': MappedField(field_id=9, names=['key']), - 'quux.value': MappedField( + "quux.value.value": MappedField(field_id=10, names=["value"]), + "quux.value.key": MappedField(field_id=9, names=["key"]), + "quux.value": MappedField( field_id=8, - names=['value'], - fields=[MappedField(field_id=9, names=['key']), MappedField(field_id=10, names=['value'])], + names=["value"], + fields=[MappedField(field_id=9, names=["key"]), MappedField(field_id=10, names=["value"])], ), - 'quux.key': MappedField(field_id=7, names=['key']), - 'quux': MappedField( + "quux.key": MappedField(field_id=7, names=["key"]), + "quux": MappedField( field_id=6, - names=['quux'], + names=["quux"], fields=[ - MappedField(field_id=7, names=['key']), + MappedField(field_id=7, names=["key"]), MappedField( field_id=8, - names=['value'], - fields=[MappedField(field_id=9, names=['key']), MappedField(field_id=10, names=['value'])], + names=["value"], + fields=[MappedField(field_id=9, names=["key"]), MappedField(field_id=10, names=["value"])], ), ], ), - 'qux.element': MappedField(field_id=5, names=['element']), - 'qux': MappedField(field_id=4, names=['qux'], fields=[MappedField(field_id=5, names=['element'])]), - 'baz': MappedField(field_id=3, names=['baz']), - 'bar': MappedField(field_id=2, names=['bar']), - 'foo': MappedField(field_id=1, names=['foo']), + "qux.element": MappedField(field_id=5, names=["element"]), + "qux": MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), + "baz": MappedField(field_id=3, names=["baz"]), + "bar": MappedField(field_id=2, names=["bar"]), + "foo": MappedField(field_id=1, names=["foo"]), } def test_mapping_lookup_by_name(table_name_mapping_nested: NameMapping) -> None: - assert table_name_mapping_nested.find("foo") == MappedField(field_id=1, names=['foo']) - assert table_name_mapping_nested.find("location.element.latitude") == MappedField(field_id=13, names=['latitude']) - assert table_name_mapping_nested.find("location", "element", "latitude") == MappedField(field_id=13, names=['latitude']) - assert table_name_mapping_nested.find(*["location", "element", "latitude"]) == MappedField(field_id=13, names=['latitude']) + assert table_name_mapping_nested.find("foo") == MappedField(field_id=1, names=["foo"]) + assert table_name_mapping_nested.find("location.element.latitude") == MappedField(field_id=13, names=["latitude"]) + assert table_name_mapping_nested.find("location", "element", "latitude") == MappedField(field_id=13, names=["latitude"]) + assert table_name_mapping_nested.find(*["location", "element", "latitude"]) == MappedField(field_id=13, names=["latitude"]) with pytest.raises(ValueError, match="Could not find field with name: boom"): table_name_mapping_nested.find("boom") @@ -264,48 +264,48 @@ def test_update_mapping(table_name_mapping_nested: NameMapping) -> None: } expected = NameMapping([ - MappedField(field_id=1, names=['foo', 'foo_update']), - MappedField(field_id=2, names=['bar']), - MappedField(field_id=3, names=['baz']), - MappedField(field_id=4, names=['qux'], fields=[MappedField(field_id=5, names=['element'])]), + MappedField(field_id=1, names=["foo", "foo_update"]), + MappedField(field_id=2, names=["bar"]), + MappedField(field_id=3, names=["baz"]), + MappedField(field_id=4, names=["qux"], fields=[MappedField(field_id=5, names=["element"])]), MappedField( field_id=6, - names=['quux'], + names=["quux"], fields=[ - MappedField(field_id=7, names=['key']), + MappedField(field_id=7, names=["key"]), MappedField( field_id=8, - names=['value'], + names=["value"], fields=[ - MappedField(field_id=9, names=['key']), - MappedField(field_id=10, names=['value']), + MappedField(field_id=9, names=["key"]), + MappedField(field_id=10, names=["value"]), ], ), ], ), MappedField( field_id=11, - names=['location'], + names=["location"], fields=[ MappedField( field_id=12, - names=['element'], + names=["element"], fields=[ - MappedField(field_id=13, names=['latitude']), - MappedField(field_id=14, names=['longitude']), + MappedField(field_id=13, names=["latitude"]), + MappedField(field_id=14, names=["longitude"]), ], ) ], ), MappedField( field_id=15, - names=['person'], + names=["person"], fields=[ - MappedField(field_id=17, names=['age']), - MappedField(field_id=19, names=['name']), - MappedField(field_id=20, names=['add_20']), + MappedField(field_id=17, names=["age"]), + MappedField(field_id=19, names=["name"]), + MappedField(field_id=20, names=["add_20"]), ], ), - MappedField(field_id=18, names=['add_18']), + MappedField(field_id=18, names=["add_18"]), ]) assert update_mapping(table_name_mapping_nested, updates, adds) == expected diff --git a/tests/table/test_snapshots.py b/tests/table/test_snapshots.py index e85ecce506..fa3464052a 100644 --- a/tests/table/test_snapshots.py +++ b/tests/table/test_snapshots.py @@ -77,6 +77,7 @@ def test_serialize_snapshot_without_sequence_number() -> None: snapshot = Snapshot( snapshot_id=25, parent_snapshot_id=19, + sequence_number=None, timestamp_ms=1602638573590, manifest_list="s3:/a/b/c.avro", summary=Summary(Operation.APPEND), @@ -156,9 +157,9 @@ def test_snapshot_summary_collector(table_schema_simple: Schema) -> None: ssc.add_file(data_file, schema=table_schema_simple) assert ssc.build() == { - 'added-data-files': '1', - 'added-files-size': '1234', - 'added-records': '100', + "added-data-files": "1", + "added-files-size": "1234", + "added-records": "100", } @@ -174,7 +175,7 @@ def test_snapshot_summary_collector_with_partition() -> None: NestedField(field_id=2, name="string_field", field_type=StringType(), required=False), NestedField(field_id=3, name="int_field", field_type=IntegerType(), required=False), ) - spec = PartitionSpec(PartitionField(source_id=3, field_id=1001, transform=IdentityTransform(), name='int_field')) + spec = PartitionSpec(PartitionField(source_id=3, field_id=1001, transform=IdentityTransform(), name="int_field")) data_file_1 = DataFile(content=DataFileContent.DATA, record_count=100, file_size_in_bytes=1234, partition=Record(int_field=1)) data_file_2 = DataFile(content=DataFileContent.DATA, record_count=200, file_size_in_bytes=4321, partition=Record(int_field=2)) # When @@ -184,13 +185,13 @@ def test_snapshot_summary_collector_with_partition() -> None: # Then assert ssc.build() == { - 'added-files-size': '1234', - 'removed-files-size': '5555', - 'added-data-files': '1', - 'deleted-data-files': '2', - 'added-records': '100', - 'deleted-records': '300', - 'changed-partition-count': '2', + "added-files-size": "1234", + "removed-files-size": "5555", + "added-data-files": "1", + "deleted-data-files": "2", + "added-records": "100", + "deleted-records": "300", + "changed-partition-count": "2", } # When @@ -198,15 +199,15 @@ def test_snapshot_summary_collector_with_partition() -> None: # Then assert ssc.build() == { - 'added-files-size': '1234', - 'removed-files-size': '5555', - 'added-data-files': '1', - 'deleted-data-files': '2', - 'added-records': '100', - 'deleted-records': '300', - 'changed-partition-count': '2', - 'partitions.int_field=1': 'added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100', - 'partitions.int_field=2': 'removed-files-size=4321,deleted-data-files=1,deleted-records=200', + "added-files-size": "1234", + "removed-files-size": "5555", + "added-data-files": "1", + "deleted-data-files": "2", + "added-records": "100", + "deleted-records": "300", + "changed-partition-count": "2", + "partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100", + "partitions.int_field=2": "removed-files-size=4321,deleted-data-files=1,deleted-records=200", } @@ -214,12 +215,12 @@ def test_merge_snapshot_summaries_empty() -> None: assert update_snapshot_summaries(Summary(Operation.APPEND)) == Summary( operation=Operation.APPEND, **{ - 'total-data-files': '0', - 'total-delete-files': '0', - 'total-records': '0', - 'total-files-size': '0', - 'total-position-deletes': '0', - 'total-equality-deletes': '0', + "total-data-files": "0", + "total-delete-files": "0", + "total-records": "0", + "total-files-size": "0", + "total-position-deletes": "0", + "total-equality-deletes": "0", }, ) @@ -229,12 +230,12 @@ def test_merge_snapshot_summaries_new_summary() -> None: summary=Summary( operation=Operation.APPEND, **{ - 'added-data-files': '1', - 'added-delete-files': '2', - 'added-equality-deletes': '3', - 'added-files-size': '4', - 'added-position-deletes': '5', - 'added-records': '6', + "added-data-files": "1", + "added-delete-files": "2", + "added-equality-deletes": "3", + "added-files-size": "4", + "added-position-deletes": "5", + "added-records": "6", }, ) ) @@ -242,18 +243,18 @@ def test_merge_snapshot_summaries_new_summary() -> None: expected = Summary( operation=Operation.APPEND, **{ - 'added-data-files': '1', - 'added-delete-files': '2', - 'added-equality-deletes': '3', - 'added-files-size': '4', - 'added-position-deletes': '5', - 'added-records': '6', - 'total-data-files': '1', - 'total-delete-files': '2', - 'total-records': '6', - 'total-files-size': '4', - 'total-position-deletes': '5', - 'total-equality-deletes': '3', + "added-data-files": "1", + "added-delete-files": "2", + "added-equality-deletes": "3", + "added-files-size": "4", + "added-position-deletes": "5", + "added-records": "6", + "total-data-files": "1", + "total-delete-files": "2", + "total-records": "6", + "total-files-size": "4", + "total-position-deletes": "5", + "total-equality-deletes": "3", }, ) @@ -265,44 +266,44 @@ def test_merge_snapshot_summaries_overwrite_summary() -> None: summary=Summary( operation=Operation.OVERWRITE, **{ - 'added-data-files': '1', - 'added-delete-files': '2', - 'added-equality-deletes': '3', - 'added-files-size': '4', - 'added-position-deletes': '5', - 'added-records': '6', + "added-data-files": "1", + "added-delete-files": "2", + "added-equality-deletes": "3", + "added-files-size": "4", + "added-position-deletes": "5", + "added-records": "6", }, ), previous_summary={ - 'total-data-files': '1', - 'total-delete-files': '1', - 'total-equality-deletes': '1', - 'total-files-size': '1', - 'total-position-deletes': '1', - 'total-records': '1', + "total-data-files": "1", + "total-delete-files": "1", + "total-equality-deletes": "1", + "total-files-size": "1", + "total-position-deletes": "1", + "total-records": "1", }, truncate_full_table=True, ) expected = { - 'added-data-files': '1', - 'added-delete-files': '2', - 'added-equality-deletes': '3', - 'added-files-size': '4', - 'added-position-deletes': '5', - 'added-records': '6', - 'total-data-files': '1', - 'total-records': '6', - 'total-delete-files': '2', - 'total-equality-deletes': '3', - 'total-files-size': '4', - 'total-position-deletes': '5', - 'deleted-data-files': '1', - 'removed-delete-files': '1', - 'deleted-records': '1', - 'removed-files-size': '1', - 'removed-position-deletes': '1', - 'removed-equality-deletes': '1', + "added-data-files": "1", + "added-delete-files": "2", + "added-equality-deletes": "3", + "added-files-size": "4", + "added-position-deletes": "5", + "added-records": "6", + "total-data-files": "1", + "total-records": "6", + "total-delete-files": "2", + "total-equality-deletes": "3", + "total-files-size": "4", + "total-position-deletes": "5", + "deleted-data-files": "1", + "removed-delete-files": "1", + "deleted-records": "1", + "removed-files-size": "1", + "removed-position-deletes": "1", + "removed-equality-deletes": "1", } assert actual.additional_properties == expected @@ -324,15 +325,15 @@ def test_invalid_type() -> None: summary=Summary( operation=Operation.OVERWRITE, **{ - 'added-data-files': '1', - 'added-delete-files': '2', - 'added-equality-deletes': '3', - 'added-files-size': '4', - 'added-position-deletes': '5', - 'added-records': '6', + "added-data-files": "1", + "added-delete-files": "2", + "added-equality-deletes": "3", + "added-files-size": "4", + "added-position-deletes": "5", + "added-records": "6", }, ), - previous_summary={'total-data-files': 'abc'}, # should be a number + previous_summary={"total-data-files": "abc"}, # should be a number truncate_full_table=True, ) diff --git a/tests/test_schema.py b/tests/test_schema.py index 96109ce9c2..23b42ef49e 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -1610,7 +1610,7 @@ def test_arrow_schema() -> None: ) expected_schema = pa.schema([ - pa.field("foo", pa.string(), nullable=False), + pa.field("foo", pa.large_string(), nullable=False), pa.field("bar", pa.int32(), nullable=True), pa.field("baz", pa.bool_(), nullable=True), ]) diff --git a/tests/test_serializers.py b/tests/test_serializers.py index 140db02700..ad40ea08e0 100644 --- a/tests/test_serializers.py +++ b/tests/test_serializers.py @@ -44,7 +44,7 @@ def test_legacy_current_snapshot_id( ToOutputFile.table_metadata(metadata, PyArrowFileIO().new_output(location=metadata_location), overwrite=True) with PyArrowFileIO().new_input(location=metadata_location).open() as input_stream: metadata_json_bytes = input_stream.read() - assert json.loads(metadata_json_bytes)['current-snapshot-id'] == -1 + assert json.loads(metadata_json_bytes)["current-snapshot-id"] == -1 backwards_compatible_static_table = StaticTable.from_metadata(metadata_location) assert backwards_compatible_static_table.metadata.current_snapshot_id is None assert backwards_compatible_static_table.metadata == static_table.metadata diff --git a/tests/test_transforms.py b/tests/test_transforms.py index 4dc3d9819f..3a9ffd6009 100644 --- a/tests/test_transforms.py +++ b/tests/test_transforms.py @@ -17,7 +17,7 @@ # pylint: disable=eval-used,protected-access,redefined-outer-name from datetime import date from decimal import Decimal -from typing import Any, Callable, Optional +from typing import TYPE_CHECKING, Any, Callable, Optional from uuid import UUID import mmh3 as mmh3 @@ -69,6 +69,7 @@ TimestampLiteral, literal, ) +from pyiceberg.partitioning import _to_partition_representation from pyiceberg.schema import Accessor from pyiceberg.transforms import ( BucketTransform, @@ -111,6 +112,9 @@ timestamptz_to_micros, ) +if TYPE_CHECKING: + import pyarrow as pa + @pytest.mark.parametrize( "test_input,test_type,expected", @@ -1550,7 +1554,7 @@ def test_strict_bucket_bytes(bound_reference_binary: BoundReference[int]) -> Non def test_strict_bucket_uuid(bound_reference_uuid: BoundReference[int]) -> None: - value = literal(UUID('12345678123456781234567812345678')) + value = literal(UUID("12345678123456781234567812345678")) transform: Transform[Any, int] = BucketTransform(num_buckets=10) _test_projection( lhs=transform.strict_project(name="name", pred=BoundNotEqualTo(term=bound_reference_uuid, literal=value)), @@ -1575,14 +1579,14 @@ def test_strict_bucket_uuid(bound_reference_uuid: BoundReference[int]) -> None: _test_projection( lhs=transform.strict_project( name="name", - pred=BoundNotIn(term=bound_reference_uuid, literals={value, literal(UUID('12345678123456781234567812345679'))}), + pred=BoundNotIn(term=bound_reference_uuid, literals={value, literal(UUID("12345678123456781234567812345679"))}), ), rhs=NotIn(term=Reference("name"), literals={1, 4}), ) _test_projection( lhs=transform.strict_project( name="name", - pred=BoundIn(term=bound_reference_uuid, literals={value, literal(UUID('12345678123456781234567812345679'))}), + pred=BoundIn(term=bound_reference_uuid, literals={value, literal(UUID("12345678123456781234567812345679"))}), ), rhs=None, ) @@ -1808,3 +1812,31 @@ def test_strict_binary(bound_reference_binary: BoundReference[str]) -> None: _test_projection( lhs=transform.strict_project(name="name", pred=BoundIn(term=bound_reference_binary, literals=set_of_literals)), rhs=None ) + + +@pytest.mark.parametrize( + "transform", + [ + pytest.param(YearTransform(), id="year_transform"), + pytest.param(MonthTransform(), id="month_transform"), + pytest.param(DayTransform(), id="day_transform"), + pytest.param(HourTransform(), id="hour_transform"), + ], +) +@pytest.mark.parametrize( + "source_col, source_type", [("date", DateType()), ("timestamp", TimestampType()), ("timestamptz", TimestamptzType())] +) +def test_ymd_pyarrow_transforms( + arrow_table_date_timestamps: "pa.Table", + source_col: str, + source_type: PrimitiveType, + transform: Transform[Any, Any], +) -> None: + if transform.can_transform(source_type): + assert transform.pyarrow_transform(source_type)(arrow_table_date_timestamps[source_col]).to_pylist() == [ + transform.transform(source_type)(_to_partition_representation(source_type, v)) + for v in arrow_table_date_timestamps[source_col].to_pylist() + ] + else: + with pytest.raises(ValueError): + transform.pyarrow_transform(DateType())(arrow_table_date_timestamps[source_col]) diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 2f15bb56d8..066e7d7cc0 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -50,8 +50,8 @@ def test_from_environment_variables_uppercase() -> None: ) def test_fix_nested_objects_from_environment_variables() -> None: assert Config().get_catalog_config("PRODUCTION") == { - 's3.region': 'eu-north-1', - 's3.access-key-id': 'username', + "s3.region": "eu-north-1", + "s3.access-key-id": "username", } diff --git a/tests/utils/test_decimal.py b/tests/utils/test_decimal.py index 419cf05916..3e67bf691a 100644 --- a/tests/utils/test_decimal.py +++ b/tests/utils/test_decimal.py @@ -45,5 +45,5 @@ def test_decimal_required_bytes() -> None: def test_decimal_to_bytes() -> None: # Check the boundary between 2 and 3 bytes. # 2 bytes has a minimum of -32,768 and a maximum value of 32,767 (inclusive). - assert decimal_to_bytes(Decimal('32767.')) == b'\x7f\xff' - assert decimal_to_bytes(Decimal('32768.')) == b'\x00\x80\x00' + assert decimal_to_bytes(Decimal("32767.")) == b"\x7f\xff" + assert decimal_to_bytes(Decimal("32768.")) == b"\x00\x80\x00" diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 8bb03cd80e..a812b384fc 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -348,8 +348,8 @@ def test_write_manifest( expected_metadata = { "schema": test_schema.model_dump_json(), - "partition-spec": test_spec.model_dump_json(), - "partition-spec-id": str(test_spec.spec_id), + "partition-spec": """[{"source-id":1,"field-id":1,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":2,"transform":"identity","name":"tpep_pickup_datetime"}]""", + "partition-spec-id": str(demo_manifest_file.partition_spec_id), "format-version": str(format_version), } _verify_metadata_with_fastavro(