diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 3413847303e0..8d0d79f2ee13 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -4178,7 +4178,7 @@ jobs: secrets: inherit with: runner_type: altinity-on-demand, altinity-regression-tester - commit: c07440a1ad14ffc5fc49ce90dff2f40c2e5f364d + commit: 00a50b5b8f12c9c603b9a3fa17dd2c5ea2012cac arch: release build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 @@ -4190,7 +4190,7 @@ jobs: secrets: inherit with: runner_type: altinity-on-demand, altinity-regression-tester-aarch64 - commit: c07440a1ad14ffc5fc49ce90dff2f40c2e5f364d + commit: 00a50b5b8f12c9c603b9a3fa17dd2c5ea2012cac arch: aarch64 build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 2add6cd67783..ff2904b1d497 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -4134,7 +4134,7 @@ jobs: secrets: inherit with: runner_type: altinity-on-demand, altinity-regression-tester - commit: c07440a1ad14ffc5fc49ce90dff2f40c2e5f364d + commit: 00a50b5b8f12c9c603b9a3fa17dd2c5ea2012cac arch: release build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 @@ -4146,7 +4146,7 @@ jobs: secrets: inherit with: runner_type: altinity-on-demand, altinity-regression-tester-aarch64 - commit: c07440a1ad14ffc5fc49ce90dff2f40c2e5f364d + commit: 00a50b5b8f12c9c603b9a3fa17dd2c5ea2012cac arch: aarch64 build_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout_minutes: 300 diff --git a/.gitmodules b/.gitmodules index 1c7304defc3c..124fb169e411 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,7 +6,7 @@ url = https://github.com/Thalhammer/jwt-cpp [submodule "contrib/zstd"] path = contrib/zstd - url = https://github.com/facebook/zstd + url = https://github.com/ClickHouse/zstd.git [submodule "contrib/lz4"] path = contrib/lz4 url = https://github.com/lz4/lz4 @@ -45,7 +45,7 @@ url = https://github.com/ClickHouse/arrow [submodule "contrib/thrift"] path = contrib/thrift - url = https://github.com/apache/thrift + url = https://github.com/ClickHouse/thrift.git [submodule "contrib/libhdfs3"] path = contrib/libhdfs3 url = https://github.com/ClickHouse/libhdfs3 diff --git a/03631_hive_columns_not_in_format_header.reference b/03631_hive_columns_not_in_format_header.reference new file mode 100644 index 000000000000..231eebbbb627 --- /dev/null +++ b/03631_hive_columns_not_in_format_header.reference @@ -0,0 +1,2 @@ +1 +raw_blob String diff --git a/03631_hive_columns_not_in_format_header.sql b/03631_hive_columns_not_in_format_header.sql new file mode 100644 index 000000000000..895f7aa4dfc0 --- /dev/null +++ b/03631_hive_columns_not_in_format_header.sql @@ -0,0 +1,13 @@ +-- Tags: no-parallel, no-fasttest, no-random-settings + +INSERT INTO FUNCTION s3( + s3_conn, + filename='03631', + format=Parquet, + partition_strategy='hive', + partition_columns_in_data_file=1) PARTITION BY (year, country) SELECT 'Brazil' as country, 2025 as year, 1 as id; + +-- distinct because minio isn't cleaned up +SELECT count(distinct year) FROM s3(s3_conn, filename='03631/**.parquet', format=RawBLOB) SETTINGS use_hive_partitioning=1; + +DESCRIBE s3(s3_conn, filename='03631/**.parquet', format=RawBLOB) SETTINGS use_hive_partitioning=1; diff --git a/ci/praktika/yaml_additional_templates.py b/ci/praktika/yaml_additional_templates.py index 5aac6f1ca8ed..a31ba2b7ea75 100644 --- a/ci/praktika/yaml_additional_templates.py +++ b/ci/praktika/yaml_additional_templates.py @@ -35,7 +35,7 @@ class AltinityWorkflowTemplates: echo "Workflow Run Report: [View Report]($REPORT_LINK)" >> $GITHUB_STEP_SUMMARY """ # Additional jobs - REGRESSION_HASH = "c07440a1ad14ffc5fc49ce90dff2f40c2e5f364d" + REGRESSION_HASH = "00a50b5b8f12c9c603b9a3fa17dd2c5ea2012cac" ALTINITY_JOBS = { "GrypeScan": r""" GrypeScanServer: diff --git a/docs/en/engines/table-engines/mergetree-family/part_export.md b/docs/en/engines/table-engines/mergetree-family/part_export.md new file mode 100644 index 000000000000..0a580726b5dd --- /dev/null +++ b/docs/en/engines/table-engines/mergetree-family/part_export.md @@ -0,0 +1,160 @@ +# ALTER TABLE EXPORT PART + +## Overview + +The `ALTER TABLE EXPORT PART` command exports individual MergeTree data parts to object storage (S3, Azure Blob Storage, etc.), typically in Parquet format. + +**Key Characteristics:** +- **Experimental feature** - must be enabled via `allow_experimental_export_merge_tree_part` setting +- **Asynchronous** - executes in the background, returns immediately +- **Ephemeral** - no automatic retry mechanism; manual retry required on failure +- **Idempotent** - safe to re-export the same part (skips by default if file exists) +- **Preserves sort order** from the source table + +## Syntax + +```sql +ALTER TABLE [database.]table_name +EXPORT PART 'part_name' +TO TABLE [destination_database.]destination_table +SETTINGS allow_experimental_export_merge_tree_part = 1 + [, setting_name = value, ...] +``` + +### Parameters + +- **`table_name`**: The source MergeTree table containing the part to export +- **`part_name`**: The exact name of the data part to export (e.g., `'2020_1_1_0'`, `'all_1_1_0'`) +- **`destination_table`**: The target table for the export (typically an S3, Azure, or other object storage table) + +## Requirements + +Source and destination tables must be 100% compatible: + +1. **Identical schemas** - same columns, types, and order +2. **Matching partition keys** - partition expressions must be identical + +## Settings + +### `allow_experimental_export_merge_tree_part` (Required) + +- **Type**: `Bool` +- **Default**: `false` +- **Description**: Must be set to `true` to enable the experimental feature. + +### `export_merge_tree_part_overwrite_file_if_exists` (Optional) + +- **Type**: `Bool` +- **Default**: `false` +- **Description**: If set to `true`, it will overwrite the file. Otherwise, fails with exception. + +## Examples + +### Basic Export to S3 + +```sql +-- Create source and destination tables +CREATE TABLE mt_table (id UInt64, year UInt16) +ENGINE = MergeTree() PARTITION BY year ORDER BY tuple(); + +CREATE TABLE s3_table (id UInt64, year UInt16) +ENGINE = S3(s3_conn, filename='data', format=Parquet, partition_strategy='hive') +PARTITION BY year; + +-- Insert and export +INSERT INTO mt_table VALUES (1, 2020), (2, 2020), (3, 2021); + +ALTER TABLE mt_table EXPORT PART '2020_1_1_0' TO TABLE s3_table +SETTINGS allow_experimental_export_merge_tree_part = 1; + +ALTER TABLE mt_table EXPORT PART '2021_2_2_0' TO TABLE s3_table +SETTINGS allow_experimental_export_merge_tree_part = 1; +``` + +## Monitoring + +### Active Exports + +Active exports can be found in the `system.exports` table. As of now, it only shows currently executing exports. It will not show pending or finished exports. + +```sql +arthur :) select * from system.exports; + +SELECT * +FROM system.exports + +Query id: 2026718c-d249-4208-891b-a271f1f93407 + +Row 1: +────── +source_database: default +source_table: source_mt_table +destination_database: default +destination_table: destination_table +create_time: 2025-11-19 09:09:11 +part_name: 20251016-365_1_1_0 +destination_file_path: table_root/eventDate=2025-10-16/retention=365/20251016-365_1_1_0_17B2F6CD5D3C18E787C07AE3DAF16EB1.parquet +elapsed: 2.04845441 +rows_read: 1138688 -- 1.14 million +total_rows_to_read: 550961374 -- 550.96 million +total_size_bytes_compressed: 37619147120 -- 37.62 billion +total_size_bytes_uncompressed: 138166213721 -- 138.17 billion +bytes_read_uncompressed: 316892925 -- 316.89 million +memory_usage: 596006095 -- 596.01 million +peak_memory_usage: 601239033 -- 601.24 million +``` + +### Export History + +You can query succeeded or failed exports in `system.part_log`. For now, it only keeps track of completion events (either success or fails). + +```sql +arthur :) select * from system.part_log where event_type='ExportPart' and table = 'replicated_source' order by event_time desc limit 1; + +SELECT * +FROM system.part_log +WHERE (event_type = 'ExportPart') AND (`table` = 'replicated_source') +ORDER BY event_time DESC +LIMIT 1 + +Query id: ae1c1cd3-c20e-4f20-8b82-ed1f6af0237f + +Row 1: +────── +hostname: arthur +query_id: +event_type: ExportPart +merge_reason: NotAMerge +merge_algorithm: Undecided +event_date: 2025-11-19 +event_time: 2025-11-19 09:08:31 +event_time_microseconds: 2025-11-19 09:08:31.974701 +duration_ms: 4 +database: default +table: replicated_source +table_uuid: 78471c67-24f4-4398-9df5-ad0a6c3daf41 +part_name: 2021_0_0_0 +partition_id: 2021 +partition: 2021 +part_type: Compact +disk_name: default +path_on_disk: year=2021/2021_0_0_0_78C704B133D41CB0EF64DD2A9ED3B6BA.parquet +rows: 1 +size_in_bytes: 272 +merged_from: ['2021_0_0_0'] +bytes_uncompressed: 86 +read_rows: 1 +read_bytes: 6 +peak_memory_usage: 22 +error: 0 +exception: +ProfileEvents: {} +``` + +### Profile Events + +- `PartsExports` - Successful exports +- `PartsExportFailures` - Failed exports +- `PartsExportDuplicated` - Number of part exports that failed because target already exists. +- `PartsExportTotalMilliseconds` - Total time + diff --git a/docs/en/engines/table-engines/mergetree-family/partition_export.md b/docs/en/engines/table-engines/mergetree-family/partition_export.md new file mode 100644 index 000000000000..1b91cf9bdeb9 --- /dev/null +++ b/docs/en/engines/table-engines/mergetree-family/partition_export.md @@ -0,0 +1,170 @@ +# ALTER TABLE EXPORT PARTITION + +## Overview + +The `ALTER TABLE EXPORT PARTITION` command exports entire partitions from Replicated*MergeTree tables to object storage (S3, Azure Blob Storage, etc.), typically in Parquet format. This feature coordinates export part operations across all replicas using ZooKeeper. + +Each MergeTree part will become a separate file with the following name convention: `//_.`. To ensure atomicity, a commit file containing the relative paths of all exported parts is also shipped. A data file should only be considered part of the dataset if a commit file references it. The commit file will be named using the following convention: `/commit__`. + +The set of parts that are exported is based on the list of parts the replica that received the export command sees. The other replicas will assist in the export process if they have those parts locally. Otherwise they will ignore it. + +The partition export tasks can be observed through `system.replicated_partition_exports`. Querying this table results in a query to ZooKeeper, so it must be used with care. Individual part export progress can be observed as usual through `system.exports`. + +The same partition can not be exported to the same destination more than once. There are two ways to override this behavior: either by setting the `export_merge_tree_partition_force_export` setting or waiting for the task to expire. + +The export task can be killed by issuing the kill command: `KILL EXPORT PARTITION `. + +The task is persistent - it should be resumed after crashes, failures and etc. + +## Syntax + +```sql +ALTER TABLE [database.]table_name +EXPORT PARTITION ID 'partition_id' +TO TABLE [destination_database.]destination_table +[SETTINGS setting_name = value, ...] +``` + +### Parameters + +- **`table_name`**: The source Replicated*MergeTree table containing the partition to export +- **`partition_id`**: The partition identifier to export (e.g., `'2020'`, `'2021'`) +- **`destination_table`**: The target table for the export (typically an S3, Azure, or other object storage table) + +## Settings + +### Server Settings + +#### `enable_experimental_export_merge_tree_partition_feature` (Required) + +- **Type**: `Bool` +- **Default**: `false` +- **Description**: Enable export replicated merge tree partition feature. It is experimental and not yet ready for production use. + +### Query Settings + +#### `export_merge_tree_partition_force_export` (Optional) + +- **Type**: `Bool` +- **Default**: `false` +- **Description**: Ignore existing partition export and overwrite the ZooKeeper entry. Allows re-exporting a partition to the same destination before the manifest expires. + +#### `export_merge_tree_partition_max_retries` (Optional) + +- **Type**: `UInt64` +- **Default**: `3` +- **Description**: Maximum number of retries for exporting a merge tree part in an export partition task. If it exceeds, the entire task fails. + +#### `export_merge_tree_partition_manifest_ttl` (Optional) + +- **Type**: `UInt64` +- **Default**: `180` (seconds) +- **Description**: Determines how long the manifest will live in ZooKeeper. It prevents the same partition from being exported twice to the same destination. This setting does not affect or delete in-progress tasks; it only cleans up completed ones. + +#### `export_merge_tree_part_file_already_exists_policy` (Optional) + +- **Type**: `MergeTreePartExportFileAlreadyExistsPolicy` +- **Default**: `skip` +- **Description**: Policy for handling files that already exist during export. Possible values: + - `skip` - Skip the file if it already exists + - `error` - Throw an error if the file already exists + - `overwrite` - Overwrite the file + +## Examples + +### Basic Export to S3 + +```sql +CREATE TABLE rmt_table (id UInt64, year UInt16) +ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/rmt_table', 'replica1') +PARTITION BY year ORDER BY tuple(); + +CREATE TABLE s3_table (id UInt64, year UInt16) +ENGINE = S3(s3_conn, filename='data', format=Parquet, partition_strategy='hive') +PARTITION BY year; + +INSERT INTO rmt_table VALUES (1, 2020), (2, 2020), (3, 2020), (4, 2021); + +ALTER TABLE rmt_table EXPORT PARTITION ID '2020' TO TABLE s3_table; + +## Killing Exports + +You can cancel in-progress partition exports using the `KILL EXPORT PARTITION` command: + +```sql +KILL EXPORT PARTITION +WHERE partition_id = '2020' + AND source_table = 'rmt_table' + AND destination_table = 's3_table' +``` + +The `WHERE` clause filters exports from the `system.replicated_partition_exports` table. You can use any columns from that table in the filter. + +## Monitoring + +### Active and Completed Exports + +Monitor partition exports using the `system.replicated_partition_exports` table: + +```sql +arthur :) select * from system.replicated_partition_exports Format Vertical; + +SELECT * +FROM system.replicated_partition_exports +FORMAT Vertical + +Query id: 9efc271a-a501-44d1-834f-bc4d20156164 + +Row 1: +────── +source_database: default +source_table: replicated_source +destination_database: default +destination_table: replicated_destination +create_time: 2025-11-21 18:21:51 +partition_id: 2022 +transaction_id: 7397746091717128192 +source_replica: r1 +parts: ['2022_0_0_0','2022_1_1_0','2022_2_2_0'] +parts_count: 3 +parts_to_do: 0 +status: COMPLETED +exception_replica: +last_exception: +exception_part: +exception_count: 0 + +Row 2: +────── +source_database: default +source_table: replicated_source +destination_database: default +destination_table: replicated_destination +create_time: 2025-11-21 18:20:35 +partition_id: 2021 +transaction_id: 7397745772618674176 +source_replica: r1 +parts: ['2021_0_0_0'] +parts_count: 1 +parts_to_do: 0 +status: COMPLETED +exception_replica: +last_exception: +exception_part: +exception_count: 0 + +2 rows in set. Elapsed: 0.019 sec. + +arthur :) +``` + +Status values include: +- `PENDING` - Export is queued / in progress +- `COMPLETED` - Export finished successfully +- `FAILED` - Export failed +- `KILLED` - Export was cancelled + +## Related Features + +- [ALTER TABLE EXPORT PART](/docs/en/engines/table-engines/mergetree-family/part_export.md) - Export individual parts (non-replicated) + diff --git a/docs/en/engines/table-engines/special/hybrid.md b/docs/en/engines/table-engines/special/hybrid.md new file mode 100644 index 000000000000..12df6cd859b8 --- /dev/null +++ b/docs/en/engines/table-engines/special/hybrid.md @@ -0,0 +1,120 @@ +--- +description: 'Hybrid unions multiple data sources behind per-segment predicates so queries behave like a single table while data is migrated or tiered.' +slug: /engines/table-engines/special/hybrid +title: 'Hybrid Table Engine' +sidebar_label: 'Hybrid' +sidebar_position: 11 +--- + +# Hybrid table engine + +`Hybrid` builds on top of the [Distributed](./distributed.md) table engine. It lets you expose several data sources as one logical table and assign every source its own predicate. +The engine rewrites incoming queries so that each segment receives the original query plus its predicate. This keeps all of the Distributed optimisations (remote aggregation, `skip_unused_shards`, +global JOIN pushdown, and so on) while you duplicate or migrate data across clusters, storage types, or formats. + +It keeps the same execution pipeline as `engine=Distributed` but can read from multiple underlying sources simultaneously—similar to `engine=Merge`—while still pushing logic down to each source. + +Typical use cases include: + +- Zero-downtime migrations where "old" and "new" replicas temporarily overlap. +- Tiered storage, for example fresh data on a local cluster and historical data in S3. +- Gradual roll-outs where only a subset of rows should be served from a new backend. + +By giving mutually exclusive predicates to the segments (for example, `date < watermark` and `date >= watermark`), you ensure that each row is read from exactly one source. + +## Enable the engine + +The Hybrid engine is experimental. Enable it per session (or in the user profile) before creating tables: + +```sql +SET allow_experimental_hybrid_table = 1; +``` + +### Automatic Type Alignment + +Hybrid segments can evolve independently, so the same logical column may use different physical types. With the experimental `hybrid_table_auto_cast_columns = 1` **(enabled by default and requires `allow_experimental_analyzer = 1`)**, the engine inserts the necessary `CAST` operations into each rewritten query so every shard receives the schema defined by the Hybrid table. You can opt out by setting the flag to `0` if it causes issues. + +Segment schemas are cached when you create or attach a Hybrid table. If you alter a segment later (for example change a column type), refresh the Hybrid table (detach/attach or recreate it) so the cached headers stay in sync with the new schema; otherwise the auto-cast feature may miss the change and queries can still fail with header/type errors. + +## Engine definition + +```sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name +( + column1 type1, + column2 type2, + ... +) +ENGINE = Hybrid(table_function_1, predicate_1 [, table_function_2, predicate_2 ...]) +``` + +You must pass at least two arguments – the first table function and its predicate. Additional sources are appended as `table_function, predicate` pairs. The first table function is also used for `INSERT` statements. + +### Arguments and behaviour + +- `table_function_n` must be a valid table function (for example `remote`, `remoteSecure`, `cluster`, `clusterAllReplicas`, `s3Cluster`) or a fully qualified table name (`database.table`). The first argument must be a table function—such as `remote` or `cluster`—because it instantiates the underlying `Distributed` storage. +- `predicate_n` must be an expression that can be evaluated on the table columns. The engine adds it to the segment's query with an additional `AND`, so expressions like `event_date >= '2025-09-01'` or `id BETWEEN 10 AND 15` are typical. +- The query planner picks the same processing stage for every segment as it does for the base `Distributed` plan, so remote aggregation, ORDER BY pushdown, `skip_unused_shards`, and the legacy/analyzer execution modes behave the same way. +- `INSERT` statements are forwarded to the first table function only. If you need multi-destination writes, use explicit `INSERT` statements into the respective sources. +- Align schemas across the segments. ClickHouse builds a common header and rejects creation if any segment misses a column defined in the Hybrid schema. If the physical types differ you may need to add casts on one side or in the query, just as you would when reading from heterogeneous replicas. + +## Example: local cluster plus S3 historical tier + +The following commands illustrate a two-segment layout. Hot data stays on a local ClickHouse cluster, while historical rows come from public S3 Parquet files. + +```sql +-- Local MergeTree table that keeps current data +CREATE OR REPLACE TABLE btc_blocks_local +( + `hash` FixedString(64), + `version` Int64, + `mediantime` DateTime64(9), + `nonce` Int64, + `bits` FixedString(8), + `difficulty` Float64, + `chainwork` FixedString(64), + `size` Int64, + `weight` Int64, + `coinbase_param` String, + `number` Int64, + `transaction_count` Int64, + `merkle_root` FixedString(64), + `stripped_size` Int64, + `timestamp` DateTime64(9), + `date` Date +) +ENGINE = MergeTree +ORDER BY (timestamp) +PARTITION BY toYYYYMM(date); + +-- Hybrid table that unions the local shard with historical data in S3 +CREATE OR REPLACE TABLE btc_blocks ENGINE = Hybrid( + remote('localhost:9000', currentDatabase(), 'btc_blocks_local'), date >= '2025-09-01', + s3('s3://aws-public-blockchain/v1.0/btc/blocks/**.parquet', NOSIGN), date < '2025-09-01' +) AS btc_blocks_local; + +-- Writes target the first (remote) segment +INSERT INTO btc_blocks +SELECT * +FROM s3('s3://aws-public-blockchain/v1.0/btc/blocks/**.parquet', NOSIGN) +WHERE date BETWEEN '2025-09-01' AND '2025-09-30'; + +-- Reads seamlessly combine both predicates +SELECT * FROM btc_blocks WHERE date = '2025-08-01'; -- data from s3 +SELECT * FROM btc_blocks WHERE date = '2025-09-05'; -- data from MergeTree (TODO: still analyzes s3) +SELECT * FROM btc_blocks WHERE date IN ('2025-08-31','2025-09-01') -- data from both sources, single copy always + + +-- Run analytic queries as usual +SELECT + date, + count(), + uniqExact(CAST(hash, 'Nullable(String)')) AS hashes, + sum(CAST(number, 'Nullable(Int64)')) AS blocks_seen +FROM btc_blocks +WHERE date BETWEEN '2025-08-01' AND '2025-09-30' +GROUP BY date +ORDER BY date; +``` + +Because the predicates are applied inside every segment, queries such as `ORDER BY`, `GROUP BY`, `LIMIT`, `JOIN`, and `EXPLAIN` behave as if you were reading from a single `Distributed` table. When sources expose different physical types (for example `FixedString(64)` versus `String` in Parquet), add explicit casts during ingestion or in the query, as shown above. diff --git a/docs/en/operations/system-tables/exports.md b/docs/en/operations/system-tables/exports.md new file mode 100644 index 000000000000..e26514364008 --- /dev/null +++ b/docs/en/operations/system-tables/exports.md @@ -0,0 +1,56 @@ +--- +description: 'System table containing information about in progress merge tree part exports' +keywords: ['system table', 'exports', 'merge tree', 'part'] +slug: /operations/system-tables/exports +title: 'system.exports' +--- + +Contains information about in progress merge tree part exports + +Columns: + +- `source_database` ([String](/docs/en/sql-reference/data-types/string.md)) — Name of the source database. +- `source_table` ([String](/docs/en/sql-reference/data-types/string.md)) — Name of the source table. +- `destination_database` ([String](/docs/en/sql-reference/data-types/string.md)) — Name of the destination database. +- `destination_table` ([String](/docs/en/sql-reference/data-types/string.md)) — Name of the destination table. +- `create_time` ([DateTime](/docs/en/sql-reference/data-types/datetime.md)) — Date and time when the export command was received in the server. +- `part_name` ([String](/docs/en/sql-reference/data-types/string.md)) — Name of the part. +- `destination_file_path` ([String](/docs/en/sql-reference/data-types/string.md)) — File path relative to where the part is being exported to. +- `elapsed` ([Float64](/docs/en/sql-reference/data-types/float.md)) — The time elapsed (in seconds) since the export started. +- `rows_read` ([UInt64](/docs/en/sql-reference/data-types/int-uint.md)) — The number of rows read from the exported part. +- `total_rows_to_read` ([UInt64](/docs/en/sql-reference/data-types/int-uint.md)) — The total number of rows to read from the exported part. +- `total_size_bytes_compressed` ([UInt64](/docs/en/sql-reference/data-types/int-uint.md)) — The total size of the compressed data in the exported part. +- `total_size_bytes_uncompressed` ([UInt64](/docs/en/sql-reference/data-types/int-uint.md)) — The total size of the uncompressed data in the exported part. +- `bytes_read_uncompressed` ([UInt64](/docs/en/sql-reference/data-types/int-uint.md)) — The number of uncompressed bytes read from the exported part. +- `memory_usage` ([UInt64](/docs/en/sql-reference/data-types/int-uint.md)) — Current memory usage in bytes for the export operation. +- `peak_memory_usage` ([UInt64](/docs/en/sql-reference/data-types/int-uint.md)) — Peak memory usage in bytes during the export operation. + +**Example** + +```sql +arthur :) select * from system.exports; + +SELECT * +FROM system.exports + +Query id: 2026718c-d249-4208-891b-a271f1f93407 + +Row 1: +────── +source_database: default +source_table: source_mt_table +destination_database: default +destination_table: destination_table +create_time: 2025-11-19 09:09:11 +part_name: 20251016-365_1_1_0 +destination_file_path: table_root/eventDate=2025-10-16/retention=365/20251016-365_1_1_0_17B2F6CD5D3C18E787C07AE3DAF16EB1.parquet +elapsed: 2.04845441 +rows_read: 1138688 -- 1.14 million +total_rows_to_read: 550961374 -- 550.96 million +total_size_bytes_compressed: 37619147120 -- 37.62 billion +total_size_bytes_uncompressed: 138166213721 -- 138.17 billion +bytes_read_uncompressed: 316892925 -- 316.89 million +memory_usage: 596006095 -- 596.01 million +peak_memory_usage: 601239033 -- 601.24 million +``` + diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 3168254596c5..4524a84a98d2 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -206,6 +206,12 @@ SYSTEM RELOAD USERS [ON CLUSTER cluster_name] Normally shuts down ClickHouse (like `service clickhouse-server stop` / `kill {$pid_clickhouse-server}`) +## PRESHUTDOWN {#preshutdown} + + + +Prepare node for graceful shutdown. Unregister in autodiscovered clusters, stop accepting distributed requests to object storages (s3Cluster, icebergCluster, etc.). + ## KILL {#kill} Aborts ClickHouse process (like `kill -9 {$ pid_clickhouse-server}`) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index d279e7560a99..161ac35134f8 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -83,6 +83,7 @@ #include #include #include +#include #include #include #include @@ -157,6 +158,10 @@ # include #endif +#if USE_PARQUET +# include +#endif + #include /// A minimal file used when the server is run without installation @@ -338,6 +343,10 @@ namespace ServerSetting extern const ServerSettingsBool abort_on_logical_error; extern const ServerSettingsUInt64 jemalloc_flush_profile_interval_bytes; extern const ServerSettingsBool jemalloc_flush_profile_on_memory_exceeded; + extern const ServerSettingsUInt64 object_storage_list_objects_cache_ttl; + extern const ServerSettingsUInt64 object_storage_list_objects_cache_size; + extern const ServerSettingsUInt64 object_storage_list_objects_cache_max_entries; + extern const ServerSettingsUInt64 input_format_parquet_metadata_cache_max_size; } namespace ErrorCodes @@ -348,6 +357,9 @@ namespace ErrorCodes namespace FileCacheSetting { extern const FileCacheSettingsBool load_metadata_asynchronously; + extern const ServerSettingsUInt64 object_storage_list_objects_cache_size; + extern const ServerSettingsUInt64 object_storage_list_objects_cache_max_entries; + extern const ServerSettingsUInt64 object_storage_list_objects_cache_ttl; } } @@ -2329,6 +2341,8 @@ try } + global_context->startSwarmMode(); + { std::lock_guard lock(servers_lock); /// We should start interserver communications before (and more important shutdown after) tables. @@ -2520,8 +2534,16 @@ try if (dns_cache_updater) dns_cache_updater->start(); + ObjectStorageListObjectsCache::instance().setMaxSizeInBytes(server_settings[ServerSetting::object_storage_list_objects_cache_size]); + ObjectStorageListObjectsCache::instance().setMaxCount(server_settings[ServerSetting::object_storage_list_objects_cache_max_entries]); + ObjectStorageListObjectsCache::instance().setTTL(server_settings[ServerSetting::object_storage_list_objects_cache_ttl]); + auto replicas_reconnector = ReplicasReconnector::init(global_context); +#if USE_PARQUET + ParquetFileMetaDataCache::instance()->setMaxSizeInBytes(server_settings[ServerSetting::input_format_parquet_metadata_cache_max_size]); +#endif + /// Set current database name before loading tables and databases because /// system logs may copy global context. std::string default_database = server_settings[ServerSetting::default_database]; @@ -2777,6 +2799,8 @@ try is_cancelled = true; + global_context->stopSwarmMode(); + LOG_DEBUG(log, "Waiting for current connections to close."); size_t current_connections = 0; diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 13a9911c702e..3ebf43b0e6ae 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -210,6 +210,8 @@ enum class AccessType : uint8_t enabled implicitly by the grant ALTER_TABLE */\ M(ALTER_SETTINGS, "ALTER SETTING, ALTER MODIFY SETTING, MODIFY SETTING, RESET SETTING", TABLE, ALTER_TABLE) /* allows to execute ALTER MODIFY SETTING */\ M(ALTER_MOVE_PARTITION, "ALTER MOVE PART, MOVE PARTITION, MOVE PART", TABLE, ALTER_TABLE) \ + M(ALTER_EXPORT_PART, "ALTER EXPORT PART, EXPORT PART", TABLE, ALTER_TABLE) \ + M(ALTER_EXPORT_PARTITION, "ALTER EXPORT PARTITION, EXPORT PARTITION", TABLE, ALTER_TABLE) \ M(ALTER_FETCH_PARTITION, "ALTER FETCH PART, FETCH PARTITION", TABLE, ALTER_TABLE) \ M(ALTER_FREEZE_PARTITION, "FREEZE PARTITION, UNFREEZE", TABLE, ALTER_TABLE) \ M(ALTER_UNLOCK_SNAPSHOT, "UNLOCK SNAPSHOT", TABLE, ALTER_TABLE) \ @@ -320,6 +322,8 @@ enum class AccessType : uint8_t M(SYSTEM_DROP_SCHEMA_CACHE, "SYSTEM DROP SCHEMA CACHE, DROP SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_FORMAT_SCHEMA_CACHE, "SYSTEM DROP FORMAT SCHEMA CACHE, DROP FORMAT SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_S3_CLIENT_CACHE, "SYSTEM DROP S3 CLIENT, DROP S3 CLIENT CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ + M(SYSTEM_DROP_OBJECT_STORAGE_LIST_OBJECTS_CACHE, "SYSTEM DROP OBJECT STORAGE LIST OBJECTS CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ + M(SYSTEM_DROP_PARQUET_METADATA_CACHE, "SYSTEM DROP PARQUET METADATA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_CACHE, "DROP CACHE", GROUP, SYSTEM) \ M(SYSTEM_RELOAD_CONFIG, "RELOAD CONFIG", GLOBAL, SYSTEM_RELOAD) \ M(SYSTEM_RELOAD_USERS, "RELOAD USERS", GLOBAL, SYSTEM_RELOAD) \ @@ -334,6 +338,7 @@ enum class AccessType : uint8_t M(SYSTEM_TTL_MERGES, "SYSTEM STOP TTL MERGES, SYSTEM START TTL MERGES, STOP TTL MERGES, START TTL MERGES", TABLE, SYSTEM) \ M(SYSTEM_FETCHES, "SYSTEM STOP FETCHES, SYSTEM START FETCHES, STOP FETCHES, START FETCHES", TABLE, SYSTEM) \ M(SYSTEM_MOVES, "SYSTEM STOP MOVES, SYSTEM START MOVES, STOP MOVES, START MOVES", TABLE, SYSTEM) \ + M(SYSTEM_SWARM, "SYSTEM STOP SWARM MODE, SYSTEM START SWARM MODE, STOP SWARM MODE, START SWARM MODE", GLOBAL, SYSTEM) \ M(SYSTEM_PULLING_REPLICATION_LOG, "SYSTEM STOP PULLING REPLICATION LOG, SYSTEM START PULLING REPLICATION LOG", TABLE, SYSTEM) \ M(SYSTEM_CLEANUP, "SYSTEM STOP CLEANUP, SYSTEM START CLEANUP", TABLE, SYSTEM) \ M(SYSTEM_VIEWS, "SYSTEM REFRESH VIEW, SYSTEM START VIEWS, SYSTEM STOP VIEWS, SYSTEM START VIEW, SYSTEM STOP VIEW, SYSTEM CANCEL VIEW, REFRESH VIEW, START VIEWS, STOP VIEWS, START VIEW, STOP VIEW, CANCEL VIEW", VIEW, SYSTEM) \ diff --git a/src/Analyzer/FunctionSecretArgumentsFinderTreeNode.h b/src/Analyzer/FunctionSecretArgumentsFinderTreeNode.h index 8bcb6e147420..e4f63192c95b 100644 --- a/src/Analyzer/FunctionSecretArgumentsFinderTreeNode.h +++ b/src/Analyzer/FunctionSecretArgumentsFinderTreeNode.h @@ -71,8 +71,14 @@ class FunctionTreeNodeImpl : public AbstractFunction { public: explicit ArgumentsTreeNode(const QueryTreeNodes * arguments_) : arguments(arguments_) {} - size_t size() const override { return arguments ? arguments->size() : 0; } - std::unique_ptr at(size_t n) const override { return std::make_unique(arguments->at(n).get()); } + size_t size() const override + { /// size withous skipped indexes + return arguments ? arguments->size() - skippedSize() : 0; + } + std::unique_ptr at(size_t n) const override + { /// n is relative index, some can be skipped + return std::make_unique(arguments->at(getRealIndex(n)).get()); + } private: const QueryTreeNodes * arguments = nullptr; }; diff --git a/src/Analyzer/Passes/HybridCastsPass.cpp b/src/Analyzer/Passes/HybridCastsPass.cpp new file mode 100644 index 000000000000..f40e7664df50 --- /dev/null +++ b/src/Analyzer/Passes/HybridCastsPass.cpp @@ -0,0 +1,150 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace DB +{ + +namespace Setting +{ + extern const SettingsBool hybrid_table_auto_cast_columns; +} + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +namespace +{ + +/// Collect Hybrid table expressions that require casts to normalize headers across segments. +/// +/// Hybrid is currently exposed only as an engine (TableNode). If it ever gets a table function +/// wrapper, this visitor must also look at TableFunctionNode and unwrap to the underlying +/// StorageDistributed so cached casts can be picked up there as well. +class HybridCastTablesCollector : public InDepthQueryTreeVisitor +{ +public: + explicit HybridCastTablesCollector(std::unordered_map & cast_map_) + : cast_map(cast_map_) + {} + + static bool needChildVisit(QueryTreeNodePtr &, QueryTreeNodePtr &) { return true; } + + void visitImpl(QueryTreeNodePtr & node) + { + const auto * table = node->as(); + if (!table) + return; + + const auto * storage = table->getStorage().get(); + if (const auto * distributed = typeid_cast(storage)) + { + ColumnsDescription to_cast = distributed->getColumnsToCast(); + if (!to_cast.empty()) + cast_map.emplace(node.get(), std::move(to_cast)); // repeated table_expression can overwrite + } + } + +private: + std::unordered_map & cast_map; +}; + +// Visitor replaces all usages of the column with CAST(column, type) in the query tree. +class HybridCastVisitor : public InDepthQueryTreeVisitor +{ +public: + HybridCastVisitor( + const std::unordered_map & cast_map_, + ContextPtr context_) + : cast_map(cast_map_) + , context(std::move(context_)) + {} + + bool shouldTraverseTopToBottom() const { return false; } + + static bool needChildVisit(QueryTreeNodePtr &, QueryTreeNodePtr & child) + { + /// Traverse all child nodes so casts also apply inside subqueries and UNION branches. + (void)child; + return true; + } + + void visitImpl(QueryTreeNodePtr & node) + { + auto * column_node = node->as(); + if (!column_node) + return; + + auto column_source = column_node->getColumnSourceOrNull(); + if (!column_source) + return; + + auto it = cast_map.find(column_source.get()); + if (it == cast_map.end()) + return; + + const auto & column_name = column_node->getColumnName(); + auto expected_column_opt = it->second.tryGetPhysical(column_name); + if (!expected_column_opt) + return; + + auto column_clone = std::static_pointer_cast(column_node->clone()); + + auto cast_node = buildCastFunction(column_clone, expected_column_opt->type, context); + const auto & alias = node->getAlias(); + if (!alias.empty()) + cast_node->setAlias(alias); + else + cast_node->setAlias(expected_column_opt->name); + + node = cast_node; + } + +private: + const std::unordered_map & cast_map; + ContextPtr context; +}; + + +} // namespace + +void HybridCastsPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) +{ + const auto & settings = context->getSettingsRef(); + if (!settings[Setting::hybrid_table_auto_cast_columns]) + return; + + auto * query = query_tree_node->as(); + if (!query) + return; + + std::unordered_map cast_map; + HybridCastTablesCollector collector(cast_map); + collector.visit(query_tree_node); + if (cast_map.empty()) + return; + + HybridCastVisitor visitor(cast_map, context); + visitor.visit(query_tree_node); +} + +} diff --git a/src/Analyzer/Passes/HybridCastsPass.h b/src/Analyzer/Passes/HybridCastsPass.h new file mode 100644 index 000000000000..6b3159d6e925 --- /dev/null +++ b/src/Analyzer/Passes/HybridCastsPass.h @@ -0,0 +1,32 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/// Adds CASTs for Hybrid segments when physical types differ from the Hybrid schema +/// +/// It normalizes headers coming from different segments when table structure in some segments +/// differs from the Hybrid table definition. For example column X is UInt32 in the Hybrid table, +/// but Int64 in an additional segment. +/// +/// Without these casts ConvertingActions may fail to reconcile mismatched headers when casts are impossible +/// (e.g. AggregateFunction states carry hashed data tied to their argument type and cannot be recast), for example: +/// "Conversion from AggregateFunction(uniq, Decimal(38, 0)) to AggregateFunction(uniq, UInt64) is not supported" +/// (CANNOT_CONVERT_TYPE). +/// +/// Per-segment casts are not reliable because WithMergeState strips aliases, so merged pipelines +/// from different segments would return different headers (with or without CAST), leading to errors +/// like "Cannot find column `max(value)` in source stream, there are only columns: [max(_CAST(value, 'UInt64'))]" +/// (THERE_IS_NO_COLUMN). +class HybridCastsPass : public IQueryTreePass +{ +public: + String getName() override { return "HybridCastsPass"; } + String getDescription() override { return "Inject casts for Hybrid columns to match schema types"; } + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; +}; + +} diff --git a/src/Analyzer/QueryTreePassManager.cpp b/src/Analyzer/QueryTreePassManager.cpp index a818ad348020..3f94444cdc53 100644 --- a/src/Analyzer/QueryTreePassManager.cpp +++ b/src/Analyzer/QueryTreePassManager.cpp @@ -48,6 +48,7 @@ #include #include #include +#include #include namespace DB @@ -309,6 +310,8 @@ void addQueryTreePasses(QueryTreePassManager & manager, bool only_analyze) manager.addPass(std::make_unique()); manager.addPass(std::make_unique()); + + manager.addPass(std::make_unique()); } } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2dcf07466941..411942d62a60 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -130,6 +130,7 @@ add_headers_and_sources(dbms Storages/ObjectStorage/Azure) add_headers_and_sources(dbms Storages/ObjectStorage/S3) add_headers_and_sources(dbms Storages/ObjectStorage/HDFS) add_headers_and_sources(dbms Storages/ObjectStorage/Local) +add_headers_and_sources(dbms Storages/ObjectStorage/MergeTree) add_headers_and_sources(dbms Storages/ObjectStorage/DataLakes) add_headers_and_sources(dbms Storages/ObjectStorage/DataLakes/Iceberg) add_headers_and_sources(dbms Storages/ObjectStorage/DataLakes/DeltaLake) diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp index c4c5010df342..3b1a09631dd9 100644 --- a/src/Client/MultiplexedConnections.cpp +++ b/src/Client/MultiplexedConnections.cpp @@ -232,7 +232,7 @@ void MultiplexedConnections::sendIgnoredPartUUIDs(const std::vector & uuid void MultiplexedConnections::sendClusterFunctionReadTaskResponse(const ClusterFunctionReadTaskResponse & response) { std::lock_guard lock(cancel_mutex); - if (cancelled) + if (cancelled || !current_connection || !current_connection->isConnected()) return; current_connection->sendClusterFunctionReadTaskResponse(response); } @@ -241,7 +241,7 @@ void MultiplexedConnections::sendClusterFunctionReadTaskResponse(const ClusterFu void MultiplexedConnections::sendMergeTreeReadTaskResponse(const ParallelReadResponse & response) { std::lock_guard lock(cancel_mutex); - if (cancelled) + if (cancelled || !current_connection || !current_connection->isConnected()) return; current_connection->sendMergeTreeReadTaskResponse(response); } @@ -527,9 +527,12 @@ MultiplexedConnections::ReplicaState & MultiplexedConnections::getReplicaForRead void MultiplexedConnections::invalidateReplica(ReplicaState & state) { + Connection * old_connection = state.connection; state.connection = nullptr; state.pool_entry = IConnectionPool::Entry(); --active_connection_count; + if (current_connection == old_connection) + current_connection = nullptr; } void MultiplexedConnections::setAsyncCallback(AsyncCallback async_callback) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 1b4071cbb5dc..da72fd5f4653 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -10,6 +10,7 @@ M(Merge, "Number of executing background merges") \ M(MergeParts, "Number of source parts participating in current background merges") \ M(Move, "Number of currently executing moves") \ + M(Export, "Number of currently executing exports") \ M(PartMutation, "Number of mutations (ALTER DELETE/UPDATE)") \ M(ReplicatedFetch, "Number of data parts being fetched from replica") \ M(ReplicatedSend, "Number of data parts being sent to replicas") \ @@ -432,6 +433,7 @@ M(StartupScriptsExecutionState, "State of startup scripts execution: 0 = not finished, 1 = success, 2 = failure.") \ \ M(IsServerShuttingDown, "Indicates if the server is shutting down: 0 = no, 1 = yes") \ + M(IsSwarmModeEnabled, "Indicates if the swarm mode enabled or not: 0 = disabled, 1 = enabled") \ \ M(StatelessWorkerThreads, "Number of threads in the stateless worker thread pool.") \ M(StatelessWorkerThreadsActive, "Number of threads in the stateless worker thread pool running a task.") \ diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 6e98ca6b7f60..144be0ee3eab 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -28,6 +28,10 @@ M(AsyncInsertBytes, "Data size in bytes of asynchronous INSERT queries.", ValueType::Bytes) \ M(AsyncInsertRows, "Number of rows inserted by asynchronous INSERT queries.", ValueType::Number) \ M(AsyncInsertCacheHits, "Number of times a duplicate hash id has been found in asynchronous INSERT hash id cache.", ValueType::Number) \ + M(PartsExports, "Number of successful part exports.", ValueType::Number) \ + M(PartsExportFailures, "Number of failed part exports.", ValueType::Number) \ + M(PartsExportDuplicated, "Number of part exports that failed because target already exists.", ValueType::Number) \ + M(PartsExportTotalMilliseconds, "Total time spent on part export operations.", ValueType::Milliseconds) \ M(FailedQuery, "Number of failed queries.", ValueType::Number) \ M(FailedSelectQuery, "Same as FailedQuery, but only for SELECT queries.", ValueType::Number) \ M(FailedInsertQuery, "Same as FailedQuery, but only for INSERT queries.", ValueType::Number) \ @@ -175,6 +179,8 @@ M(MergesThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_merges_bandwidth_for_server' throttling.", ValueType::Microseconds) \ M(MutationsThrottlerBytes, "Bytes passed through 'max_mutations_bandwidth_for_server' throttler.", ValueType::Bytes) \ M(MutationsThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_mutations_bandwidth_for_server' throttling.", ValueType::Microseconds) \ + M(ExportsThrottlerBytes, "Bytes passed through 'max_exports_bandwidth_for_server' throttler.", ValueType::Bytes) \ + M(ExportsThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_exports_bandwidth_for_server' throttling.", ValueType::Microseconds) \ M(QueryRemoteReadThrottlerBytes, "Bytes passed through 'max_remote_read_network_bandwidth' throttler.", ValueType::Bytes) \ M(QueryRemoteReadThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_remote_read_network_bandwidth' throttling.", ValueType::Microseconds) \ M(QueryRemoteWriteThrottlerBytes, "Bytes passed through 'max_remote_write_network_bandwidth' throttler.", ValueType::Bytes) \ @@ -297,6 +303,11 @@ M(IcebergTrivialCountOptimizationApplied, "Trivial count optimization applied while reading from Iceberg", ValueType::Number) \ M(IcebergVersionHintUsed, "Number of times version-hint.text has been used.", ValueType::Number) \ M(IcebergMinMaxIndexPrunedFiles, "Number of skipped files by using MinMax index in Iceberg", ValueType::Number) \ + M(IcebergAvroFileParsing, "Number of times avro metadata files have been parsed.", ValueType::Number) \ + M(IcebergAvroFileParsingMicroseconds, "Time spent for parsing avro metadata files for Iceberg tables.", ValueType::Microseconds) \ + M(IcebergJsonFileParsing, "Number of times json metadata files have been parsed.", ValueType::Number) \ + M(IcebergJsonFileParsingMicroseconds, "Time spent for parsing json metadata files for Iceberg tables.", ValueType::Microseconds) \ + \ M(JoinBuildTableRowCount, "Total number of rows in the build table for a JOIN operation.", ValueType::Number) \ M(JoinProbeTableRowCount, "Total number of rows in the probe table for a JOIN operation.", ValueType::Number) \ M(JoinResultRowCount, "Total number of rows in the result of a JOIN operation.", ValueType::Number) \ @@ -599,7 +610,9 @@ The server successfully detected this situation and will download merged part fr M(S3DeleteObjects, "Number of S3 API DeleteObject(s) calls.", ValueType::Number) \ M(S3CopyObject, "Number of S3 API CopyObject calls.", ValueType::Number) \ M(S3ListObjects, "Number of S3 API ListObjects calls.", ValueType::Number) \ + M(S3ListObjectsMicroseconds, "Time of S3 API ListObjects execution.", ValueType::Microseconds) \ M(S3HeadObject, "Number of S3 API HeadObject calls.", ValueType::Number) \ + M(S3HeadObjectMicroseconds, "Time of S3 API HeadObject execution.", ValueType::Microseconds) \ M(S3GetObjectAttributes, "Number of S3 API GetObjectAttributes calls.", ValueType::Number) \ M(S3CreateMultipartUpload, "Number of S3 API CreateMultipartUpload calls.", ValueType::Number) \ M(S3UploadPartCopy, "Number of S3 API UploadPartCopy calls.", ValueType::Number) \ @@ -653,6 +666,7 @@ The server successfully detected this situation and will download merged part fr M(AzureCopyObject, "Number of Azure blob storage API CopyObject calls", ValueType::Number) \ M(AzureDeleteObjects, "Number of Azure blob storage API DeleteObject(s) calls.", ValueType::Number) \ M(AzureListObjects, "Number of Azure blob storage API ListObjects calls.", ValueType::Number) \ + M(AzureListObjectsMicroseconds, "Time of Azure blob storage API ListObjects execution.", ValueType::Microseconds) \ M(AzureGetProperties, "Number of Azure blob storage API GetProperties calls.", ValueType::Number) \ M(AzureCreateContainer, "Number of Azure blob storage API CreateContainer calls.", ValueType::Number) \ \ @@ -1133,7 +1147,7 @@ The server successfully detected this situation and will download merged part fr M(MemoryWorkerRun, "Number of runs done by MemoryWorker in background", ValueType::Number) \ M(MemoryWorkerRunElapsedMicroseconds, "Total time spent by MemoryWorker for background work", ValueType::Microseconds) \ \ - M(ParquetFetchWaitTimeMicroseconds, "Time of waiting fetching parquet data", ValueType::Microseconds) \ + M(ParquetFetchWaitTimeMicroseconds, "Time of waiting for parquet file reads from decoding threads (not prefetching threads)", ValueType::Microseconds) \ M(ParquetReadRowGroups, "The total number of row groups read from parquet data", ValueType::Number) \ M(ParquetPrunedRowGroups, "The total number of row groups pruned from parquet data", ValueType::Number) \ M(ParquetDecodingTasks, "Tasks issued by parquet reader", ValueType::Number) \ @@ -1164,7 +1178,12 @@ The server successfully detected this situation and will download merged part fr M(AsyncLoggingErrorFileLogDroppedMessages, "How many messages have been dropped from error file log due to the async log queue being full", ValueType::Number) \ M(AsyncLoggingSyslogDroppedMessages, "How many messages have been dropped from the syslog due to the async log queue being full", ValueType::Number) \ M(AsyncLoggingTextLogDroppedMessages, "How many messages have been dropped from text_log due to the async log queue being full", ValueType::Number) \ - + M(ObjectStorageListObjectsCacheHits, "Number of times object storage list objects operation hit the cache.", ValueType::Number) \ + M(ObjectStorageListObjectsCacheMisses, "Number of times object storage list objects operation miss the cache.", ValueType::Number) \ + M(ObjectStorageListObjectsCacheExactMatchHits, "Number of times object storage list objects operation hit the cache with an exact match.", ValueType::Number) \ + M(ObjectStorageListObjectsCachePrefixMatchHits, "Number of times object storage list objects operation miss the cache using prefix matching.", ValueType::Number) \ + M(ParquetMetaDataCacheHits, "Number of times the read from filesystem cache hit the cache.", ValueType::Number) \ + M(ParquetMetaDataCacheMisses, "Number of times the read from filesystem cache miss the cache.", ValueType::Number) \ #ifdef APPLY_FOR_EXTERNAL_EVENTS #define APPLY_FOR_EVENTS(M) APPLY_FOR_BUILTIN_EVENTS(M) APPLY_FOR_EXTERNAL_EVENTS(M) diff --git a/src/Common/TTLCachePolicy.h b/src/Common/TTLCachePolicy.h index 79e34dc2ebd5..140dcc59f1b8 100644 --- a/src/Common/TTLCachePolicy.h +++ b/src/Common/TTLCachePolicy.h @@ -271,10 +271,10 @@ class TTLCachePolicy : public ICachePolicy; Cache cache; - +private: /// TODO To speed up removal of stale entries, we could also add another container sorted on expiry times which maps keys to iterators /// into the cache. To insert an entry, add it to the cache + add the iterator to the sorted container. To remove stale entries, do a /// binary search on the sorted container and erase all left of the found key. diff --git a/src/Common/futex.h b/src/Common/futex.h index f86cfacdc3d3..9ee0ae95d956 100644 --- a/src/Common/futex.h +++ b/src/Common/futex.h @@ -19,6 +19,16 @@ inline Int64 futexWait(void * address, UInt32 value) return syscall(SYS_futex, address, FUTEX_WAIT_PRIVATE, value, nullptr, nullptr, 0); } +inline Int64 futexTimedWait(void * address, UInt32 value, UInt64 nanos) +{ + const UInt64 nanos_per_sec = 1'000'000'000; + UInt64 sec = nanos / nanos_per_sec; + struct timespec timeout; + timeout.tv_sec = time_t(std::min(sec, UInt64(std::numeric_limits::max()))); + timeout.tv_nsec = int64_t(nanos % nanos_per_sec); + return syscall(SYS_futex, address, FUTEX_WAIT_PRIVATE, value, &timeout, nullptr, 0); +} + inline Int64 futexWake(void * address, int count) { return syscall(SYS_futex, address, FUTEX_WAKE_PRIVATE, count, nullptr, nullptr, 0); @@ -37,7 +47,7 @@ inline void futexWakeOne(std::atomic & address) inline void futexWakeAll(std::atomic & address) { - futexWake(&address, INT_MAX); + futexWake(&address, INT_MAX); } constexpr UInt32 lowerHalf(UInt64 value) diff --git a/src/Common/threadPoolCallbackRunner.cpp b/src/Common/threadPoolCallbackRunner.cpp index d33349cbaedb..81117d22ba3d 100644 --- a/src/Common/threadPoolCallbackRunner.cpp +++ b/src/Common/threadPoolCallbackRunner.cpp @@ -20,14 +20,6 @@ void ThreadPoolCallbackRunnerFast::initThreadPool(ThreadPool & pool_, size_t max max_threads = max_threads_; thread_name = thread_name_; thread_group = thread_group_; - - /// We could dynamically add and remove threads based on load, but it's not clear whether it's - /// worth the added complexity. - for (size_t i = 0; i < max_threads; ++i) - { - pool->scheduleOrThrowOnError([this] { threadFunction(); }); - ++threads; // only if scheduleOrThrowOnError didn't throw - } } ThreadPoolCallbackRunnerFast::ThreadPoolCallbackRunnerFast(Mode mode_) : mode(mode_) @@ -58,19 +50,30 @@ void ThreadPoolCallbackRunnerFast::shutdown() chassert(active_tasks.load() == queue.size()); } +void ThreadPoolCallbackRunnerFast::startMoreThreadsIfNeeded(size_t active_tasks_, std::unique_lock &) +{ + while (threads < max_threads && threads < active_tasks_ && !shutdown_requested) + { + pool->scheduleOrThrow([this] { threadFunction(); }); + ++threads; // only if scheduleOrThrow didn't throw + } +} + void ThreadPoolCallbackRunnerFast::operator()(std::function f) { if (mode == Mode::Disabled) throw Exception(ErrorCodes::LOGICAL_ERROR, "Thread pool runner is not initialized"); + size_t active_tasks_ = 1 + active_tasks.fetch_add(1, std::memory_order_relaxed); + { std::unique_lock lock(mutex); queue.push_back(std::move(f)); + startMoreThreadsIfNeeded(active_tasks_, lock); } if (mode == Mode::ThreadPool) { - active_tasks.fetch_add(1, std::memory_order_relaxed); #ifdef OS_LINUX UInt32 prev_size = queue_size.fetch_add(1, std::memory_order_release); if (prev_size < max_threads) @@ -89,14 +92,16 @@ void ThreadPoolCallbackRunnerFast::bulkSchedule(std::vector switcher; + switcher.emplace(thread_group, thread_name.c_str()); + + while (true) { - ThreadGroupSwitcher switcher(thread_group, thread_name.c_str()); + bool timed_out = false; +#ifdef OS_LINUX + UInt32 x = queue_size.load(std::memory_order_relaxed); while (true) { - #ifdef OS_LINUX - UInt32 x = queue_size.load(std::memory_order_relaxed); - while (true) + if (x == 0) { - if (x == 0) + Int64 waited = futexTimedWait(&queue_size, 0, THREAD_IDLE_TIMEOUT_NS); + x = queue_size.load(std::memory_order_relaxed); + + if (waited < 0 && errno == ETIMEDOUT && x == 0) { - futexWait(&queue_size, 0); - x = queue_size.load(std::memory_order_relaxed); - } - else if (queue_size.compare_exchange_weak( - x, x - 1, std::memory_order_acquire, std::memory_order_relaxed)) + timed_out = true; break; + } } - #endif + else if (queue_size.compare_exchange_weak( + x, x - 1, std::memory_order_acquire, std::memory_order_relaxed)) + break; + } +#endif - std::function f; - { - std::unique_lock lock(mutex); + std::function f; + { + std::unique_lock lock(mutex); - #ifndef OS_LINUX - queue_cv.wait(lock, [&] { return shutdown_requested || !queue.empty(); }); - #endif +#ifdef OS_LINUX + /// Important to never stop the last thread if queue is not empty (checked under the + /// same `lock` as decrementing `threads`). Otherwise we'll deadlock like this: + /// 0. `threads` == 1, queue is empty. + /// 1. The worker thread times out; it didn't lock mutex or decrement `threads` yet. + /// 2. A manager thread enqueues a task. It sees active_tasks == 1 and `threads` == 1, + /// so it doesn't start another thread. + /// 3. The worker thread exits. + /// 4. There are no threads, but the queue is not empty, oops. + if (timed_out && !queue.empty() && !shutdown_requested) + /// We can't just proceed to `queue.pop_front()` here because we haven't + /// decremented queue_size. + continue; +#else + timed_out = !queue_cv.wait_for( + lock, std::chrono::nanoseconds(THREAD_IDLE_TIMEOUT_NS), + [&] { return shutdown_requested || !queue.empty(); }); +#endif - if (shutdown_requested) - break; + if (shutdown_requested || timed_out) + { + /// Important that we destroy the `ThreadGroupSwitcher` before decrementing `threads`. + /// Otherwise ~ThreadGroupSwitcher may access global Context after the query is + /// finished, which may race with mutating Context (specifically, Settings) at the + /// start of next query. + switcher.reset(); - chassert(!queue.empty()); + threads -= 1; + if (threads == 0) + shutdown_cv.notify_all(); - f = std::move(queue.front()); - queue.pop_front(); + return; } - try - { - f(); + chassert(!queue.empty()); - CurrentThread::updatePerformanceCountersIfNeeded(); - } - catch (...) - { - tryLogCurrentException("FastThreadPool"); - chassert(false); - } + f = std::move(queue.front()); + queue.pop_front(); + } + + try + { + f(); - active_tasks.fetch_sub(1, std::memory_order_relaxed); + CurrentThread::updatePerformanceCountersIfNeeded(); + } + catch (...) + { + tryLogCurrentException("FastThreadPool"); + chassert(false); } - } - /// Important that we destroy the `ThreadGroupSwitcher` before decrementing `threads`. - /// Otherwise ~ThreadGroupSwitcher may access global Context after the query is finished, which - /// may race with mutating Context (specifically, Settings) at the start of next query. - { - std::unique_lock lock(mutex); - threads -= 1; - if (threads == 0) - shutdown_cv.notify_all(); + active_tasks.fetch_sub(1, std::memory_order_relaxed); } + + chassert(false); } bool ShutdownHelper::try_lock_shared() diff --git a/src/Common/threadPoolCallbackRunner.h b/src/Common/threadPoolCallbackRunner.h index 33a9a63ba91e..443e07c0584f 100644 --- a/src/Common/threadPoolCallbackRunner.h +++ b/src/Common/threadPoolCallbackRunner.h @@ -249,8 +249,6 @@ class ThreadPoolCallbackRunnerFast Disabled, }; - /// TODO [parquet]: Add metrics for queue size and active threads, and maybe event for tasks executed. - ThreadPoolCallbackRunnerFast(); void initManual() @@ -282,6 +280,9 @@ class ThreadPoolCallbackRunnerFast bool isIdle() const { return active_tasks.load(std::memory_order_relaxed) == 0; } private: + /// Stop thread if it had nothing to do for this long. + static constexpr UInt64 THREAD_IDLE_TIMEOUT_NS = 3'000'000; // 3 ms + Mode mode = Mode::Disabled; ThreadPool * pool = nullptr; size_t max_threads = 0; @@ -309,6 +310,25 @@ class ThreadPoolCallbackRunnerFast std::condition_variable queue_cv; #endif + /// We dynamically start more threads when queue grows and stop idle threads after a timeout. + /// + /// Interestingly, this is required for correctness, not just performance. + /// If we kept max_threads threads at all times, we may deadlock because the "threads" that we + /// schedule on ThreadPool are not necessarily running, they may be sitting in ThreadPool's + /// queue, blocking other "threads" from running. E.g. this may happen: + /// 1. Iceberg reader creates many parquet readers, and their ThreadPoolCallbackRunnerFast(s) + /// occupy all slots in the shared ThreadPool (getFormatParsingThreadPool()). + /// 2. Iceberg reader creates some more parquet readers for positional deletes, using separate + /// ThreadPoolCallbackRunnerFast-s (because the ones from above are mildly inconvenient to + /// propagate to that code site). Those ThreadPoolCallbackRunnerFast-s make + /// pool->scheduleOrThrowOnError calls, but ThreadPool just adds them to queue, no actual + /// ThreadPoolCallbackRunnerFast::threadFunction()-s are started. + /// 3. The readers from step 2 are stuck because their ThreadPoolCallbackRunnerFast-s have no + /// threads. The readers from step 1 are idle but not destroyed (keep occupying threads) + /// because the iceberg reader is waiting for positional deletes to be read (by readers + /// from step 2). We're stuck. + void startMoreThreadsIfNeeded(size_t active_tasks_, std::unique_lock &); + void threadFunction(); }; diff --git a/src/Core/FormatFactorySettings.h b/src/Core/FormatFactorySettings.h index 41a2bbaa98a0..e3e69899cee2 100644 --- a/src/Core/FormatFactorySettings.h +++ b/src/Core/FormatFactorySettings.h @@ -168,7 +168,7 @@ Ignore case when matching ORC columns with CH columns. Ignore case when matching Parquet columns with CH columns. )", 0) \ DECLARE(Bool, input_format_parquet_preserve_order, false, R"( -Avoid reordering rows when reading from Parquet files. Usually makes it much slower. Not recommended as row ordering is generally not guaranteed, and other parts of query pipeline may break it. +Avoid reordering rows when reading from Parquet files. Not recommended as row ordering is generally not guaranteed, and other parts of query pipeline may break it. Use `ORDER BY _row_number` instead. )", 0) \ DECLARE(Bool, input_format_parquet_filter_push_down, true, R"( When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and min/max statistics in the Parquet metadata. @@ -1458,8 +1458,7 @@ Use geo column parser to convert Array(UInt8) into Point/Linestring/Polygon/Mult DECLARE(Bool, output_format_parquet_geometadata, true, R"( Allow to write information about geo columns in parquet metadata and encode columns in WKB format. )", 0) \ - - + DECLARE(Bool, input_format_parquet_use_metadata_cache, true, R"(Enable parquet file metadata caching)", 0) \ // End of FORMAT_FACTORY_SETTINGS #define OBSOLETE_FORMAT_SETTINGS(M, ALIAS) \ diff --git a/src/Core/Protocol.h b/src/Core/Protocol.h index 1db06e0f916d..9db589f4ca34 100644 --- a/src/Core/Protocol.h +++ b/src/Core/Protocol.h @@ -96,8 +96,10 @@ namespace Protocol MergeTreeReadTaskRequest = 16, /// Request from a MergeTree replica to a coordinator TimezoneUpdate = 17, /// Receive server's (session-wide) default timezone SSHChallenge = 18, /// Return challenge for SSH signature signing + MAX = SSHChallenge, + ConnectionLost = 255, /// Exception that occurred on the client side. }; /// NOTE: If the type of packet argument would be Enum, the comparison packet >= 0 && packet < 10 diff --git a/src/Core/ProtocolDefines.h b/src/Core/ProtocolDefines.h index 5ed5ab179b19..95258a2b320c 100644 --- a/src/Core/ProtocolDefines.h +++ b/src/Core/ProtocolDefines.h @@ -35,7 +35,8 @@ static constexpr auto DBMS_MIN_REVISION_WITH_AGGREGATE_FUNCTIONS_VERSIONING = 54 static constexpr auto DBMS_CLUSTER_INITIAL_PROCESSING_PROTOCOL_VERSION = 1; static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_DATA_LAKE_METADATA = 2; -static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION = 2; +static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_DATA_LAKE_COLUMNS_METADATA = 3; +static constexpr auto DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION = 3; static constexpr auto DBMS_MIN_SUPPORTED_PARALLEL_REPLICAS_PROTOCOL_VERSION = 3; static constexpr auto DBMS_PARALLEL_REPLICAS_MIN_VERSION_WITH_MARK_SEGMENT_SIZE_FIELD = 4; diff --git a/src/Core/Range.cpp b/src/Core/Range.cpp index 139fb8db76c9..6d037d7e9004 100644 --- a/src/Core/Range.cpp +++ b/src/Core/Range.cpp @@ -151,6 +151,13 @@ bool Range::isInfinite() const return left.isNegativeInfinity() && right.isPositiveInfinity(); } +/// [x, x] +bool Range::isPoint() const +{ + return fullBounded() && left_included && right_included && equals(left, right) + && !left.isNegativeInfinity() && !left.isPositiveInfinity(); +} + bool Range::intersectsRange(const Range & r) const { /// r to the left of me. diff --git a/src/Core/Range.h b/src/Core/Range.h index 6072795db0a9..921e1e6aa3f0 100644 --- a/src/Core/Range.h +++ b/src/Core/Range.h @@ -94,6 +94,8 @@ struct Range bool isBlank() const; + bool isPoint() const; + bool intersectsRange(const Range & r) const; bool containsRange(const Range & r) const; diff --git a/src/Core/ServerSettings.cpp b/src/Core/ServerSettings.cpp index 004f8a16098c..28a660647541 100644 --- a/src/Core/ServerSettings.cpp +++ b/src/Core/ServerSettings.cpp @@ -112,6 +112,7 @@ namespace DB DECLARE(UInt64, max_unexpected_parts_loading_thread_pool_size, 8, R"(The number of threads to load inactive set of data parts (Unexpected ones) at startup.)", 0) \ DECLARE(UInt64, max_parts_cleaning_thread_pool_size, 128, R"(The number of threads for concurrent removal of inactive data parts.)", 0) \ DECLARE(UInt64, max_mutations_bandwidth_for_server, 0, R"(The maximum read speed of all mutations on server in bytes per second. Zero means unlimited.)", 0) \ + DECLARE(UInt64, max_exports_bandwidth_for_server, 0, R"(The maximum read speed of all exports on server in bytes per second. Zero means unlimited.)", 0) \ DECLARE(UInt64, max_merges_bandwidth_for_server, 0, R"(The maximum read speed of all merges on server in bytes per second. Zero means unlimited.)", 0) \ DECLARE(UInt64, max_replicated_fetches_network_bandwidth_for_server, 0, R"(The maximum speed of data exchange over the network in bytes per second for replicated fetches. Zero means unlimited.)", 0) \ DECLARE(UInt64, max_replicated_sends_network_bandwidth_for_server, 0, R"(The maximum speed of data exchange over the network in bytes per second for replicated sends. Zero means unlimited.)", 0) \ @@ -1139,8 +1140,11 @@ The policy on how to perform a scheduling of CPU slots specified by `concurrent_ DECLARE(UInt64, threadpool_local_fs_reader_queue_size, 1000000, R"(The maximum number of jobs that can be scheduled on the thread pool for reading from local filesystem.)", 0) \ DECLARE(NonZeroUInt64, threadpool_remote_fs_reader_pool_size, 250, R"(Number of threads in the Thread pool used for reading from remote filesystem when `remote_filesystem_read_method = 'threadpool'`.)", 0) \ DECLARE(UInt64, threadpool_remote_fs_reader_queue_size, 1000000, R"(The maximum number of jobs that can be scheduled on the thread pool for reading from remote filesystem.)", 0) \ - - + DECLARE(UInt64, object_storage_list_objects_cache_size, 500000000, "Maximum size of ObjectStorage list objects cache in bytes. Zero means disabled.", 0) \ + DECLARE(UInt64, object_storage_list_objects_cache_max_entries, 1000, "Maximum size of ObjectStorage list objects cache in entries. Zero means disabled.", 0) \ + DECLARE(UInt64, object_storage_list_objects_cache_ttl, 3600, "Time to live of records in ObjectStorage list objects cache in seconds. Zero means unlimited", 0) \ + DECLARE(UInt64, input_format_parquet_metadata_cache_max_size, 500000000, "Maximum size of parquet file metadata cache", 0) \ + DECLARE(Bool, enable_experimental_export_merge_tree_partition_feature, false, "Enable export replicated merge tree partition feature. It is experimental and not yet ready for production use.", 0) \ // clang-format on /// If you add a setting which can be updated at runtime, please update 'changeable_settings' map in dumpToSystemServerSettingsColumns below diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 903d984dee49..2042e998ec31 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -1760,6 +1760,22 @@ Possible values: - `global` — Replaces the `IN`/`JOIN` query with `GLOBAL IN`/`GLOBAL JOIN.` - `allow` — Allows the use of these types of subqueries. )", IMPORTANT) \ + DECLARE(ObjectStorageClusterJoinMode, object_storage_cluster_join_mode, ObjectStorageClusterJoinMode::ALLOW, R"( +Changes the behaviour of object storage cluster function or table. + +ClickHouse applies this setting when the query contains the product of object storage cluster function or table, i.e. when the query for a object storage cluster function or table contains a non-GLOBAL subquery for the object storage cluster function or table. + +Restrictions: + +- Only applied for JOIN subqueries. +- Only if the FROM section uses a object storage cluster function or table. + +Possible values: + +- `local` — Replaces the database and table in the subquery with local ones for the destination server (shard), leaving the normal `IN`/`JOIN.` +- `global` — Unsupported for now. Replaces the `IN`/`JOIN` query with `GLOBAL IN`/`GLOBAL JOIN.` +- `allow` — Default value. Allows the use of these types of subqueries. +)", 0) \ \ DECLARE(UInt64, max_concurrent_queries_for_all_users, 0, R"( Throw exception if the value of this setting is less or equal than the current number of simultaneously processed queries. @@ -6629,13 +6645,13 @@ The timeout in milliseconds for connecting to a remote replica during query exec DECLARE(Bool, parallel_replicas_for_cluster_engines, true, R"( Replace table function engines with their -Cluster alternatives )", 0) \ - DECLARE_WITH_ALIAS(Bool, allow_experimental_database_iceberg, false, R"( + DECLARE_WITH_ALIAS(Bool, allow_experimental_database_iceberg, true, R"( Allow experimental database engine DataLakeCatalog with catalog_type = 'iceberg' )", BETA, allow_database_iceberg) \ - DECLARE_WITH_ALIAS(Bool, allow_experimental_database_unity_catalog, false, R"( + DECLARE_WITH_ALIAS(Bool, allow_experimental_database_unity_catalog, true, R"( Allow experimental database engine DataLakeCatalog with catalog_type = 'unity' )", BETA, allow_database_unity_catalog) \ - DECLARE_WITH_ALIAS(Bool, allow_experimental_database_glue_catalog, false, R"( + DECLARE_WITH_ALIAS(Bool, allow_experimental_database_glue_catalog, true, R"( Allow experimental database engine DataLakeCatalog with catalog_type = 'glue' )", BETA, allow_database_glue_catalog) \ DECLARE_WITH_ALIAS(Bool, allow_experimental_analyzer, true, R"( @@ -6854,6 +6870,35 @@ Possible values: )", 0) \ DECLARE(Bool, use_roaring_bitmap_iceberg_positional_deletes, false, R"( Use roaring bitmap for iceberg positional deletes. +)", 0) \ + DECLARE(Bool, export_merge_tree_part_overwrite_file_if_exists, false, R"( +Overwrite file if it already exists when exporting a merge tree part +)", 0) \ + DECLARE(Bool, export_merge_tree_partition_force_export, false, R"( +Ignore existing partition export and overwrite the zookeeper entry +)", 0) \ + DECLARE(UInt64, export_merge_tree_partition_max_retries, 3, R"( +Maximum number of retries for exporting a merge tree part in an export partition task +)", 0) \ + DECLARE(UInt64, export_merge_tree_partition_manifest_ttl, 180, R"( +Determines how long the manifest will live in ZooKeeper. It prevents the same partition from being exported twice to the same destination. +This setting does not affect / delete in progress tasks. It'll only cleanup the completed ones. +)", 0) \ + DECLARE(MergeTreePartExportFileAlreadyExistsPolicy, export_merge_tree_part_file_already_exists_policy, MergeTreePartExportFileAlreadyExistsPolicy::skip, R"( +Possible values: +- skip - Skip the file if it already exists. +- error - Throw an error if the file already exists. +- overwrite - Overwrite the file. +)", 0) \ + DECLARE(Timezone, iceberg_timezone_for_timestamptz, "UTC", R"( +Timezone for Iceberg timestamptz field. + +Possible values: + +- Any valid timezone, e.g. `Europe/Berlin`, `UTC` or `Zulu` +- `` (empty value) - use session timezone + +Default value is `UTC`. )", 0) \ \ /* ####################################################### */ \ @@ -6881,6 +6926,12 @@ Allows creation of tables with the [TimeSeries](../../engines/table-engines/inte - 0 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is disabled. - 1 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is enabled. )", EXPERIMENTAL) \ + DECLARE(Bool, allow_experimental_hybrid_table, false, R"( +Allows creation of tables with the [Hybrid](../../engines/table-engines/special/hybrid.md) table engine. +)", EXPERIMENTAL) \ + DECLARE(Bool, hybrid_table_auto_cast_columns, true, R"( +Automatically cast columns to the schema defined in Hybrid tables when remote segments expose different physical types. Works only with analyzer. Enabled by default, does nothing if (experimental) Hybrid tables are disabled; disable it if it causes issues. Segment schemas are cached when the Hybrid table is created or attached; if a segment schema changes later, detach/attach or recreate the Hybrid table so the cached headers stay in sync. +)", 0) \ DECLARE(Bool, allow_experimental_codecs, false, R"( If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing). )", EXPERIMENTAL) \ @@ -6970,6 +7021,15 @@ Enable PRQL - an alternative to SQL. )", EXPERIMENTAL) \ DECLARE(Bool, enable_adaptive_memory_spill_scheduler, false, R"( Trigger processor to spill data into external storage adpatively. grace join is supported at present. +)", EXPERIMENTAL) \ + DECLARE(String, object_storage_cluster, "", R"( +Cluster to make distributed requests to object storages with alternative syntax. +)", EXPERIMENTAL) \ + DECLARE(UInt64, object_storage_max_nodes, 0, R"( +Limit for hosts used for request in object storage cluster table functions - azureBlobStorageCluster, s3Cluster, hdfsCluster, etc. +Possible values: +- Positive integer. +- 0 — All hosts in cluster. )", EXPERIMENTAL) \ DECLARE(Bool, allow_experimental_delta_kernel_rs, true, R"( Allow experimental delta-kernel-rs implementation. @@ -6985,6 +7045,9 @@ Write full paths (including s3://) into iceberg metadata files. )", EXPERIMENTAL) \ DECLARE(String, iceberg_metadata_compression_method, "", R"( Method to compress `.metadata.json` file. +)", EXPERIMENTAL) \ + DECLARE(Bool, use_object_storage_list_objects_cache, false, R"( +Cache the list of objects returned by list objects calls in object storage )", EXPERIMENTAL) \ DECLARE(Bool, make_distributed_plan, false, R"( Make distributed query plan. @@ -7001,6 +7064,19 @@ Default number of tasks for parallel reading in distributed query. Tasks are spr DECLARE(Bool, distributed_plan_optimize_exchanges, true, R"( Removes unnecessary exchanges in distributed query plan. Disable it for debugging. )", 0) \ + DECLARE(UInt64, lock_object_storage_task_distribution_ms, 500, R"( +In object storage distribution queries do not distribute tasks on non-prefetched nodes until prefetched node is active. +Determines how long the free executor node (one that finished processing all of it assigned tasks) should wait before "stealing" tasks from queue of currently busy executor nodes. + +Possible values: + +- 0 - steal tasks immediately after freeing up. +- >0 - wait for specified period of time before stealing tasks. + +Having this `>0` helps with cache reuse and might improve overall query time. +Because busy node might have warmed-up caches for this specific task, while free node needs to fetch lots of data from S3. +Which might take longer than just waiting for the busy node and generate extra traffic. +)", EXPERIMENTAL) \ DECLARE(String, distributed_plan_force_exchange_kind, "", R"( Force specified kind of Exchange operators between distributed query stages. @@ -7024,12 +7100,24 @@ DECLARE(Bool, allow_experimental_ytsaurus_dictionary_source, false, R"( )", EXPERIMENTAL) \ DECLARE(Bool, distributed_plan_force_shuffle_aggregation, false, R"( Use Shuffle aggregation strategy instead of PartialAggregation + Merge in distributed query plan. +)", EXPERIMENTAL) \ + DECLARE(Bool, allow_experimental_iceberg_read_optimization, true, R"( +Allow Iceberg read optimization based on Iceberg metadata. +)", EXPERIMENTAL) \ + DECLARE(Bool, allow_retries_in_cluster_requests, false, R"( +Allow retries in cluster request, when one node goes offline +)", EXPERIMENTAL) \ + DECLARE(Bool, object_storage_remote_initiator, false, R"( +Execute request to object storage as remote on one of object_storage_cluster nodes. )", EXPERIMENTAL) \ \ /** Experimental timeSeries* aggregate functions. */ \ DECLARE_WITH_ALIAS(Bool, allow_experimental_time_series_aggregate_functions, false, R"( Experimental timeSeries* aggregate functions for Prometheus-like timeseries resampling, rate, delta calculation. )", EXPERIMENTAL, allow_experimental_ts_to_grid_aggregate_function) \ + DECLARE(Bool, allow_experimental_export_merge_tree_part, true, R"( +Experimental export merge tree part. +)", EXPERIMENTAL) \ \ DECLARE(String, promql_database, "", R"( Specifies the database name used by the 'promql' dialect. Empty string means the current database. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 7f284dd78b4e..c70767e21aca 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -58,6 +58,7 @@ class WriteBuffer; M(CLASS_NAME, DistributedCachePoolBehaviourOnLimit) /* Cloud only */ \ M(CLASS_NAME, DistributedDDLOutputMode) \ M(CLASS_NAME, DistributedProductMode) \ + M(CLASS_NAME, ObjectStorageClusterJoinMode) \ M(CLASS_NAME, Double) \ M(CLASS_NAME, EscapingRule) \ M(CLASS_NAME, Float) \ @@ -80,6 +81,7 @@ class WriteBuffer; M(CLASS_NAME, LogsLevel) \ M(CLASS_NAME, Map) \ M(CLASS_NAME, MaxThreads) \ + M(CLASS_NAME, MergeTreePartExportFileAlreadyExistsPolicy) \ M(CLASS_NAME, Milliseconds) \ M(CLASS_NAME, MsgPackUUIDRepresentation) \ M(CLASS_NAME, MySQLDataTypesSupport) \ diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 1ba731c37bbf..3e3f5e3f7608 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -39,6 +39,24 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() /// controls new feature and it's 'true' by default, use 'false' as previous_value). /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) /// Note: please check if the key already exists to prevent duplicate entries. + addSettingsChanges(settings_changes_history, "25.8.9.2000", + { + {"allow_experimental_iceberg_read_optimization", true, true, "New setting."}, + {"object_storage_cluster_join_mode", "allow", "allow", "New setting"}, + {"lock_object_storage_task_distribution_ms", 500, 500, "Raised the value to 500 to avoid hoping tasks between executors."}, + {"allow_retries_in_cluster_requests", false, false, "New setting"}, + {"object_storage_remote_initiator", false, false, "New setting."}, + {"allow_experimental_export_merge_tree_part", false, false, "New setting."}, + {"export_merge_tree_part_overwrite_file_if_exists", false, false, "New setting."}, + {"allow_experimental_export_merge_tree_part", false, true, "Turned ON by default for Antalya."}, + {"export_merge_tree_partition_force_export", false, false, "New setting."}, + {"export_merge_tree_partition_max_retries", 3, 3, "New setting."}, + {"export_merge_tree_partition_manifest_ttl", 180, 180, "New setting."}, + {"export_merge_tree_part_file_already_exists_policy", "skip", "skip", "New setting."}, + {"iceberg_timezone_for_timestamptz", "UTC", "UTC", "New setting."}, + {"hybrid_table_auto_cast_columns", true, true, "New setting to automatically cast Hybrid table columns when segments disagree on types. Default enabled."}, + {"allow_experimental_hybrid_table", false, false, "Added new setting to allow the Hybrid table engine."} + }); addSettingsChanges(settings_changes_history, "25.8", { {"output_format_json_quote_64bit_integers", true, false, "Disable quoting of the 64 bit integers in JSON by default"}, @@ -110,6 +128,7 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() {"allow_experimental_lightweight_update", false, true, "Lightweight updates were moved to Beta."}, {"s3_slow_all_threads_after_retryable_error", false, false, "Added an alias for setting `backup_slow_all_threads_after_retryable_s3_error`"}, {"iceberg_metadata_log_level", "none", "none", "New setting."}, + {"use_object_storage_list_objects_cache", false, false, "New setting."}, }); addSettingsChanges(settings_changes_history, "25.7", { @@ -133,6 +152,18 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() {"allow_experimental_insert_into_iceberg", false, false, "New setting."}, /// RELEASE CLOSED }); + addSettingsChanges(settings_changes_history, "25.6.5.2000", + { + {"allow_experimental_database_iceberg", false, true, "Turned ON by default for Antalya"}, + {"allow_experimental_database_unity_catalog", false, true, "Turned ON by default for Antalya"}, + {"allow_experimental_database_glue_catalog", false, true, "Turned ON by default for Antalya"}, + {"output_format_parquet_enum_as_byte_array", true, true, "Enable writing Enum as byte array in Parquet by default"}, + {"object_storage_cluster", "", "", "New setting"}, + {"object_storage_max_nodes", 0, 0, "New setting"}, + {"allow_experimental_export_merge_tree_part", false, false, "New setting."}, + {"export_merge_tree_part_overwrite_file_if_exists", false, false, "New setting."}, + {"lock_object_storage_task_distribution_ms", 0, 0, "New setting."}, + }); addSettingsChanges(settings_changes_history, "25.6", { /// RELEASE CLOSED @@ -254,6 +285,11 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() {"parallel_hash_join_threshold", 0, 0, "New setting"}, /// Release closed. Please use 25.4 }); + addSettingsChanges(settings_changes_history, "24.12.2.20000", + { + // Altinity Antalya modifications atop of 24.12 + {"input_format_parquet_use_metadata_cache", true, true, "New setting, turned ON by default"}, // https://github.com/Altinity/ClickHouse/pull/586 + }); addSettingsChanges(settings_changes_history, "25.2", { /// Release closed. Please use 25.3 diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index 7a586f51168e..2fb4f1668ed4 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -90,6 +90,11 @@ IMPLEMENT_SETTING_ENUM(DistributedProductMode, ErrorCodes::UNKNOWN_DISTRIBUTED_P {"global", DistributedProductMode::GLOBAL}, {"allow", DistributedProductMode::ALLOW}}) +IMPLEMENT_SETTING_ENUM(ObjectStorageClusterJoinMode, ErrorCodes::BAD_ARGUMENTS, + {{"local", ObjectStorageClusterJoinMode::LOCAL}, + {"global", ObjectStorageClusterJoinMode::GLOBAL}, + {"allow", ObjectStorageClusterJoinMode::ALLOW}}) + IMPLEMENT_SETTING_ENUM(QueryResultCacheNondeterministicFunctionHandling, ErrorCodes::BAD_ARGUMENTS, {{"throw", QueryResultCacheNondeterministicFunctionHandling::Throw}, @@ -365,4 +370,6 @@ IMPLEMENT_SETTING_ENUM( {"manifest_list_entry", IcebergMetadataLogLevel::ManifestListEntry}, {"manifest_file_metadata", IcebergMetadataLogLevel::ManifestFileMetadata}, {"manifest_file_entry", IcebergMetadataLogLevel::ManifestFileEntry}}) + +IMPLEMENT_SETTING_AUTO_ENUM(MergeTreePartExportFileAlreadyExistsPolicy, ErrorCodes::BAD_ARGUMENTS); } diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index 18873a0790ae..935e0ee8b615 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -163,6 +163,16 @@ enum class DistributedProductMode : uint8_t DECLARE_SETTING_ENUM(DistributedProductMode) +/// The setting for executing object storage cluster function or table JOIN sections. +enum class ObjectStorageClusterJoinMode : uint8_t +{ + LOCAL, /// Convert to local query + GLOBAL, /// Convert to global query + ALLOW /// Enable +}; + +DECLARE_SETTING_ENUM(ObjectStorageClusterJoinMode) + /// How the query result cache handles queries with non-deterministic functions, e.g. now() enum class QueryResultCacheNondeterministicFunctionHandling : uint8_t { @@ -470,4 +480,14 @@ enum class IcebergMetadataLogLevel : uint8_t }; DECLARE_SETTING_ENUM(IcebergMetadataLogLevel) + +enum class MergeTreePartExportFileAlreadyExistsPolicy : uint8_t +{ + skip, + error, + overwrite, +}; + +DECLARE_SETTING_ENUM(MergeTreePartExportFileAlreadyExistsPolicy) + } diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index e31f23e18c59..d6380d2fdbca 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -48,11 +48,8 @@ std::string concatenateName(const std::string & nested_table_name, const std::st */ std::pair splitName(const std::string & name, bool reverse) { - auto idx = (reverse ? name.find_last_of('.') : name.find_first_of('.')); - if (idx == std::string::npos || idx == 0 || idx + 1 == name.size()) - return {name, {}}; - - return {name.substr(0, idx), name.substr(idx + 1)}; + auto res = splitName(std::string_view(name), reverse); + return {std::string(res.first), std::string(res.second)}; } std::pair splitName(std::string_view name, bool reverse) diff --git a/src/DataTypes/NestedUtils.h b/src/DataTypes/NestedUtils.h index 8f2ab20ac879..894af62092ba 100644 --- a/src/DataTypes/NestedUtils.h +++ b/src/DataTypes/NestedUtils.h @@ -17,6 +17,8 @@ namespace Nested std::string concatenateName(const std::string & nested_table_name, const std::string & nested_field_name); /// Splits name of compound identifier by first/last dot (depending on 'reverse' parameter). + /// If the name is not nested (no dot or dot at start/end), + /// returns {name, ""}. std::pair splitName(const std::string & name, bool reverse = false); std::pair splitName(std::string_view name, bool reverse = false); diff --git a/src/Databases/DataLake/Common.cpp b/src/Databases/DataLake/Common.cpp index 681dd957b43f..8946d3412d70 100644 --- a/src/Databases/DataLake/Common.cpp +++ b/src/Databases/DataLake/Common.cpp @@ -61,14 +61,14 @@ std::vector splitTypeArguments(const String & type_str) return args; } -DB::DataTypePtr getType(const String & type_name, bool nullable, const String & prefix) +DB::DataTypePtr getType(const String & type_name, bool nullable, DB::ContextPtr context, const String & prefix) { String name = trim(type_name); if (name.starts_with("array<") && name.ends_with(">")) { String inner = name.substr(6, name.size() - 7); - return std::make_shared(getType(inner, nullable)); + return std::make_shared(getType(inner, nullable, context)); } if (name.starts_with("map<") && name.ends_with(">")) @@ -79,7 +79,7 @@ DB::DataTypePtr getType(const String & type_name, bool nullable, const String & if (args.size() != 2) throw DB::Exception(DB::ErrorCodes::DATALAKE_DATABASE_ERROR, "Invalid data type {}", type_name); - return std::make_shared(getType(args[0], false), getType(args[1], nullable)); + return std::make_shared(getType(args[0], false, context), getType(args[1], nullable, context)); } if (name.starts_with("struct<") && name.ends_with(">")) @@ -101,13 +101,13 @@ DB::DataTypePtr getType(const String & type_name, bool nullable, const String & String full_field_name = prefix.empty() ? field_name : prefix + "." + field_name; field_names.push_back(full_field_name); - field_types.push_back(getType(field_type, nullable, full_field_name)); + field_types.push_back(getType(field_type, nullable, context, full_field_name)); } return std::make_shared(field_types, field_names); } - return nullable ? DB::makeNullable(DB::Iceberg::IcebergSchemaProcessor::getSimpleType(name)) - : DB::Iceberg::IcebergSchemaProcessor::getSimpleType(name); + return nullable ? DB::makeNullable(DB::Iceberg::IcebergSchemaProcessor::getSimpleType(name, context)) + : DB::Iceberg::IcebergSchemaProcessor::getSimpleType(name, context); } std::pair parseTableName(const std::string & name) diff --git a/src/Databases/DataLake/Common.h b/src/Databases/DataLake/Common.h index cd4b6214e343..9b0dd7c626a6 100644 --- a/src/Databases/DataLake/Common.h +++ b/src/Databases/DataLake/Common.h @@ -2,6 +2,7 @@ #include #include +#include namespace DataLake { @@ -10,7 +11,7 @@ String trim(const String & str); std::vector splitTypeArguments(const String & type_str); -DB::DataTypePtr getType(const String & type_name, bool nullable, const String & prefix = ""); +DB::DataTypePtr getType(const String & type_name, bool nullable, DB::ContextPtr context, const String & prefix = ""); /// Parse a string, containing at least one dot, into a two substrings: /// A.B.C.D.E -> A.B.C.D and E, where diff --git a/src/Databases/DataLake/DataLakeConstants.h b/src/Databases/DataLake/DataLakeConstants.h index eaa8f5a276e6..02f6a7dcfcd7 100644 --- a/src/Databases/DataLake/DataLakeConstants.h +++ b/src/Databases/DataLake/DataLakeConstants.h @@ -8,6 +8,7 @@ namespace DataLake { static constexpr auto DATABASE_ENGINE_NAME = "DataLakeCatalog"; +static constexpr auto DATABASE_ALIAS_NAME = "Iceberg"; static constexpr std::string_view FILE_PATH_PREFIX = "file:/"; /// Some catalogs (Unity or Glue) may store not only Iceberg/DeltaLake tables but other kinds of "tables" diff --git a/src/Databases/DataLake/DatabaseDataLake.cpp b/src/Databases/DataLake/DatabaseDataLake.cpp index 15ca4b9dd3e7..df0114095f61 100644 --- a/src/Databases/DataLake/DatabaseDataLake.cpp +++ b/src/Databases/DataLake/DatabaseDataLake.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,7 @@ namespace DatabaseDataLakeSetting extern const DatabaseDataLakeSettingsString oauth_server_uri; extern const DatabaseDataLakeSettingsBool oauth_server_use_request_body; extern const DatabaseDataLakeSettingsBool vended_credentials; + extern const DatabaseDataLakeSettingsString object_storage_cluster; extern const DatabaseDataLakeSettingsString aws_access_key_id; extern const DatabaseDataLakeSettingsString aws_secret_access_key; extern const DatabaseDataLakeSettingsString region; @@ -176,7 +178,7 @@ std::shared_ptr DatabaseDataLake::getCatalog() const return catalog_impl; } -std::shared_ptr DatabaseDataLake::getConfiguration( +StorageObjectStorageConfigurationPtr DatabaseDataLake::getConfiguration( DatabaseDataLakeStorageType type, DataLakeStorageSettingsPtr storage_settings) const { @@ -324,7 +326,7 @@ StoragePtr DatabaseDataLake::tryGetTableImpl(const String & name, ContextPtr con auto [namespace_name, table_name] = DataLake::parseTableName(name); - if (!catalog->tryGetTableMetadata(namespace_name, table_name, table_metadata)) + if (!catalog->tryGetTableMetadata(namespace_name, table_name, context_, table_metadata)) return nullptr; if (ignore_if_not_iceberg && !table_metadata.isDefaultReadableTable()) @@ -423,25 +425,27 @@ StoragePtr DatabaseDataLake::tryGetTableImpl(const String & name, ContextPtr con /// with_table_structure = false: because there will be /// no table structure in table definition AST. - StorageObjectStorageConfiguration::initialize(*configuration, args, context_copy, /* with_table_structure */false); + configuration->initialize(args, context_copy, /* with_table_structure */false); - return std::make_shared( + auto cluster_name = settings[DatabaseDataLakeSetting::object_storage_cluster].value; + + return std::make_shared( + cluster_name, configuration, configuration->createObjectStorage(context_copy, /* is_readonly */ false), - context_copy, StorageID(getDatabaseName(), name), - /* columns */columns, - /* constraints */ConstraintsDescription{}, - /* comment */"", + /* columns */ columns, + /* constraints */ ConstraintsDescription{}, + /* partition_by */ nullptr, + context_copy, + /* comment */ "", getFormatSettings(context_copy), LoadingStrictnessLevel::CREATE, getCatalog(), - /* if_not_exists*/true, - /* is_datalake_query*/true, - /* distributed_processing */false, - /* partition_by */nullptr, - /* is_table_function */false, - /* lazy_init */true); + /* if_not_exists */ true, + /* is_datalake_query */ true, + /* is_table_function */ true, + /* lazy_init */ true); } void DatabaseDataLake::dropTable( /// NOLINT @@ -628,7 +632,7 @@ ASTPtr DatabaseDataLake::getCreateDatabaseQuery() const ASTPtr DatabaseDataLake::getCreateTableQueryImpl( const String & name, - ContextPtr /* context_ */, + ContextPtr context_, bool throw_on_error) const { auto catalog = getCatalog(); @@ -636,7 +640,7 @@ ASTPtr DatabaseDataLake::getCreateTableQueryImpl( const auto [namespace_name, table_name] = DataLake::parseTableName(name); - if (!catalog->tryGetTableMetadata(namespace_name, table_name, table_metadata)) + if (!catalog->tryGetTableMetadata(namespace_name, table_name, context_, table_metadata)) { if (throw_on_error) throw Exception(ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY, "Table `{}` doesn't exist", name); @@ -729,6 +733,11 @@ void registerDatabaseDataLake(DatabaseFactory & factory) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Engine `{}` must have arguments", database_engine_name); } + if (database_engine_name == "Iceberg" && catalog_type != DatabaseDataLakeCatalogType::ICEBERG_REST) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Engine `Iceberg` must have `rest` catalog type only"); + } + for (auto & engine_arg : engine_args) engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, args.context); @@ -810,6 +819,7 @@ void registerDatabaseDataLake(DatabaseFactory & factory) args.uuid); }; factory.registerDatabase("DataLakeCatalog", create_fn, { .supports_arguments = true, .supports_settings = true }); + factory.registerDatabase("Iceberg", create_fn, { .supports_arguments = true, .supports_settings = true }); } } diff --git a/src/Databases/DataLake/DatabaseDataLake.h b/src/Databases/DataLake/DatabaseDataLake.h index c46b0eb3dcd2..547ec23f1bcc 100644 --- a/src/Databases/DataLake/DatabaseDataLake.h +++ b/src/Databases/DataLake/DatabaseDataLake.h @@ -88,7 +88,7 @@ class DatabaseDataLake final : public IDatabase, WithContext void validateSettings(); std::shared_ptr getCatalog() const; - std::shared_ptr getConfiguration( + StorageObjectStorageConfigurationPtr getConfiguration( DatabaseDataLakeStorageType type, DataLakeStorageSettingsPtr storage_settings) const; diff --git a/src/Databases/DataLake/GlueCatalog.cpp b/src/Databases/DataLake/GlueCatalog.cpp index c0233f362ec7..59a256d581a4 100644 --- a/src/Databases/DataLake/GlueCatalog.cpp +++ b/src/Databases/DataLake/GlueCatalog.cpp @@ -275,6 +275,7 @@ bool GlueCatalog::existsTable(const std::string & database_name, const std::stri bool GlueCatalog::tryGetTableMetadata( const std::string & database_name, const std::string & table_name, + DB::ContextPtr /* context_ */, TableMetadata & result) const { Aws::Glue::Model::GetTableRequest request; @@ -316,11 +317,31 @@ bool GlueCatalog::tryGetTableMetadata( { result.setDataLakeSpecificProperties(DataLakeSpecificProperties{.iceberg_metadata_file_location = table_params.at("metadata_location")}); } + else if (table_outcome.GetStorageDescriptor().LocationHasBeenSet()) + { + const auto & location = table_outcome.GetStorageDescriptor().GetLocation(); + + std::string location_with_slash = location; + if (!location_with_slash.ends_with('/')) + location_with_slash += '/'; + + // Resolve the actual metadata file path based on table location + std::string resolved_metadata_path = resolveMetadataPathFromTableLocation(location_with_slash, result); + if (resolved_metadata_path.empty()) + { + result.setTableIsNotReadable(fmt::format("Could not determine metadata_location of table `{}`. ", + database_name + "." + table_name)); + } + else + { + result.setDataLakeSpecificProperties(DataLakeSpecificProperties{.iceberg_metadata_file_location = resolved_metadata_path}); + } + } else { - result.setTableIsNotReadable(fmt::format("Cannot read table `{}` because it has no metadata_location. " \ - "It means that it's unreadable with Glue catalog in ClickHouse, readable tables must have 'metadata_location' in table parameters", - database_name + "." + table_name)); + result.setTableIsNotReadable(fmt::format("Cannot read table `{}` because it has no metadata_location. " \ + "It means that it's unreadable with Glue catalog in ClickHouse, readable tables must have 'metadata_location' in table parameters", + database_name + "." + table_name)); } }; @@ -351,7 +372,7 @@ bool GlueCatalog::tryGetTableMetadata( column_type = "timestamptz"; } - schema.push_back({column.GetName(), getType(column_type, can_be_nullable)}); + schema.push_back({column.GetName(), getType(column_type, can_be_nullable, getContext())}); } result.setSchema(schema); } @@ -373,9 +394,10 @@ bool GlueCatalog::tryGetTableMetadata( void GlueCatalog::getTableMetadata( const std::string & database_name, const std::string & table_name, + DB::ContextPtr context_, TableMetadata & result) const { - if (!tryGetTableMetadata(database_name, table_name, result)) + if (!tryGetTableMetadata(database_name, table_name, context_, result)) { throw DB::Exception( DB::ErrorCodes::DATALAKE_DATABASE_ERROR, @@ -414,43 +436,47 @@ bool GlueCatalog::empty() const bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMetadata & table_metadata) const { String metadata_path; + String metadata_uri; if (auto table_specific_properties = table_metadata.getDataLakeSpecificProperties(); table_specific_properties.has_value()) { metadata_path = table_specific_properties->iceberg_metadata_file_location; + metadata_uri = metadata_path; if (metadata_path.starts_with("s3:/")) metadata_path = metadata_path.substr(5); - // Delete bucket + // Delete bucket from path std::size_t pos = metadata_path.find('/'); if (pos != std::string::npos) metadata_path = metadata_path.substr(pos + 1); } else - throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Metadata specific properties should be defined"); + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Failed to read table metadata, reason why table is unreadable: {}", table_metadata.getReasonWhyTableIsUnreadable()); - if (!metadata_objects.get(metadata_path)) + if (!metadata_objects.get(metadata_uri)) { DB::ASTStorage * storage = table_engine_definition->as(); DB::ASTs args = storage->engine->arguments->children; - auto table_endpoint = settings.storage_endpoint; + String storage_endpoint = !settings.storage_endpoint.empty() ? settings.storage_endpoint : metadata_uri; + if (args.empty()) - args.emplace_back(std::make_shared(table_endpoint)); + args.emplace_back(std::make_shared(storage_endpoint)); else - args[0] = std::make_shared(table_endpoint); + args[0] = std::make_shared(storage_endpoint); - if (args.size() == 1 && table_metadata.hasStorageCredentials()) + if (args.size() == 1) { - auto storage_credentials = table_metadata.getStorageCredentials(); - if (storage_credentials) - storage_credentials->addCredentialsToEngineArgs(args); + if (table_metadata.hasStorageCredentials()) + table_metadata.getStorageCredentials()->addCredentialsToEngineArgs(args); + else if (!credentials.IsExpiredOrEmpty()) + DataLake::S3Credentials(credentials.GetAWSAccessKeyId(), credentials.GetAWSSecretKey(), credentials.GetSessionToken()).addCredentialsToEngineArgs(args); } auto storage_settings = std::make_shared(); storage_settings->loadFromSettingsChanges(settings.allChanged()); auto configuration = std::make_shared(storage_settings); - DB::StorageObjectStorageConfiguration::initialize(*configuration, args, getContext(), false); + configuration->initialize(args, getContext(), false); auto object_storage = configuration->createObjectStorage(getContext(), true); const auto & read_settings = getContext()->getReadSettings(); @@ -458,14 +484,14 @@ bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMet DB::StoredObject metadata_stored_object(metadata_path); auto read_buf = object_storage->readObject(metadata_stored_object, read_settings); String metadata_file; - readString(metadata_file, *read_buf); + readStringUntilEOF(metadata_file, *read_buf); Poco::JSON::Parser parser; Poco::Dynamic::Var result = parser.parse(metadata_file); auto metadata_object = result.extract(); - metadata_objects.set(metadata_path, std::make_shared(metadata_object)); + metadata_objects.set(metadata_uri, std::make_shared(metadata_object)); } - auto metadata_object = *metadata_objects.get(metadata_path); + auto metadata_object = *metadata_objects.get(metadata_uri); auto current_schema_id = metadata_object->getValue("current-schema-id"); auto schemas = metadata_object->getArray(DB::Iceberg::f_schemas); for (size_t i = 0; i < schemas->size(); ++i) @@ -486,6 +512,125 @@ bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMet return false; } +/// This function tries two resolve the metadata file path by following means: +/// 1. Tries to read version-hint.text to get the latest version. +/// 2. Lists all *.metadata.json files in the metadata directory and takes the most recent one. +String GlueCatalog::resolveMetadataPathFromTableLocation(const String & table_location, const TableMetadata & table_metadata) const +{ + // Construct path to version-hint.text + String version_hint_path = table_location + "metadata/version-hint.text"; + + DB::ASTStorage * storage = table_engine_definition->as(); + DB::ASTs args = storage->engine->arguments->children; + + String storage_endpoint = !settings.storage_endpoint.empty() ? settings.storage_endpoint : table_location; + if (args.empty()) + args.emplace_back(std::make_shared(storage_endpoint)); + else + args[0] = std::make_shared(storage_endpoint); + + if (args.size() == 1 && table_metadata.hasStorageCredentials()) + { + auto storage_credentials = table_metadata.getStorageCredentials(); + if (storage_credentials) + storage_credentials->addCredentialsToEngineArgs(args); + } + + auto storage_settings = std::make_shared(); + storage_settings->loadFromSettingsChanges(settings.allChanged()); + auto configuration = std::make_shared(storage_settings); + configuration->initialize(args, getContext(), false); + + auto object_storage = configuration->createObjectStorage(getContext(), true); + const auto & read_settings = getContext()->getReadSettings(); + + try + { + // Try to read version-hint.text to get the latest version + String version_hint_object_path = version_hint_path; + if (version_hint_object_path.starts_with("s3://")) + { + version_hint_object_path = version_hint_object_path.substr(5); + // Remove bucket from path + std::size_t pos = version_hint_object_path.find('/'); + if (pos != std::string::npos) + version_hint_object_path = version_hint_object_path.substr(pos + 1); + } + + DB::StoredObject version_hint_stored_object(version_hint_object_path); + auto version_hint_buf = object_storage->readObject(version_hint_stored_object, read_settings); + String version_str; + readString(version_str, *version_hint_buf); + + boost::algorithm::trim(version_str); + + LOG_TRACE(log, "Read version {} from version-hint.text for table location '{}'", version_str, table_location); + + return table_location + "metadata/v" + version_str + "-metadata.json"; + } + catch (...) + { + LOG_TRACE(log, "Could not read version-hint.text from '{}', trying to find latest metadata file", version_hint_path); + + try + { + String bucket_with_prefix; + String metadata_dir = table_location + "metadata/"; + String metadata_dir_path = metadata_dir; + + if (metadata_dir_path.starts_with("s3://")) + { + metadata_dir_path = metadata_dir_path.substr(5); + // Remove bucket from path + std::size_t pos = metadata_dir_path.find('/'); + if (pos != std::string::npos) + { + metadata_dir_path = metadata_dir_path.substr(pos + 1); + bucket_with_prefix = table_location.substr(0, pos + 6); + } + } + else + return ""; + + // List all files in metadata directory + DB::PathsWithMetadata files; + object_storage->listObjects(metadata_dir_path, files, 0); + + // Filter for .metadata.json files and find the most recent one + String latest_metadata_file; + std::optional latest_metadata; + + for (const auto & file : files) + { + if (file->getPath().ends_with(".metadata.json")) + { + // Get file metadata to check last modified time + if (!latest_metadata.has_value() || + (file->metadata->last_modified > latest_metadata->last_modified)) + { + latest_metadata_file = file->getPath(); + latest_metadata = file->metadata; + } + } + } + + if (!latest_metadata_file.empty()) + { + LOG_TRACE(log, "Found latest metadata file: {}", latest_metadata_file); + return bucket_with_prefix + latest_metadata_file; + } + + LOG_TRACE(log, "No <...>.metadata.json files found,"); + return ""; + } + catch (...) + { + LOG_TRACE(log, "Failed to list metadata directory"); + return ""; + } + } +} + void GlueCatalog::createNamespaceIfNotExists(const String & namespace_name) const { Aws::Glue::Model::CreateDatabaseRequest create_request; diff --git a/src/Databases/DataLake/GlueCatalog.h b/src/Databases/DataLake/GlueCatalog.h index bed6e93c5dcc..bcecfd2368ca 100644 --- a/src/Databases/DataLake/GlueCatalog.h +++ b/src/Databases/DataLake/GlueCatalog.h @@ -40,11 +40,13 @@ class GlueCatalog final : public ICatalog, private DB::WithContext void getTableMetadata( const std::string & database_name, const std::string & table_name, + DB::ContextPtr context_, TableMetadata & result) const override; bool tryGetTableMetadata( const std::string & database_name, const std::string & table_name, + DB::ContextPtr context_, TableMetadata & result) const override; std::optional getStorageType() const override @@ -81,6 +83,8 @@ class GlueCatalog final : public ICatalog, private DB::WithContext /// This method allows to clarify the actual type of the timestamp column. bool classifyTimestampTZ(const String & column_name, const TableMetadata & table_metadata) const; + String resolveMetadataPathFromTableLocation(const String & table_location, const TableMetadata & table_metadata) const; + mutable DB::CacheBase metadata_objects; }; diff --git a/src/Databases/DataLake/HiveCatalog.cpp b/src/Databases/DataLake/HiveCatalog.cpp index d174034349b0..7a80571fce56 100644 --- a/src/Databases/DataLake/HiveCatalog.cpp +++ b/src/Databases/DataLake/HiveCatalog.cpp @@ -96,13 +96,21 @@ bool HiveCatalog::existsTable(const std::string & namespace_name, const std::str return true; } -void HiveCatalog::getTableMetadata(const std::string & namespace_name, const std::string & table_name, TableMetadata & result) const +void HiveCatalog::getTableMetadata( + const std::string & namespace_name, + const std::string & table_name, + DB::ContextPtr context_, + TableMetadata & result) const { - if (!tryGetTableMetadata(namespace_name, table_name, result)) + if (!tryGetTableMetadata(namespace_name, table_name, context_, result)) throw DB::Exception(DB::ErrorCodes::DATALAKE_DATABASE_ERROR, "No response from iceberg catalog"); } -bool HiveCatalog::tryGetTableMetadata(const std::string & namespace_name, const std::string & table_name, TableMetadata & result) const +bool HiveCatalog::tryGetTableMetadata( + const std::string & namespace_name, + const std::string & table_name, + DB::ContextPtr context_, + TableMetadata & result) const { Apache::Hadoop::Hive::Table table; @@ -130,7 +138,7 @@ bool HiveCatalog::tryGetTableMetadata(const std::string & namespace_name, const auto columns = table.sd.cols; for (const auto & column : columns) { - schema.push_back({column.name, getType(column.type, true)}); + schema.push_back({column.name, getType(column.type, true, context_)}); } result.setSchema(schema); } diff --git a/src/Databases/DataLake/HiveCatalog.h b/src/Databases/DataLake/HiveCatalog.h index 29b4e6ce6c63..0fba0e132486 100644 --- a/src/Databases/DataLake/HiveCatalog.h +++ b/src/Databases/DataLake/HiveCatalog.h @@ -38,9 +38,17 @@ class HiveCatalog final : public ICatalog, private DB::WithContext bool existsTable(const std::string & namespace_name, const std::string & table_name) const override; - void getTableMetadata(const std::string & namespace_name, const std::string & table_name, TableMetadata & result) const override; - - bool tryGetTableMetadata(const std::string & namespace_name, const std::string & table_name, TableMetadata & result) const override; + void getTableMetadata( + const std::string & namespace_name, + const std::string & table_name, + DB::ContextPtr context_, + TableMetadata & result) const override; + + bool tryGetTableMetadata( + const std::string & namespace_name, + const std::string & table_name, + DB::ContextPtr context_, + TableMetadata & result) const override; std::optional getStorageType() const override; diff --git a/src/Databases/DataLake/ICatalog.cpp b/src/Databases/DataLake/ICatalog.cpp index 363293933d53..c7ab78030411 100644 --- a/src/Databases/DataLake/ICatalog.cpp +++ b/src/Databases/DataLake/ICatalog.cpp @@ -84,13 +84,19 @@ void TableMetadata::setLocation(const std::string & location_) auto pos_to_path = location_.substr(pos_to_bucket).find('/'); if (pos_to_path == std::string::npos) - throw DB::Exception(DB::ErrorCodes::NOT_IMPLEMENTED, "Unexpected location format: {}", location_); - - pos_to_path = pos_to_bucket + pos_to_path; + { // empty path + location_without_path = location_; + path.clear(); + bucket = location_.substr(pos_to_bucket); + } + else + { + pos_to_path = pos_to_bucket + pos_to_path; - location_without_path = location_.substr(0, pos_to_path); - path = location_.substr(pos_to_path + 1); - bucket = location_.substr(pos_to_bucket, pos_to_path - pos_to_bucket); + location_without_path = location_.substr(0, pos_to_path); + path = location_.substr(pos_to_path + 1); + bucket = location_.substr(pos_to_bucket, pos_to_path - pos_to_bucket); + } LOG_TEST(getLogger("TableMetadata"), "Parsed location without path: {}, path: {}", diff --git a/src/Databases/DataLake/ICatalog.h b/src/Databases/DataLake/ICatalog.h index ce3970d6d046..26964aa36433 100644 --- a/src/Databases/DataLake/ICatalog.h +++ b/src/Databases/DataLake/ICatalog.h @@ -8,6 +8,14 @@ #include #include +namespace DB +{ + +class Context; +using ContextPtr = std::shared_ptr; + +} + namespace DataLake { @@ -150,6 +158,7 @@ class ICatalog virtual void getTableMetadata( const std::string & namespace_name, const std::string & table_name, + DB::ContextPtr context, TableMetadata & result) const = 0; /// Get table metadata in the given namespace. @@ -157,6 +166,7 @@ class ICatalog virtual bool tryGetTableMetadata( const std::string & namespace_name, const std::string & table_name, + DB::ContextPtr context, TableMetadata & result) const = 0; /// Get storage type, where Iceberg tables' data is stored. diff --git a/src/Databases/DataLake/RestCatalog.cpp b/src/Databases/DataLake/RestCatalog.cpp index 3006c68e6083..39297f8b33a3 100644 --- a/src/Databases/DataLake/RestCatalog.cpp +++ b/src/Databases/DataLake/RestCatalog.cpp @@ -272,7 +272,7 @@ DB::ReadWriteBufferFromHTTPPtr RestCatalog::createReadBuffer( { const auto & context = getContext(); - Poco::URI url(base_url / endpoint); + Poco::URI url(base_url / endpoint, false); if (!params.empty()) url.setQueryParameters(params); @@ -511,7 +511,9 @@ DB::Names RestCatalog::parseTables(DB::ReadBuffer & buf, const std::string & bas for (size_t i = 0; i < identifiers_object->size(); ++i) { const auto current_table_json = identifiers_object->get(static_cast(i)).extract(); - const auto table_name = current_table_json->get("name").extract(); + const auto table_name_raw = current_table_json->get("name").extract(); + std::string table_name; + Poco::URI::encode(table_name_raw, "/", table_name); tables.push_back(base_namespace + "." + table_name); if (limit && tables.size() >= limit) @@ -530,17 +532,18 @@ DB::Names RestCatalog::parseTables(DB::ReadBuffer & buf, const std::string & bas bool RestCatalog::existsTable(const std::string & namespace_name, const std::string & table_name) const { TableMetadata table_metadata; - return tryGetTableMetadata(namespace_name, table_name, table_metadata); + return tryGetTableMetadata(namespace_name, table_name, getContext(), table_metadata); } bool RestCatalog::tryGetTableMetadata( const std::string & namespace_name, const std::string & table_name, + DB::ContextPtr context_, TableMetadata & result) const { try { - return getTableMetadataImpl(namespace_name, table_name, result); + return getTableMetadataImpl(namespace_name, table_name, context_, result); } catch (...) { @@ -552,15 +555,17 @@ bool RestCatalog::tryGetTableMetadata( void RestCatalog::getTableMetadata( const std::string & namespace_name, const std::string & table_name, + DB::ContextPtr context_, TableMetadata & result) const { - if (!getTableMetadataImpl(namespace_name, table_name, result)) + if (!getTableMetadataImpl(namespace_name, table_name, context_, result)) throw DB::Exception(DB::ErrorCodes::DATALAKE_DATABASE_ERROR, "No response from iceberg catalog"); } bool RestCatalog::getTableMetadataImpl( const std::string & namespace_name, const std::string & table_name, + DB::ContextPtr context_, TableMetadata & result) const { LOG_TEST(log, "Checking table {} in namespace {}", table_name, namespace_name); @@ -621,8 +626,8 @@ bool RestCatalog::getTableMetadataImpl( if (result.requiresSchema()) { // int format_version = metadata_object->getValue("format-version"); - auto schema_processor = DB::Iceberg::IcebergSchemaProcessor(); - auto id = DB::IcebergMetadata::parseTableSchema(metadata_object, schema_processor, log); + auto schema_processor = DB::Iceberg::IcebergSchemaProcessor(context_); + auto id = DB::IcebergMetadata::parseTableSchema(metadata_object, schema_processor, context_, log); auto schema = schema_processor.getClickhouseTableSchemaById(id); result.setSchema(*schema); } @@ -700,7 +705,7 @@ void RestCatalog::sendRequest(const String & endpoint, Poco::JSON::Object::Ptr r }; } - Poco::URI url(endpoint); + Poco::URI url(endpoint, false); auto wb = DB::BuilderRWBufferFromHTTP(url) .withConnectionGroup(DB::HTTPConnectionGroupType::HTTP) .withMethod(method) diff --git a/src/Databases/DataLake/RestCatalog.h b/src/Databases/DataLake/RestCatalog.h index 182e3de0ae55..a98e719ff09d 100644 --- a/src/Databases/DataLake/RestCatalog.h +++ b/src/Databases/DataLake/RestCatalog.h @@ -42,11 +42,13 @@ class RestCatalog final : public ICatalog, private DB::WithContext void getTableMetadata( const std::string & namespace_name, const std::string & table_name, + DB::ContextPtr context_, TableMetadata & result) const override; bool tryGetTableMetadata( const std::string & namespace_name, const std::string & table_name, + DB::ContextPtr context_, TableMetadata & result) const override; std::optional getStorageType() const override; @@ -126,6 +128,7 @@ class RestCatalog final : public ICatalog, private DB::WithContext bool getTableMetadataImpl( const std::string & namespace_name, const std::string & table_name, + DB::ContextPtr context_, TableMetadata & result) const; Config loadConfig(); diff --git a/src/Databases/DataLake/UnityCatalog.cpp b/src/Databases/DataLake/UnityCatalog.cpp index 18c20931e608..8054d971c9e0 100644 --- a/src/Databases/DataLake/UnityCatalog.cpp +++ b/src/Databases/DataLake/UnityCatalog.cpp @@ -91,9 +91,10 @@ DB::Names UnityCatalog::getTables() const void UnityCatalog::getTableMetadata( const std::string & namespace_name, const std::string & table_name, + DB::ContextPtr context_, TableMetadata & result) const { - if (!tryGetTableMetadata(namespace_name, table_name, result)) + if (!tryGetTableMetadata(namespace_name, table_name, context_, result)) throw DB::Exception(DB::ErrorCodes::DATALAKE_DATABASE_ERROR, "No response from unity catalog"); } @@ -136,6 +137,7 @@ void UnityCatalog::getCredentials(const std::string & table_id, TableMetadata & bool UnityCatalog::tryGetTableMetadata( const std::string & schema_name, const std::string & table_name, + DB::ContextPtr /* context_ */, TableMetadata & result) const { auto full_table_name = warehouse + "." + schema_name + "." + table_name; diff --git a/src/Databases/DataLake/UnityCatalog.h b/src/Databases/DataLake/UnityCatalog.h index 2e6262d6e5d7..9d4dc0a74877 100644 --- a/src/Databases/DataLake/UnityCatalog.h +++ b/src/Databases/DataLake/UnityCatalog.h @@ -34,11 +34,13 @@ class UnityCatalog final : public ICatalog, private DB::WithContext void getTableMetadata( const std::string & namespace_name, const std::string & table_name, + DB::ContextPtr context_, TableMetadata & result) const override; bool tryGetTableMetadata( const std::string & schema_name, const std::string & table_name, + DB::ContextPtr context_, TableMetadata & result) const override; std::optional getStorageType() const override { return std::nullopt; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index f99884741315..c736639608f6 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -2255,7 +2255,8 @@ bool DatabaseReplicated::shouldReplicateQuery(const ContextPtr & query_context, if (const auto * alter = query_ptr->as()) { if (alter->isAttachAlter() || alter->isFetchAlter() || alter->isDropPartitionAlter() - || is_keeper_map_table(query_ptr) || alter->isFreezeAlter() || alter->isUnlockSnapshot()) + || is_keeper_map_table(query_ptr) || alter->isFreezeAlter() || alter->isUnlockSnapshot() + || alter->isExportPartAlter()) return false; if (has_many_shards() || !is_replicated_table(query_ptr)) diff --git a/src/Databases/enableAllExperimentalSettings.cpp b/src/Databases/enableAllExperimentalSettings.cpp index 705a27c0905e..215ed5ad9248 100644 --- a/src/Databases/enableAllExperimentalSettings.cpp +++ b/src/Databases/enableAllExperimentalSettings.cpp @@ -61,6 +61,7 @@ void enableAllExperimentalSettings(ContextMutablePtr context) context->setSetting("allow_experimental_ytsaurus_table_engine", 1); context->setSetting("allow_experimental_ytsaurus_dictionary_source", 1); context->setSetting("allow_experimental_time_series_aggregate_functions", 1); + context->setSetting("allow_experimental_hybrid_table", 1); context->setSetting("allow_experimental_lightweight_update", 1); context->setSetting("allow_experimental_insert_into_iceberg", 1); context->setSetting("allow_experimental_iceberg_compaction", 1); diff --git a/src/Disks/DiskType.cpp b/src/Disks/DiskType.cpp index 0dafa86ce4fa..dbfba8e5686e 100644 --- a/src/Disks/DiskType.cpp +++ b/src/Disks/DiskType.cpp @@ -12,7 +12,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -MetadataStorageType metadataTypeFromString(const String & type) +MetadataStorageType metadataTypeFromString(const std::string & type) { auto check_type = Poco::toLower(type); if (check_type == "local") @@ -60,25 +60,49 @@ std::string DataSourceDescription::toString() const case DataSourceType::RAM: return "memory"; case DataSourceType::ObjectStorage: - { - switch (object_storage_type) - { - case ObjectStorageType::S3: - return "s3"; - case ObjectStorageType::HDFS: - return "hdfs"; - case ObjectStorageType::Azure: - return "azure_blob_storage"; - case ObjectStorageType::Local: - return "local_blob_storage"; - case ObjectStorageType::Web: - return "web"; - case ObjectStorageType::None: - return "none"; - case ObjectStorageType::Max: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected object storage type: Max"); - } - } + return DB::toString(object_storage_type); } } + +ObjectStorageType objectStorageTypeFromString(const std::string & type) +{ + auto check_type = Poco::toLower(type); + if (check_type == "s3") + return ObjectStorageType::S3; + if (check_type == "hdfs") + return ObjectStorageType::HDFS; + if (check_type == "azure_blob_storage" || check_type == "azure") + return ObjectStorageType::Azure; + if (check_type == "local_blob_storage" || check_type == "local") + return ObjectStorageType::Local; + if (check_type == "web") + return ObjectStorageType::Web; + if (check_type == "none") + return ObjectStorageType::None; + + throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, + "Unknown object storage type: {}", type); +} + +std::string toString(ObjectStorageType type) +{ + switch (type) + { + case ObjectStorageType::S3: + return "s3"; + case ObjectStorageType::HDFS: + return "hdfs"; + case ObjectStorageType::Azure: + return "azure_blob_storage"; + case ObjectStorageType::Local: + return "local_blob_storage"; + case ObjectStorageType::Web: + return "web"; + case ObjectStorageType::None: + return "none"; + case ObjectStorageType::Max: + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected object storage type: Max"); + } +} + } diff --git a/src/Disks/DiskType.h b/src/Disks/DiskType.h index f8eb31b2f988..9662a0dc7b4e 100644 --- a/src/Disks/DiskType.h +++ b/src/Disks/DiskType.h @@ -36,8 +36,10 @@ enum class MetadataStorageType : uint8_t Memory, }; -MetadataStorageType metadataTypeFromString(const String & type); -String toString(DataSourceType data_source_type); +MetadataStorageType metadataTypeFromString(const std::string & type); + +ObjectStorageType objectStorageTypeFromString(const std::string & type); +std::string toString(ObjectStorageType type); struct DataSourceDescription { diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 83ca6950da69..dbd62f35f9e0 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -14,6 +14,7 @@ #include #include #include +#include namespace CurrentMetrics @@ -26,6 +27,7 @@ namespace CurrentMetrics namespace ProfileEvents { extern const Event AzureListObjects; + extern const Event AzureListObjectsMicroseconds; extern const Event DiskAzureListObjects; extern const Event AzureDeleteObjects; extern const Event DiskAzureDeleteObjects; @@ -72,11 +74,12 @@ class AzureIteratorAsync final : public IObjectStorageIteratorAsync } private: - bool getBatchAndCheckNext(RelativePathsWithMetadata & batch) override + bool getBatchAndCheckNext(PathsWithMetadata & batch) override { ProfileEvents::increment(ProfileEvents::AzureListObjects); if (client->IsClientForDisk()) ProfileEvents::increment(ProfileEvents::DiskAzureListObjects); + ProfileEventTimeIncrement watch(ProfileEvents::AzureListObjectsMicroseconds); chassert(batch.empty()); auto blob_list_response = client->ListBlobs(options); @@ -85,7 +88,7 @@ class AzureIteratorAsync final : public IObjectStorageIteratorAsync for (const auto & blob : blobs_list) { - batch.emplace_back(std::make_shared( + batch.emplace_back(std::make_shared( blob.Name, ObjectMetadata{ static_cast(blob.BlobSize), @@ -167,7 +170,7 @@ ObjectStorageIteratorPtr AzureObjectStorage::iterate(const std::string & path_pr return std::make_shared(path_prefix, client_ptr, max_keys ? max_keys : settings_ptr->list_object_keys_size); } -void AzureObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const +void AzureObjectStorage::listObjects(const std::string & path, PathsWithMetadata & children, size_t max_keys) const { auto client_ptr = client.get(); @@ -184,12 +187,16 @@ void AzureObjectStorage::listObjects(const std::string & path, RelativePathsWith if (client_ptr->IsClientForDisk()) ProfileEvents::increment(ProfileEvents::DiskAzureListObjects); - blob_list_response = client_ptr->ListBlobs(options); + { + ProfileEventTimeIncrement watch(ProfileEvents::AzureListObjectsMicroseconds); + blob_list_response = client_ptr->ListBlobs(options); + } + const auto & blobs_list = blob_list_response.Blobs; for (const auto & blob : blobs_list) { - children.emplace_back(std::make_shared( + children.emplace_back(std::make_shared( blob.Name, ObjectMetadata{ static_cast(blob.BlobSize), diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 5a23deb9b65b..1e4ed18b4605 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -35,7 +35,9 @@ class AzureObjectStorage : public IObjectStorage const String & description_, const String & common_key_prefix_); - void listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const override; + bool supportsListObjectsCache() override { return true; } + + void listObjects(const std::string & path, PathsWithMetadata & children, size_t max_keys) const override; /// Sanitizer build may crash with max_keys=1; this looks like a false positive. ObjectStorageIteratorPtr iterate(const std::string & path_prefix, size_t max_keys) const override; diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp index 8675555f2668..8d483a5581dd 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp @@ -193,7 +193,7 @@ void CachedObjectStorage::copyObject( // NOLINT object_storage->copyObject(object_from, object_to, read_settings, write_settings, object_to_attributes); } -void CachedObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const +void CachedObjectStorage::listObjects(const std::string & path, PathsWithMetadata & children, size_t max_keys) const { object_storage->listObjects(path, children, max_keys); } diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index 68cce9f2ccb6..34c2b7f2054f 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -64,7 +64,7 @@ class CachedObjectStorage final : public IObjectStorage IObjectStorage & object_storage_to, std::optional object_to_attributes = {}) override; - void listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const override; + void listObjects(const std::string & path, PathsWithMetadata & children, size_t max_keys) const override; ObjectMetadata getObjectMetadata(const std::string & path) const override; diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index 5dea86c49027..443a8b4c6920 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -167,7 +167,7 @@ ObjectMetadata HDFSObjectStorage::getObjectMetadata(const std::string & path) co return metadata; } -void HDFSObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const +void HDFSObjectStorage::listObjects(const std::string & path, PathsWithMetadata & children, size_t max_keys) const { initializeHDFSFS(); LOG_TEST(log, "Trying to list files for {}", path); @@ -203,7 +203,7 @@ void HDFSObjectStorage::listObjects(const std::string & path, RelativePathsWithM } else { - children.emplace_back(std::make_shared( + children.emplace_back(std::make_shared( String(file_path), ObjectMetadata{ static_cast(ls.file_info[i].mSize), diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index 733407e236ef..c3fe9ce0cf6c 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -92,7 +92,7 @@ class HDFSObjectStorage : public IObjectStorage, public HDFSErrorWrapper const WriteSettings & write_settings, std::optional object_to_attributes = {}) override; - void listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const override; + void listObjects(const std::string & path, PathsWithMetadata & children, size_t max_keys) const override; String getObjectsNamespace() const override { return ""; } diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp index acbd3e1fa2c0..2541d71c49f2 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/ObjectStorages/IObjectStorage.cpp @@ -7,6 +7,11 @@ #include #include #include +#include + +#include +#include +#include namespace DB @@ -25,12 +30,12 @@ const MetadataStorageMetrics & IObjectStorage::getMetadataStorageMetrics() const bool IObjectStorage::existsOrHasAnyChild(const std::string & path) const { - RelativePathsWithMetadata files; + PathsWithMetadata files; listObjects(path, files, 1); return !files.empty(); } -void IObjectStorage::listObjects(const std::string &, RelativePathsWithMetadata &, size_t) const +void IObjectStorage::listObjects(const std::string &, PathsWithMetadata &, size_t) const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "listObjects() is not supported"); } @@ -38,7 +43,7 @@ void IObjectStorage::listObjects(const std::string &, RelativePathsWithMetadata ObjectStorageIteratorPtr IObjectStorage::iterate(const std::string & path_prefix, size_t max_keys) const { - RelativePathsWithMetadata files; + PathsWithMetadata files; listObjects(path_prefix, files, max_keys); return std::make_shared(std::move(files)); @@ -97,11 +102,59 @@ WriteSettings IObjectStorage::patchSettings(const WriteSettings & write_settings return write_settings; } -std::string RelativePathWithMetadata::getPathOrPathToArchiveIfArchive() const +std::string PathWithMetadata::getPathOrPathToArchiveIfArchive() const { if (isArchive()) return getPathToArchive(); return getPath(); } +PathWithMetadata::CommandInTaskResponse::CommandInTaskResponse(const std::string & task) +{ + Poco::JSON::Parser parser; + try + { + auto json = parser.parse(task).extract(); + if (!json) + return; + + is_valid = true; + + if (json->has("retry_after_us")) + retry_after_us = json->getValue("retry_after_us"); + } + catch (const Poco::JSON::JSONException &) + { /// Not a JSON + return; + } +} + +std::string PathWithMetadata::CommandInTaskResponse::toString() const +{ + Poco::JSON::Object json; + if (retry_after_us.has_value()) + json.set("retry_after_us", retry_after_us.value()); + + std::ostringstream oss; + oss.exceptions(std::ios::failbit); + Poco::JSON::Stringifier::stringify(json, oss); + return oss.str(); +} + + +void PathWithMetadata::loadMetadata(ObjectStoragePtr object_storage, bool ignore_non_existent_file) +{ + if (!metadata) + { + const auto & path = isArchive() ? getPathToArchive() : getPath(); + + auto storage_to_use = object_storage_to_use ? object_storage_to_use : object_storage; + + if (ignore_non_existent_file) + metadata = storage_to_use->tryGetObjectMetadata(path); + else + metadata = storage_to_use->getObjectMetadata(path); + } +} + } diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 1f3a4278f135..292a8de13537 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -107,33 +107,87 @@ struct ObjectMetadata ObjectAttributes attributes; }; + +class DataFileMetaInfo; +using DataFileMetaInfoPtr = std::shared_ptr; + struct DataLakeObjectMetadata; -struct RelativePathWithMetadata +struct PathWithMetadata { + class CommandInTaskResponse + { + public: + CommandInTaskResponse() = default; + explicit CommandInTaskResponse(const std::string & task); + + bool isValid() const { return is_valid; } + void setRetryAfterUs(Poco::Timestamp::TimeDiff time_us) + { + retry_after_us = time_us; + is_valid = true; + } + + std::string toString() const; + + std::optional getRetryAfterUs() const { return retry_after_us; } + + private: + bool is_valid = false; + std::optional retry_after_us; + }; + String relative_path; /// Object metadata: size, modification time, etc. std::optional metadata; /// Delta lake related object metadata. std::optional data_lake_metadata; - - RelativePathWithMetadata() = default; - - explicit RelativePathWithMetadata(String relative_path_, std::optional metadata_ = std::nullopt) - : relative_path(std::move(relative_path_)) + /// Information about columns + std::optional file_meta_info; + /// Retry request after short pause + CommandInTaskResponse command; + std::optional absolute_path; + ObjectStoragePtr object_storage_to_use = nullptr; + + PathWithMetadata() = default; + + explicit PathWithMetadata( + const String & command_or_path, + std::optional metadata_ = std::nullopt, + std::optional absolute_path_ = std::nullopt, + ObjectStoragePtr object_storage_to_use_ = nullptr) + : relative_path(std::move(command_or_path)) , metadata(std::move(metadata_)) - {} + , command(relative_path) + , absolute_path((absolute_path_.has_value() && !absolute_path_.value().empty()) ? absolute_path_ : std::nullopt) + , object_storage_to_use(object_storage_to_use_) + { + if (command.isValid()) + relative_path = ""; + } - RelativePathWithMetadata(const RelativePathWithMetadata & other) = default; + PathWithMetadata(const PathWithMetadata & other) = default; - virtual ~RelativePathWithMetadata() = default; + virtual ~PathWithMetadata() = default; virtual std::string getFileName() const { return std::filesystem::path(relative_path).filename(); } + virtual std::string getFileNameWithoutExtension() const { return std::filesystem::path(relative_path).stem(); } + virtual std::string getPath() const { return relative_path; } + virtual std::optional getAbsolutePath() const { return absolute_path; } virtual bool isArchive() const { return false; } virtual std::string getPathToArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); } virtual size_t fileSizeInArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); } virtual std::string getPathOrPathToArchiveIfArchive() const; + + void setFileMetaInfo(std::optional file_meta_info_ ) { file_meta_info = file_meta_info_; } + std::optional getFileMetaInfo() const { return file_meta_info; } + + const CommandInTaskResponse & getCommand() const { return command; } + + void loadMetadata(ObjectStoragePtr object_storage, bool ignore_non_existent_file = true); + + ObjectStoragePtr getObjectStorage() const { return object_storage_to_use; } }; struct ObjectKeyWithMetadata @@ -149,8 +203,8 @@ struct ObjectKeyWithMetadata {} }; -using RelativePathWithMetadataPtr = std::shared_ptr; -using RelativePathsWithMetadata = std::vector; +using PathWithMetadataPtr = std::shared_ptr; +using PathsWithMetadata = std::vector; using ObjectKeysWithMetadata = std::vector; class IObjectStorageIterator; @@ -191,7 +245,7 @@ class IObjectStorage virtual bool existsOrHasAnyChild(const std::string & path) const; /// List objects recursively by certain prefix. - virtual void listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const; + virtual void listObjects(const std::string & path, PathsWithMetadata & children, size_t max_keys) const; /// List objects recursively by certain prefix. Use it instead of listObjects, if you want to list objects lazily. virtual ObjectStorageIteratorPtr iterate(const std::string & path_prefix, size_t max_keys) const; @@ -327,6 +381,14 @@ class IObjectStorage } virtual std::shared_ptr tryGetS3StorageClient() { return nullptr; } #endif + + + virtual bool supportsListObjectsCache() { return false; } + +private: + mutable std::mutex throttlers_mutex; + ThrottlerPtr remote_read_throttler; + ThrottlerPtr remote_write_throttler; }; using ObjectStoragePtr = std::shared_ptr; diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp index 2c444954d538..40e83286e416 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp @@ -151,7 +151,7 @@ ObjectMetadata LocalObjectStorage::getObjectMetadata(const std::string & path) c return object_metadata; } -void LocalObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t/* max_keys */) const +void LocalObjectStorage::listObjects(const std::string & path, PathsWithMetadata & children, size_t/* max_keys */) const { if (!fs::exists(path) || !fs::is_directory(path)) return; @@ -164,7 +164,7 @@ void LocalObjectStorage::listObjects(const std::string & path, RelativePathsWith continue; } - children.emplace_back(std::make_shared(entry.path(), getObjectMetadata(entry.path()))); + children.emplace_back(std::make_shared(entry.path(), getObjectMetadata(entry.path()))); } } diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.h b/src/Disks/ObjectStorages/Local/LocalObjectStorage.h index a8a9fe321894..3b2a8b31d562 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.h +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.h @@ -62,7 +62,7 @@ class LocalObjectStorage : public IObjectStorage ObjectMetadata getObjectMetadata(const std::string & path) const override; - void listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const override; + void listObjects(const std::string & path, PathsWithMetadata & children, size_t max_keys) const override; bool existsOrHasAnyChild(const std::string & path) const override; diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp index 336c2bca91ca..2609d0a123a8 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp @@ -124,7 +124,7 @@ std::vector MetadataStorageFromPlainObjectStorage::listDirectory(co { auto key_prefix = object_storage->generateObjectKeyForPath(path, std::nullopt /* key_prefix */).serialize(); - RelativePathsWithMetadata files; + PathsWithMetadata files; std::string absolute_key = key_prefix; if (!absolute_key.ends_with('/')) absolute_key += '/'; diff --git a/src/Disks/ObjectStorages/ObjectStorageIterator.cpp b/src/Disks/ObjectStorages/ObjectStorageIterator.cpp index 3d939ce92302..bec76452b2d5 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIterator.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIterator.cpp @@ -9,7 +9,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -RelativePathWithMetadataPtr ObjectStorageIteratorFromList::current() +PathWithMetadataPtr ObjectStorageIteratorFromList::current() { if (!isValid()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); diff --git a/src/Disks/ObjectStorages/ObjectStorageIterator.h b/src/Disks/ObjectStorages/ObjectStorageIterator.h index d814514ddcc9..b62992f6a719 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIterator.h +++ b/src/Disks/ObjectStorages/ObjectStorageIterator.h @@ -16,10 +16,10 @@ class IObjectStorageIterator virtual bool isValid() = 0; /// Return the current element. - virtual RelativePathWithMetadataPtr current() = 0; + virtual PathWithMetadataPtr current() = 0; /// This will initiate prefetching the next batch in background, so it can be obtained faster when needed. - virtual std::optional getCurrentBatchAndScheduleNext() = 0; + virtual std::optional getCurrentBatchAndScheduleNext() = 0; /// Returns the number of elements in the batches that were fetched so far. virtual size_t getAccumulatedSize() const = 0; @@ -36,7 +36,7 @@ class IObjectStorageIterator /// Return the current batch of elements. /// It is unspecified how batches are formed. /// But this method can be used for more efficient processing. - virtual RelativePathsWithMetadata currentBatch() = 0; + virtual PathsWithMetadata currentBatch() = 0; }; using ObjectStorageIteratorPtr = std::shared_ptr; @@ -45,7 +45,7 @@ class ObjectStorageIteratorFromList : public IObjectStorageIterator { public: /// Everything is represented by just a single batch. - explicit ObjectStorageIteratorFromList(RelativePathsWithMetadata && batch_) + explicit ObjectStorageIteratorFromList(PathsWithMetadata && batch_) : batch(std::move(batch_)) , batch_iterator(batch.begin()) {} @@ -59,11 +59,11 @@ class ObjectStorageIteratorFromList : public IObjectStorageIterator bool isValid() override { return batch_iterator != batch.end(); } - RelativePathWithMetadataPtr current() override; + PathWithMetadataPtr current() override; - RelativePathsWithMetadata currentBatch() override { return batch; } + PathsWithMetadata currentBatch() override { return batch; } - std::optional getCurrentBatchAndScheduleNext() override + std::optional getCurrentBatchAndScheduleNext() override { if (batch.empty()) return {}; @@ -76,8 +76,8 @@ class ObjectStorageIteratorFromList : public IObjectStorageIterator size_t getAccumulatedSize() const override { return batch.size(); } private: - RelativePathsWithMetadata batch; - RelativePathsWithMetadata::iterator batch_iterator; + PathsWithMetadata batch; + PathsWithMetadata::iterator batch_iterator; }; } diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp index 2d2e8cd2c1a5..c488e8596aaa 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp @@ -112,7 +112,7 @@ bool IObjectStorageIteratorAsync::isValid() return !is_finished; } -RelativePathWithMetadataPtr IObjectStorageIteratorAsync::current() +PathWithMetadataPtr IObjectStorageIteratorAsync::current() { std::lock_guard lock(mutex); @@ -123,7 +123,7 @@ RelativePathWithMetadataPtr IObjectStorageIteratorAsync::current() } -RelativePathsWithMetadata IObjectStorageIteratorAsync::currentBatch() +PathsWithMetadata IObjectStorageIteratorAsync::currentBatch() { std::lock_guard lock(mutex); @@ -133,7 +133,7 @@ RelativePathsWithMetadata IObjectStorageIteratorAsync::currentBatch() return current_batch; } -std::optional IObjectStorageIteratorAsync::getCurrentBatchAndScheduleNext() +std::optional IObjectStorageIteratorAsync::getCurrentBatchAndScheduleNext() { std::lock_guard lock(mutex); diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h index 013714151245..b8a0d6e0249a 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.h @@ -23,24 +23,24 @@ class IObjectStorageIteratorAsync : public IObjectStorageIterator bool isValid() override; - RelativePathWithMetadataPtr current() override; - RelativePathsWithMetadata currentBatch() override; + PathWithMetadataPtr current() override; + PathsWithMetadata currentBatch() override; void next() override; void nextBatch() override; size_t getAccumulatedSize() const override; - std::optional getCurrentBatchAndScheduleNext() override; + std::optional getCurrentBatchAndScheduleNext() override; void deactivate(); protected: /// This method fetches the next batch, and returns true if there are more batches after it. - virtual bool getBatchAndCheckNext(RelativePathsWithMetadata & batch) = 0; + virtual bool getBatchAndCheckNext(PathsWithMetadata & batch) = 0; struct BatchAndHasNext { - RelativePathsWithMetadata batch; + PathsWithMetadata batch; bool has_next; }; @@ -55,8 +55,8 @@ class IObjectStorageIteratorAsync : public IObjectStorageIterator ThreadPool list_objects_pool; ThreadPoolCallbackRunnerUnsafe list_objects_scheduler; std::future outcome_future; - RelativePathsWithMetadata current_batch; - RelativePathsWithMetadata::iterator current_batch_iterator; + PathsWithMetadata current_batch; + PathsWithMetadata::iterator current_batch_iterator; std::atomic accumulated_size = 0; }; diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 320173bd7a49..f1cb2561674c 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -30,11 +30,13 @@ #include #include #include +#include namespace ProfileEvents { extern const Event S3ListObjects; + extern const Event S3ListObjectsMicroseconds; extern const Event DiskS3DeleteObjects; extern const Event DiskS3ListObjects; } @@ -132,12 +134,17 @@ class S3IteratorAsync final : public IObjectStorageIteratorAsync } private: - bool getBatchAndCheckNext(RelativePathsWithMetadata & batch) override + bool getBatchAndCheckNext(PathsWithMetadata & batch) override { ProfileEvents::increment(ProfileEvents::S3ListObjects); ProfileEvents::increment(ProfileEvents::DiskS3ListObjects); - auto outcome = client->ListObjectsV2(*request); + Aws::S3::Model::ListObjectsV2Outcome outcome; + + { + ProfileEventTimeIncrement watch(ProfileEvents::S3ListObjectsMicroseconds); + outcome = client->ListObjectsV2(*request); + } /// Outcome failure will be handled on the caller side. if (outcome.IsSuccess()) @@ -148,7 +155,7 @@ class S3IteratorAsync final : public IObjectStorageIteratorAsync for (const auto & object : objects) { ObjectMetadata metadata{static_cast(object.GetSize()), Poco::Timestamp::fromEpochTime(object.GetLastModified().Seconds()), object.GetETag(), {}}; - batch.emplace_back(std::make_shared(object.GetKey(), std::move(metadata))); + batch.emplace_back(std::make_shared(object.GetKey(), std::move(metadata))); } /// It returns false when all objects were returned @@ -246,7 +253,7 @@ ObjectStorageIteratorPtr S3ObjectStorage::iterate(const std::string & path_prefi return std::make_shared(uri.bucket, path_prefix, client.get(), max_keys); } -void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const +void S3ObjectStorage::listObjects(const std::string & path, PathsWithMetadata & children, size_t max_keys) const { auto settings_ptr = s3_settings.get(); @@ -264,7 +271,11 @@ void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMet ProfileEvents::increment(ProfileEvents::S3ListObjects); ProfileEvents::increment(ProfileEvents::DiskS3ListObjects); - outcome = client.get()->ListObjectsV2(request); + { + ProfileEventTimeIncrement watch(ProfileEvents::S3ListObjectsMicroseconds); + outcome = client.get()->ListObjectsV2(request); + } + throwIfError(outcome); auto result = outcome.GetResult(); @@ -274,7 +285,7 @@ void S3ObjectStorage::listObjects(const std::string & path, RelativePathsWithMet break; for (const auto & object : objects) - children.emplace_back(std::make_shared( + children.emplace_back(std::make_shared( object.GetKey(), ObjectMetadata{ static_cast(object.GetSize()), diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index 7b64d3b0f9fe..fbeb32916280 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -61,6 +61,8 @@ class S3ObjectStorage : public IObjectStorage ObjectStorageType getType() const override { return ObjectStorageType::S3; } + bool supportsListObjectsCache() override { return true; } + bool exists(const StoredObject & object) const override; std::unique_ptr readObject( /// NOLINT @@ -77,7 +79,7 @@ class S3ObjectStorage : public IObjectStorage size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, const WriteSettings & write_settings = {}) override; - void listObjects(const std::string & path, RelativePathsWithMetadata & children, size_t max_keys) const override; + void listObjects(const std::string & path, PathsWithMetadata & children, size_t max_keys) const override; ObjectStorageIteratorPtr iterate(const std::string & path_prefix, size_t max_keys) const override; diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index 365df6585ae0..fb8204b7fcb3 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -178,7 +178,7 @@ class FormatFactory final : private boost::noncopyable UInt64 max_block_size, const std::optional & format_settings = std::nullopt, FormatParserSharedResourcesPtr parser_shared_resources = nullptr, - FormatFilterInfoPtr format_filter_info = std::make_shared(), + FormatFilterInfoPtr format_filter_info = nullptr, // affects things like buffer sizes and parallel reading bool is_remote_fs = false, // allows to do: buf -> parallel read -> decompression, diff --git a/src/Formats/FormatFilterInfo.cpp b/src/Formats/FormatFilterInfo.cpp index 8d25deefb397..ac130771fbb9 100644 --- a/src/Formats/FormatFilterInfo.cpp +++ b/src/Formats/FormatFilterInfo.cpp @@ -15,11 +15,16 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int ICEBERG_SPECIFICATION_VIOLATION; } void ColumnMapper::setStorageColumnEncoding(std::unordered_map && storage_encoding_) { + chassert(storage_encoding.empty()); storage_encoding = std::move(storage_encoding_); + for (const auto & [column_name, field_id] : storage_encoding) + if (!field_id_to_clickhouse_name.emplace(field_id, column_name).second) + throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Duplicate field id {}", field_id); } std::pair, std::unordered_map> ColumnMapper::makeMapping( diff --git a/src/Formats/FormatFilterInfo.h b/src/Formats/FormatFilterInfo.h index 3d37a95c93b2..024c4e5321aa 100644 --- a/src/Formats/FormatFilterInfo.h +++ b/src/Formats/FormatFilterInfo.h @@ -18,14 +18,20 @@ class ColumnMapper { public: /// clickhouse_column_name -> field_id + /// For tuples, the map contains both the tuple itself and all its elements, e.g. {t, t.x, t.y}. + /// Note that parquet schema reader has to apply the mapping to all tuple fields recursively + /// even if the whole tuple was requested, because the names of the fields may be different. void setStorageColumnEncoding(std::unordered_map && storage_encoding_); const std::unordered_map & getStorageColumnEncoding() const { return storage_encoding; } + const std::unordered_map & getFieldIdToClickHouseName() const { return field_id_to_clickhouse_name; } + /// clickhouse_column_name -> format_column_name (just join the maps above by field_id). std::pair, std::unordered_map> makeMapping(const std::unordered_map & format_encoding); private: std::unordered_map storage_encoding; + std::unordered_map field_id_to_clickhouse_name; }; using ColumnMapperPtr = std::shared_ptr; diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index 5055f548a0ee..d3c8137e6611 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -154,6 +154,11 @@ uint64_t generateSnowflakeID() return fromSnowflakeId(snowflake_id); } +std::string generateSnowflakeIDString() +{ + return std::to_string(generateSnowflakeID()); +} + class FunctionGenerateSnowflakeID : public IFunction { public: diff --git a/src/Functions/generateSnowflakeID.h b/src/Functions/generateSnowflakeID.h index 38fa684a9b4b..4fc173dcf1be 100644 --- a/src/Functions/generateSnowflakeID.h +++ b/src/Functions/generateSnowflakeID.h @@ -7,4 +7,6 @@ namespace DB uint64_t generateSnowflakeID(); +std::string generateSnowflakeIDString(); + } diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 06e9ed3885ca..c3a3d9f9ff86 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -445,6 +445,12 @@ Aws::S3::Model::GetObjectResult ReadBufferFromS3::sendRequest(size_t attempt, si log, "Read S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}", bucket, key, version_id.empty() ? "Latest" : version_id, range_begin); } + else + { + LOG_TEST( + log, "Read S3 object. Bucket: {}, Key: {}, Version: {}", + bucket, key, version_id.empty() ? "Latest" : version_id); + } ProfileEvents::increment(ProfileEvents::S3GetObject); if (client_ptr->isClientForDisk()) diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index c46d1456c417..831ce12e4277 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -454,7 +454,7 @@ Model::HeadObjectOutcome Client::HeadObject(HeadObjectRequest & request) const auto bucket_uri = getURIForBucket(bucket); if (!bucket_uri) { - if (auto maybe_error = updateURIForBucketForHead(bucket); maybe_error.has_value()) + if (auto maybe_error = updateURIForBucketForHead(bucket, request.GetKey()); maybe_error.has_value()) return *maybe_error; if (auto region = getRegionForBucket(bucket); !region.empty()) @@ -659,7 +659,6 @@ Client::doRequest(RequestType & request, RequestFn request_fn) const if (auto uri = getURIForBucket(bucket); uri.has_value()) request.overrideURI(std::move(*uri)); - bool found_new_endpoint = false; // if we found correct endpoint after 301 responses, update the cache for future requests SCOPE_EXIT( @@ -693,19 +692,35 @@ Client::doRequest(RequestType & request, RequestFn request_fn) const continue; } - if (error.GetResponseCode() != Aws::Http::HttpResponseCode::MOVED_PERMANENTLY) + /// IllegalLocationConstraintException may indicate that we are working with an opt-in region (e.g. me-south-1) + /// In that case, we need to update the region and try again + bool is_illegal_constraint_exception = error.GetExceptionName() == "IllegalLocationConstraintException"; + if (error.GetResponseCode() != Aws::Http::HttpResponseCode::MOVED_PERMANENTLY && !is_illegal_constraint_exception) return result; // maybe we detect a correct region - if (!detect_region) + bool new_region_detected = false; + if (!detect_region || is_illegal_constraint_exception) { if (auto region = GetErrorMarshaller()->ExtractRegion(error); !region.empty() && region != explicit_region) { + LOG_INFO(log, "Detected new region: {}", region); request.overrideRegion(region); insertRegionOverride(bucket, region); + new_region_detected = true; } } + /// special handling for opt-in regions + if (new_region_detected && is_illegal_constraint_exception && initial_endpoint.substr(11) == "amazonaws.com") + { + S3::URI new_uri(initial_endpoint); + new_uri.addRegionToURI(request.getRegionOverride()); + found_new_endpoint = true; + request.overrideURI(new_uri); + continue; + } + // we possibly got new location, need to try with that one auto new_uri = getURIFromError(error); if (!new_uri) @@ -1006,12 +1021,15 @@ std::optional Client::getURIFromError(const Aws::S3::S3Error & error) c } // Do a list request because head requests don't have body in response -std::optional Client::updateURIForBucketForHead(const std::string & bucket) const +// S3 Tables don't support ListObjects, so made dirty workaroung - changed on GetObject +std::optional Client::updateURIForBucketForHead(const std::string & bucket, const std::string & key) const { - ListObjectsV2Request req; + GetObjectRequest req; req.SetBucket(bucket); - req.SetMaxKeys(1); - auto result = ListObjectsV2(req); + req.SetKey(key); + req.SetRange("bytes=0-1"); + auto result = GetObject(req); + if (result.IsSuccess()) return std::nullopt; return result.GetError(); diff --git a/src/IO/S3/Client.h b/src/IO/S3/Client.h index 16457c7a511f..2701a3094cbc 100644 --- a/src/IO/S3/Client.h +++ b/src/IO/S3/Client.h @@ -287,7 +287,7 @@ class Client : private Aws::S3::S3Client void updateURIForBucket(const std::string & bucket, S3::URI new_uri) const; std::optional getURIFromError(const Aws::S3::S3Error & error) const; - std::optional updateURIForBucketForHead(const std::string & bucket) const; + std::optional updateURIForBucketForHead(const std::string & bucket, const std::string & key) const; std::optional getURIForBucket(const std::string & bucket) const; diff --git a/src/IO/S3/URI.cpp b/src/IO/S3/URI.cpp index f5848fac75df..480160a86c5f 100644 --- a/src/IO/S3/URI.cpp +++ b/src/IO/S3/URI.cpp @@ -158,10 +158,72 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax) validateKey(key, uri); } +bool URI::isAWSRegion(std::string_view region) +{ + /// List from https://docs.aws.amazon.com/general/latest/gr/s3.html + static const std::unordered_set regions = { + "us-east-2", + "us-east-1", + "us-west-1", + "us-west-2", + "af-south-1", + "ap-east-1", + "ap-south-2", + "ap-southeast-3", + "ap-southeast-5", + "ap-southeast-4", + "ap-south-1", + "ap-northeast-3", + "ap-northeast-2", + "ap-southeast-1", + "ap-southeast-2", + "ap-east-2", + "ap-southeast-7", + "ap-northeast-1", + "ca-central-1", + "ca-west-1", + "eu-central-1", + "eu-west-1", + "eu-west-2", + "eu-south-1", + "eu-west-3", + "eu-south-2", + "eu-north-1", + "eu-central-2", + "il-central-1", + "mx-central-1", + "me-south-1", + "me-central-1", + "sa-east-1", + "us-gov-east-1", + "us-gov-west-1" + }; + + /// 's3-us-west-2' is a legacy region format for S3 storage, equals to 'us-west-2' + /// See https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#VirtualHostingBackwardsCompatibility + if (region.substr(0, 3) == "s3-") + region = region.substr(3); + + return regions.contains(region); +} + void URI::addRegionToURI(const std::string ®ion) { if (auto pos = endpoint.find(".amazonaws.com"); pos != std::string::npos) + { + if (pos > 0) + { /// Check if region is already in endpoint to avoid add it second time + auto prev_pos = endpoint.find_last_of("/.", pos - 1); + if (prev_pos == std::string::npos) + prev_pos = 0; + else + ++prev_pos; + std::string_view endpoint_region = std::string_view(endpoint).substr(prev_pos, pos - prev_pos); + if (isAWSRegion(endpoint_region)) + return; + } endpoint = endpoint.substr(0, pos) + "." + region + endpoint.substr(pos); + } } void URI::validateBucket(const String & bucket, const Poco::URI & uri) diff --git a/src/IO/S3/URI.h b/src/IO/S3/URI.h index 9220a8209045..8af05c177807 100644 --- a/src/IO/S3/URI.h +++ b/src/IO/S3/URI.h @@ -41,6 +41,10 @@ struct URI static void validateBucket(const std::string & bucket, const Poco::URI & uri); static void validateKey(const std::string & key, const Poco::URI & uri); + + /// Returns true if 'region' string is an AWS S3 region + /// https://docs.aws.amazon.com/general/latest/gr/s3.html + static bool isAWSRegion(std::string_view region); }; } diff --git a/src/IO/S3/getObjectInfo.cpp b/src/IO/S3/getObjectInfo.cpp index b69f2a23a0dc..b85743e4c5fe 100644 --- a/src/IO/S3/getObjectInfo.cpp +++ b/src/IO/S3/getObjectInfo.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #if USE_AWS_S3 @@ -15,6 +16,7 @@ namespace ProfileEvents extern const Event S3GetObject; extern const Event S3GetObjectAttributes; extern const Event S3HeadObject; + extern const Event S3HeadObjectMicroseconds; extern const Event DiskS3GetObject; extern const Event DiskS3GetObjectAttributes; extern const Event DiskS3HeadObject; @@ -32,6 +34,7 @@ namespace ProfileEvents::increment(ProfileEvents::S3HeadObject); if (client.isClientForDisk()) ProfileEvents::increment(ProfileEvents::DiskS3HeadObject); + ProfileEventTimeIncrement watch(ProfileEvents::S3HeadObjectMicroseconds); S3::HeadObjectRequest req; req.SetBucket(bucket); diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index f12de6a7b546..cc321ba7af8c 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -19,16 +19,6 @@ #include -namespace ProfileEvents -{ - extern const Event S3GetObjectAttributes; - extern const Event S3GetObjectMetadata; - extern const Event S3HeadObject; - extern const Event DiskS3GetObjectAttributes; - extern const Event DiskS3GetObjectMetadata; - extern const Event DiskS3HeadObject; -} - namespace DB { diff --git a/src/Interpreters/ClientInfo.h b/src/Interpreters/ClientInfo.h index c360e723641a..025f84bdd4f9 100644 --- a/src/Interpreters/ClientInfo.h +++ b/src/Interpreters/ClientInfo.h @@ -139,6 +139,7 @@ class ClientInfo NOT_A_BACKGROUND_OPERATION = 0, MERGE = 1, MUTATION = 2, + EXPORT_PART = 3, }; /// It's ClientInfo and context created for background operation (not real query) diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp index a0e4e4999b72..e067cf87b067 100644 --- a/src/Interpreters/Cluster.cpp +++ b/src/Interpreters/Cluster.cpp @@ -732,9 +732,9 @@ void Cluster::initMisc() } } -std::unique_ptr Cluster::getClusterWithReplicasAsShards(const Settings & settings, size_t max_replicas_from_shard) const +std::unique_ptr Cluster::getClusterWithReplicasAsShards(const Settings & settings, size_t max_replicas_from_shard, size_t max_hosts) const { - return std::unique_ptr{ new Cluster(ReplicasAsShardsTag{}, *this, settings, max_replicas_from_shard)}; + return std::unique_ptr{ new Cluster(ReplicasAsShardsTag{}, *this, settings, max_replicas_from_shard, max_hosts)}; } std::unique_ptr Cluster::getClusterWithSingleShard(size_t index) const @@ -783,7 +783,7 @@ void shuffleReplicas(std::vector & replicas, const Settings & } -Cluster::Cluster(Cluster::ReplicasAsShardsTag, const Cluster & from, const Settings & settings, size_t max_replicas_from_shard) +Cluster::Cluster(Cluster::ReplicasAsShardsTag, const Cluster & from, const Settings & settings, size_t max_replicas_from_shard, size_t max_hosts) { if (from.addresses_with_failover.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cluster is empty"); @@ -805,6 +805,7 @@ Cluster::Cluster(Cluster::ReplicasAsShardsTag, const Cluster & from, const Setti if (address.is_local) info.local_addresses.push_back(address); + addresses_with_failover.emplace_back(Addresses({address})); auto pool = ConnectionPoolFactory::instance().get( static_cast(settings[Setting::distributed_connections_pool_size]), @@ -828,9 +829,6 @@ Cluster::Cluster(Cluster::ReplicasAsShardsTag, const Cluster & from, const Setti info.per_replica_pools = {std::move(pool)}; info.default_database = address.default_database; - addresses_with_failover.emplace_back(Addresses{address}); - - slot_to_shard.insert(std::end(slot_to_shard), info.weight, shards_info.size()); shards_info.emplace_back(std::move(info)); } }; @@ -852,10 +850,37 @@ Cluster::Cluster(Cluster::ReplicasAsShardsTag, const Cluster & from, const Setti secret = from.secret; name = from.name; + constrainShardInfoAndAddressesToMaxHosts(max_hosts); + + for (size_t i = 0; i < shards_info.size(); ++i) + slot_to_shard.insert(std::end(slot_to_shard), shards_info[i].weight, i); + initMisc(); } +void Cluster::constrainShardInfoAndAddressesToMaxHosts(size_t max_hosts) +{ + if (max_hosts == 0 || shards_info.size() <= max_hosts) + return; + + pcg64_fast gen{randomSeed()}; + std::shuffle(shards_info.begin(), shards_info.end(), gen); + shards_info.resize(max_hosts); + + AddressesWithFailover addresses_with_failover_; + + UInt32 shard_num = 0; + for (auto & shard_info : shards_info) + { + addresses_with_failover_.push_back(addresses_with_failover[shard_info.shard_num - 1]); + shard_info.shard_num = ++shard_num; + } + + addresses_with_failover.swap(addresses_with_failover_); +} + + Cluster::Cluster(Cluster::SubclusterTag, const Cluster & from, const std::vector & indices) { for (size_t index : indices) diff --git a/src/Interpreters/Cluster.h b/src/Interpreters/Cluster.h index b5a4c51c11db..f9b581034ef7 100644 --- a/src/Interpreters/Cluster.h +++ b/src/Interpreters/Cluster.h @@ -270,7 +270,7 @@ class Cluster std::unique_ptr getClusterWithMultipleShards(const std::vector & indices) const; /// Get a new Cluster that contains all servers (all shards with all replicas) from existing cluster as independent shards. - std::unique_ptr getClusterWithReplicasAsShards(const Settings & settings, size_t max_replicas_from_shard = 0) const; + std::unique_ptr getClusterWithReplicasAsShards(const Settings & settings, size_t max_replicas_from_shard = 0, size_t max_hosts = 0) const; /// Returns false if cluster configuration doesn't allow to use it for cross-replication. /// NOTE: true does not mean, that it's actually a cross-replication cluster. @@ -296,7 +296,7 @@ class Cluster /// For getClusterWithReplicasAsShards implementation struct ReplicasAsShardsTag {}; - Cluster(ReplicasAsShardsTag, const Cluster & from, const Settings & settings, size_t max_replicas_from_shard); + Cluster(ReplicasAsShardsTag, const Cluster & from, const Settings & settings, size_t max_replicas_from_shard, size_t max_hosts); void addShard( const Settings & settings, @@ -308,6 +308,9 @@ class Cluster ShardInfoInsertPathForInternalReplication insert_paths = {}, bool internal_replication = false); + /// Reduce size of cluster to max_hosts + void constrainShardInfoAndAddressesToMaxHosts(size_t max_hosts); + /// Inter-server secret String secret; diff --git a/src/Interpreters/ClusterDiscovery.cpp b/src/Interpreters/ClusterDiscovery.cpp index 033f1379543c..6c328a3f7beb 100644 --- a/src/Interpreters/ClusterDiscovery.cpp +++ b/src/Interpreters/ClusterDiscovery.cpp @@ -108,6 +108,13 @@ class ClusterDiscovery::Flags cv.notify_one(); } + void wakeup() + { + std::unique_lock lk(mu); + any_need_update = true; + cv.notify_one(); + } + private: std::condition_variable cv; std::mutex mu; @@ -391,7 +398,9 @@ bool ClusterDiscovery::upsertCluster(ClusterInfo & cluster_info) return true; }; - if (!cluster_info.current_node_is_observer && !contains(node_uuids, current_node_name)) + if (!cluster_info.current_node_is_observer + && context->isSwarmModeEnabled() + && !contains(node_uuids, current_node_name)) { LOG_ERROR(log, "Can't find current node in cluster '{}', will register again", cluster_info.name); registerInZk(zk, cluster_info); @@ -455,12 +464,30 @@ void ClusterDiscovery::registerInZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & inf return; } + if (!context->isSwarmModeEnabled()) + { + LOG_DEBUG(log, "STOP SWARM MODE called, skip self-registering current node {} in cluster {}", current_node_name, info.name); + return; + } + LOG_DEBUG(log, "Registering current node {} in cluster {}", current_node_name, info.name); zk->createOrUpdate(node_path, info.current_node.serialize(), zkutil::CreateMode::Ephemeral); LOG_DEBUG(log, "Current node {} registered in cluster {}", current_node_name, info.name); } +void ClusterDiscovery::unregisterFromZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info) +{ + if (info.current_node_is_observer) + return; + + String node_path = getShardsListPath(info.zk_root) / current_node_name; + LOG_DEBUG(log, "Removing current node {} from cluster {}", current_node_name, info.name); + + zk->remove(node_path); + LOG_DEBUG(log, "Current node {} removed from cluster {}", current_node_name, info.name); +} + void ClusterDiscovery::initialUpdate() { LOG_DEBUG(log, "Initializing"); @@ -506,6 +533,18 @@ void ClusterDiscovery::initialUpdate() is_initialized = true; } +void ClusterDiscovery::registerAll() +{ + register_change_flag = RegisterChangeFlag::RCF_REGISTER_ALL; + clusters_to_update->wakeup(); +} + +void ClusterDiscovery::unregisterAll() +{ + register_change_flag = RegisterChangeFlag::RCF_UNREGISTER_ALL; + clusters_to_update->wakeup(); +} + void ClusterDiscovery::findDynamicClusters( std::unordered_map & info, std::unordered_set * unchanged_roots) @@ -729,6 +768,27 @@ bool ClusterDiscovery::runMainThread(std::function up_to_date_callback) { up_to_date_callback(); } + + RegisterChangeFlag flag = register_change_flag.exchange(RegisterChangeFlag::RCF_NONE); + + if (flag == RegisterChangeFlag::RCF_REGISTER_ALL) + { + LOG_DEBUG(log, "Register in all dynamic clusters"); + for (auto & [_, info] : clusters_info) + { + auto zk = context->getDefaultOrAuxiliaryZooKeeper(info.zk_name); + registerInZk(zk, info); + } + } + else if (flag == RegisterChangeFlag::RCF_UNREGISTER_ALL) + { + LOG_DEBUG(log, "Unregister in all dynamic clusters"); + for (auto & [_, info] : clusters_info) + { + auto zk = context->getDefaultOrAuxiliaryZooKeeper(info.zk_name); + unregisterFromZk(zk, info); + } + } } LOG_DEBUG(log, "Worker thread stopped"); return finished; diff --git a/src/Interpreters/ClusterDiscovery.h b/src/Interpreters/ClusterDiscovery.h index c0e4af3b86f3..2d3bbe489f4e 100644 --- a/src/Interpreters/ClusterDiscovery.h +++ b/src/Interpreters/ClusterDiscovery.h @@ -38,6 +38,9 @@ class ClusterDiscovery ~ClusterDiscovery(); + void registerAll(); + void unregisterAll(); + private: struct NodeInfo { @@ -125,6 +128,7 @@ class ClusterDiscovery void initialUpdate(); void registerInZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info); + void unregisterFromZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info); Strings getNodeNames(zkutil::ZooKeeperPtr & zk, const String & zk_root, @@ -207,6 +211,15 @@ class ClusterDiscovery std::shared_ptr>> multicluster_discovery_paths; MultiVersion::Version macros; + + enum RegisterChangeFlag + { + RCF_NONE, + RCF_REGISTER_ALL, + RCF_UNREGISTER_ALL, + }; + + std::atomic register_change_flag = RegisterChangeFlag::RCF_NONE; }; } diff --git a/src/Interpreters/ClusterFunctionReadTask.cpp b/src/Interpreters/ClusterFunctionReadTask.cpp index 9a91f8549cfa..319505917f2f 100644 --- a/src/Interpreters/ClusterFunctionReadTask.cpp +++ b/src/Interpreters/ClusterFunctionReadTask.cpp @@ -19,9 +19,11 @@ namespace ErrorCodes namespace Setting { extern const SettingsBool cluster_function_process_archive_on_multiple_nodes; + extern const SettingsBool allow_experimental_iceberg_read_optimization; } ClusterFunctionReadTaskResponse::ClusterFunctionReadTaskResponse(ObjectInfoPtr object, const ContextPtr & context) + : iceberg_read_optimization_enabled(context->getSettingsRef()[Setting::allow_experimental_iceberg_read_optimization]) { if (!object) throw Exception(ErrorCodes::LOGICAL_ERROR, "`object` cannot be null"); @@ -29,8 +31,16 @@ ClusterFunctionReadTaskResponse::ClusterFunctionReadTaskResponse(ObjectInfoPtr o if (object->data_lake_metadata.has_value()) data_lake_metadata = object->data_lake_metadata.value(); - const bool send_over_whole_archive = !context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes]; - path = send_over_whole_archive ? object->getPathOrPathToArchiveIfArchive() : object->getPath(); + file_meta_info = object->file_meta_info; + + if (object->getCommand().isValid()) + path = object->getCommand().toString(); + else + { + const bool send_over_whole_archive = !context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes]; + path = send_over_whole_archive ? object->getPathOrPathToArchiveIfArchive() : object->getPath(); + absolute_path = object->getAbsolutePath(); + } } ClusterFunctionReadTaskResponse::ClusterFunctionReadTaskResponse(const std::string & path_) @@ -45,6 +55,10 @@ ObjectInfoPtr ClusterFunctionReadTaskResponse::getObjectInfo() const auto object = std::make_shared(path); object->data_lake_metadata = data_lake_metadata; + object->file_meta_info = file_meta_info; + if (absolute_path.has_value() && !absolute_path.value().empty()) + object->absolute_path = absolute_path; + return object; } @@ -61,6 +75,15 @@ void ClusterFunctionReadTaskResponse::serialize(WriteBuffer & out, size_t protoc else ActionsDAG().serialize(out, registry); } + + if (protocol_version >= DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_DATA_LAKE_COLUMNS_METADATA) + { + /// This info is not used when optimization is disabled, so there is no need to send it. + if (iceberg_read_optimization_enabled && file_meta_info.has_value()) + file_meta_info.value()->serialize(out); + else + DataFileMetaInfo().serialize(out); + } } void ClusterFunctionReadTaskResponse::deserialize(ReadBuffer & in) @@ -87,6 +110,14 @@ void ClusterFunctionReadTaskResponse::deserialize(ReadBuffer & in) data_lake_metadata.transform = std::move(transform); } } + + if (protocol_version >= DBMS_CLUSTER_PROCESSING_PROTOCOL_VERSION_WITH_DATA_LAKE_COLUMNS_METADATA) + { + auto info = std::make_shared(DataFileMetaInfo::deserialize(in)); + + if (!path.empty() && !info->empty()) + file_meta_info = info; + } } } diff --git a/src/Interpreters/ClusterFunctionReadTask.h b/src/Interpreters/ClusterFunctionReadTask.h index 21e3238e0fb9..b5e2123cbc4f 100644 --- a/src/Interpreters/ClusterFunctionReadTask.h +++ b/src/Interpreters/ClusterFunctionReadTask.h @@ -18,8 +18,14 @@ struct ClusterFunctionReadTaskResponse /// Data path (object path, in case of object storage). String path; + /// Absolute path (including storage type prefix). + std::optional absolute_path; /// Object metadata path, in case of data lake object. DataLakeObjectMetadata data_lake_metadata; + /// File's columns info + std::optional file_meta_info; + + const bool iceberg_read_optimization_enabled = false; /// Convert received response into ObjectInfo. ObjectInfoPtr getObjectInfo() const; diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp index f57fae899df5..8259d57fa7f0 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp @@ -67,7 +67,8 @@ ASTPtr rewriteSelectQuery( const ASTPtr & query, const std::string & remote_database, const std::string & remote_table, - ASTPtr table_function_ptr) + ASTPtr table_function_ptr, + ASTPtr additional_filter) { auto modified_query_ast = query->clone(); @@ -80,8 +81,33 @@ ASTPtr rewriteSelectQuery( if (!context->getSettingsRef()[Setting::allow_experimental_analyzer]) { + // Apply additional filter if provided + if (additional_filter) + { + if (select_query.where()) + { + /// WHERE AND + select_query.setExpression( + ASTSelectQuery::Expression::WHERE, + makeASTFunction("and", select_query.where(), additional_filter->clone())); + } + else + { + /// No WHERE – simply set it + select_query.setExpression( + ASTSelectQuery::Expression::WHERE, additional_filter->clone()); + } + } + if (table_function_ptr) - select_query.addTableFunction(table_function_ptr); + { + select_query.addTableFunction(table_function_ptr->clone()); + + // Reset semantic table information for all column identifiers to prevent + // RestoreQualifiedNamesVisitor from adding wrong table names + ResetSemanticTableVisitor::Data data; + ResetSemanticTableVisitor(data).visit(modified_query_ast); + } else select_query.replaceDatabaseAndTable(remote_database, remote_table); @@ -93,6 +119,7 @@ ASTPtr rewriteSelectQuery( data.distributed_table = DatabaseAndTableWithAlias(*getTableExpression(query->as(), 0)); data.remote_table.database = remote_database; data.remote_table.table = remote_table; + RestoreQualifiedNamesVisitor(data).visit(modified_query_ast); } } diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.h b/src/Interpreters/ClusterProxy/SelectStreamFactory.h index b61f7a0012af..47646a80e1aa 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.h +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.h @@ -41,7 +41,8 @@ ASTPtr rewriteSelectQuery( const ASTPtr & query, const std::string & remote_database, const std::string & remote_table, - ASTPtr table_function_ptr = nullptr); + ASTPtr table_function_ptr = nullptr, + ASTPtr additional_filter = nullptr); using ColumnsDescriptionByShardNum = std::unordered_map; using AdditionalShardFilterGenerator = std::function; diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index 4d15eb28c8fd..5ed4c2862d01 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -333,7 +334,8 @@ void executeQuery( const std::string & sharding_key_column_name, const DistributedSettings & distributed_settings, AdditionalShardFilterGenerator shard_filter_generator, - bool is_remote_function) + bool is_remote_function, + std::span additional_query_infos) { const Settings & settings = context->getSettingsRef(); @@ -361,6 +363,7 @@ void executeQuery( new_context->increaseDistributedDepth(); const size_t shards = cluster->getShardCount(); + const bool has_additional_query_infos = !additional_query_infos.empty(); if (context->getSettingsRef()[Setting::allow_experimental_analyzer]) { @@ -464,11 +467,35 @@ void executeQuery( not_optimized_cluster->getName()); read_from_remote->setStepDescription("Read from remote replica"); + read_from_remote->setIsRemoteFunction(is_remote_function); plan->addStep(std::move(read_from_remote)); plan->addInterpreterContext(new_context); plans.emplace_back(std::move(plan)); } + if (has_additional_query_infos) + { + if (!header) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Header is not initialized for local hybrid plan creation"); + + const Block & header_block = *header; + for (const auto & additional_query_info : additional_query_infos) + { + auto additional_plan = createLocalPlan( + additional_query_info.query, + header_block, + context, + processed_stage, + 0, /// shard_num is not applicable for local hybrid plans + 1, /// shard_count is not applicable for local hybrid plans + false, + false, + ""); + + plans.emplace_back(std::move(additional_plan)); + } + } + if (plans.empty()) return; @@ -484,6 +511,8 @@ void executeQuery( input_headers.emplace_back(plan->getCurrentHeader()); auto union_step = std::make_unique(std::move(input_headers)); + if (has_additional_query_infos) + union_step->setStepDescription("Hybrid"); query_plan.unitePlans(std::move(union_step), std::move(plans)); } diff --git a/src/Interpreters/ClusterProxy/executeQuery.h b/src/Interpreters/ClusterProxy/executeQuery.h index 0142e57a9120..8f1b8be42182 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.h +++ b/src/Interpreters/ClusterProxy/executeQuery.h @@ -5,6 +5,7 @@ #include #include +#include namespace DB { @@ -88,7 +89,8 @@ void executeQuery( const std::string & sharding_key_column_name, const DistributedSettings & distributed_settings, AdditionalShardFilterGenerator shard_filter_generator, - bool is_remote_function); + bool is_remote_function, + std::span additional_query_infos = {}); std::optional executeInsertSelectWithParallelReplicas( const ASTInsertQuery & query_ast, diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index e1a1e3b677c1..902c6a551460 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -157,6 +158,8 @@ namespace ProfileEvents extern const Event BackupThrottlerSleepMicroseconds; extern const Event MergesThrottlerBytes; extern const Event MergesThrottlerSleepMicroseconds; + extern const Event ExportsThrottlerBytes; + extern const Event ExportsThrottlerSleepMicroseconds; extern const Event MutationsThrottlerBytes; extern const Event MutationsThrottlerSleepMicroseconds; extern const Event QueryLocalReadThrottlerBytes; @@ -244,6 +247,7 @@ namespace CurrentMetrics extern const Metric UncompressedCacheCells; extern const Metric IndexUncompressedCacheBytes; extern const Metric IndexUncompressedCacheCells; + extern const Metric IsSwarmModeEnabled; } @@ -344,6 +348,7 @@ namespace ServerSetting extern const ServerSettingsUInt64 max_local_write_bandwidth_for_server; extern const ServerSettingsUInt64 max_merges_bandwidth_for_server; extern const ServerSettingsUInt64 max_mutations_bandwidth_for_server; + extern const ServerSettingsUInt64 max_exports_bandwidth_for_server; extern const ServerSettingsUInt64 max_remote_read_network_bandwidth_for_server; extern const ServerSettingsUInt64 max_remote_write_network_bandwidth_for_server; extern const ServerSettingsUInt64 max_replicated_fetches_network_bandwidth_for_server; @@ -528,6 +533,7 @@ struct ContextSharedPart : boost::noncopyable GlobalOvercommitTracker global_overcommit_tracker; MergeList merge_list; /// The list of executable merge (for (Replicated)?MergeTree) MovesList moves_list; /// The list of executing moves (for (Replicated)?MergeTree) + ExportsList exports_list; /// The list of executing exports (for (Replicated)?MergeTree) ReplicatedFetchList replicated_fetch_list; RefreshSet refresh_set; /// The list of active refreshes (for MaterializedView) ConfigurationPtr users_config TSA_GUARDED_BY(mutex); /// Config with the users, profiles and quotas sections. @@ -569,6 +575,8 @@ struct ContextSharedPart : boost::noncopyable mutable ThrottlerPtr mutations_throttler; /// A server-wide throttler for mutations mutable ThrottlerPtr merges_throttler; /// A server-wide throttler for merges + mutable ThrottlerPtr exports_throttler; /// A server-wide throttler for exports + MultiVersion macros; /// Substitutions extracted from config. std::unique_ptr ddl_worker TSA_GUARDED_BY(mutex); /// Process ddl commands from zk. LoadTaskPtr ddl_worker_startup_task; /// To postpone `ddl_worker->startup()` after all tables startup @@ -645,6 +653,7 @@ struct ContextSharedPart : boost::noncopyable std::map server_ports; std::atomic shutdown_called = false; + std::atomic swarm_mode_enabled = true; Stopwatch uptime_watch TSA_GUARDED_BY(mutex); @@ -813,6 +822,8 @@ struct ContextSharedPart : boost::noncopyable */ void shutdown() TSA_NO_THREAD_SAFETY_ANALYSIS { + swarm_mode_enabled = false; + CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 0); bool is_shutdown_called = shutdown_called.exchange(true); if (is_shutdown_called) return; @@ -1075,6 +1086,9 @@ struct ContextSharedPart : boost::noncopyable if (auto bandwidth = server_settings[ServerSetting::max_merges_bandwidth_for_server]) merges_throttler = std::make_shared(bandwidth, ProfileEvents::MergesThrottlerBytes, ProfileEvents::MergesThrottlerSleepMicroseconds); + + if (auto bandwidth = server_settings[ServerSetting::max_exports_bandwidth_for_server]) + exports_throttler = std::make_shared(bandwidth, ProfileEvents::ExportsThrottlerBytes, ProfileEvents::ExportsThrottlerSleepMicroseconds); } }; @@ -1232,6 +1246,8 @@ MergeList & Context::getMergeList() { return shared->merge_list; } const MergeList & Context::getMergeList() const { return shared->merge_list; } MovesList & Context::getMovesList() { return shared->moves_list; } const MovesList & Context::getMovesList() const { return shared->moves_list; } +ExportsList & Context::getExportsList() { return shared->exports_list; } +const ExportsList & Context::getExportsList() const { return shared->exports_list; } ReplicatedFetchList & Context::getReplicatedFetchList() { return shared->replicated_fetch_list; } const ReplicatedFetchList & Context::getReplicatedFetchList() const { return shared->replicated_fetch_list; } RefreshSet & Context::getRefreshSet() { return shared->refresh_set; } @@ -3051,8 +3067,11 @@ void Context::setCurrentQueryId(const String & query_id) client_info.current_query_id = query_id_to_set; - if (client_info.query_kind == ClientInfo::QueryKind::INITIAL_QUERY) + if (client_info.query_kind == ClientInfo::QueryKind::INITIAL_QUERY + && (getApplicationType() != ApplicationType::SERVER || client_info.initial_query_id.empty())) + { client_info.initial_query_id = client_info.current_query_id; + } } void Context::setBackgroundOperationTypeForContext(ClientInfo::BackgroundOperationType background_operation) @@ -3190,6 +3209,13 @@ void Context::makeQueryContextForMutate(const MergeTreeSettings & merge_tree_set = merge_tree_settings[MergeTreeSetting::mutation_workload].value.empty() ? getMutationWorkload() : merge_tree_settings[MergeTreeSetting::mutation_workload]; } +void Context::makeQueryContextForExportPart() +{ + makeQueryContext(); + classifier.reset(); // It is assumed that there are no active queries running using this classifier, otherwise this will lead to crashes + // Export part operations don't have a specific workload setting, so we leave the default workload +} + void Context::makeSessionContext() { session_context = shared_from_this(); @@ -4221,6 +4247,11 @@ ThrottlerPtr Context::getMergesThrottler() const return shared->merges_throttler; } +ThrottlerPtr Context::getExportsThrottler() const +{ + return shared->exports_throttler; +} + void Context::reloadRemoteThrottlerConfig(size_t read_bandwidth, size_t write_bandwidth) const { if (read_bandwidth) @@ -4898,7 +4929,6 @@ std::shared_ptr Context::getCluster(const std::string & cluster_name) c throw Exception(ErrorCodes::CLUSTER_DOESNT_EXIST, "Requested cluster '{}' not found", cluster_name); } - std::shared_ptr Context::tryGetCluster(const std::string & cluster_name) const { std::shared_ptr res = nullptr; @@ -4917,6 +4947,21 @@ std::shared_ptr Context::tryGetCluster(const std::string & cluster_name return res; } +void Context::unregisterInAutodiscoveryClusters() +{ + std::lock_guard lock(shared->clusters_mutex); + if (!shared->cluster_discovery) + return; + shared->cluster_discovery->unregisterAll(); +} + +void Context::registerInAutodiscoveryClusters() +{ + std::lock_guard lock(shared->clusters_mutex); + if (!shared->cluster_discovery) + return; + shared->cluster_discovery->registerAll(); +} void Context::reloadClusterConfig() const { @@ -5833,12 +5878,35 @@ void Context::stopServers(const ServerType & server_type) const shared->stop_servers_callback(server_type); } - void Context::shutdown() TSA_NO_THREAD_SAFETY_ANALYSIS { shared->shutdown(); } +bool Context::stopSwarmMode() +{ + bool expected_is_enabled = true; + bool is_stopped_now = shared->swarm_mode_enabled.compare_exchange_strong(expected_is_enabled, false); + if (is_stopped_now) + CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 0); + // return true if stop successful + return is_stopped_now; +} + +bool Context::startSwarmMode() +{ + bool expected_is_enabled = false; + bool is_started_now = shared->swarm_mode_enabled.compare_exchange_strong(expected_is_enabled, true); + if (is_started_now) + CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 1); + // return true if start successful + return is_started_now; +} + +bool Context::isSwarmModeEnabled() const +{ + return shared->swarm_mode_enabled; +} Context::ApplicationType Context::getApplicationType() const { diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 626705099e22..28a228fdf843 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -89,6 +89,7 @@ class AsynchronousMetrics; class BackgroundSchedulePool; class MergeList; class MovesList; +class ExportsList; class ReplicatedFetchList; class RefreshSet; class Cluster; @@ -1134,6 +1135,7 @@ class Context: public ContextData, public std::enable_shared_from_this void makeQueryContext(); void makeQueryContextForMerge(const MergeTreeSettings & merge_tree_settings); void makeQueryContextForMutate(const MergeTreeSettings & merge_tree_settings); + void makeQueryContextForExportPart(); void makeSessionContext(); void makeGlobalContext(); @@ -1166,6 +1168,9 @@ class Context: public ContextData, public std::enable_shared_from_this MovesList & getMovesList(); const MovesList & getMovesList() const; + ExportsList & getExportsList(); + const ExportsList & getExportsList() const; + ReplicatedFetchList & getReplicatedFetchList(); const ReplicatedFetchList & getReplicatedFetchList() const; @@ -1319,6 +1324,8 @@ class Context: public ContextData, public std::enable_shared_from_this size_t getClustersVersion() const; void startClusterDiscovery(); + void registerInAutodiscoveryClusters(); + void unregisterInAutodiscoveryClusters(); /// Sets custom cluster, but doesn't update configuration void setCluster(const String & cluster_name, const std::shared_ptr & cluster); @@ -1434,6 +1441,15 @@ class Context: public ContextData, public std::enable_shared_from_this void shutdown(); + /// Stop some works to allow graceful shutdown later. + /// Returns true if stop successful. + bool stopSwarmMode(); + /// Resume some works if we change our mind. + /// Returns true if start successful. + bool startSwarmMode(); + /// Return current swarm mode state. + bool isSwarmModeEnabled() const; + bool isInternalQuery() const { return is_internal_query; } void setInternalQuery(bool internal) { is_internal_query = internal; } @@ -1660,6 +1676,7 @@ class Context: public ContextData, public std::enable_shared_from_this ThrottlerPtr getMutationsThrottler() const; ThrottlerPtr getMergesThrottler() const; + ThrottlerPtr getExportsThrottler() const; void reloadRemoteThrottlerConfig(size_t read_bandwidth, size_t write_bandwidth) const; void reloadLocalThrottlerConfig(size_t read_bandwidth, size_t write_bandwidth) const; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index b1922a694919..70454dd9a062 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -752,7 +752,8 @@ bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr & ast_ddl, const Stora alter->isFreezeAlter() || alter->isUnlockSnapshot() || alter->isMovePartitionToDiskOrVolumeAlter() || - alter->isCommentAlter()) + alter->isCommentAlter() || + alter->isExportPartAlter()) return false; } diff --git a/src/Interpreters/IcebergMetadataLog.cpp b/src/Interpreters/IcebergMetadataLog.cpp index 7388e55a4fcb..7f36e95b1bdf 100644 --- a/src/Interpreters/IcebergMetadataLog.cpp +++ b/src/Interpreters/IcebergMetadataLog.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -91,7 +92,7 @@ void IcebergMetadataLogElement::appendToBlock(MutableColumns & columns) const void insertRowToLogTable( const ContextPtr & local_context, - String row, + std::function get_row, IcebergMetadataLogLevel row_log_level, const String & table_path, const String & file_path, @@ -112,7 +113,7 @@ void insertRowToLogTable( .content_type = row_log_level, .table_path = table_path, .file_path = file_path, - .metadata_content = row, + .metadata_content = get_row(), .row_in_file = row_in_file, .pruning_status = pruning_status}); } diff --git a/src/Interpreters/IcebergMetadataLog.h b/src/Interpreters/IcebergMetadataLog.h index b43e2cfa47b2..8c48c563bbd3 100644 --- a/src/Interpreters/IcebergMetadataLog.h +++ b/src/Interpreters/IcebergMetadataLog.h @@ -2,6 +2,7 @@ #include #include +#include #include namespace DB @@ -25,9 +26,11 @@ struct IcebergMetadataLogElement void appendToBlock(MutableColumns & columns) const; }; +/// Here `get_row` function is used instead `row` string to calculate string only when required. +/// Inside `insertRowToLogTable` code can exit immediately after `iceberg_metadata_log_level` setting check. void insertRowToLogTable( const ContextPtr & local_context, - String row, + std::function get_row, IcebergMetadataLogLevel row_log_level, const String & table_path, const String & file_path, diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 841e9b768d44..e280cd6318f9 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -539,6 +539,18 @@ AccessRightsElements InterpreterAlterQuery::getRequiredAccessForCommand(const AS required_access.emplace_back(AccessType::ALTER_DELETE | AccessType::INSERT, database, table); break; } + case ASTAlterCommand::EXPORT_PART: + { + required_access.emplace_back(AccessType::ALTER_EXPORT_PART, database, table); + required_access.emplace_back(AccessType::INSERT, command.to_database, command.to_table); + break; + } + case ASTAlterCommand::EXPORT_PARTITION: + { + required_access.emplace_back(AccessType::ALTER_EXPORT_PARTITION, database, table); + required_access.emplace_back(AccessType::INSERT, command.to_database, command.to_table); + break; + } case ASTAlterCommand::FETCH_PARTITION: { required_access.emplace_back(AccessType::ALTER_FETCH_PARTITION, database, table); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 0e859d775db3..f9b12506019d 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -1937,8 +1937,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, auto table_function_ast = create.as_table_function->ptr(); auto table_function = TableFunctionFactory::instance().get(table_function_ast, getContext()); - if (!table_function->canBeUsedToCreateTable()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table function '{}' cannot be used to create a table", table_function->getName()); + table_function->validateUseToCreateTable(); /// In case of CREATE AS table_function() query we should use global context /// in storage creation because there will be no query context on server startup diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 619a2799a0bf..76f7f850c73c 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -779,6 +779,9 @@ InterpreterInsertQuery::distributedWriteIntoReplicatedMergeTreeFromClusterStorag if (!src_storage_cluster) return {}; + if (src_storage_cluster->getOriginalClusterName().empty()) + return {}; + if (!isInsertSelectTrivialEnoughForDistributedExecution(query)) return {}; diff --git a/src/Interpreters/InterpreterKillQueryQuery.cpp b/src/Interpreters/InterpreterKillQueryQuery.cpp index 51ea9051ddfc..4054b1e7211a 100644 --- a/src/Interpreters/InterpreterKillQueryQuery.cpp +++ b/src/Interpreters/InterpreterKillQueryQuery.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -37,11 +38,17 @@ namespace Setting extern const SettingsUInt64 max_parser_depth; } +namespace ServerSetting +{ + extern const ServerSettingsBool enable_experimental_export_merge_tree_partition_feature; +} + namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int ACCESS_DENIED; extern const int NOT_IMPLEMENTED; + extern const int SUPPORT_IS_DISABLED; } @@ -250,6 +257,82 @@ BlockIO InterpreterKillQueryQuery::execute() break; } + case ASTKillQueryQuery::Type::ExportPartition: + { + if (!getContext()->getServerSettings()[ServerSetting::enable_experimental_export_merge_tree_partition_feature]) + { + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Exporting merge tree partition is experimental. Set the server setting `enable_experimental_export_merge_tree_partition_feature` to enable it"); + } + + Block exports_block = getSelectResult( + "source_database, source_table, transaction_id, destination_database, destination_table, partition_id", + "system.replicated_partition_exports"); + if (exports_block.empty()) + return res_io; + + const ColumnString & src_db_col = typeid_cast(*exports_block.getByName("source_database").column); + const ColumnString & src_table_col = typeid_cast(*exports_block.getByName("source_table").column); + const ColumnString & dst_db_col = typeid_cast(*exports_block.getByName("destination_database").column); + const ColumnString & dst_table_col = typeid_cast(*exports_block.getByName("destination_table").column); + const ColumnString & tx_col = typeid_cast(*exports_block.getByName("transaction_id").column); + + auto header = exports_block.cloneEmpty(); + header.insert(0, {ColumnString::create(), std::make_shared(), "kill_status"}); + + MutableColumns res_columns = header.cloneEmptyColumns(); + AccessRightsElements required_access_rights; + auto access = getContext()->getAccess(); + bool access_denied = false; + + for (size_t i = 0; i < exports_block.rows(); ++i) + { + const auto src_database = src_db_col.getDataAt(i).toString(); + const auto src_table = src_table_col.getDataAt(i).toString(); + const auto dst_database = dst_db_col.getDataAt(i).toView(); + const auto dst_table = dst_table_col.getDataAt(i).toView(); + + const auto table_id = StorageID{src_database, src_table}; + const auto transaction_id = tx_col.getDataAt(i).toString(); + + CancellationCode code = CancellationCode::Unknown; + if (!query.test) + { + auto storage = DatabaseCatalog::instance().tryGetTable(table_id, getContext()); + if (!storage) + code = CancellationCode::NotFound; + else + { + ASTAlterCommand alter_command{}; + alter_command.type = ASTAlterCommand::EXPORT_PARTITION; + alter_command.move_destination_type = DataDestinationType::TABLE; + alter_command.from_database = src_database; + alter_command.from_table = src_table; + alter_command.to_database = dst_database; + alter_command.to_table = dst_table; + + required_access_rights = InterpreterAlterQuery::getRequiredAccessForCommand( + alter_command, table_id.database_name, table_id.table_name); + if (!access->isGranted(required_access_rights)) + { + access_denied = true; + continue; + } + code = storage->killExportPartition(transaction_id); + } + } + + insertResultRow(i, code, exports_block, header, res_columns); + } + + if (res_columns[0]->empty() && access_denied) + throw Exception(ErrorCodes::ACCESS_DENIED, "Not allowed to kill export partition. " + "To execute this query, it's necessary to have the grant {}", required_access_rights.toString()); + + res_io.pipeline = QueryPipeline(Pipe(std::make_shared(std::make_shared(header.cloneWithColumns(std::move(res_columns)))))); + + break; + } case ASTKillQueryQuery::Type::Mutation: { Block mutations_block = getSelectResult("database, table, mutation_id, command", "system.mutations"); @@ -462,6 +545,9 @@ AccessRightsElements InterpreterKillQueryQuery::getRequiredAccessForDDLOnCluster | AccessType::ALTER_MATERIALIZE_COLUMN | AccessType::ALTER_MATERIALIZE_TTL ); + /// todo arthur think about this + else if (query.type == ASTKillQueryQuery::Type::ExportPartition) + required_access.emplace_back(AccessType::ALTER_EXPORT_PARTITION); return required_access; } diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index e0ffd41fcdd3..7b9aebf2433c 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,7 @@ #include #include #include +#include #include #include @@ -195,6 +197,7 @@ namespace Setting extern const SettingsUInt64 max_rows_to_transfer; extern const SettingsOverflowMode transfer_overflow_mode; extern const SettingsString implicit_table_at_top_level; + extern const SettingsBool use_hive_partitioning; } namespace ServerSetting @@ -1976,6 +1979,22 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

(query_plan.getRootNode()->step.get())) + { + auto object_filter_step = std::make_unique( + query_plan.getCurrentHeader(), + expressions.before_where->dag.clone(), + getSelectQuery().where()->getColumnName()); + + object_filter_step->setStepDescription("WHERE"); + query_plan.addStep(std::move(object_filter_step)); + } + } + if (from_aggregation_stage) { /// No need to aggregate anything, since this was done on remote shards. diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index f02a6e8ddd6c..4aace28a6340 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,10 @@ #include #endif +#if USE_PARQUET +#include +#endif + #if USE_AWS_S3 #include #endif @@ -436,6 +441,22 @@ BlockIO InterpreterSystemQuery::execute() getContext()->clearQueryResultCache(query.query_result_cache_tag); break; } + case Type::DROP_OBJECT_STORAGE_LIST_OBJECTS_CACHE: + { + getContext()->checkAccess(AccessType::SYSTEM_DROP_OBJECT_STORAGE_LIST_OBJECTS_CACHE); + ObjectStorageListObjectsCache::instance().clear(); + break; + } + case Type::DROP_PARQUET_METADATA_CACHE: + { +#if USE_PARQUET + getContext()->checkAccess(AccessType::SYSTEM_DROP_PARQUET_METADATA_CACHE); + ParquetFileMetaDataCache::instance()->clear(); + break; +#else + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "The server was compiled without the support for Parquet"); +#endif + } case Type::DROP_COMPILED_EXPRESSION_CACHE: #if USE_EMBEDDED_COMPILER getContext()->checkAccess(AccessType::SYSTEM_DROP_COMPILED_EXPRESSION_CACHE); @@ -700,6 +721,20 @@ BlockIO InterpreterSystemQuery::execute() case Type::START_MOVES: startStopAction(ActionLocks::PartsMove, true); break; + case Type::STOP_SWARM_MODE: + { + getContext()->checkAccess(AccessType::SYSTEM_SWARM); + if (getContext()->stopSwarmMode()) + getContext()->unregisterInAutodiscoveryClusters(); + break; + } + case Type::START_SWARM_MODE: + { + getContext()->checkAccess(AccessType::SYSTEM_SWARM); + if (getContext()->startSwarmMode()) + getContext()->registerInAutodiscoveryClusters(); + break; + } case Type::STOP_FETCHES: startStopAction(ActionLocks::PartsFetch, false); break; @@ -1573,6 +1608,8 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster() case Type::DROP_PAGE_CACHE: case Type::DROP_SCHEMA_CACHE: case Type::DROP_FORMAT_SCHEMA_CACHE: + case Type::DROP_OBJECT_STORAGE_LIST_OBJECTS_CACHE: + case Type::DROP_PARQUET_METADATA_CACHE: case Type::DROP_S3_CLIENT_CACHE: { required_access.emplace_back(AccessType::SYSTEM_DROP_CACHE); @@ -1641,6 +1678,12 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster() required_access.emplace_back(AccessType::SYSTEM_MOVES, query.getDatabase(), query.getTable()); break; } + case Type::STOP_SWARM_MODE: + case Type::START_SWARM_MODE: + { + required_access.emplace_back(AccessType::SYSTEM_SWARM); + break; + } case Type::STOP_PULLING_REPLICATION_LOG: case Type::START_PULLING_REPLICATION_LOG: { diff --git a/src/Interpreters/PartLog.cpp b/src/Interpreters/PartLog.cpp index 02c6f6e573b0..aca3b4cf6870 100644 --- a/src/Interpreters/PartLog.cpp +++ b/src/Interpreters/PartLog.cpp @@ -69,6 +69,7 @@ ColumnsDescription PartLogElement::getColumnsDescription() {"MovePart", static_cast(MOVE_PART)}, {"MergePartsStart", static_cast(MERGE_PARTS_START)}, {"MutatePartStart", static_cast(MUTATE_PART_START)}, + {"ExportPart", static_cast(EXPORT_PART)}, } ); @@ -109,7 +110,8 @@ ColumnsDescription PartLogElement::getColumnsDescription() "RemovePart — Removing or detaching a data part using [DETACH PARTITION](/sql-reference/statements/alter/partition#detach-partitionpart)." "MutatePartStart — Mutating of a data part has started, " "MutatePart — Mutating of a data part has finished, " - "MovePart — Moving the data part from the one disk to another one."}, + "MovePart — Moving the data part from the one disk to another one." + "ExportPart — Exporting the data part from a MergeTree table into a target table that represents external storage (e.g., object storage or a data lake).."}, {"merge_reason", std::move(merge_reason_datatype), "The reason for the event with type MERGE_PARTS. Can have one of the following values: " "NotAMerge — The current event has the type other than MERGE_PARTS, " diff --git a/src/Interpreters/PartLog.h b/src/Interpreters/PartLog.h index 44d2fb413c5f..4f58069dae55 100644 --- a/src/Interpreters/PartLog.h +++ b/src/Interpreters/PartLog.h @@ -30,6 +30,7 @@ struct PartLogElement MOVE_PART = 6, MERGE_PARTS_START = 7, MUTATE_PART_START = 8, + EXPORT_PART = 9, }; /// Copy of MergeAlgorithm since values are written to disk. diff --git a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp index 7ef39f82cc29..0eb3c4fcefc3 100644 --- a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp +++ b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp @@ -399,4 +399,15 @@ void RestoreQualifiedNamesMatcher::visit(ASTIdentifier & identifier, ASTPtr &, D } } +void ResetSemanticTableMatcher::visit(ASTPtr & ast, Data & data) +{ + if (auto * t = ast->as()) + visit(*t, ast, data); +} + +void ResetSemanticTableMatcher::visit(ASTIdentifier & identifier, ASTPtr &, Data &) +{ + identifier.resetSemanticTable(); +} + } diff --git a/src/Interpreters/TranslateQualifiedNamesVisitor.h b/src/Interpreters/TranslateQualifiedNamesVisitor.h index 00c85d08873f..becff4845755 100644 --- a/src/Interpreters/TranslateQualifiedNamesVisitor.h +++ b/src/Interpreters/TranslateQualifiedNamesVisitor.h @@ -80,4 +80,33 @@ struct RestoreQualifiedNamesMatcher using RestoreQualifiedNamesVisitor = InDepthNodeVisitor; + +/// Reset semantic->table for all column identifiers in the AST. +/// +/// PROBLEM DESCRIPTION: +/// When an AST is passed through multiple query rewrites (e.g., in Hybrid -> remote), +/// the semantic->table information attached to ASTIdentifier nodes can become stale and +/// cause incorrect column qualification. This happens because: +/// +/// 1. During initial parsing, semantic->table is populated with the original table name +/// 2. When the query is rewritten (e.g., FROM clause changed from table to remote() function inside Hybrid), +/// the AST structure is modified but semantic->table information is preserved +/// 3. Subsequent visitors like RestoreQualifiedNamesVisitor called in remote() function over the same AST +/// may use this stale semantic->table information to incorrectly qualify column names with the original table name +/// +/// SOLUTION: +/// This visitor clears semantic->table for all column identifiers, ensuring that subsequent +/// visitors work with clean semantic information and don't apply stale table qualifications. +struct ResetSemanticTableMatcher +{ + // No data needed for this visitor + struct Data {}; + + static bool needChildVisit(const ASTPtr &, const ASTPtr &) { return true; } + static void visit(ASTPtr & ast, Data & data); + static void visit(ASTIdentifier & identifier, ASTPtr &, Data & data); +}; + +using ResetSemanticTableVisitor = InDepthNodeVisitor; + } diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 46ef1aaafee3..eb2315253a0f 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -53,6 +53,8 @@ bool isSupportedAlterTypeForOnClusterDDLQuery(int type) ASTAlterCommand::ATTACH_PARTITION, /// Usually followed by ATTACH PARTITION ASTAlterCommand::FETCH_PARTITION, + /// Data operation that should be executed locally on each replica + ASTAlterCommand::EXPORT_PART, /// Logical error ASTAlterCommand::NO_TYPE, }; diff --git a/src/Parsers/ASTAlterQuery.cpp b/src/Parsers/ASTAlterQuery.cpp index 32f2156b5cde..57d81c92898f 100644 --- a/src/Parsers/ASTAlterQuery.cpp +++ b/src/Parsers/ASTAlterQuery.cpp @@ -358,6 +358,37 @@ void ASTAlterCommand::formatImpl(WriteBuffer & ostr, const FormatSettings & sett ostr << quoteString(move_destination_name); } } + else if (type == ASTAlterCommand::EXPORT_PART) + { + ostr << "EXPORT PART "; + partition->format(ostr, settings, state, frame); + ostr << " TO "; + switch (move_destination_type) + { + case DataDestinationType::TABLE: + ostr << "TABLE "; + if (!to_database.empty()) + { + ostr << backQuoteIfNeed(to_database) << "."; + } + ostr << backQuoteIfNeed(to_table); + return; + default: + break; + } + + } + else if (type == ASTAlterCommand::EXPORT_PARTITION) + { + ostr << "EXPORT PARTITION "; + partition->format(ostr, settings, state, frame); + ostr << " TO TABLE "; + if (!to_database.empty()) + { + ostr << backQuoteIfNeed(to_database) << "."; + } + ostr << backQuoteIfNeed(to_table); + } else if (type == ASTAlterCommand::REPLACE_PARTITION) { ostr << (replace ? "REPLACE" : "ATTACH") << " PARTITION " @@ -627,6 +658,11 @@ bool ASTAlterQuery::isMovePartitionToDiskOrVolumeAlter() const return false; } +bool ASTAlterQuery::isExportPartAlter() const +{ + return isOneCommandTypeOnly(ASTAlterCommand::EXPORT_PART); +} + /** Get the text that identifies this element. */ String ASTAlterQuery::getID(char delim) const diff --git a/src/Parsers/ASTAlterQuery.h b/src/Parsers/ASTAlterQuery.h index 3867a86cf797..7683b2e11c3d 100644 --- a/src/Parsers/ASTAlterQuery.h +++ b/src/Parsers/ASTAlterQuery.h @@ -71,6 +71,8 @@ class ASTAlterCommand : public IAST FREEZE_ALL, UNFREEZE_PARTITION, UNFREEZE_ALL, + EXPORT_PART, + EXPORT_PARTITION, DELETE, UPDATE, @@ -263,6 +265,8 @@ class ASTAlterQuery : public ASTQueryWithTableAndOutput, public ASTQueryWithOnCl bool isMovePartitionToDiskOrVolumeAlter() const; + bool isExportPartAlter() const; + bool isCommentAlter() const; String getID(char) const override; diff --git a/src/Parsers/ASTIdentifier.cpp b/src/Parsers/ASTIdentifier.cpp index 490116cfc6cf..0767516aa74a 100644 --- a/src/Parsers/ASTIdentifier.cpp +++ b/src/Parsers/ASTIdentifier.cpp @@ -167,6 +167,17 @@ void ASTIdentifier::restoreTable() } } +void ASTIdentifier::resetSemanticTable() +{ + // Only reset semantic table for column identifiers (not table identifiers) + if (semantic && !semantic->special) + { + semantic->table.clear(); + semantic->can_be_alias = true; + semantic->membership = std::nullopt; + } +} + std::shared_ptr ASTIdentifier::createTable() const { if (name_parts.size() == 1) return std::make_shared(name_parts[0]); diff --git a/src/Parsers/ASTIdentifier.h b/src/Parsers/ASTIdentifier.h index 72dde7f644fb..3ea66264ca24 100644 --- a/src/Parsers/ASTIdentifier.h +++ b/src/Parsers/ASTIdentifier.h @@ -52,6 +52,7 @@ class ASTIdentifier : public ASTWithAlias void updateTreeHashImpl(SipHash & hash_state, bool ignore_alias) const override; void restoreTable(); // TODO(ilezhankin): get rid of this + void resetSemanticTable(); // Reset semantic to empty string (see ResetSemanticTableVisitor) std::shared_ptr createTable() const; // returns |nullptr| if identifier is not table. String full_name; diff --git a/src/Parsers/ASTKillQueryQuery.cpp b/src/Parsers/ASTKillQueryQuery.cpp index 0334b78d559e..9911e60b5ed9 100644 --- a/src/Parsers/ASTKillQueryQuery.cpp +++ b/src/Parsers/ASTKillQueryQuery.cpp @@ -27,6 +27,9 @@ void ASTKillQueryQuery::formatQueryImpl(WriteBuffer & ostr, const FormatSettings case Type::Transaction: ostr << "TRANSACTION"; break; + case Type::ExportPartition: + ostr << "EXPORT PARTITION"; + break; } formatOnCluster(ostr, settings); diff --git a/src/Parsers/ASTKillQueryQuery.h b/src/Parsers/ASTKillQueryQuery.h index 99a14c56d72b..13d2811534f0 100644 --- a/src/Parsers/ASTKillQueryQuery.h +++ b/src/Parsers/ASTKillQueryQuery.h @@ -13,6 +13,7 @@ class ASTKillQueryQuery : public ASTQueryWithOutput, public ASTQueryWithOnCluste { Query, /// KILL QUERY Mutation, /// KILL MUTATION + ExportPartition, /// KILL EXPORT_PARTITION PartMoveToShard, /// KILL PART_MOVE_TO_SHARD Transaction, /// KILL TRANSACTION }; diff --git a/src/Parsers/ASTSetQuery.cpp b/src/Parsers/ASTSetQuery.cpp index 94e2ee5fbfc2..f5962b64c954 100644 --- a/src/Parsers/ASTSetQuery.cpp +++ b/src/Parsers/ASTSetQuery.cpp @@ -129,7 +129,8 @@ void ASTSetQuery::formatImpl(WriteBuffer & ostr, const FormatSettings & format, return true; } - if (DataLake::DATABASE_ENGINE_NAME == state.create_engine_name) + if (DataLake::DATABASE_ENGINE_NAME == state.create_engine_name + || DataLake::DATABASE_ALIAS_NAME == state.create_engine_name) { if (DataLake::SETTINGS_TO_HIDE.contains(change.name)) { diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp index 9cdd034f2ca3..00925e561219 100644 --- a/src/Parsers/ASTSystemQuery.cpp +++ b/src/Parsers/ASTSystemQuery.cpp @@ -163,7 +163,13 @@ void ASTSystemQuery::formatImpl(WriteBuffer & ostr, const FormatSettings & setti print_keyword("SYSTEM") << " "; print_keyword(typeToString(type)); - if (!cluster.empty()) + + std::unordered_set queries_with_on_cluster_at_end = { + Type::DROP_FILESYSTEM_CACHE, + Type::SYNC_FILESYSTEM_CACHE, + }; + + if (!queries_with_on_cluster_at_end.contains(type) && !cluster.empty()) formatOnCluster(ostr, settings); switch (type) @@ -484,6 +490,8 @@ void ASTSystemQuery::formatImpl(WriteBuffer & ostr, const FormatSettings & setti case Type::DROP_COMPILED_EXPRESSION_CACHE: case Type::DROP_S3_CLIENT_CACHE: case Type::DROP_ICEBERG_METADATA_CACHE: + case Type::DROP_OBJECT_STORAGE_LIST_OBJECTS_CACHE: + case Type::DROP_PARQUET_METADATA_CACHE: case Type::RESET_COVERAGE: case Type::RESTART_REPLICAS: case Type::JEMALLOC_PURGE: @@ -510,11 +518,16 @@ void ASTSystemQuery::formatImpl(WriteBuffer & ostr, const FormatSettings & setti case Type::DROP_PAGE_CACHE: case Type::STOP_REPLICATED_DDL_QUERIES: case Type::START_REPLICATED_DDL_QUERIES: + case Type::STOP_SWARM_MODE: + case Type::START_SWARM_MODE: break; case Type::UNKNOWN: case Type::END: throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown SYSTEM command"); } + + if (queries_with_on_cluster_at_end.contains(type) && !cluster.empty()) + formatOnCluster(ostr, settings); } diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h index cb21d3d12ba2..22a0dab53e25 100644 --- a/src/Parsers/ASTSystemQuery.h +++ b/src/Parsers/ASTSystemQuery.h @@ -41,6 +41,8 @@ class ASTSystemQuery : public IAST, public ASTQueryWithOnCluster DROP_SCHEMA_CACHE, DROP_FORMAT_SCHEMA_CACHE, DROP_S3_CLIENT_CACHE, + DROP_OBJECT_STORAGE_LIST_OBJECTS_CACHE, + DROP_PARQUET_METADATA_CACHE, STOP_LISTEN, START_LISTEN, RESTART_REPLICAS, @@ -80,6 +82,8 @@ class ASTSystemQuery : public IAST, public ASTQueryWithOnCluster START_FETCHES, STOP_MOVES, START_MOVES, + STOP_SWARM_MODE, + START_SWARM_MODE, STOP_REPLICATED_SENDS, START_REPLICATED_SENDS, STOP_REPLICATION_QUEUES, diff --git a/src/Parsers/CommonParsers.h b/src/Parsers/CommonParsers.h index 057aad6fffea..c846a12eab02 100644 --- a/src/Parsers/CommonParsers.h +++ b/src/Parsers/CommonParsers.h @@ -332,6 +332,8 @@ namespace DB MR_MACROS(MONTHS, "MONTHS") \ MR_MACROS(MOVE_PART, "MOVE PART") \ MR_MACROS(MOVE_PARTITION, "MOVE PARTITION") \ + MR_MACROS(EXPORT_PART, "EXPORT PART") \ + MR_MACROS(EXPORT_PARTITION, "EXPORT PARTITION") \ MR_MACROS(MOVE, "MOVE") \ MR_MACROS(MS, "MS") \ MR_MACROS(MUTATION, "MUTATION") \ diff --git a/src/Parsers/FunctionSecretArgumentsFinder.h b/src/Parsers/FunctionSecretArgumentsFinder.h index 7e045dc636c6..b614dcb393f6 100644 --- a/src/Parsers/FunctionSecretArgumentsFinder.h +++ b/src/Parsers/FunctionSecretArgumentsFinder.h @@ -3,9 +3,12 @@ #include #include #include +#include +#include #include #include #include +#include namespace DB @@ -29,6 +32,21 @@ class AbstractFunction virtual ~Arguments() = default; virtual size_t size() const = 0; virtual std::unique_ptr at(size_t n) const = 0; + void skipArgument(size_t n) { skipped_indexes.insert(n); } + void unskipArguments() { skipped_indexes.clear(); } + size_t getRealIndex(size_t n) const + { + for (auto idx : skipped_indexes) + { + if (n < idx) + break; + ++n; + } + return n; + } + size_t skippedSize() const { return skipped_indexes.size(); } + private: + std::set skipped_indexes; }; virtual ~AbstractFunction() = default; @@ -75,14 +93,15 @@ class FunctionSecretArgumentsFinder { if (index >= function->arguments->size()) return; + auto real_index = function->arguments->getRealIndex(index); if (!result.count) { - result.start = index; + result.start = real_index; result.are_named = argument_is_named; } - chassert(index >= result.start); /// We always check arguments consecutively + chassert(real_index >= result.start); /// We always check arguments consecutively chassert(result.replacement.empty()); /// We shouldn't use replacement with masking other arguments - result.count = index + 1 - result.start; + result.count = real_index + 1 - result.start; if (!argument_is_named) result.are_named = false; } @@ -100,8 +119,12 @@ class FunctionSecretArgumentsFinder { findMongoDBSecretArguments(); } + else if (function->name() == "iceberg") + { + findIcebergFunctionSecretArguments(); + } else if ((function->name() == "s3") || (function->name() == "cosn") || (function->name() == "oss") || - (function->name() == "deltaLake") || (function->name() == "hudi") || (function->name() == "iceberg") || + (function->name() == "deltaLake") || (function->name() == "hudi") || (function->name() == "gcs") || (function->name() == "icebergS3")) { /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...) @@ -242,6 +265,12 @@ class FunctionSecretArgumentsFinder findSecretNamedArgument("secret_access_key", 1); return; } + if (is_cluster_function && isNamedCollectionName(1)) + { + /// s3Cluster(cluster, named_collection, ..., secret_access_key = 'secret_access_key', ...) + findSecretNamedArgument("secret_access_key", 2); + return; + } findSecretNamedArgument("secret_access_key", url_arg_idx); @@ -249,6 +278,7 @@ class FunctionSecretArgumentsFinder /// s3('url', NOSIGN, 'format' [, 'compression'] [, extra_credentials(..)] [, headers(..)]) /// s3('url', 'format', 'structure' [, 'compression'] [, extra_credentials(..)] [, headers(..)]) size_t count = excludeS3OrURLNestedMaps(); + if ((url_arg_idx + 3 <= count) && (count <= url_arg_idx + 4)) { String second_arg; @@ -313,6 +343,48 @@ class FunctionSecretArgumentsFinder markSecretArgument(url_arg_idx + 4); } + std::string findIcebergStorageType() + { + std::string storage_type = "s3"; + + size_t count = function->arguments->size(); + if (!count) + return storage_type; + + auto storage_type_idx = findNamedArgument(&storage_type, "storage_type"); + if (storage_type_idx != -1) + { + storage_type = Poco::toLower(storage_type); + function->arguments->skipArgument(storage_type_idx); + } + else if (isNamedCollectionName(0)) + { + std::string collection_name; + if (function->arguments->at(0)->tryGetString(&collection_name, true)) + { + NamedCollectionPtr collection = NamedCollectionFactory::instance().tryGet(collection_name); + if (collection && collection->has("storage_type")) + { + storage_type = Poco::toLower(collection->get("storage_type")); + } + } + } + + return storage_type; + } + + void findIcebergFunctionSecretArguments() + { + auto storage_type = findIcebergStorageType(); + + if (storage_type == "s3") + findS3FunctionSecretArguments(false); + else if (storage_type == "azure") + findAzureBlobStorageFunctionSecretArguments(false); + + function->arguments->unskipArguments(); + } + bool maskAzureConnectionString(ssize_t url_arg_idx, bool argument_is_named = false, size_t start = 0) { String url_arg; @@ -336,7 +408,7 @@ class FunctionSecretArgumentsFinder if (RE2::Replace(&url_arg, account_key_pattern, "AccountKey=[HIDDEN]\\1")) { chassert(result.count == 0); /// We shouldn't use replacement with masking other arguments - result.start = url_arg_idx; + result.start = function->arguments->getRealIndex(url_arg_idx); result.are_named = argument_is_named; result.count = 1; result.replacement = url_arg; @@ -495,6 +567,7 @@ class FunctionSecretArgumentsFinder void findTableEngineSecretArguments() { const String & engine_name = function->name(); + if (engine_name == "ExternalDistributed") { /// ExternalDistributed('engine', 'host:port', 'database', 'table', 'user', 'password') @@ -512,10 +585,13 @@ class FunctionSecretArgumentsFinder { findMongoDBSecretArguments(); } + else if (engine_name == "Iceberg") + { + findIcebergTableEngineSecretArguments(); + } else if ((engine_name == "S3") || (engine_name == "COSN") || (engine_name == "OSS") || (engine_name == "DeltaLake") || (engine_name == "Hudi") - || (engine_name == "Iceberg") || (engine_name == "IcebergS3") - || (engine_name == "S3Queue")) + || (engine_name == "IcebergS3") || (engine_name == "S3Queue")) { /// S3('url', ['aws_access_key_id', 'aws_secret_access_key',] ...) findS3TableEngineSecretArguments(); @@ -524,7 +600,7 @@ class FunctionSecretArgumentsFinder { findURLSecretArguments(); } - else if (engine_name == "AzureBlobStorage" || engine_name == "AzureQueue") + else if (engine_name == "AzureBlobStorage" || engine_name == "AzureQueue" || engine_name == "IcebergAzure") { findAzureBlobStorageTableEngineSecretArguments(); } @@ -632,6 +708,18 @@ class FunctionSecretArgumentsFinder markSecretArgument(2); } + void findIcebergTableEngineSecretArguments() + { + auto storage_type = findIcebergStorageType(); + + if (storage_type == "s3") + findS3TableEngineSecretArguments(); + else if (storage_type == "azure") + findAzureBlobStorageTableEngineSecretArguments(); + + function->arguments->unskipArguments(); + } + void findDatabaseEngineSecretArguments() { const String & engine_name = function->name(); @@ -648,7 +736,7 @@ class FunctionSecretArgumentsFinder /// S3('url', 'access_key_id', 'secret_access_key') findS3DatabaseSecretArguments(); } - else if (engine_name == "DataLakeCatalog") + else if (engine_name == "DataLakeCatalog" || engine_name == "Iceberg") { findDataLakeCatalogSecretArguments(); } diff --git a/src/Parsers/FunctionSecretArgumentsFinderAST.h b/src/Parsers/FunctionSecretArgumentsFinderAST.h index a260c0d58da6..3624d7a7e87b 100644 --- a/src/Parsers/FunctionSecretArgumentsFinderAST.h +++ b/src/Parsers/FunctionSecretArgumentsFinderAST.h @@ -54,10 +54,13 @@ class FunctionAST : public AbstractFunction { public: explicit ArgumentsAST(const ASTs * arguments_) : arguments(arguments_) {} - size_t size() const override { return arguments ? arguments->size() : 0; } + size_t size() const override + { /// size withous skipped indexes + return arguments ? arguments->size() - skippedSize() : 0; + } std::unique_ptr at(size_t n) const override - { - return std::make_unique(arguments->at(n).get()); + { /// n is relative index, some can be skipped + return std::make_unique(arguments->at(getRealIndex(n)).get()); } private: const ASTs * arguments = nullptr; diff --git a/src/Parsers/ParserAlterQuery.cpp b/src/Parsers/ParserAlterQuery.cpp index 4bb76c0d2e4b..eff14253b97f 100644 --- a/src/Parsers/ParserAlterQuery.cpp +++ b/src/Parsers/ParserAlterQuery.cpp @@ -82,6 +82,8 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ParserKeyword s_forget_partition(Keyword::FORGET_PARTITION); ParserKeyword s_move_partition(Keyword::MOVE_PARTITION); ParserKeyword s_move_part(Keyword::MOVE_PART); + ParserKeyword s_export_part(Keyword::EXPORT_PART); + ParserKeyword s_export_partition(Keyword::EXPORT_PARTITION); ParserKeyword s_drop_detached_partition(Keyword::DROP_DETACHED_PARTITION); ParserKeyword s_drop_detached_part(Keyword::DROP_DETACHED_PART); ParserKeyword s_fetch_partition(Keyword::FETCH_PARTITION); @@ -535,6 +537,39 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected command->move_destination_name = ast_space_name->as().value.safeGet(); } + else if (s_export_part.ignore(pos, expected)) + { + if (!parser_string_and_substituion.parse(pos, command_partition, expected)) + return false; + + command->type = ASTAlterCommand::EXPORT_PART; + command->part = true; + + if (!s_to_table.ignore(pos, expected)) + { + return false; + } + + if (!parseDatabaseAndTableName(pos, expected, command->to_database, command->to_table)) + return false; + command->move_destination_type = DataDestinationType::TABLE; + } + else if (s_export_partition.ignore(pos, expected)) + { + if (!parser_partition.parse(pos, command_partition, expected)) + return false; + + command->type = ASTAlterCommand::EXPORT_PARTITION; + + if (!s_to_table.ignore(pos, expected)) + { + return false; + } + + if (!parseDatabaseAndTableName(pos, expected, command->to_database, command->to_table)) + return false; + command->move_destination_type = DataDestinationType::TABLE; + } else if (s_move_partition.ignore(pos, expected)) { if (!parser_partition.parse(pos, command_partition, expected)) diff --git a/src/Parsers/ParserKillQueryQuery.cpp b/src/Parsers/ParserKillQueryQuery.cpp index 55bd5100009e..7e06ae8d30b7 100644 --- a/src/Parsers/ParserKillQueryQuery.cpp +++ b/src/Parsers/ParserKillQueryQuery.cpp @@ -17,6 +17,7 @@ bool ParserKillQueryQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expect ParserKeyword p_kill{Keyword::KILL}; ParserKeyword p_query{Keyword::QUERY}; ParserKeyword p_mutation{Keyword::MUTATION}; + ParserKeyword p_export_partition{Keyword::EXPORT_PARTITION}; ParserKeyword p_part_move_to_shard{Keyword::PART_MOVE_TO_SHARD}; ParserKeyword p_transaction{Keyword::TRANSACTION}; ParserKeyword p_on{Keyword::ON}; @@ -33,6 +34,8 @@ bool ParserKillQueryQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expect query->type = ASTKillQueryQuery::Type::Query; else if (p_mutation.ignore(pos, expected)) query->type = ASTKillQueryQuery::Type::Mutation; + else if (p_export_partition.ignore(pos, expected)) + query->type = ASTKillQueryQuery::Type::ExportPartition; else if (p_part_move_to_shard.ignore(pos, expected)) query->type = ASTKillQueryQuery::Type::PartMoveToShard; else if (p_transaction.ignore(pos, expected)) diff --git a/src/Planner/Planner.cpp b/src/Planner/Planner.cpp index 2bdb558646ee..dd5785f1e6d2 100644 --- a/src/Planner/Planner.cpp +++ b/src/Planner/Planner.cpp @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,7 @@ #include #include #include +#include #include @@ -144,6 +146,7 @@ namespace Setting extern const SettingsUInt64 max_rows_to_transfer; extern const SettingsOverflowMode transfer_overflow_mode; extern const SettingsBool enable_parallel_blocks_marshalling; + extern const SettingsBool use_hive_partitioning; } namespace ServerSetting @@ -473,6 +476,19 @@ void addFilterStep( query_plan.addStep(std::move(where_step)); } +void addObjectFilterStep(QueryPlan & query_plan, + FilterAnalysisResult & filter_analysis_result, + const std::string & step_description) +{ + auto actions = std::move(filter_analysis_result.filter_actions->dag); + + auto where_step = std::make_unique(query_plan.getCurrentHeader(), + std::move(actions), + filter_analysis_result.filter_column_name); + where_step->setStepDescription(step_description); + query_plan.addStep(std::move(where_step)); +} + Aggregator::Params getAggregatorParams(const PlannerContextPtr & planner_context, const AggregationAnalysisResult & aggregation_analysis_result, const QueryAnalysisResult & query_analysis_result, @@ -1794,6 +1810,16 @@ void Planner::buildPlanForQueryNode() if (query_processing_info.isSecondStage() || query_processing_info.isFromAggregationState()) { + if (settings[Setting::use_hive_partitioning] + && !query_processing_info.isFirstStage() + && expression_analysis_result.hasWhere()) + { + if (typeid_cast(query_plan.getRootNode()->step.get())) + { + addObjectFilterStep(query_plan, expression_analysis_result.getWhere(), "WHERE"); + } + } + if (query_processing_info.isFromAggregationState()) { /// Aggregation was performed on remote shards diff --git a/src/Planner/PlannerActionsVisitor.cpp b/src/Planner/PlannerActionsVisitor.cpp index 3527f275d6ee..57482c45a714 100644 --- a/src/Planner/PlannerActionsVisitor.cpp +++ b/src/Planner/PlannerActionsVisitor.cpp @@ -348,7 +348,7 @@ class ActionNodeNameHelper } default: { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid action query tree node {}", node->formatASTForErrorMessage()); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid action query tree node {} (node_type: {})", node->formatASTForErrorMessage(), static_cast(node_type)); } } diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index dff3ebfb14c2..568865c881cf 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -1370,7 +1370,9 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres /// Overall, IStorage::read -> FetchColumns returns normal column names (except Distributed, which is inconsistent) /// Interpreter::getQueryPlan -> FetchColumns returns identifiers (why?) and this the reason for the bug ^ in Distributed /// Hopefully there is no other case when we read from Distributed up to FetchColumns. - if (table_node && table_node->getStorage()->isRemote() && select_query_options.to_stage == QueryProcessingStage::FetchColumns) + if (table_node && table_node->getStorage()->isRemote()) + updated_actions_dag_outputs.push_back(output_node); + else if (table_function_node && table_function_node->getStorage()->isRemote()) updated_actions_dag_outputs.push_back(output_node); } else diff --git a/src/Processors/Chunk.cpp b/src/Processors/Chunk.cpp index e27762f53dc4..34402cd58249 100644 --- a/src/Processors/Chunk.cpp +++ b/src/Processors/Chunk.cpp @@ -108,7 +108,12 @@ void Chunk::addColumn(ColumnPtr column) void Chunk::addColumn(size_t position, ColumnPtr column) { - if (position >= columns.size()) + if (position == columns.size()) + { + addColumn(column); + return; + } + if (position > columns.size()) throw Exception(ErrorCodes::POSITION_OUT_OF_BOUND, "Position {} out of bound in Chunk::addColumn(), max position = {}", position, !columns.empty() ? columns.size() - 1 : 0); diff --git a/src/Processors/Formats/IInputFormat.cpp b/src/Processors/Formats/IInputFormat.cpp index f7449b2c8728..4311e8bcaf97 100644 --- a/src/Processors/Formats/IInputFormat.cpp +++ b/src/Processors/Formats/IInputFormat.cpp @@ -6,6 +6,17 @@ namespace DB { +ChunkInfoRowNumbers::ChunkInfoRowNumbers(size_t row_num_offset_, std::optional applied_filter_) + : row_num_offset(row_num_offset_), applied_filter(std::move(applied_filter_)) { } + +ChunkInfoRowNumbers::Ptr ChunkInfoRowNumbers::clone() const +{ + auto res = std::make_shared(row_num_offset); + if (applied_filter.has_value()) + res->applied_filter.emplace(applied_filter->begin(), applied_filter->end()); + return res; +} + IInputFormat::IInputFormat(SharedHeader header, ReadBuffer * in_) : ISource(std::move(header)), in(in_) { column_mapping = std::make_shared(); diff --git a/src/Processors/Formats/IInputFormat.h b/src/Processors/Formats/IInputFormat.h index 31a7e816f24a..cb5feea18646 100644 --- a/src/Processors/Formats/IInputFormat.h +++ b/src/Processors/Formats/IInputFormat.h @@ -3,8 +3,10 @@ #include #include #include +#include #include #include +#include namespace DB @@ -13,13 +15,35 @@ namespace DB struct SelectQueryInfo; using ColumnMappingPtr = std::shared_ptr; - -struct ChunkInfoRowNumOffset : public ChunkInfoCloneable +using IColumnFilter = PaddedPODArray; + +/// Most (all?) file formats have a natural order of rows within the file. +/// But our format readers and query pipeline may reorder or filter rows. This struct is used to +/// propagate the original row numbers, e.g. for _row_number virtual column or for iceberg +/// positional deletes. +/// +/// Warning: we currently don't correctly update this info in most transforms. E.g. things like +/// FilterTransform and SortingTransform logically should remove this ChunkInfo, but don't; we don't +/// have a mechanism to systematically find all code sites that would need to do that or to detect +/// if one was missed. +/// So this is only used in a few specific situations, and the builder of query pipeline must be +/// careful to never put a step that uses this info after a step that breaks it. +/// +/// If row numbers in a chunk are consecutive, this contains just the first row number. +/// If row numbers are not consecutive as a result of filtering, this additionally contains the mask +/// that was used for filtering, from which row numbers can be recovered. +struct ChunkInfoRowNumbers : public ChunkInfo { - ChunkInfoRowNumOffset(const ChunkInfoRowNumOffset & other) = default; - explicit ChunkInfoRowNumOffset(size_t row_num_offset_) : row_num_offset(row_num_offset_) { } + explicit ChunkInfoRowNumbers(size_t row_num_offset_, std::optional applied_filter_ = std::nullopt); + + Ptr clone() const override; const size_t row_num_offset; + /// If nullopt, row numbers are consecutive. + /// If not empty, the number of '1' elements is equal to the number of rows in the chunk; + /// row i in the chunk has row number: + /// row_num_offset + {index of the i-th '1' element in applied_filter}. + std::optional applied_filter; }; /** Input format is a source, that reads data from ReadBuffer. @@ -79,6 +103,9 @@ class IInputFormat : public ISource void needOnlyCount() { need_only_count = true; } + /// Set additional info/key/id related to underlying storage of the ReadBuffer + virtual void setStorageRelatedUniqueKey(const Settings & /*settings*/, const String & /*key*/) {} + protected: ReadBuffer & getReadBuffer() const { chassert(in); return *in; } diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index 4dae97e0d58b..3e73df4ebf3d 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -1556,7 +1556,18 @@ Chunk ArrowColumnToCHColumn::arrowTableToCHChunk( auto arrow_field = table->schema()->GetFieldByName(column_name); if (parquet_columns_to_clickhouse) - column_name = parquet_columns_to_clickhouse->at(column_name); + { + auto column_name_it = parquet_columns_to_clickhouse->find(column_name); + if (column_name_it == parquet_columns_to_clickhouse->end()) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Column '{}' is not present in input data. Column name mapping has {} columns", + column_name, + parquet_columns_to_clickhouse->size()); + } + column_name = column_name_it->second; + } if (case_insensitive_matching) boost::to_lower(column_name); diff --git a/src/Processors/Formats/Impl/Parquet/Decoding.cpp b/src/Processors/Formats/Impl/Parquet/Decoding.cpp index a3cfb47ee8f9..af8475afdcf9 100644 --- a/src/Processors/Formats/Impl/Parquet/Decoding.cpp +++ b/src/Processors/Formats/Impl/Parquet/Decoding.cpp @@ -745,7 +745,7 @@ struct ByteStreamSplitDecoder : public PageDecoder bool PageDecoderInfo::canReadDirectlyIntoColumn(parq::Encoding::type encoding, size_t num_values, IColumn & col, std::span & out) const { - if (encoding == parq::Encoding::PLAIN && fixed_size_converter && fixed_size_converter->isTrivial()) + if (encoding == parq::Encoding::PLAIN && fixed_size_converter && physical_type != parq::Type::BOOLEAN && fixed_size_converter->isTrivial()) { chassert(col.sizeOfValueIfFixed() == fixed_size_converter->input_size); out = col.insertRawUninitialized(num_values); @@ -1417,7 +1417,7 @@ void GeoConverter::convertColumn(std::span chars, const UInt64 * off { col.reserve(col.size() + num_values); chassert(chars.size() >= offsets[num_values - 1]); - for (size_t i = 0; i < num_values; ++i) + for (ssize_t i = 0; i < ssize_t(num_values); ++i) { char * ptr = const_cast(chars.data() + offsets[i - 1]); size_t length = offsets[i] - offsets[i - 1] - separator_bytes; diff --git a/src/Processors/Formats/Impl/Parquet/Prefetcher.cpp b/src/Processors/Formats/Impl/Parquet/Prefetcher.cpp index 0ba59cbe9a02..3e05029060fa 100644 --- a/src/Processors/Formats/Impl/Parquet/Prefetcher.cpp +++ b/src/Processors/Formats/Impl/Parquet/Prefetcher.cpp @@ -15,6 +15,11 @@ namespace DB::ErrorCodes extern const int CANNOT_READ_ALL_DATA; } +namespace ProfileEvents +{ + extern const Event ParquetFetchWaitTimeMicroseconds; +} + namespace DB::Parquet { @@ -409,16 +414,24 @@ std::span Prefetcher::getRangeData(const PrefetchHandle & request) const RequestState * req = request.request; chassert(req->state == RequestState::State::HasTask); Task * task = req->task; - auto s = task->state.load(std::memory_order_acquire); - if (s == Task::State::Scheduled) + Task::State s = task->state.load(std::memory_order_acquire); + if (s == Task::State::Scheduled || s == Task::State::Running) { - s = runTask(task); - chassert(s != Task::State::Scheduled); - } - if (s == Task::State::Running) - { - task->completion.wait(); - s = task->state.load(); + Stopwatch wait_time; + + if (s == Task::State::Scheduled) + { + s = runTask(task); + chassert(s != Task::State::Scheduled); + } + + if (s == Task::State::Running) // (not `else`, the runTask above may return Running) + { + task->completion.wait(); + s = task->state.load(); + } + + ProfileEvents::increment(ProfileEvents::ParquetFetchWaitTimeMicroseconds, wait_time.elapsedMicroseconds()); } if (s == Task::State::Exception) rethrowException(task); diff --git a/src/Processors/Formats/Impl/Parquet/ReadManager.cpp b/src/Processors/Formats/Impl/Parquet/ReadManager.cpp index 0c83f9a03c0e..0cea3ce8e407 100644 --- a/src/Processors/Formats/Impl/Parquet/ReadManager.cpp +++ b/src/Processors/Formats/Impl/Parquet/ReadManager.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -17,6 +18,8 @@ namespace ProfileEvents { extern const Event ParquetDecodingTasks; extern const Event ParquetDecodingTaskBatches; + extern const Event ParquetReadRowGroups; + extern const Event ParquetPrunedRowGroups; } namespace DB::Parquet @@ -45,6 +48,9 @@ void ReadManager::init(FormatParserSharedResourcesPtr parser_shared_resources_) reader.prefilterAndInitRowGroups(); reader.preparePrewhere(); + ProfileEvents::increment(ProfileEvents::ParquetReadRowGroups, reader.row_groups.size()); + ProfileEvents::increment(ProfileEvents::ParquetPrunedRowGroups, reader.file_metadata.row_groups.size() - reader.row_groups.size()); + size_t num_row_groups = reader.row_groups.size(); for (size_t i = size_t(ReadStage::NotStarted) + 1; i < size_t(ReadStage::Deliver); ++i) { @@ -202,9 +208,10 @@ void ReadManager::finishRowGroupStage(size_t row_group_idx, ReadStage stage, Mem { diff.scheduleAllStages(); - if (i + 1 == reader.row_groups.size()) + /// Notify read() if everything is done or if it's relying on + /// first_incomplete_row_group to deliver chunks in order. + if (i + 1 == reader.row_groups.size() || reader.options.format.parquet.preserve_order) { - /// Notify read() that everything is done. { /// Lock and unlock to avoid race condition on condition variable. /// (Otherwise the notify_all() may happen after read() saw the old @@ -349,14 +356,20 @@ void ReadManager::finishRowSubgroupStage(size_t row_group_idx, size_t row_subgro } case ReadStage::MainData: { - size_t prev = row_group.read_ptr.exchange(row_subgroup_idx + 1); - chassert(prev == row_subgroup_idx); - advanced_read_ptr = prev + 1; row_subgroup.stage.store(ReadStage::Deliver, std::memory_order::relaxed); + + /// Must add to delivery_queue before advancing read_ptr to deliver subgroups in order. + /// (If we advanced read_ptr first, another thread could start and finish reading the + /// next subgroup before we add this one to delivery_queue, and ReadManager::read could + /// pick up the later subgroup before we add this one.) { std::lock_guard lock(delivery_mutex); delivery_queue.push(Task {.stage = ReadStage::Deliver, .row_group_idx = row_group_idx, .row_subgroup_idx = row_subgroup_idx}); } + + size_t prev = row_group.read_ptr.exchange(row_subgroup_idx + 1); + chassert(prev == row_subgroup_idx); + advanced_read_ptr = prev + 1; delivery_cv.notify_one(); break; // proceed to advancing read_ptr } @@ -816,13 +829,13 @@ void ReadManager::clearColumnChunk(ColumnChunk & column, MemoryUsageDiff & diff) void ReadManager::clearRowSubgroup(RowSubgroup & row_subgroup, MemoryUsageDiff & diff) { - row_subgroup.filter.memory.reset(&diff); + row_subgroup.filter.clear(&diff); row_subgroup.output.clear(); for (ColumnSubchunk & col : row_subgroup.columns) col.column_and_offsets_memory.reset(&diff); } -std::tuple ReadManager::read() +ReadManager::ReadResult ReadManager::read() { Task task; { @@ -835,9 +848,15 @@ std::tuple ReadManager::read() if (exception) std::rethrow_exception(exception); - if (!delivery_queue.empty()) + /// If `preserve_order`, only deliver chunks from `first_incomplete_row_group`. + /// This ensures that row groups are delivered in order. Within a row group, row + /// subgroups are read and added to `delivery_queue` in order. + if (!delivery_queue.empty() && + (!reader.options.format.parquet.preserve_order || + delivery_queue.top().row_group_idx == + first_incomplete_row_group.load(std::memory_order_relaxed))) { - task = delivery_queue.front(); + task = delivery_queue.top(); delivery_queue.pop(); break; } @@ -846,7 +865,10 @@ std::tuple ReadManager::read() { /// All done. Check for memory accounting leaks. /// First join the threads because they might still be decrementing memory_usage. + lock.unlock(); shutdown->shutdown(); + lock.lock(); + for (const RowGroup & row_group : reader.row_groups) { chassert(row_group.stage.load(std::memory_order_relaxed) == ReadStage::Deallocated); @@ -892,7 +914,8 @@ std::tuple ReadManager::read() } } - auto & row_subgroup = reader.row_groups.at(task.row_group_idx).subgroups.at(task.row_subgroup_idx); + RowGroup & row_group = reader.row_groups.at(task.row_group_idx); + RowSubgroup & row_subgroup = row_group.subgroups.at(task.row_subgroup_idx); chassert(row_subgroup.stage == ReadStage::Deliver); size_t num_final_columns = reader.sample_block->columns(); for (size_t i = 0; i < reader.output_columns.size(); ++i) @@ -912,12 +935,44 @@ std::tuple ReadManager::read() Chunk chunk(std::move(row_subgroup.output), row_subgroup.filter.rows_pass); BlockMissingValues block_missing_values = std::move(row_subgroup.block_missing_values); + auto row_numbers_info = std::make_shared( + row_subgroup.start_row_idx + row_group.start_global_row_idx); + if (row_subgroup.filter.rows_pass != row_subgroup.filter.rows_total) + { + chassert(row_subgroup.filter.rows_pass > 0); + chassert(!row_subgroup.filter.filter.empty()); + chassert(std::accumulate(row_subgroup.filter.filter.begin(), row_subgroup.filter.filter.end(), size_t(0)) == chunk.getNumRows()); + + row_numbers_info->applied_filter = std::move(row_subgroup.filter.filter); + } + chunk.getChunkInfos().add(std::move(row_numbers_info)); + + /// This is a terrible hack to make progress indication kind of work. + /// + /// TODO: Fix progress bar in many ways: + /// 1. use number of rows instead of bytes; + /// don't lie about number of bytes read (getApproxBytesReadForChunk()), + /// 2. estimate total rows to read after filtering row groups; + /// for rows filtered out by PREWHERE, either report them as read or reduce the + /// estimate of number of rows to read (make it signed), + /// 3. report uncompressed deserialized IColumn bytes instead of file bytes, for + /// consistency with MergeTree reads, + /// 4. correctly extrapolate progress when reading many files in sequence, e.g. + /// file('part{1..1000}.parquet'), + /// 5. correctly merge progress info when a query reads both from MergeTree and files, or + /// parquet and text files. + /// Probably get rid of getApproxBytesReadForChunk() and use the existing + /// ISource::progress()/addTotalRowsApprox instead. + /// For (4) and (5), either add things to struct Progress or make progress bar use + /// ProfileEvents instead of Progress. + size_t virtual_bytes_read = size_t(row_group.meta->total_compressed_size) * row_subgroup.filter.rows_total / std::max(size_t(1), size_t(row_group.meta->num_rows)); + /// This updates `memory_usage` of previous stages, which may allow more tasks to be scheduled. MemoryUsageDiff diff(ReadStage::Deliver); finishRowSubgroupStage(task.row_group_idx, task.row_subgroup_idx, ReadStage::Deliver, diff); flushMemoryUsageDiff(std::move(diff)); - return {std::move(chunk), std::move(block_missing_values)}; + return {std::move(chunk), std::move(block_missing_values), virtual_bytes_read}; } } diff --git a/src/Processors/Formats/Impl/Parquet/ReadManager.h b/src/Processors/Formats/Impl/Parquet/ReadManager.h index 43b55873265e..291f81a2cb4a 100644 --- a/src/Processors/Formats/Impl/Parquet/ReadManager.h +++ b/src/Processors/Formats/Impl/Parquet/ReadManager.h @@ -44,8 +44,15 @@ class ReadManager ~ReadManager(); + struct ReadResult + { + Chunk chunk; + BlockMissingValues block_missing_values; + size_t virtual_bytes_read = 0; + }; + /// Not thread safe. - std::tuple read(); + ReadResult read(); void cancel() noexcept; @@ -63,6 +70,14 @@ class ReadManager size_t row_subgroup_idx = UINT64_MAX; size_t column_idx = UINT64_MAX; size_t cost_estimate_bytes = 0; + + struct Comparator + { + bool operator()(const Task & x, const Task & y) const + { + return std::make_tuple(x.row_group_idx, x.row_subgroup_idx) > std::make_tuple(y.row_group_idx, y.row_subgroup_idx); + } + }; }; struct Stage @@ -91,7 +106,7 @@ class ReadManager std::atomic first_incomplete_row_group {0}; std::mutex delivery_mutex; - std::queue delivery_queue; + std::priority_queue, Task::Comparator> delivery_queue; std::condition_variable delivery_cv; std::exception_ptr exception; diff --git a/src/Processors/Formats/Impl/Parquet/Reader.cpp b/src/Processors/Formats/Impl/Parquet/Reader.cpp index 924c337f0aa5..fdd0a433d789 100644 --- a/src/Processors/Formats/Impl/Parquet/Reader.cpp +++ b/src/Processors/Formats/Impl/Parquet/Reader.cpp @@ -274,6 +274,7 @@ void Reader::prefilterAndInitRowGroups() SchemaConverter schemer(file_metadata, options, &extended_sample_block); if (prewhere_info && !prewhere_info->remove_prewhere_column) schemer.external_columns.push_back(prewhere_info->prewhere_column_name); + schemer.column_mapper = format_filter_info->column_mapper.get(); schemer.prepareForReading(); primitive_columns = std::move(schemer.primitive_columns); total_primitive_columns_in_file = schemer.primitive_column_idx; @@ -281,30 +282,33 @@ void Reader::prefilterAndInitRowGroups() /// Precalculate some column index mappings. - sample_block_to_output_columns_idx.resize(extended_sample_block.columns(), UINT64_MAX); + sample_block_to_output_columns_idx.resize(extended_sample_block.columns()); for (size_t i = 0; i < output_columns.size(); ++i) { const auto & idx = output_columns[i].idx_in_output_block; if (idx.has_value()) { - chassert(sample_block_to_output_columns_idx.at(*idx) == UINT64_MAX); + chassert(!sample_block_to_output_columns_idx.at(*idx).has_value()); sample_block_to_output_columns_idx.at(*idx) = i; } } - chassert(std::all_of(sample_block_to_output_columns_idx.begin(), sample_block_to_output_columns_idx.end(), [](size_t x) { return x != UINT64_MAX; })); if (format_filter_info->key_condition) { for (size_t idx_in_output_block : format_filter_info->key_condition->getUsedColumns()) { - size_t output_idx = sample_block_to_output_columns_idx.at(idx_in_output_block); - const OutputColumnInfo & output_info = output_columns[output_idx]; + const auto & output_idx = sample_block_to_output_columns_idx.at(idx_in_output_block); + if (!output_idx.has_value()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "KeyCondition uses PREWHERE output"); + const OutputColumnInfo & output_info = output_columns[output_idx.value()]; + if (output_info.is_primitive) primitive_columns[output_info.primitive_start].used_by_key_condition = idx_in_output_block; } } /// Populate row_groups. Skip row groups based on column chunk min/max statistics. + size_t total_rows = 0; for (size_t row_group_idx = 0; row_group_idx < file_metadata.row_groups.size(); ++row_group_idx) { const auto * meta = &file_metadata.row_groups[row_group_idx]; @@ -313,6 +317,8 @@ void Reader::prefilterAndInitRowGroups() if (meta->columns.size() != total_primitive_columns_in_file) throw Exception(ErrorCodes::INCORRECT_DATA, "Row group {} has unexpected number of columns: {} != {}", row_group_idx, meta->columns.size(), total_primitive_columns_in_file); + total_rows += size_t(meta->num_rows); // before potentially skipping the row group + Hyperrectangle hyperrectangle(extended_sample_block.columns(), Range::createWholeUniverse()); if (options.format.parquet.filter_push_down && format_filter_info->key_condition) { @@ -325,6 +331,7 @@ void Reader::prefilterAndInitRowGroups() RowGroup & row_group = row_groups.emplace_back(); row_group.meta = meta; row_group.row_group_idx = row_group_idx; + row_group.start_global_row_idx = total_rows - size_t(meta->num_rows); row_group.columns.resize(primitive_columns.size()); row_group.hyperrectangle = std::move(hyperrectangle); @@ -359,7 +366,11 @@ void Reader::prefilterAndInitRowGroups() const auto & column_conditions = static_cast(format_filter_info->opaque.get())->column_conditions; for (const auto & [idx_in_output_block, key_condition] : column_conditions) { - const OutputColumnInfo & output_info = output_columns[sample_block_to_output_columns_idx.at(idx_in_output_block)]; + const auto & output_idx = sample_block_to_output_columns_idx.at(idx_in_output_block); + if (!output_idx.has_value()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Column condition uses PREWHERE output"); + const OutputColumnInfo & output_info = output_columns[output_idx.value()]; + if (!output_info.is_primitive) continue; primitive_columns[output_info.primitive_start].column_index_condition = key_condition.get(); @@ -598,44 +609,47 @@ void Reader::initializePrefetches() void Reader::preparePrewhere() { PrewhereInfoPtr prewhere_info = format_filter_info->prewhere_info; - if (!prewhere_info) - return; + if (prewhere_info) + { + /// TODO [parquet]: We currently run prewhere after reading all prewhere columns of the row + /// subgroup, in one thread per row group. Instead, we could extract single-column conditions + /// and run them after decoding the corresponding columns, in parallel. + /// (Still run multi-column conditions, like `col1 = 42 or col2 = 'yes'`, after reading all columns.) + /// Probably reuse tryBuildPrewhereSteps from MergeTree for splitting the expression. - /// TODO [parquet]: We currently run prewhere after reading all prewhere columns of the row - /// subgroup, in one thread per row group. Instead, we could extract single-column conditions - /// and run them after decoding the corresponding columns, in parallel. - /// (Still run multi-column conditions, like `col1 = 42 or col2 = 'yes'`, after reading all columns.) - /// Probably reuse tryBuildPrewhereSteps from MergeTree for splitting the expression. - /// Convert ActionsDAG to ExpressionActions. - ExpressionActionsSettings actions_settings; - if (prewhere_info->row_level_filter.has_value()) - { - ExpressionActions actions(prewhere_info->row_level_filter->clone(), actions_settings); + /// Convert ActionsDAG to ExpressionActions. + ExpressionActionsSettings actions_settings; + if (prewhere_info->row_level_filter.has_value()) + { + ExpressionActions actions(prewhere_info->row_level_filter->clone(), actions_settings); + prewhere_steps.push_back(PrewhereStep + { + .actions = std::move(actions), + .result_column_name = prewhere_info->row_level_column_name, + }); + } + ExpressionActions actions(prewhere_info->prewhere_actions.clone(), actions_settings); prewhere_steps.push_back(PrewhereStep { .actions = std::move(actions), - .result_column_name = prewhere_info->row_level_column_name + .result_column_name = prewhere_info->prewhere_column_name, + .need_filter = prewhere_info->need_filter, }); + if (!prewhere_info->remove_prewhere_column) + prewhere_steps.back().idx_in_output_block = sample_block->getPositionByName(prewhere_info->prewhere_column_name); } - ExpressionActions actions(prewhere_info->prewhere_actions.clone(), actions_settings); - prewhere_steps.push_back(PrewhereStep - { - .actions = std::move(actions), - .result_column_name = prewhere_info->prewhere_column_name, - .need_filter = prewhere_info->need_filter, - }); - if (!prewhere_info->remove_prewhere_column) - prewhere_steps.back().idx_in_output_block = sample_block->getPositionByName(prewhere_info->prewhere_column_name); - /// Look up expression inputs in extended_sample_block. for (PrewhereStep & step : prewhere_steps) { for (const auto & col : step.actions.getRequiredColumnsWithTypes()) { size_t idx_in_output_block = extended_sample_block.getPositionByName(col.name, /* case_insensitive= */ false); - size_t output_idx = sample_block_to_output_columns_idx.at(idx_in_output_block); - OutputColumnInfo & output_info = output_columns[output_idx]; + const auto & output_idx = sample_block_to_output_columns_idx.at(idx_in_output_block); + if (!output_idx.has_value()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "PREWHERE appears to use its own output as input"); + OutputColumnInfo & output_info = output_columns[output_idx.value()]; + output_info.use_prewhere = true; bool only_for_prewhere = idx_in_output_block >= sample_block->columns(); @@ -645,7 +659,21 @@ void Reader::preparePrewhere() primitive_columns[primitive_idx].only_for_prewhere = only_for_prewhere; } - step.input_column_idxs.push_back(output_idx); + step.input_column_idxs.push_back(output_idx.value()); + } + } + + /// Assert that sample_block_to_output_columns_idx is valid. + for (size_t i = 0; i < sample_block_to_output_columns_idx.size(); ++i) + { + /// (`prewhere_steps` has at most two elements) + size_t is_prewhere_output = std::count_if(prewhere_steps.begin(), prewhere_steps.end(), + [&](const PrewhereStep & step) { return step.idx_in_output_block == i; }); + if (is_prewhere_output > 1 || + /// Column must appear in exactly one of {output_columns, prewhere output}. + sample_block_to_output_columns_idx[i].has_value() != !is_prewhere_output) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected column in sample block: {}", extended_sample_block.getByPosition(i).name); } } } @@ -970,7 +998,8 @@ void Reader::intersectColumnIndexResultsAndInitSubgroups(RowGroup & row_group) bytes_per_row += estimateColumnMemoryBytesPerRow(row_group.columns.at(i), row_group, primitive_columns.at(i)); size_t n = size_t(options.format.parquet.prefer_block_bytes / std::max(bytes_per_row, 1.)); - rows_per_subgroup = std::min(rows_per_subgroup, std::max(n, 1ul)); + n = std::max(n, size_t(128)); // avoid super tiny blocks if something is wrong with stats + rows_per_subgroup = std::min(rows_per_subgroup, n); } chassert(rows_per_subgroup > 0); diff --git a/src/Processors/Formats/Impl/Parquet/Reader.h b/src/Processors/Formats/Impl/Parquet/Reader.h index 43dd76857696..899d4371b0c4 100644 --- a/src/Processors/Formats/Impl/Parquet/Reader.h +++ b/src/Processors/Formats/Impl/Parquet/Reader.h @@ -24,13 +24,10 @@ namespace DB::Parquet { // TODO [parquet]: -// * column_mapper -// * find a way to make this compatible at all with our implementation of iceberg positioned deletes: https://github.com/ClickHouse/ClickHouse/pull/83094 (prewhere causes nonconsecutive row idxs in chunk) -// * allow_geoparquet_parser +// * either multistage PREWHERE or make query optimizer selectively move parts of the condition to prewhere instead of the whole condition // * test on files from https://github.com/apache/parquet-testing // * check fields for false sharing, add cacheline padding as needed // * make sure userspace page cache read buffer supports readBigAt -// * assert that memory usage is zero at the end, the reset()s are easy to miss // * support newer parquet versions: https://github.com/apache/parquet-format/blob/master/CHANGES.md // * make writer write DataPageV2 // * make writer write PageEncodingStats @@ -38,8 +35,6 @@ namespace DB::Parquet // * try adding [[unlikely]] to all ifs // * try adding __restrict to pointers on hot paths // * support or deprecate the preserve-order setting -// * stats (reuse the ones from the other PR?) -// - number of row groups that were split into chunks // * add comments everywhere // * progress indication and estimating bytes to read; allow negative total_bytes_to_read? // * cache FileMetaData in something like SchemaCache @@ -157,7 +152,7 @@ struct Reader size_t column_idx; /// Index in parquet `schema` (in FileMetaData). size_t schema_idx; - String name; + String name; // possibly mapped by ColumnMapper (e.g. using iceberg metadata) PageDecoderInfo decoder; DataTypePtr raw_decoded_type; // not Nullable @@ -193,7 +188,7 @@ struct Reader struct OutputColumnInfo { - String name; + String name; // possibly mapped by ColumnMapper /// Range in primitive_columns. size_t primitive_start = 0; size_t primitive_end = 0; @@ -390,6 +385,7 @@ struct Reader const parq::RowGroup * meta; size_t row_group_idx; // in parquet file + size_t start_global_row_idx = 0; // total number of rows in preceding row groups in the file /// Parallel to Reader::primitive_columns. /// NOT parallel to `meta.columns` (it's a subset of parquet columns). @@ -455,8 +451,10 @@ struct Reader size_t total_primitive_columns_in_file = 0; std::vector output_columns; /// Maps idx_in_output_block to index in output_columns. I.e.: - /// sample_block_to_output_columns_idx[output_columns[i].idx_in_output_block] = i - std::vector sample_block_to_output_columns_idx; + /// sample_block_to_output_columns_idx[output_columns[i].idx_in_output_block] = i + /// nullopt if the column is produced by PREWHERE expression: + /// prewhere_steps[?].idx_in_output_block == i + std::vector> sample_block_to_output_columns_idx; /// sample_block with maybe some columns added at the end. /// The added columns are used as inputs to prewhere expression, then discarded. diff --git a/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp b/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp index 2232adc7d42e..714ec0f1c61d 100644 --- a/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp +++ b/src/Processors/Formats/Impl/Parquet/SchemaConverter.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -25,6 +27,7 @@ namespace DB::ErrorCodes extern const int TOO_DEEP_RECURSION; extern const int NOT_IMPLEMENTED; extern const int THERE_IS_NO_COLUMN; + extern const int ICEBERG_SPECIFICATION_VIOLATION; } namespace DB::Parquet @@ -65,7 +68,10 @@ void SchemaConverter::prepareForReading() /// DFS the schema tree. size_t top_level_columns = size_t(file_metadata.schema.at(0).num_children); for (size_t i = 0; i < top_level_columns; ++i) - processSubtree("", /*requested*/ false, /*type_hint*/ nullptr, SchemaContext::None); + { + TraversalNode node; + processSubtree(node); + } /// Check that all requested columns were found. std::vector found_columns(sample_block->columns()); @@ -109,33 +115,52 @@ NamesAndTypesList SchemaConverter::inferSchema() NamesAndTypesList res; for (size_t i = 0; i < top_level_columns; ++i) { - std::optional idx = processSubtree("", /*requested*/ true, /*type_hint*/ nullptr, SchemaContext::None); - if (idx.has_value()) + TraversalNode node; + node.requested = true; + processSubtree(node); + if (node.output_idx.has_value()) { - const OutputColumnInfo & col = output_columns.at(idx.value()); + const OutputColumnInfo & col = output_columns.at(node.output_idx.value()); res.emplace_back(col.name, col.type); } } return res; } -std::optional SchemaConverter::processSubtree(String name, bool requested, DataTypePtr type_hint, SchemaContext schema_context) +std::string_view SchemaConverter::useColumnMapperIfNeeded(const parq::SchemaElement & element) const +{ + if (!column_mapper) + return element.name; + const auto & map = column_mapper->getFieldIdToClickHouseName(); + if (!element.__isset.field_id) + { + /// Does iceberg require that parquet files have field ids? + /// Our iceberg writer currently doesn't write them. + //throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Missing field_id for column {}", element.name); + return element.name; + } + auto it = map.find(element.field_id); + if (it == map.end()) + throw Exception(ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "Parquet file has column {} with field_id {} that is not in datalake metadata", element.name, element.field_id); + auto split = Nested::splitName(std::string_view(it->second), /*reverse=*/ true); + return split.second.empty() ? split.first : split.second; +} + +void SchemaConverter::processSubtree(TraversalNode & node) { - if (type_hint) - chassert(requested); + if (node.type_hint) + chassert(node.requested); if (schema_idx >= file_metadata.schema.size()) throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid parquet schema tree"); - const parq::SchemaElement & element = file_metadata.schema.at(schema_idx); + node.element = &file_metadata.schema.at(schema_idx); schema_idx += 1; std::optional idx_in_output_block; size_t wrap_in_arrays = 0; - if (schema_context == SchemaContext::None) + if (node.schema_context == SchemaContext::None) { - if (!name.empty()) - name += "."; - name += element.name; + node.appendNameComponent(node.element->name, useColumnMapperIfNeeded(*node.element)); if (sample_block) { @@ -143,24 +168,24 @@ std::optional SchemaConverter::processSubtree(String name, bool requeste /// E.g.: /// insert into function file('t.parquet') select [(10,20,30)] as x; /// select * from file('t.parquet', Parquet, '`x.2` Array(UInt8)'); -- outputs [20] - std::optional pos = sample_block->findPositionByName(name, options.format.parquet.case_insensitive_column_matching); + std::optional pos = sample_block->findPositionByName(node.name, options.format.parquet.case_insensitive_column_matching); if (pos.has_value()) { - if (requested) - throw Exception(ErrorCodes::COLUMN_QUERIED_MORE_THAN_ONCE, "Requested column {} is part of another requested column", name); + if (node.requested) + throw Exception(ErrorCodes::COLUMN_QUERIED_MORE_THAN_ONCE, "Requested column {} is part of another requested column", node.getNameForLogging()); - requested = true; - name = sample_block->getByPosition(pos.value()).name; // match case - type_hint = sample_block->getByPosition(pos.value()).type; + node.requested = true; + node.name = sample_block->getByPosition(pos.value()).name; // match case + node.type_hint = sample_block->getByPosition(pos.value()).type; for (size_t i = 1; i < levels.size(); ++i) { if (levels[i].is_array) { - const DataTypeArray * array = typeid_cast(type_hint.get()); + const DataTypeArray * array = typeid_cast(node.type_hint.get()); if (!array) - throw Exception(ErrorCodes::TYPE_MISMATCH, "Requested type of nested column {} doesn't match parquet schema: parquet type is Array, requested type is {}", name, type_hint->getName()); - type_hint = array->getNestedType(); + throw Exception(ErrorCodes::TYPE_MISMATCH, "Requested type of nested column {} doesn't match parquet schema: parquet type is Array, requested type is {}", node.getNameForLogging(), node.type_hint->getName()); + node.type_hint = array->getNestedType(); wrap_in_arrays += 1; } } @@ -178,62 +203,60 @@ std::optional SchemaConverter::processSubtree(String name, bool requeste levels.resize(prev_levels_size); }); - if (element.repetition_type != parq::FieldRepetitionType::REQUIRED) + if (node.element->repetition_type != parq::FieldRepetitionType::REQUIRED) { LevelInfo prev = levels.back(); if (prev.def == UINT8_MAX) - throw Exception(ErrorCodes::TOO_DEEP_RECURSION, "Parquet column {} has extremely deeply nested (>255 levels) arrays or nullables", name); + throw Exception(ErrorCodes::TOO_DEEP_RECURSION, "Parquet column {} has extremely deeply nested (>255 levels) arrays or nullables", node.getNameForLogging()); auto level = LevelInfo {.def = UInt8(prev.def + 1), .rep = prev.rep}; - if (element.repetition_type == parq::FieldRepetitionType::REPEATED) + if (node.element->repetition_type == parq::FieldRepetitionType::REPEATED) { level.rep += 1; // no overflow, rep <= def level.is_array = true; /// We'll first process schema for array element type, then wrap it in Array type. - if (type_hint) + if (node.type_hint) { - const DataTypeArray * array_type = typeid_cast(type_hint.get()); + const DataTypeArray * array_type = typeid_cast(node.type_hint.get()); if (!array_type) - throw Exception(ErrorCodes::TYPE_MISMATCH, "Requested type of column {} doesn't match parquet schema: parquet type is Array, requested type is {}", name, type_hint->getName()); - type_hint = array_type->getNestedType(); + throw Exception(ErrorCodes::TYPE_MISMATCH, "Requested type of column {} doesn't match parquet schema: parquet type is Array, requested type is {}", node.getNameForLogging(), node.type_hint->getName()); + node.type_hint = array_type->getNestedType(); } } chassert(level.def == levels.size()); levels.push_back(level); } - std::optional output_idx; // index in output_columns - /// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md - if (!processSubtreePrimitive(name, requested, type_hint, schema_context, element, output_idx) && - !processSubtreeMap(name, requested, type_hint, schema_context, element, output_idx) && - !processSubtreeArrayOuter(name, requested, type_hint, schema_context, element, output_idx) && - !processSubtreeArrayInner(name, requested, type_hint, schema_context, element, output_idx)) + if (!processSubtreePrimitive(node) && + !processSubtreeMap(node) && + !processSubtreeArrayOuter(node) && + !processSubtreeArrayInner(node)) { - processSubtreeTuple(name, requested, type_hint, schema_context, element, output_idx); + processSubtreeTuple(node); } - if (!output_idx.has_value()) - return std::nullopt; - if (!requested) - return std::nullopt; // we just needed to recurse to children, not interested in output_idx + if (!node.output_idx.has_value()) + return; + if (!node.requested) + return; // we just needed to recurse to children, not interested in output_idx auto make_array = [&](UInt8 rep) { size_t array_idx = output_columns.size(); OutputColumnInfo & array = output_columns.emplace_back(); - const OutputColumnInfo & array_element = output_columns.at(*output_idx); - array.name = name; + const OutputColumnInfo & array_element = output_columns.at(node.output_idx.value()); + array.name = node.name; array.primitive_start = array_element.primitive_start; array.primitive_end = primitive_columns.size(); array.type = std::make_shared(array_element.type); - array.nested_columns = {*output_idx}; + array.nested_columns = {*node.output_idx}; array.rep = rep; - output_idx = array_idx; + node.output_idx = array_idx; }; - if (element.repetition_type == parq::FieldRepetitionType::REPEATED) + if (node.element->repetition_type == parq::FieldRepetitionType::REPEATED) { /// Array of some kind. Can be a child of List or Map, or a standalone repeated field. /// We dispatch all 3 cases to this one code path to minimize probability of bugs. @@ -248,30 +271,28 @@ std::optional SchemaConverter::processSubtree(String name, bool requeste for (size_t i = 0; i < wrap_in_arrays; ++i) make_array(levels[prev_levels_size - 1].rep - i); - output_columns[*output_idx].idx_in_output_block = idx_in_output_block; + output_columns[node.output_idx.value()].idx_in_output_block = idx_in_output_block; } - - return output_idx; } -bool SchemaConverter::processSubtreePrimitive(const String & name, bool requested, DataTypePtr type_hint, SchemaContext schema_context, const parq::SchemaElement & element, std::optional & output_idx) +bool SchemaConverter::processSubtreePrimitive(TraversalNode & node) { /// `parquet.thrift` says "[num_children] is not set when the element is a primitive type". /// If it's set but has value 0, logically it would make sense to interpret it as empty tuple/struct. /// But in practice some writers are sloppy about it and set this field to 0 (rather than unset) /// for primitive columns. E.g. /// tests/queries/0_stateless/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet - bool is_primitive = !element.__isset.num_children || (element.num_children == 0 && element.__isset.type); + bool is_primitive = !node.element->__isset.num_children || (node.element->num_children == 0 && node.element->__isset.type); if (!is_primitive) return false; primitive_column_idx += 1; - if (!requested) + if (!node.requested) return true; - if (!element.__isset.type) - throw Exception(ErrorCodes::INCORRECT_DATA, "Parquet metadata is missing physical type for column {}", element.name); + if (!node.element->__isset.type) + throw Exception(ErrorCodes::INCORRECT_DATA, "Parquet metadata is missing physical type for column {}", node.getNameForLogging()); - DataTypePtr primitive_type_hint = type_hint; + DataTypePtr primitive_type_hint = node.type_hint; bool output_nullable = false; bool output_nullable_if_not_json = false; if (primitive_type_hint) @@ -287,7 +308,7 @@ bool SchemaConverter::processSubtreePrimitive(const String & name, bool requeste } } /// Force map key to be non-nullable because clickhouse Map doesn't support nullable map key. - else if (!options.schema_inference_force_not_nullable && schema_context != SchemaContext::MapKey) + else if (!options.schema_inference_force_not_nullable && node.schema_context != SchemaContext::MapKey) { if (levels.back().is_array == false) { @@ -306,7 +327,7 @@ bool SchemaConverter::processSubtreePrimitive(const String & name, bool requeste } } - auto geo_it = geo_columns.find(name); + auto geo_it = geo_columns.find(node.getParquetName()); auto geo_metadata = geo_it == geo_columns.end() ? std::nullopt : std::optional(geo_it->second); DataTypePtr inferred_type; @@ -314,7 +335,7 @@ bool SchemaConverter::processSubtreePrimitive(const String & name, bool requeste PageDecoderInfo decoder; try { - processPrimitiveColumn(element, primitive_type_hint, decoder, raw_decoded_type, inferred_type, geo_metadata); + processPrimitiveColumn(*node.element, primitive_type_hint, decoder, raw_decoded_type, inferred_type, geo_metadata); } catch (Exception & e) { @@ -325,7 +346,7 @@ bool SchemaConverter::processSubtreePrimitive(const String & name, bool requeste } else { - e.addMessage("column '" + name + "'"); + e.addMessage("column '" + node.getNameForLogging() + "'"); throw; } } @@ -341,7 +362,7 @@ bool SchemaConverter::processSubtreePrimitive(const String & name, bool requeste PrimitiveColumnInfo & primitive = primitive_columns.emplace_back(); primitive.column_idx = primitive_column_idx - 1; primitive.schema_idx = schema_idx - 1; - primitive.name = name; + primitive.name = node.name; primitive.levels = levels; primitive.output_nullable = output_nullable || (output_nullable_if_not_json && !typeid_cast(inferred_type.get())); primitive.decoder = std::move(decoder); @@ -350,9 +371,9 @@ bool SchemaConverter::processSubtreePrimitive(const String & name, bool requeste if (level.is_array) primitive.max_array_def = level.def; - output_idx = output_columns.size(); + node.output_idx = output_columns.size(); OutputColumnInfo & output = output_columns.emplace_back(); - output.name = name; + output.name = node.name; output.primitive_start = primitive_idx; output.primitive_end = primitive_idx + 1; output.is_primitive = true; @@ -367,7 +388,7 @@ bool SchemaConverter::processSubtreePrimitive(const String & name, bool requeste inferred_type = std::make_shared(inferred_type); } - primitive.final_type = type_hint ? type_hint : inferred_type; + primitive.final_type = node.type_hint ? node.type_hint : inferred_type; primitive.needs_cast = !primitive.final_type->equals(*primitive.intermediate_type); output.type = primitive.final_type; @@ -375,7 +396,7 @@ bool SchemaConverter::processSubtreePrimitive(const String & name, bool requeste return true; } -bool SchemaConverter::processSubtreeMap(const String & name, bool requested, DataTypePtr type_hint, SchemaContext schema_context, const parq::SchemaElement & element, std::optional & output_idx) +bool SchemaConverter::processSubtreeMap(TraversalNode & node) { /// Map, aka Array(Tuple(2)). /// required group `name` (MAP or MAP_KEY_VALUE): @@ -383,18 +404,18 @@ bool SchemaConverter::processSubtreeMap(const String & name, bool requested, Dat /// reqiured "key" /// "value" - if (element.converted_type != parq::ConvertedType::MAP && element.converted_type != parq::ConvertedType::MAP_KEY_VALUE && !element.logicalType.__isset.MAP) + if (node.element->converted_type != parq::ConvertedType::MAP && node.element->converted_type != parq::ConvertedType::MAP_KEY_VALUE && !node.element->logicalType.__isset.MAP) return false; /// If an element is declared as MAP, but doesn't have the expected structure of children /// and grandchildren, we fall back to interpreting it as array of tuples, as if there were /// no MAP annotation on it. Also fall back if Tuple type was requested /// (presumably `Tuple(Array(Tuple(key, value))` - a literal interpretation of the schema tree) /// (not to be confused with the case when `Array(Tuple(key, value))` was requested). - if (schema_context != SchemaContext::None && schema_context != SchemaContext::ListElement) + if (node.schema_context != SchemaContext::None && node.schema_context != SchemaContext::ListElement) return false; - if (typeid_cast(type_hint.get())) + if (typeid_cast(node.type_hint.get())) return false; - if (element.num_children != 1) + if (node.element->num_children != 1) return false; const parq::SchemaElement & child = file_metadata.schema.at(schema_idx); if (child.repetition_type != parq::FieldRepetitionType::REPEATED || child.num_children != 2) @@ -402,52 +423,54 @@ bool SchemaConverter::processSubtreeMap(const String & name, bool requested, Dat DataTypePtr array_type_hint; bool no_map = false; // return plain Array(Tuple) instead of Map - if (type_hint) + if (node.type_hint) { - if (const DataTypeMap * map_type = typeid_cast(type_hint.get())) + if (const DataTypeMap * map_type = typeid_cast(node.type_hint.get())) { array_type_hint = map_type->getNestedType(); } - else if (typeid_cast(type_hint.get())) + else if (typeid_cast(node.type_hint.get())) { - array_type_hint = type_hint; + array_type_hint = node.type_hint; no_map = true; } else { - throw Exception(ErrorCodes::TYPE_MISMATCH, "Requested type of column {} doesn't match parquet schema: parquet type is Map, requested type is {}", name, type_hint->getName()); + throw Exception(ErrorCodes::TYPE_MISMATCH, "Requested type of column {} doesn't match parquet schema: parquet type is Map, requested type is {}", node.getNameForLogging(), node.type_hint->getName()); } } /// (MapTupleAsPlainTuple is needed to skip a level in the column name: it changes /// `my_map.key_value.key` to `my_map.key`. - auto array_idx = processSubtree(name, requested, array_type_hint, no_map ? SchemaContext::MapTupleAsPlainTuple : SchemaContext::MapTuple); + TraversalNode subnode = node.prepareToRecurse(no_map ? SchemaContext::MapTupleAsPlainTuple : SchemaContext::MapTuple, array_type_hint); + processSubtree(subnode); - if (!requested || !array_idx.has_value()) + if (!node.requested || !subnode.output_idx.has_value()) return true; + size_t array_idx = subnode.output_idx.value(); /// Support explicitly requesting Array(Tuple) type for map columns. Useful e.g. if the map /// key type is something that's not allowed as Map key in clickhouse. if (no_map) { - output_idx = array_idx; + node.output_idx = array_idx; } else { - output_idx = output_columns.size(); + node.output_idx = output_columns.size(); OutputColumnInfo & output = output_columns.emplace_back(); - const OutputColumnInfo & array = output_columns.at(array_idx.value()); + const OutputColumnInfo & array = output_columns.at(array_idx); - output.name = name; + output.name = node.name; output.primitive_start = array.primitive_start; output.primitive_end = array.primitive_end; output.type = std::make_shared(array.type); - output.nested_columns = {array_idx.value()}; + output.nested_columns = {array_idx}; } return true; } -bool SchemaConverter::processSubtreeArrayOuter(const String & name, bool requested, DataTypePtr type_hint, SchemaContext schema_context, const parq::SchemaElement & element, std::optional & output_idx) +bool SchemaConverter::processSubtreeArrayOuter(TraversalNode & node) { /// Array: /// required group `name` (List): @@ -458,46 +481,48 @@ bool SchemaConverter::processSubtreeArrayOuter(const String & name, bool request /// across two levels of recursion: processSubtreeArrayOuter for the outer wrapper, /// processSubtreeArrayInner for the inner wrapper. - if (element.converted_type != parq::ConvertedType::LIST && !element.logicalType.__isset.LIST) + if (node.element->converted_type != parq::ConvertedType::LIST && !node.element->logicalType.__isset.LIST) return false; - if (schema_context != SchemaContext::None && schema_context != SchemaContext::ListElement) + if (node.schema_context != SchemaContext::None && node.schema_context != SchemaContext::ListElement) return false; - if (element.num_children != 1) + if (node.element->num_children != 1) return false; const parq::SchemaElement & child = file_metadata.schema.at(schema_idx); if (child.repetition_type != parq::FieldRepetitionType::REPEATED || child.num_children != 1) return false; - auto array_idx = processSubtree(name, requested, type_hint, SchemaContext::ListTuple); + TraversalNode subnode = node.prepareToRecurse(SchemaContext::ListTuple, node.type_hint); + processSubtree(subnode); - if (!requested || !array_idx.has_value()) + if (!node.requested || !subnode.output_idx.has_value()) return true; - output_idx = array_idx; + node.output_idx = subnode.output_idx; return true; } -bool SchemaConverter::processSubtreeArrayInner(const String & name, bool requested, DataTypePtr type_hint, SchemaContext schema_context, const parq::SchemaElement & element, std::optional & output_idx) +bool SchemaConverter::processSubtreeArrayInner(TraversalNode & node) { - if (schema_context != SchemaContext::ListTuple) + if (node.schema_context != SchemaContext::ListTuple) return false; /// Array (middle schema element). - chassert(element.repetition_type == parq::FieldRepetitionType::REPEATED && - element.num_children == 1); // caller checked this + chassert(node.element->repetition_type == parq::FieldRepetitionType::REPEATED && + node.element->num_children == 1); // caller checked this /// (type_hint is already unwrapped to be element type, because of REPEATED) - auto element_idx = processSubtree(name, requested, type_hint, SchemaContext::ListElement); + TraversalNode subnode = node.prepareToRecurse(SchemaContext::ListElement, node.type_hint); + processSubtree(subnode); - if (!requested || !element_idx.has_value()) + if (!node.requested || !subnode.output_idx.has_value()) return true; - output_idx = element_idx; + node.output_idx = subnode.output_idx; return true; } -void SchemaConverter::processSubtreeTuple(const String & name, bool requested, DataTypePtr type_hint, SchemaContext schema_context, const parq::SchemaElement & element, std::optional & output_idx) +void SchemaConverter::processSubtreeTuple(TraversalNode & node) { /// Tuple (possibly a Map key_value tuple): /// (required|optional) group `name`: @@ -505,9 +530,9 @@ void SchemaConverter::processSubtreeTuple(const String & name, bool requested, D /// `name2` /// ... - const DataTypeTuple * tuple_type_hint = typeid_cast(type_hint.get()); - if (type_hint && !tuple_type_hint) - throw Exception(ErrorCodes::TYPE_MISMATCH, "Requested type of column {} doesn't match parquet schema: parquet type is Tuple, requested type is {}", name, type_hint->getName()); + const DataTypeTuple * tuple_type_hint = typeid_cast(node.type_hint.get()); + if (node.type_hint && !tuple_type_hint) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Requested type of column {} doesn't match parquet schema: parquet type is Tuple, requested type is {}", node.getNameForLogging(), node.type_hint->getName()); /// 3 modes: /// * If type_hint has element names, we match elements from parquet to elements from type @@ -521,10 +546,10 @@ void SchemaConverter::processSubtreeTuple(const String & name, bool requested, D bool lookup_by_name = false; std::vector elements; - if (type_hint) + if (node.type_hint) { if (tuple_type_hint->hasExplicitNames() && !tuple_type_hint->getElements().empty() && - schema_context != SchemaContext::MapTuple) + node.schema_context != SchemaContext::MapTuple) { /// Allow reading a subset of tuple elements, matched by name, possibly reordered. lookup_by_name = true; @@ -532,16 +557,16 @@ void SchemaConverter::processSubtreeTuple(const String & name, bool requested, D } else { - if (tuple_type_hint->getElements().size() != size_t(element.num_children)) - throw Exception(ErrorCodes::TYPE_MISMATCH, "Requested type of column {} doesn't match parquet schema: parquet type is Tuple with {} elements, requested type is Tuple with {} elements", name, element.num_children, tuple_type_hint->getElements().size()); + if (tuple_type_hint->getElements().size() != size_t(node.element->num_children)) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Requested type of column {} doesn't match parquet schema: parquet type is Tuple with {} elements, requested type is Tuple with {} elements", node.getNameForLogging(), node.element->num_children, tuple_type_hint->getElements().size()); } } - if (!lookup_by_name && requested) - elements.resize(size_t(element.num_children), UINT64_MAX); + if (!lookup_by_name && node.requested) + elements.resize(size_t(node.element->num_children), UINT64_MAX); Strings names; DataTypes types; - if (!type_hint && requested) + if (!node.type_hint && node.requested) { names.resize(elements.size()); types.resize(elements.size()); @@ -551,36 +576,37 @@ void SchemaConverter::processSubtreeTuple(const String & name, bool requested, D size_t output_start = output_columns.size(); size_t skipped_unsupported_columns = 0; std::vector element_names_in_file; - for (size_t i = 0; i < size_t(element.num_children); ++i) + for (size_t i = 0; i < size_t(node.element->num_children); ++i) { - const String & element_name = file_metadata.schema.at(schema_idx).name; - element_names_in_file.push_back(element_name); + const String & element_name = element_names_in_file.emplace_back(useColumnMapperIfNeeded(file_metadata.schema.at(schema_idx))); std::optional idx_in_output_tuple = i - skipped_unsupported_columns; if (lookup_by_name) { idx_in_output_tuple = tuple_type_hint->tryGetPositionByName(element_name, options.format.parquet.case_insensitive_column_matching); if (idx_in_output_tuple.has_value() && elements.at(idx_in_output_tuple.value()) != UINT64_MAX) - throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Parquet tuple {} has multiple elements with name `{}`", name, element_name); + throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Parquet tuple {} has multiple elements with name `{}`", node.getNameForLogging(), element_name); } DataTypePtr element_type_hint; - if (type_hint && idx_in_output_tuple.has_value()) + if (node.type_hint && idx_in_output_tuple.has_value()) element_type_hint = tuple_type_hint->getElement(idx_in_output_tuple.value()); - bool element_requested = requested && idx_in_output_tuple.has_value(); + const bool element_requested = node.requested && idx_in_output_tuple.has_value(); - SchemaContext child_context = SchemaContext::None; - if (schema_context == SchemaContext::MapTuple && idx_in_output_tuple == 0) - child_context = SchemaContext::MapKey; + TraversalNode subnode = node.prepareToRecurse(SchemaContext::None, element_type_hint); + subnode.requested = element_requested; + if (node.schema_context == SchemaContext::MapTuple && idx_in_output_tuple == 0) + subnode.schema_context = SchemaContext::MapKey; - auto element_idx = processSubtree(name, element_requested, element_type_hint, child_context); + processSubtree(subnode); + auto element_idx = subnode.output_idx; if (element_requested) { if (!element_idx.has_value()) { - if (type_hint || schema_context == SchemaContext::MapTuple) + if (node.type_hint || node.schema_context == SchemaContext::MapTuple) { /// If one of the elements is skipped, skip the whole tuple. /// Remove previous elements. @@ -601,7 +627,7 @@ void SchemaConverter::processSubtreeTuple(const String & name, bool requested, D elements.at(idx_in_output_tuple.value()) = element_idx.value(); const auto & type = output_columns.at(element_idx.value()).type; - if (type_hint) + if (node.type_hint) { chassert(type->equals(*element_type_hint)); } @@ -613,16 +639,16 @@ void SchemaConverter::processSubtreeTuple(const String & name, bool requested, D } } - if (!requested) + if (!node.requested) return; /// Map tuple in parquet has elements: {"key" , "value" }, /// but DataTypeMap requires: {"keys", "values"}. - if (schema_context == SchemaContext::MapTuple) + if (node.schema_context == SchemaContext::MapTuple) names = {"keys", "values"}; DataTypePtr output_type; - if (type_hint) + if (node.type_hint) { chassert(elements.size() == tuple_type_hint->getElements().size()); for (size_t i = 0; i < elements.size(); ++i) @@ -630,24 +656,24 @@ void SchemaConverter::processSubtreeTuple(const String & name, bool requested, D if (elements[i] != UINT64_MAX) continue; if (!options.format.parquet.allow_missing_columns) - throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Requested tuple element {} of column {} was not found in parquet schema ({})", tuple_type_hint->getNameByPosition(i + 1), name, element_names_in_file); + throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "Requested tuple element {} of column {} was not found in parquet schema ({})", tuple_type_hint->getNameByPosition(i + 1), node.getNameForLogging(), element_names_in_file); elements[i] = output_columns.size(); OutputColumnInfo & missing_output = output_columns.emplace_back(); - missing_output.name = name + "." + (tuple_type_hint->hasExplicitNames() ? tuple_type_hint->getNameByPosition(i + 1) : std::to_string(i + 1)); + missing_output.name = node.name + "." + (tuple_type_hint->hasExplicitNames() ? tuple_type_hint->getNameByPosition(i + 1) : std::to_string(i + 1)); missing_output.type = tuple_type_hint->getElement(i); missing_output.is_missing_column = true; } - output_type = type_hint; + output_type = node.type_hint; } else { output_type = std::make_shared(types, names); } - output_idx = output_columns.size(); + node.output_idx = output_columns.size(); OutputColumnInfo & output = output_columns.emplace_back(); - output.name = name; + output.name = node.name; output.primitive_start = primitive_start; output.primitive_end = primitive_columns.size(); output.type = std::move(output_type); @@ -790,7 +816,9 @@ void SchemaConverter::processPrimitiveColumn( /// GeoParquet. /// Spec says "Geometry columns MUST be at the root of the schema", but we allow them to be - /// nested in tuples etc, why not. + /// nested in tuples etc, why not. (Though nesting in arrays/maps probably currently wouldn't + /// work because our names omit the wrapper SchemaElement-s. That would be easy to fix by + /// including them in parquet_name.) /// If type hint is String, ignore geoparquet and return raw bytes. if (geo_metadata.has_value() && (!type_hint || !typeid_cast(type_hint.get()))) { @@ -1045,7 +1073,7 @@ void SchemaConverter::processPrimitiveColumn( if (precision > max_precision) throw Exception(ErrorCodes::INCORRECT_DATA, "Parquet decimal type precision or scale is too big ({} digits) for physical type {}", precision, thriftToString(type)); - out_inferred_type = createDecimal(max_precision, scale); + out_inferred_type = createDecimal(precision, scale); size_t output_size = out_inferred_type->getSizeOfValueInMemory(); out_decoder.allow_stats = is_output_type_decimal(output_size, scale); diff --git a/src/Processors/Formats/Impl/Parquet/SchemaConverter.h b/src/Processors/Formats/Impl/Parquet/SchemaConverter.h index eb8bf807b350..e89e7f75d6f0 100644 --- a/src/Processors/Formats/Impl/Parquet/SchemaConverter.h +++ b/src/Processors/Formats/Impl/Parquet/SchemaConverter.h @@ -2,6 +2,13 @@ #include +namespace DB +{ + +class ColumnMapper; + +} + namespace DB::Parquet { @@ -16,6 +23,7 @@ struct SchemaConverter const parq::FileMetaData & file_metadata; const ReadOptions & options; const Block * sample_block; + const ColumnMapper * column_mapper = nullptr; std::vector external_columns; std::vector primitive_columns; @@ -25,6 +33,7 @@ struct SchemaConverter size_t primitive_column_idx = 0; std::vector levels; + /// The key is the parquet column name, without ColumnMapper. std::unordered_map geo_columns; SchemaConverter(const parq::FileMetaData &, const ReadOptions &, const Block *); @@ -49,23 +58,87 @@ struct SchemaConverter ListElement, }; + /// Parameters of a recursive call that traverses a subtree, corresponding to a parquet SchemaElement. + struct TraversalNode + { + /// Assigned by the caller. + SchemaContext schema_context = SchemaContext::None; + + /// These fields are assigned by the caller, then updated by the callee. + /// E.g. name is initially the parent element's name, then the callee appends a path + /// component to it. + /// + /// If there's ColumnMapper, `name` is the mapped name (clickhouse column name), while + /// `parquet_name` is the name according to the parquet schema. + /// If `parquet_name` is nullopt, the clickhouse and parquet names are equal. + String name; + std::optional parquet_name; + DataTypePtr type_hint; + bool requested = false; + + /// These are assigned by the callee. + const parq::SchemaElement * element = nullptr; + std::optional output_idx; // index in output_columns + + const String & getParquetName() const + { + return parquet_name.has_value() ? *parquet_name : name; + } + + String getNameForLogging() const + { + if (parquet_name.has_value() && *parquet_name != name) + return fmt::format("{} (mapped to {})", *parquet_name, name); + return name; + } + + void appendNameComponent(const String & parquet_field_name, std::string_view mapped_field_name) + { + if (!name.empty()) + name += "."; + name += mapped_field_name; + if (parquet_name.has_value() || mapped_field_name != parquet_field_name) + { + if (parquet_name.has_value()) + *parquet_name += "."; + else + parquet_name.emplace(); + *parquet_name += parquet_field_name; + } + } + + TraversalNode prepareToRecurse(SchemaContext schema_context_, DataTypePtr type_hint_) + { + TraversalNode res = *this; + res.schema_context = schema_context_; + res.type_hint = std::move(type_hint_); + res.element = nullptr; + res.output_idx.reset(); + return res; + } + }; + void checkHasColumns(); - std::optional processSubtree(String name, bool requested, DataTypePtr type_hint, SchemaContext); + void processSubtree(TraversalNode & node); /// These functions are used by processSubtree for different kinds of SchemaElement. /// Return true if the schema element was recognized as the corresponding kind, /// even if no output column needs to be produced. - bool processSubtreePrimitive(const String & name, bool requested, DataTypePtr type_hint, SchemaContext schema_context, const parq::SchemaElement & element, std::optional & output_idx); - bool processSubtreeMap(const String & name, bool requested, DataTypePtr type_hint, SchemaContext schema_context, const parq::SchemaElement & element, std::optional & output_idx); - bool processSubtreeArrayOuter(const String & name, bool requested, DataTypePtr type_hint, SchemaContext schema_context, const parq::SchemaElement & element, std::optional & output_idx); - bool processSubtreeArrayInner(const String & name, bool requested, DataTypePtr type_hint, SchemaContext schema_context, const parq::SchemaElement & element, std::optional & output_idx); - void processSubtreeTuple(const String & name, bool requested, DataTypePtr type_hint, SchemaContext schema_context, const parq::SchemaElement & element, std::optional & output_idx); + bool processSubtreePrimitive(TraversalNode & node); + bool processSubtreeMap(TraversalNode & node); + bool processSubtreeArrayOuter(TraversalNode & node); + bool processSubtreeArrayInner(TraversalNode & node); + void processSubtreeTuple(TraversalNode & node); void processPrimitiveColumn( const parq::SchemaElement & element, DataTypePtr type_hint, PageDecoderInfo & out_decoder, DataTypePtr & out_decoded_type, DataTypePtr & out_inferred_type, std::optional geo_metadata) const; + + /// Returns element.name or a corresponding name from ColumnMapper. + /// For tuple elements, that's just the element name like `x`, not the whole path like `t.x`. + std::string_view useColumnMapperIfNeeded(const parq::SchemaElement & element) const; }; } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 06b69ca71e3f..e8a4c8fdce91 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -3,6 +3,9 @@ #if USE_PARQUET +#include +#include +#include #include #include #include @@ -30,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +45,8 @@ namespace ProfileEvents extern const Event ParquetFetchWaitTimeMicroseconds; extern const Event ParquetReadRowGroups; extern const Event ParquetPrunedRowGroups; + extern const Event ParquetMetaDataCacheHits; + extern const Event ParquetMetaDataCacheMisses; } namespace CurrentMetrics @@ -57,6 +63,16 @@ namespace CurrentMetrics namespace DB { +namespace Setting +{ + extern const SettingsBool input_format_parquet_use_metadata_cache; +} + +namespace ServerSetting +{ + extern const ServerSettingsUInt64 input_format_parquet_metadata_cache_max_size; +} + namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -543,6 +559,49 @@ static std::vector getHyperrectangleForRowGroup(const parquet::FileMetaDa return hyperrectangle; } +std::shared_ptr ParquetBlockInputFormat::readMetadataFromFile() +{ + createArrowFileIfNotCreated(); + return parquet::ReadMetaData(arrow_file); +} + +std::shared_ptr ParquetBlockInputFormat::getFileMetaData() +{ + // in-memory cache is not implemented for local file operations, only for remote files + // there is a chance the user sets `input_format_parquet_use_metadata_cache=1` for a local file operation + // and the cache_key won't be set. Therefore, we also need to check for metadata_cache.key + if (!metadata_cache.use_cache || metadata_cache.key.empty()) + { + return readMetadataFromFile(); + } + + auto [parquet_file_metadata, loaded] = ParquetFileMetaDataCache::instance()->getOrSet( + metadata_cache.key, + [&]() + { + return readMetadataFromFile(); + } + ); + if (loaded) + ProfileEvents::increment(ProfileEvents::ParquetMetaDataCacheMisses); + else + ProfileEvents::increment(ProfileEvents::ParquetMetaDataCacheHits); + return parquet_file_metadata; +} + +void ParquetBlockInputFormat::createArrowFileIfNotCreated() +{ + if (arrow_file) + { + return; + } + + // Create arrow file adapter. + // TODO: Make the adapter do prefetching on IO threads, based on the full set of ranges that + // we'll need to read (which we know in advance). Use max_download_threads for that. + arrow_file = asArrowFile(*in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true); +} + std::unordered_set getBloomFilterFilteringColumnKeys(const KeyCondition::RPN & rpn) { std::unordered_set column_keys; @@ -656,7 +715,7 @@ void ParquetBlockInputFormat::initializeIfNeeded() if (is_stopped) return; - metadata = parquet::ReadMetaData(arrow_file); + metadata = getFileMetaData(); const bool prefetch_group = io_pool != nullptr; std::shared_ptr schema; @@ -762,6 +821,8 @@ void ParquetBlockInputFormat::initializeIfNeeded() } } + bool has_row_groups_to_read = false; + auto skip_row_group_based_on_filters = [&](int row_group) { if (!format_settings.parquet.filter_push_down && !format_settings.parquet.bloom_filter_push_down) @@ -820,7 +881,20 @@ void ParquetBlockInputFormat::initializeIfNeeded() row_group_batches.back().total_bytes_compressed += row_group_size; auto rows = adaptive_chunk_size(row_group); row_group_batches.back().adaptive_chunk_size = rows ? rows : format_settings.parquet.max_block_size; + + has_row_groups_to_read = true; } + + if (has_row_groups_to_read) + { + createArrowFileIfNotCreated(); + } +} + +void ParquetBlockInputFormat::setStorageRelatedUniqueKey(const Settings & settings, const String & key_) +{ + metadata_cache.key = key_; + metadata_cache.use_cache = settings[Setting::input_format_parquet_use_metadata_cache]; } void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_batch_idx) @@ -1158,7 +1232,7 @@ Chunk ParquetBlockInputFormat::read() [](size_t sum, const RowGroupBatchState & batch) { return sum + batch.total_rows; }); row_group_batches_completed++; - chunk.getChunkInfos().add(std::make_shared(total_rows_before)); + chunk.getChunkInfos().add(std::make_shared(total_rows_before)); return chunk; } @@ -1208,7 +1282,7 @@ Chunk ParquetBlockInputFormat::read() + std::accumulate(row_group.chunk_sizes.begin(), row_group.chunk_sizes.begin() + chunk.chunk_idx, 0ull); - chunk.chunk.getChunkInfos().add(std::make_shared(total_rows_before)); + chunk.chunk.getChunkInfos().add(std::make_shared(total_rows_before)); return std::move(chunk.chunk); } diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index cf30aa6e36b1..1038e1d35037 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -72,6 +72,8 @@ class ParquetBlockInputFormat : public IInputFormat size_t getApproxBytesReadForChunk() const override { return previous_approx_bytes_read_for_chunk; } + void setStorageRelatedUniqueKey(const Settings & settings, const String & key_) override; + private: Chunk read() override; @@ -90,6 +92,13 @@ class ParquetBlockInputFormat : public IInputFormat void threadFunction(size_t row_group_batch_idx); + void createArrowFileIfNotCreated(); + std::shared_ptr readMetadataFromFile(); + + std::shared_ptr getFileMetaData(); + + inline bool supportPrefetch() const; + // Data layout in the file: // // row group 0 @@ -340,6 +349,13 @@ class ParquetBlockInputFormat : public IInputFormat bool is_initialized = false; std::optional> parquet_names_to_clickhouse; std::optional> clickhouse_names_to_parquet; + struct Cache + { + String key; + bool use_cache = false; + }; + + Cache metadata_cache; }; class ArrowParquetSchemaReader : public ISchemaReader diff --git a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp index 0c86689e8bdc..fc94cedfc00d 100644 --- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp @@ -306,13 +306,17 @@ void ParquetBlockOutputFormat::writeRowGroup(std::vector chunks) else { Chunk concatenated; - while (!chunks.empty()) + for (auto & chunk : chunks) { if (concatenated.empty()) - concatenated.swap(chunks.back()); + { + concatenated.swap(chunk); + } else - concatenated.append(chunks.back()); - chunks.pop_back(); + { + concatenated.append(chunk); + chunk.clear(); // free chunk's buffers so memory is release earlier + } } writeRowGroupInOneThread(std::move(concatenated)); } diff --git a/src/Processors/Formats/Impl/ParquetFileMetaDataCache.cpp b/src/Processors/Formats/Impl/ParquetFileMetaDataCache.cpp new file mode 100644 index 000000000000..da8ad825f505 --- /dev/null +++ b/src/Processors/Formats/Impl/ParquetFileMetaDataCache.cpp @@ -0,0 +1,20 @@ +#include + +#if USE_PARQUET + +namespace DB +{ + +ParquetFileMetaDataCache::ParquetFileMetaDataCache() + : CacheBase(CurrentMetrics::end(), CurrentMetrics::end(), 0) +{} + +ParquetFileMetaDataCache * ParquetFileMetaDataCache::instance() +{ + static ParquetFileMetaDataCache instance; + return &instance; +} + +} + +#endif diff --git a/src/Processors/Formats/Impl/ParquetFileMetaDataCache.h b/src/Processors/Formats/Impl/ParquetFileMetaDataCache.h new file mode 100644 index 000000000000..fb5fc1bb0217 --- /dev/null +++ b/src/Processors/Formats/Impl/ParquetFileMetaDataCache.h @@ -0,0 +1,30 @@ +#pragma once + +#include "config.h" + +#if USE_PARQUET + +namespace parquet +{ + +class FileMetaData; + +} + +#include + +namespace DB +{ + +class ParquetFileMetaDataCache : public CacheBase +{ +public: + static ParquetFileMetaDataCache * instance(); + +private: + ParquetFileMetaDataCache(); +}; + +} + +#endif diff --git a/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp b/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp index 4e8973c2bc61..1a9ac2cb0379 100644 --- a/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp @@ -22,8 +22,17 @@ #include #include #include +#include +#include +#include +namespace ProfileEvents +{ +extern const Event ParquetMetaDataCacheHits; +extern const Event ParquetMetaDataCacheMisses; +} + namespace DB { @@ -32,6 +41,11 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +namespace Setting +{ +extern const SettingsBool input_format_parquet_use_metadata_cache; +} + static NamesAndTypesList getHeaderForParquetMetadata() { NamesAndTypesList names_and_types{ @@ -130,10 +144,35 @@ void checkHeader(const Block & header) static std::shared_ptr getFileMetadata( ReadBuffer & in, const FormatSettings & format_settings, - std::atomic & is_stopped) + std::atomic & is_stopped, + ParquetMetadataInputFormat::Cache metadata_cache) { - auto arrow_file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true); - return parquet::ReadMetaData(arrow_file); + // in-memory cache is not implemented for local file operations, only for remote files + // there is a chance the user sets `input_format_parquet_use_metadata_cache=1` for a local file operation + // and the cache_key won't be set. Therefore, we also need to check for metadata_cache.key + if (!metadata_cache.use_cache || metadata_cache.key.empty()) + { + auto arrow_file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true); + return parquet::ReadMetaData(arrow_file); + } + + auto [parquet_file_metadata, loaded] = ParquetFileMetaDataCache::instance()->getOrSet( + metadata_cache.key, + [&]() + { + auto arrow_file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true); + return parquet::ReadMetaData(arrow_file); + } + ); + + if (loaded) + ProfileEvents::increment(ProfileEvents::ParquetMetaDataCacheMisses); + else + ProfileEvents::increment(ProfileEvents::ParquetMetaDataCacheHits); + + return parquet_file_metadata; + + } ParquetMetadataInputFormat::ParquetMetadataInputFormat(ReadBuffer & in_, SharedHeader header_, const FormatSettings & format_settings_) @@ -148,7 +187,7 @@ Chunk ParquetMetadataInputFormat::read() if (done) return res; - auto metadata = getFileMetadata(*in, format_settings, is_stopped); + auto metadata = getFileMetadata(*in, format_settings, is_stopped, metadata_cache); const auto & header = getPort().getHeader(); auto names_and_types = getHeaderForParquetMetadata(); @@ -489,6 +528,12 @@ void ParquetMetadataInputFormat::resetParser() done = false; } +void ParquetMetadataInputFormat::setStorageRelatedUniqueKey(const Settings & settings, const String & key_) +{ + metadata_cache.key = key_; + metadata_cache.use_cache = settings[Setting::input_format_parquet_use_metadata_cache]; +} + ParquetMetadataSchemaReader::ParquetMetadataSchemaReader(ReadBuffer & in_) : ISchemaReader(in_) { diff --git a/src/Processors/Formats/Impl/ParquetMetadataInputFormat.h b/src/Processors/Formats/Impl/ParquetMetadataInputFormat.h index 81cf7890ee7e..6b667dcc5b1e 100644 --- a/src/Processors/Formats/Impl/ParquetMetadataInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetMetadataInputFormat.h @@ -62,6 +62,14 @@ class ParquetMetadataInputFormat : public IInputFormat void resetParser() override; + void setStorageRelatedUniqueKey(const Settings & settings, const String & key_) override; + + struct Cache + { + String key; + bool use_cache = false; + }; + private: Chunk read() override; @@ -78,6 +86,8 @@ class ParquetMetadataInputFormat : public IInputFormat const FormatSettings format_settings; bool done = false; std::atomic is_stopped{0}; + + Cache metadata_cache; }; class ParquetMetadataSchemaReader : public ISchemaReader diff --git a/src/Processors/Formats/Impl/ParquetV3BlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetV3BlockInputFormat.cpp index 5debd8249190..fca6fc38b89d 100644 --- a/src/Processors/Formats/Impl/ParquetV3BlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetV3BlockInputFormat.cpp @@ -37,6 +37,9 @@ ParquetV3BlockInputFormat::ParquetV3BlockInputFormat( { read_options.min_bytes_for_seek = min_bytes_for_seek; read_options.bytes_per_read_task = min_bytes_for_seek * 4; + + if (!format_filter_info) + format_filter_info = std::make_shared(); } void ParquetV3BlockInputFormat::initializeIfNeeded() @@ -65,7 +68,7 @@ void ParquetV3BlockInputFormat::initializeIfNeeded() /// as a signal to disable thread pool altogether, sacrificing the ability to /// use thread pool with 1 thread. We could subtract 1 instead, but then /// by default the thread pool would use `num_cores - 1` threads, also bad. - if (parser_shared_resources->max_parsing_threads <= 1 || format_settings.parquet.preserve_order) + if (parser_shared_resources->max_parsing_threads <= 1) parser_shared_resources->parsing_runner.initManual(); else parser_shared_resources->parsing_runner.initThreadPool( @@ -98,16 +101,17 @@ Chunk ParquetV3BlockInputFormat::read() auto file_metadata = Parquet::Reader::readFileMetaData(temp_prefetcher); auto chunk = getChunkForCount(size_t(file_metadata.num_rows)); - chunk.getChunkInfos().add(std::make_shared(0)); + chunk.getChunkInfos().add(std::make_shared(0)); reported_count = true; return chunk; } initializeIfNeeded(); - Chunk chunk; - std::tie(chunk, previous_block_missing_values) = reader->read(); - return chunk; + auto res = reader->read(); + previous_block_missing_values = res.block_missing_values; + previous_approx_bytes_read_for_chunk = res.virtual_bytes_read; + return std::move(res.chunk); } const BlockMissingValues * ParquetV3BlockInputFormat::getMissingValues() const diff --git a/src/Processors/Formats/Impl/ParquetV3BlockInputFormat.h b/src/Processors/Formats/Impl/ParquetV3BlockInputFormat.h index e93f9c456882..122aaae174ac 100644 --- a/src/Processors/Formats/Impl/ParquetV3BlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetV3BlockInputFormat.h @@ -29,8 +29,7 @@ class ParquetV3BlockInputFormat : public IInputFormat size_t getApproxBytesReadForChunk() const override { - /// TODO [parquet]: - return 0; + return previous_approx_bytes_read_for_chunk; } private: @@ -47,6 +46,7 @@ class ParquetV3BlockInputFormat : public IInputFormat bool reported_count = false; // if need_only_count BlockMissingValues previous_block_missing_values; + size_t previous_approx_bytes_read_for_chunk = 0; void initializeIfNeeded(); }; diff --git a/src/Processors/QueryPlan/ObjectFilterStep.cpp b/src/Processors/QueryPlan/ObjectFilterStep.cpp new file mode 100644 index 000000000000..a635aee729c7 --- /dev/null +++ b/src/Processors/QueryPlan/ObjectFilterStep.cpp @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; +} + +ObjectFilterStep::ObjectFilterStep( + SharedHeader input_header_, + ActionsDAG actions_dag_, + String filter_column_name_) + : actions_dag(std::move(actions_dag_)) + , filter_column_name(std::move(filter_column_name_)) +{ + input_headers.emplace_back(input_header_); + output_header = input_headers.front(); +} + +QueryPipelineBuilderPtr ObjectFilterStep::updatePipeline(QueryPipelineBuilders pipelines, const BuildQueryPipelineSettings & /* settings */) +{ + return std::move(pipelines.front()); +} + +void ObjectFilterStep::updateOutputHeader() +{ + output_header = input_headers.front(); +} + +void ObjectFilterStep::serialize(Serialization & ctx) const +{ + writeStringBinary(filter_column_name, ctx.out); + + actions_dag.serialize(ctx.out, ctx.registry); +} + +std::unique_ptr ObjectFilterStep::deserialize(Deserialization & ctx) +{ + if (ctx.input_headers.size() != 1) + throw Exception(ErrorCodes::INCORRECT_DATA, "ObjectFilterStep must have one input stream"); + + String filter_column_name; + readStringBinary(filter_column_name, ctx.in); + + ActionsDAG actions_dag = ActionsDAG::deserialize(ctx.in, ctx.registry, ctx.context); + + return std::make_unique(ctx.input_headers.front(), std::move(actions_dag), std::move(filter_column_name)); +} + +void registerObjectFilterStep(QueryPlanStepRegistry & registry) +{ + registry.registerStep("ObjectFilter", ObjectFilterStep::deserialize); +} + +} diff --git a/src/Processors/QueryPlan/ObjectFilterStep.h b/src/Processors/QueryPlan/ObjectFilterStep.h new file mode 100644 index 000000000000..ef35d20068ba --- /dev/null +++ b/src/Processors/QueryPlan/ObjectFilterStep.h @@ -0,0 +1,35 @@ +#pragma once +#include +#include + +namespace DB +{ + +/// Implements WHERE operation. +class ObjectFilterStep : public IQueryPlanStep +{ +public: + ObjectFilterStep( + SharedHeader input_header_, + ActionsDAG actions_dag_, + String filter_column_name_); + + String getName() const override { return "ObjectFilter"; } + QueryPipelineBuilderPtr updatePipeline(QueryPipelineBuilders pipelines, const BuildQueryPipelineSettings & settings) override; + + const ActionsDAG & getExpression() const { return actions_dag; } + ActionsDAG & getExpression() { return actions_dag; } + const String & getFilterColumnName() const { return filter_column_name; } + + void serialize(Serialization & ctx) const override; + + static std::unique_ptr deserialize(Deserialization & ctx); + +private: + void updateOutputHeader() override; + + ActionsDAG actions_dag; + String filter_column_name; +}; + +} diff --git a/src/Processors/QueryPlan/Optimizations/optimizePrimaryKeyConditionAndLimit.cpp b/src/Processors/QueryPlan/Optimizations/optimizePrimaryKeyConditionAndLimit.cpp index ce36c7bddb43..33408e02df87 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizePrimaryKeyConditionAndLimit.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizePrimaryKeyConditionAndLimit.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB::QueryPlanOptimizations { @@ -41,6 +42,10 @@ void optimizePrimaryKeyConditionAndLimit(const Stack & stack) /// So this is likely not needed. continue; } + else if (auto * object_filter_step = typeid_cast(iter->node->step.get())) + { + source_step_with_filter->addFilter(object_filter_step->getExpression().clone(), object_filter_step->getFilterColumnName()); + } else { break; diff --git a/src/Processors/QueryPlan/QueryPlanStepRegistry.cpp b/src/Processors/QueryPlan/QueryPlanStepRegistry.cpp index 517f46fbfc96..283eece53c00 100644 --- a/src/Processors/QueryPlan/QueryPlanStepRegistry.cpp +++ b/src/Processors/QueryPlan/QueryPlanStepRegistry.cpp @@ -50,6 +50,7 @@ void registerFilterStep(QueryPlanStepRegistry & registry); void registerTotalsHavingStep(QueryPlanStepRegistry & registry); void registerExtremesStep(QueryPlanStepRegistry & registry); void registerJoinStep(QueryPlanStepRegistry & registry); +void registerObjectFilterStep(QueryPlanStepRegistry & registry); void registerReadFromTableStep(QueryPlanStepRegistry & registry); void registerReadFromTableFunctionStep(QueryPlanStepRegistry & registry); @@ -75,6 +76,7 @@ void QueryPlanStepRegistry::registerPlanSteps() registerReadFromTableStep(registry); registerReadFromTableFunctionStep(registry); + registerObjectFilterStep(registry); } } diff --git a/src/Processors/QueryPlan/ReadFromObjectStorageStep.cpp b/src/Processors/QueryPlan/ReadFromObjectStorageStep.cpp index a182b2f84034..34914c39fc2b 100644 --- a/src/Processors/QueryPlan/ReadFromObjectStorageStep.cpp +++ b/src/Processors/QueryPlan/ReadFromObjectStorageStep.cpp @@ -97,6 +97,7 @@ void ReadFromObjectStorageStep::initializePipeline(QueryPipelineBuilder & pipeli num_streams = 1; } +// const size_t max_parsing_threads = (distributed_processing || num_streams >= max_threads) ? 1 : (max_threads / std::max(num_streams, 1ul)); auto parser_shared_resources = std::make_shared(context->getSettingsRef(), num_streams); auto format_filter_info diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 48a412ef781f..70f3b78b8245 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -508,7 +508,8 @@ void ReadFromRemote::addLazyPipe( my_stage = stage, my_storage = storage, add_agg_info, add_totals, add_extremes, async_read, async_query_sending, query_tree = shard.query_tree, planner_context = shard.planner_context, - pushed_down_filters, parallel_marshalling_threads]() mutable + pushed_down_filters, parallel_marshalling_threads, + my_is_remote_function = is_remote_function]() mutable -> QueryPipelineBuilder { auto current_settings = my_context->getSettingsRef(); @@ -603,6 +604,8 @@ void ReadFromRemote::addLazyPipe( {DataTypeUInt32().createColumnConst(1, my_shard.shard_info.shard_num), std::make_shared(), "_shard_num"}}; auto remote_query_executor = std::make_shared( std::move(connections), query_string, header, my_context, my_throttler, my_scalars, my_external_tables, stage_to_use, my_shard.query_plan); + remote_query_executor->setRemoteFunction(my_is_remote_function); + remote_query_executor->setShardCount(my_shard_count); auto pipe = createRemoteSourcePipe( remote_query_executor, add_agg_info, add_totals, add_extremes, async_read, async_query_sending, parallel_marshalling_threads); @@ -693,6 +696,8 @@ void ReadFromRemote::addPipe( priority_func); remote_query_executor->setLogger(log); remote_query_executor->setPoolMode(PoolMode::GET_ONE); + remote_query_executor->setRemoteFunction(is_remote_function); + remote_query_executor->setShardCount(shard_count); if (!table_func_ptr) remote_query_executor->setMainTable(shard.main_table ? shard.main_table : main_table); @@ -713,6 +718,8 @@ void ReadFromRemote::addPipe( auto remote_query_executor = std::make_shared( shard.shard_info.pool, query_string, shard.header, context, throttler, scalars, external_tables, stage_to_use, shard.query_plan); remote_query_executor->setLogger(log); + remote_query_executor->setRemoteFunction(is_remote_function); + remote_query_executor->setShardCount(shard_count); if (context->canUseTaskBasedParallelReplicas() || parallel_replicas_disabled) { diff --git a/src/Processors/QueryPlan/ReadFromRemote.h b/src/Processors/QueryPlan/ReadFromRemote.h index fec1549430e1..02e1536e05d1 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.h +++ b/src/Processors/QueryPlan/ReadFromRemote.h @@ -45,6 +45,7 @@ class ReadFromRemote final : public SourceStepWithFilterBase void enableMemoryBoundMerging(); void enforceAggregationInOrder(const SortDescription & sort_description); + void setIsRemoteFunction(bool is_remote_function_ = true) { is_remote_function = is_remote_function_; } bool hasSerializedPlan() const; @@ -62,6 +63,7 @@ class ReadFromRemote final : public SourceStepWithFilterBase UInt32 shard_count; const String cluster_name; std::optional priority_func_factory; + bool is_remote_function = false; Pipes addPipes(const ClusterProxy::SelectStreamFactory::Shards & used_shards, const SharedHeader & out_header); diff --git a/src/Processors/Sources/ConstChunkGenerator.h b/src/Processors/Sources/ConstChunkGenerator.h index 8232bd5f0dff..ca120fc23d23 100644 --- a/src/Processors/Sources/ConstChunkGenerator.h +++ b/src/Processors/Sources/ConstChunkGenerator.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace DB @@ -13,7 +14,7 @@ class ConstChunkGenerator : public ISource public: ConstChunkGenerator(SharedHeader header, size_t total_num_rows, size_t max_block_size_) : ISource(std::move(header)) - , remaining_rows(total_num_rows), max_block_size(max_block_size_) + , generated_rows(0), remaining_rows(total_num_rows), max_block_size(max_block_size_) { } @@ -27,10 +28,14 @@ class ConstChunkGenerator : public ISource size_t num_rows = std::min(max_block_size, remaining_rows); remaining_rows -= num_rows; - return cloneConstWithDefault(Chunk{getPort().getHeader().getColumns(), 0}, num_rows); + auto chunk = cloneConstWithDefault(Chunk{getPort().getHeader().getColumns(), 0}, num_rows); + chunk.getChunkInfos().add(std::make_shared(generated_rows)); + generated_rows += num_rows; + return chunk; } private: + size_t generated_rows; size_t remaining_rows; size_t max_block_size; }; diff --git a/src/QueryPipeline/RemoteQueryExecutor.cpp b/src/QueryPipeline/RemoteQueryExecutor.cpp index fa4794555ee4..ec705f2b4504 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.cpp +++ b/src/QueryPipeline/RemoteQueryExecutor.cpp @@ -52,6 +52,7 @@ namespace Setting extern const SettingsBool use_hedged_requests; extern const SettingsBool push_external_roles_in_interserver_queries; extern const SettingsMilliseconds parallel_replicas_connect_timeout_ms; + extern const SettingsBool allow_retries_in_cluster_requests; } namespace ErrorCodes @@ -82,6 +83,7 @@ RemoteQueryExecutor::RemoteQueryExecutor( , extension(extension_) , priority_func(priority_func_) , read_packet_type_separately(context->canUseParallelReplicasOnInitiator() && !context->getSettingsRef()[Setting::use_hedged_requests]) + , allow_retries_in_cluster_requests(context->getSettingsRef()[Setting::allow_retries_in_cluster_requests]) { if (stage == QueryProcessingStage::QueryPlan && !query_plan) throw Exception(ErrorCodes::LOGICAL_ERROR, "Query plan is not passed for QueryPlan processing stage"); @@ -406,7 +408,16 @@ void RemoteQueryExecutor::sendQueryUnlocked(ClientInfo::QueryKind query_kind, As auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(settings); ClientInfo modified_client_info = context->getClientInfo(); - modified_client_info.query_kind = query_kind; + + /// Doesn't support now "remote('1.1.1.{1,2}')"" + if (is_remote_function && (shard_count == 1)) + { + modified_client_info.setInitialQuery(); + modified_client_info.client_name = "ClickHouse server"; + modified_client_info.interface = ClientInfo::Interface::TCP; + } + else + modified_client_info.query_kind = query_kind; if (extension) modified_client_info.collaborate_with_initiator = true; @@ -458,7 +469,8 @@ int RemoteQueryExecutor::sendQueryAsync() read_context = std::make_unique( *this, /*suspend_when_query_sent*/ true, - read_packet_type_separately); + read_packet_type_separately, + allow_retries_in_cluster_requests); /// If query already sent, do nothing. Note that we cannot use sent_query flag here, /// because we can still be in process of sending scalars or external tables. @@ -531,7 +543,8 @@ RemoteQueryExecutor::ReadResult RemoteQueryExecutor::readAsync() read_context = std::make_unique( *this, /*suspend_when_query_sent*/ false, - read_packet_type_separately); + read_packet_type_separately, + allow_retries_in_cluster_requests); recreate_read_context = false; } @@ -655,7 +668,11 @@ RemoteQueryExecutor::ReadResult RemoteQueryExecutor::processPacket(Packet packet /// We can actually return it, and the first call to RemoteQueryExecutor::read /// will return earlier. We should consider doing it. if (!packet.block.empty() && (packet.block.rows() > 0)) + { + if (extension && extension->replica_info) + replica_has_processed_data.insert(extension->replica_info->number_of_current_replica); return ReadResult(adaptBlockStructure(packet.block, *header)); + } break; /// If the block is empty - we will receive other packets before EndOfStream. case Protocol::Server::Exception: @@ -717,6 +734,22 @@ RemoteQueryExecutor::ReadResult RemoteQueryExecutor::processPacket(Packet packet case Protocol::Server::TimezoneUpdate: break; + case Protocol::Server::ConnectionLost: + if (allow_retries_in_cluster_requests) + { + if (extension && extension->task_iterator && extension->task_iterator->supportRerunTask() && extension->replica_info) + { + if (!replica_has_processed_data.contains(extension->replica_info->number_of_current_replica)) + { + finished = true; + extension->task_iterator->rescheduleTasksFromReplica(extension->replica_info->number_of_current_replica); + return ReadResult(Block{}); + } + } + } + packet.exception->rethrow(); + break; + default: got_unknown_packet_from_replica = true; throw Exception( @@ -984,6 +1017,11 @@ void RemoteQueryExecutor::setProfileInfoCallback(ProfileInfoCallback callback) profile_info_callback = std::move(callback); } +bool RemoteQueryExecutor::skipUnavailableShards() const +{ + return context->getSettingsRef()[Setting::skip_unavailable_shards]; +} + bool RemoteQueryExecutor::needToSkipUnavailableShard() const { return context->getSettingsRef()[Setting::skip_unavailable_shards] && (0 == connections->size()); diff --git a/src/QueryPipeline/RemoteQueryExecutor.h b/src/QueryPipeline/RemoteQueryExecutor.h index e3fb64bb0f7b..7ef8be9e27cc 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.h +++ b/src/QueryPipeline/RemoteQueryExecutor.h @@ -31,8 +31,22 @@ class RemoteQueryExecutorReadContext; class ParallelReplicasReadingCoordinator; -/// This is the same type as StorageS3Source::IteratorWrapper -using TaskIterator = std::function; +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +}; + +class TaskIterator +{ +public: + virtual ~TaskIterator() = default; + virtual bool supportRerunTask() const { return false; } + virtual void rescheduleTasksFromReplica(size_t /* number_of_current_replica */) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method rescheduleTasksFromReplica is not implemented"); + } + virtual ClusterFunctionReadTaskResponsePtr operator()(size_t number_of_current_replica) const = 0; +}; /// This class allows one to launch queries on remote replicas of one shard and get results class RemoteQueryExecutor @@ -210,11 +224,17 @@ class RemoteQueryExecutor void setLogger(LoggerPtr logger) { log = logger; } + void setRemoteFunction(bool is_remote_function_ = true) { is_remote_function = is_remote_function_; } + + void setShardCount(UInt32 shard_count_) { shard_count = shard_count_; } + const Block & getHeader() const { return *header; } const SharedHeader & getSharedHeader() const { return header; } IConnections & getConnections() { return *connections; } + bool skipUnavailableShards() const; + bool needToSkipUnavailableShard() const; bool isReplicaUnavailable() const { return extension && extension->parallel_reading_coordinator && connections->size() == 0; } @@ -304,6 +324,9 @@ class RemoteQueryExecutor bool packet_in_progress = false; #endif + bool is_remote_function = false; + UInt32 shard_count = 0; + /// Parts uuids, collected from remote replicas std::vector duplicated_part_uuids; @@ -316,6 +339,10 @@ class RemoteQueryExecutor const bool read_packet_type_separately = false; + const bool allow_retries_in_cluster_requests = false; + + std::unordered_set replica_has_processed_data; + /// Send all scalars to remote servers void sendScalars(); diff --git a/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp b/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp index 9090d045daae..bd9c0f4966e4 100644 --- a/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp +++ b/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp @@ -16,14 +16,19 @@ namespace ErrorCodes extern const int CANNOT_READ_FROM_SOCKET; extern const int CANNOT_OPEN_FILE; extern const int SOCKET_TIMEOUT; + extern const int ATTEMPT_TO_READ_AFTER_EOF; } RemoteQueryExecutorReadContext::RemoteQueryExecutorReadContext( - RemoteQueryExecutor & executor_, bool suspend_when_query_sent_, bool read_packet_type_separately_) + RemoteQueryExecutor & executor_, + bool suspend_when_query_sent_, + bool read_packet_type_separately_, + bool allow_retries_in_cluster_requests_) : AsyncTaskExecutor(std::make_unique(*this)) , executor(executor_) , suspend_when_query_sent(suspend_when_query_sent_) , read_packet_type_separately(read_packet_type_separately_) + , allow_retries_in_cluster_requests(allow_retries_in_cluster_requests_) { if (-1 == pipe2(pipe_fd, O_NONBLOCK)) throw ErrnoException(ErrorCodes::CANNOT_OPEN_FILE, "Cannot create pipe"); @@ -54,17 +59,48 @@ void RemoteQueryExecutorReadContext::Task::run(AsyncCallback async_callback, Sus if (read_context.executor.needToSkipUnavailableShard()) return; - while (true) + try { - read_context.has_read_packet_part = PacketPart::None; - - if (read_context.read_packet_type_separately) + while (true) { - read_context.packet.type = read_context.executor.getConnections().receivePacketTypeUnlocked(async_callback); - read_context.has_read_packet_part = PacketPart::Type; + try + { + read_context.has_read_packet_part = PacketPart::None; + + if (read_context.read_packet_type_separately) + { + read_context.packet.type = read_context.executor.getConnections().receivePacketTypeUnlocked(async_callback); + read_context.has_read_packet_part = PacketPart::Type; + suspend_callback(); + } + read_context.packet = read_context.executor.getConnections().receivePacketUnlocked(async_callback); + read_context.has_read_packet_part = PacketPart::Body; + if (read_context.packet.type == Protocol::Server::Data) + read_context.has_data_packets = true; + } + catch (const Exception & e) + { + /// If cluster node unxepectedly shutted down (kill/segfault/power off/etc.) socket just closes. + /// If initiator did not process any data packets before, this fact can be ignored. + /// Unprocessed tasks will be executed on other nodes. + if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF + && !read_context.has_data_packets.load() && read_context.executor.skipUnavailableShards()) + { + read_context.has_read_packet_part = PacketPart::None; + } + else + throw; + } + suspend_callback(); } - read_context.packet = read_context.executor.getConnections().receivePacketUnlocked(async_callback); + } + catch (const Exception &) + { + if (!read_context.allow_retries_in_cluster_requests) + throw; + read_context.packet.type = Protocol::Server::ConnectionLost; + read_context.packet.exception = std::make_unique(getCurrentExceptionMessageAndPattern(true), getCurrentExceptionCode()); read_context.has_read_packet_part = PacketPart::Body; suspend_callback(); } diff --git a/src/QueryPipeline/RemoteQueryExecutorReadContext.h b/src/QueryPipeline/RemoteQueryExecutorReadContext.h index abde6cb93ef3..82bb28f81264 100644 --- a/src/QueryPipeline/RemoteQueryExecutorReadContext.h +++ b/src/QueryPipeline/RemoteQueryExecutorReadContext.h @@ -26,7 +26,10 @@ class RemoteQueryExecutorReadContext : public AsyncTaskExecutor { public: explicit RemoteQueryExecutorReadContext( - RemoteQueryExecutor & executor_, bool suspend_when_query_sent_, bool read_packet_type_separately_); + RemoteQueryExecutor & executor_, + bool suspend_when_query_sent_, + bool read_packet_type_separately_, + bool allow_retries_in_cluster_requests_); ~RemoteQueryExecutorReadContext() override; @@ -85,6 +88,7 @@ class RemoteQueryExecutorReadContext : public AsyncTaskExecutor /// None -> Type -> Body -> None /// None -> Body -> None std::atomic has_read_packet_part = PacketPart::None; + std::atomic_bool has_data_packets = false; Packet packet; RemoteQueryExecutor & executor; @@ -108,6 +112,7 @@ class RemoteQueryExecutorReadContext : public AsyncTaskExecutor bool suspend_when_query_sent = false; bool is_query_sent = false; const bool read_packet_type_separately = false; + const bool allow_retries_in_cluster_requests = false; }; } diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index eb5f058927a4..3f1de155e0c9 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -36,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Storages/Cache/ObjectStorageListObjectsCache.cpp b/src/Storages/Cache/ObjectStorageListObjectsCache.cpp new file mode 100644 index 000000000000..a7aec57d9161 --- /dev/null +++ b/src/Storages/Cache/ObjectStorageListObjectsCache.cpp @@ -0,0 +1,210 @@ +#include +#include +#include +#include + +namespace ProfileEvents +{ +extern const Event ObjectStorageListObjectsCacheHits; +extern const Event ObjectStorageListObjectsCacheMisses; +extern const Event ObjectStorageListObjectsCacheExactMatchHits; +extern const Event ObjectStorageListObjectsCachePrefixMatchHits; +} + +namespace DB +{ + +template +class ObjectStorageListObjectsCachePolicy : public TTLCachePolicy +{ +public: + using BasePolicy = TTLCachePolicy; + using typename BasePolicy::MappedPtr; + using typename BasePolicy::KeyMapped; + using BasePolicy::cache; + + ObjectStorageListObjectsCachePolicy() + : BasePolicy(CurrentMetrics::end(), CurrentMetrics::end(), std::make_unique()) + { + } + + std::optional getWithKey(const Key & key) override + { + if (const auto it = cache.find(key); it != cache.end()) + { + if (!IsStaleFunction()(it->first)) + { + return std::make_optional({it->first, it->second}); + } + // found a stale entry, remove it but don't return. We still want to perform the prefix matching search + BasePolicy::remove(it->first); + } + + if (const auto it = findBestMatchingPrefixAndRemoveExpiredEntries(key); it != cache.end()) + { + return std::make_optional({it->first, it->second}); + } + + return std::nullopt; + } + +private: + auto findBestMatchingPrefixAndRemoveExpiredEntries(Key key) + { + while (!key.prefix.empty()) + { + if (const auto it = cache.find(key); it != cache.end()) + { + if (IsStaleFunction()(it->first)) + { + BasePolicy::remove(it->first); + } + else + { + return it; + } + } + + key.prefix.pop_back(); + } + + return cache.end(); + } +}; + +ObjectStorageListObjectsCache::Key::Key( + const String & storage_description_, + const String & bucket_, + const String & prefix_, + const std::chrono::steady_clock::time_point & expires_at_, + std::optional user_id_) + : storage_description(storage_description_), bucket(bucket_), prefix(prefix_), expires_at(expires_at_), user_id(user_id_) {} + +bool ObjectStorageListObjectsCache::Key::operator==(const Key & other) const +{ + return storage_description == other.storage_description && bucket == other.bucket && prefix == other.prefix; +} + +size_t ObjectStorageListObjectsCache::KeyHasher::operator()(const Key & key) const +{ + std::size_t seed = 0; + + boost::hash_combine(seed, key.storage_description); + boost::hash_combine(seed, key.bucket); + boost::hash_combine(seed, key.prefix); + + return seed; +} + +bool ObjectStorageListObjectsCache::IsStale::operator()(const Key & key) const +{ + return key.expires_at < std::chrono::steady_clock::now(); +} + +size_t ObjectStorageListObjectsCache::WeightFunction::operator()(const Value & value) const +{ + std::size_t weight = 0; + + for (const auto & object : value) + { + const auto object_metadata = object->metadata; + weight += object->relative_path.capacity() + sizeof(object_metadata); + + // variable size + if (object_metadata) + { + weight += object_metadata->etag.capacity(); + weight += object_metadata->attributes.size() * (sizeof(std::string) * 2); + + for (const auto & [k, v] : object_metadata->attributes) + { + weight += k.capacity() + v.capacity(); + } + } + } + + return weight; +} + +ObjectStorageListObjectsCache::ObjectStorageListObjectsCache() + : cache(std::make_unique>()) +{ +} + +void ObjectStorageListObjectsCache::set( + const Key & key, + const std::shared_ptr & value) +{ + auto key_with_ttl = key; + key_with_ttl.expires_at = std::chrono::steady_clock::now() + std::chrono::seconds(ttl_in_seconds); + + cache.set(key_with_ttl, value); +} + +void ObjectStorageListObjectsCache::clear() +{ + cache.clear(); +} + +std::optional ObjectStorageListObjectsCache::get(const Key & key, bool filter_by_prefix) +{ + const auto pair = cache.getWithKey(key); + + if (!pair) + { + ProfileEvents::increment(ProfileEvents::ObjectStorageListObjectsCacheMisses); + return {}; + } + + ProfileEvents::increment(ProfileEvents::ObjectStorageListObjectsCacheHits); + + if (pair->key == key) + { + ProfileEvents::increment(ProfileEvents::ObjectStorageListObjectsCacheExactMatchHits); + return *pair->mapped; + } + + ProfileEvents::increment(ProfileEvents::ObjectStorageListObjectsCachePrefixMatchHits); + + if (!filter_by_prefix) + { + return *pair->mapped; + } + + Value filtered_objects; + + filtered_objects.reserve(pair->mapped->size()); + + for (const auto & object : *pair->mapped) + { + if (object->relative_path.starts_with(key.prefix)) + { + filtered_objects.push_back(object); + } + } + + return filtered_objects; +} + +void ObjectStorageListObjectsCache::setMaxSizeInBytes(std::size_t size_in_bytes_) +{ + cache.setMaxSizeInBytes(size_in_bytes_); +} + +void ObjectStorageListObjectsCache::setMaxCount(std::size_t count) +{ + cache.setMaxCount(count); +} + +void ObjectStorageListObjectsCache::setTTL(std::size_t ttl_in_seconds_) +{ + ttl_in_seconds = ttl_in_seconds_; +} + +ObjectStorageListObjectsCache & ObjectStorageListObjectsCache::instance() +{ + static ObjectStorageListObjectsCache instance; + return instance; +} + +} diff --git a/src/Storages/Cache/ObjectStorageListObjectsCache.h b/src/Storages/Cache/ObjectStorageListObjectsCache.h new file mode 100644 index 000000000000..6cb6c3694d93 --- /dev/null +++ b/src/Storages/Cache/ObjectStorageListObjectsCache.h @@ -0,0 +1,78 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +class ObjectStorageListObjectsCache +{ + friend class ObjectStorageListObjectsCacheTest; +public: + ObjectStorageListObjectsCache(const ObjectStorageListObjectsCache &) = delete; + ObjectStorageListObjectsCache(ObjectStorageListObjectsCache &&) noexcept = delete; + + ObjectStorageListObjectsCache& operator=(const ObjectStorageListObjectsCache &) = delete; + ObjectStorageListObjectsCache& operator=(ObjectStorageListObjectsCache &&) noexcept = delete; + + static ObjectStorageListObjectsCache & instance(); + + struct Key + { + Key( + const String & storage_description_, + const String & bucket_, + const String & prefix_, + const std::chrono::steady_clock::time_point & expires_at_ = std::chrono::steady_clock::now(), + std::optional user_id_ = std::nullopt); + + std::string storage_description; + std::string bucket; + std::string prefix; + std::chrono::steady_clock::time_point expires_at; + std::optional user_id; + + bool operator==(const Key & other) const; + }; + + using Value = StorageObjectStorage::ObjectInfos; + struct KeyHasher + { + size_t operator()(const Key & key) const; + }; + + struct IsStale + { + bool operator()(const Key & key) const; + }; + + struct WeightFunction + { + size_t operator()(const Value & value) const; + }; + + using Cache = CacheBase; + + void set( + const Key & key, + const std::shared_ptr & value); + + std::optional get(const Key & key, bool filter_by_prefix = true); + + void clear(); + + void setMaxSizeInBytes(std::size_t size_in_bytes_); + void setMaxCount(std::size_t count); + void setTTL(std::size_t ttl_in_seconds_); + +private: + ObjectStorageListObjectsCache(); + + Cache cache; + size_t ttl_in_seconds {0}; +}; + +} diff --git a/src/Storages/Cache/tests/gtest_object_storage_list_objects_cache.cpp b/src/Storages/Cache/tests/gtest_object_storage_list_objects_cache.cpp new file mode 100644 index 000000000000..3b719d4df3e3 --- /dev/null +++ b/src/Storages/Cache/tests/gtest_object_storage_list_objects_cache.cpp @@ -0,0 +1,160 @@ +#include +#include +#include +#include + +namespace DB +{ + +class ObjectStorageListObjectsCacheTest : public ::testing::Test +{ +protected: + void SetUp() override + { + cache = std::unique_ptr(new ObjectStorageListObjectsCache()); + cache->setTTL(3); + cache->setMaxCount(100); + cache->setMaxSizeInBytes(1000000); + } + + std::unique_ptr cache; + static ObjectStorageListObjectsCache::Key default_key; + + static std::shared_ptr createTestValue(const std::vector& paths) + { + auto value = std::make_shared(); + for (const auto & path : paths) + { + value->push_back(std::make_shared(path)); + } + return value; + } +}; + +ObjectStorageListObjectsCache::Key ObjectStorageListObjectsCacheTest::default_key {"default", "test-bucket", "test-prefix/"}; + +TEST_F(ObjectStorageListObjectsCacheTest, BasicSetAndGet) +{ + cache->clear(); + auto value = createTestValue({"test-prefix/file1.txt", "test-prefix/file2.txt"}); + + cache->set(default_key, value); + + auto result = cache->get(default_key).value(); + + ASSERT_EQ(result.size(), 2); + EXPECT_EQ(result[0]->getPath(), "test-prefix/file1.txt"); + EXPECT_EQ(result[1]->getPath(), "test-prefix/file2.txt"); +} + +TEST_F(ObjectStorageListObjectsCacheTest, CacheMiss) +{ + cache->clear(); + + EXPECT_FALSE(cache->get(default_key)); +} + +TEST_F(ObjectStorageListObjectsCacheTest, ClearCache) +{ + cache->clear(); + auto value = createTestValue({"test-prefix/file1.txt", "test-prefix/file2.txt"}); + + cache->set(default_key, value); + cache->clear(); + + EXPECT_FALSE(cache->get(default_key)); +} + +TEST_F(ObjectStorageListObjectsCacheTest, PrefixMatching) +{ + cache->clear(); + + auto short_prefix_key = default_key; + short_prefix_key.prefix = "parent/"; + + auto mid_prefix_key = default_key; + mid_prefix_key.prefix = "parent/child/"; + + auto long_prefix_key = default_key; + long_prefix_key.prefix = "parent/child/grandchild/"; + + auto value = createTestValue( + { + "parent/child/grandchild/file1.txt", + "parent/child/grandchild/file2.txt"}); + + cache->set(mid_prefix_key, value); + + auto result1 = cache->get(mid_prefix_key).value(); + EXPECT_EQ(result1.size(), 2); + + auto result2 = cache->get(long_prefix_key).value(); + EXPECT_EQ(result2.size(), 2); + + EXPECT_FALSE(cache->get(short_prefix_key)); +} + +TEST_F(ObjectStorageListObjectsCacheTest, PrefixFiltering) +{ + cache->clear(); + + auto key_with_short_prefix = default_key; + key_with_short_prefix.prefix = "parent/"; + + auto key_with_mid_prefix = default_key; + key_with_mid_prefix.prefix = "parent/child1/"; + + auto value = createTestValue({ + "parent/file1.txt", + "parent/child1/file2.txt", + "parent/child2/file3.txt" + }); + + cache->set(key_with_short_prefix, value); + + auto result = cache->get(key_with_mid_prefix, true).value(); + EXPECT_EQ(result.size(), 1); + EXPECT_EQ(result[0]->getPath(), "parent/child1/file2.txt"); +} + +TEST_F(ObjectStorageListObjectsCacheTest, TTLExpiration) +{ + cache->clear(); + auto value = createTestValue({"test-prefix/file1.txt"}); + + cache->set(default_key, value); + + // Verify we can get it immediately + auto result1 = cache->get(default_key).value(); + EXPECT_EQ(result1.size(), 1); + + std::this_thread::sleep_for(std::chrono::seconds(4)); + + EXPECT_FALSE(cache->get(default_key)); +} + +TEST_F(ObjectStorageListObjectsCacheTest, BestPrefixMatch) +{ + cache->clear(); + + auto short_prefix_key = default_key; + short_prefix_key.prefix = "a/b/"; + + auto mid_prefix_key = default_key; + mid_prefix_key.prefix = "a/b/c/"; + + auto long_prefix_key = default_key; + long_prefix_key.prefix = "a/b/c/d/"; + + auto short_prefix = createTestValue({"a/b/c/d/file1.txt", "a/b/c/file1.txt", "a/b/file2.txt"}); + auto mid_prefix = createTestValue({"a/b/c/d/file1.txt", "a/b/c/file1.txt"}); + + cache->set(short_prefix_key, short_prefix); + cache->set(mid_prefix_key, mid_prefix); + + // should pick mid_prefix, which has size 2. filter_by_prefix=false so we can assert by size + auto result = cache->get(long_prefix_key, false).value(); + EXPECT_EQ(result.size(), 2u); +} + +} diff --git a/src/Storages/ExportReplicatedMergeTreePartitionManifest.h b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h new file mode 100644 index 000000000000..81f61b5b9f12 --- /dev/null +++ b/src/Storages/ExportReplicatedMergeTreePartitionManifest.h @@ -0,0 +1,179 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +struct ExportReplicatedMergeTreePartitionProcessingPartEntry +{ + + enum class Status + { + PENDING, + COMPLETED, + FAILED + }; + + String part_name; + Status status; + size_t retry_count; + String finished_by; + + std::string toJsonString() const + { + Poco::JSON::Object json; + + json.set("part_name", part_name); + json.set("status", String(magic_enum::enum_name(status))); + json.set("retry_count", retry_count); + json.set("finished_by", finished_by); + std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + oss.exceptions(std::ios::failbit); + Poco::JSON::Stringifier::stringify(json, oss); + + return oss.str(); + } + + static ExportReplicatedMergeTreePartitionProcessingPartEntry fromJsonString(const std::string & json_string) + { + Poco::JSON::Parser parser; + auto json = parser.parse(json_string).extract(); + chassert(json); + + ExportReplicatedMergeTreePartitionProcessingPartEntry entry; + + entry.part_name = json->getValue("part_name"); + entry.status = magic_enum::enum_cast(json->getValue("status")).value(); + entry.retry_count = json->getValue("retry_count"); + if (json->has("finished_by")) + { + entry.finished_by = json->getValue("finished_by"); + } + return entry; + } +}; + +struct ExportReplicatedMergeTreePartitionProcessedPartEntry +{ + String part_name; + String path_in_destination; + String finished_by; + + std::string toJsonString() const + { + Poco::JSON::Object json; + json.set("part_name", part_name); + json.set("path_in_destination", path_in_destination); + json.set("finished_by", finished_by); + std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + oss.exceptions(std::ios::failbit); + Poco::JSON::Stringifier::stringify(json, oss); + return oss.str(); + } + + static ExportReplicatedMergeTreePartitionProcessedPartEntry fromJsonString(const std::string & json_string) + { + Poco::JSON::Parser parser; + auto json = parser.parse(json_string).extract(); + chassert(json); + + ExportReplicatedMergeTreePartitionProcessedPartEntry entry; + + entry.part_name = json->getValue("part_name"); + entry.path_in_destination = json->getValue("path_in_destination"); + entry.finished_by = json->getValue("finished_by"); + + return entry; + } +}; + +struct ExportReplicatedMergeTreePartitionManifest +{ + String transaction_id; + String partition_id; + String destination_database; + String destination_table; + String source_replica; + size_t number_of_parts; + std::vector parts; + time_t create_time; + size_t max_retries; + size_t ttl_seconds; + size_t max_threads; + bool parallel_formatting; + bool parquet_parallel_encoding; + MergeTreePartExportManifest::FileAlreadyExistsPolicy file_already_exists_policy; + + std::string toJsonString() const + { + Poco::JSON::Object json; + json.set("transaction_id", transaction_id); + json.set("partition_id", partition_id); + json.set("destination_database", destination_database); + json.set("destination_table", destination_table); + json.set("source_replica", source_replica); + json.set("number_of_parts", number_of_parts); + + Poco::JSON::Array::Ptr parts_array = new Poco::JSON::Array(); + for (const auto & part : parts) + parts_array->add(part); + json.set("parts", parts_array); + json.set("parallel_formatting", parallel_formatting); + json.set("max_threads", max_threads); + json.set("parquet_parallel_encoding", parquet_parallel_encoding); + json.set("file_already_exists_policy", String(magic_enum::enum_name(file_already_exists_policy))); + json.set("create_time", create_time); + json.set("max_retries", max_retries); + json.set("ttl_seconds", ttl_seconds); + std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM + oss.exceptions(std::ios::failbit); + Poco::JSON::Stringifier::stringify(json, oss); + return oss.str(); + } + + static ExportReplicatedMergeTreePartitionManifest fromJsonString(const std::string & json_string) + { + Poco::JSON::Parser parser; + auto json = parser.parse(json_string).extract(); + chassert(json); + + ExportReplicatedMergeTreePartitionManifest manifest; + manifest.transaction_id = json->getValue("transaction_id"); + manifest.partition_id = json->getValue("partition_id"); + manifest.destination_database = json->getValue("destination_database"); + manifest.destination_table = json->getValue("destination_table"); + manifest.source_replica = json->getValue("source_replica"); + manifest.number_of_parts = json->getValue("number_of_parts"); + manifest.max_retries = json->getValue("max_retries"); + auto parts_array = json->getArray("parts"); + for (size_t i = 0; i < parts_array->size(); ++i) + manifest.parts.push_back(parts_array->getElement(static_cast(i))); + + manifest.create_time = json->getValue("create_time"); + manifest.ttl_seconds = json->getValue("ttl_seconds"); + manifest.max_threads = json->getValue("max_threads"); + manifest.parallel_formatting = json->getValue("parallel_formatting"); + manifest.parquet_parallel_encoding = json->getValue("parquet_parallel_encoding"); + + if (json->has("file_already_exists_policy")) + { + const auto file_already_exists_policy = magic_enum::enum_cast(json->getValue("file_already_exists_policy")); + if (file_already_exists_policy) + { + manifest.file_already_exists_policy = file_already_exists_policy.value(); + } + + /// what to do if it's not a valid value? + } + + return manifest; + } +}; + +} diff --git a/src/Storages/ExportReplicatedMergeTreePartitionTaskEntry.h b/src/Storages/ExportReplicatedMergeTreePartitionTaskEntry.h new file mode 100644 index 000000000000..76674bfc4a92 --- /dev/null +++ b/src/Storages/ExportReplicatedMergeTreePartitionTaskEntry.h @@ -0,0 +1,78 @@ +#pragma once + +#include +#include +#include "Core/QualifiedTableName.h" +#include +#include +#include +#include + +namespace DB +{ +struct ExportReplicatedMergeTreePartitionTaskEntry +{ + using DataPartPtr = std::shared_ptr; + ExportReplicatedMergeTreePartitionManifest manifest; + + enum class Status + { + PENDING, + COMPLETED, + FAILED, + KILLED + }; + + /// Allows us to skip completed / failed entries during scheduling + mutable Status status; + + /// References to the parts that should be exported + /// This is used to prevent the parts from being deleted before finishing the export operation + /// It does not mean this replica will export all the parts + /// There is also a chance this replica does not contain a given part and it is totally ok. + std::vector part_references; + + std::string getCompositeKey() const + { + const auto qualified_table_name = QualifiedTableName {manifest.destination_database, manifest.destination_table}; + return manifest.partition_id + "_" + qualified_table_name.getFullName(); + } + + std::string getTransactionId() const + { + return manifest.transaction_id; + } + + /// Get create_time for sorted iteration + time_t getCreateTime() const + { + return manifest.create_time; + } +}; + +struct ExportPartitionTaskEntryTagByCompositeKey {}; +struct ExportPartitionTaskEntryTagByCreateTime {}; +struct ExportPartitionTaskEntryTagByTransactionId {}; + +// Multi-index container for export partition task entries +// - Index 0 (TagByCompositeKey): hashed_unique on composite key for O(1) lookup +// - Index 1 (TagByCreateTime): ordered_non_unique on create_time for sorted iteration +using ExportPartitionTaskEntriesContainer = boost::multi_index_container< + ExportReplicatedMergeTreePartitionTaskEntry, + boost::multi_index::indexed_by< + boost::multi_index::hashed_unique< + boost::multi_index::tag, + boost::multi_index::const_mem_fun + >, + boost::multi_index::ordered_non_unique< + boost::multi_index::tag, + boost::multi_index::const_mem_fun + >, + boost::multi_index::hashed_unique< + boost::multi_index::tag, + boost::multi_index::const_mem_fun + > + > +>; + +} diff --git a/src/Storages/HivePartitioningUtils.cpp b/src/Storages/HivePartitioningUtils.cpp index 27a59aae044c..04fd801681d9 100644 --- a/src/Storages/HivePartitioningUtils.cpp +++ b/src/Storages/HivePartitioningUtils.cpp @@ -196,9 +196,9 @@ HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForObjectStorage( * Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path. * And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns */ - if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE) + if (configuration->getPartitionStrategy() && configuration->getPartitionStrategyType() == PartitionStrategyFactory::StrategyType::HIVE) { - hive_partition_columns_to_read_from_file_path = configuration->partition_strategy->getPartitionColumns(); + hive_partition_columns_to_read_from_file_path = configuration->getPartitionStrategy()->getPartitionColumns(); } else if (context->getSettingsRef()[Setting::use_hive_partitioning]) { @@ -213,7 +213,7 @@ HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForObjectStorage( sanityCheckSchemaAndHivePartitionColumns(hive_partition_columns_to_read_from_file_path, columns); - if (configuration->partition_columns_in_data_file) + if (configuration->getPartitionColumnsInDataFile()) { file_columns = columns.getAllPhysical(); } diff --git a/src/Storages/IPartitionStrategy.cpp b/src/Storages/IPartitionStrategy.cpp index 0be6d30f4e7c..0e2f897fb617 100644 --- a/src/Storages/IPartitionStrategy.cpp +++ b/src/Storages/IPartitionStrategy.cpp @@ -264,19 +264,6 @@ ColumnPtr WildcardPartitionStrategy::computePartitionKey(const Chunk & chunk) return block_with_partition_by_expr.getByName(actions_with_column_name.column_name).column; } -std::string WildcardPartitionStrategy::getPathForRead( - const std::string & prefix) -{ - return prefix; -} - -std::string WildcardPartitionStrategy::getPathForWrite( - const std::string & prefix, - const std::string & partition_key) -{ - return PartitionedSink::replaceWildcards(prefix, partition_key); -} - HiveStylePartitionStrategy::HiveStylePartitionStrategy( KeyDescription partition_key_description_, const Block & sample_block_, @@ -296,41 +283,6 @@ HiveStylePartitionStrategy::HiveStylePartitionStrategy( block_without_partition_columns = buildBlockWithoutPartitionColumns(sample_block, partition_columns_name_set); } -std::string HiveStylePartitionStrategy::getPathForRead(const std::string & prefix) -{ - return prefix + "**." + Poco::toLower(file_format); -} - -std::string HiveStylePartitionStrategy::getPathForWrite( - const std::string & prefix, - const std::string & partition_key) -{ - std::string path; - - if (!prefix.empty()) - { - path += prefix; - if (path.back() != '/') - { - path += '/'; - } - } - - /// Not adding '/' because buildExpressionHive() always adds a trailing '/' - path += partition_key; - - /* - * File extension is toLower(format) - * This isn't ideal, but I guess multiple formats can be specified and introduced. - * So I think it is simpler to keep it this way. - * - * Or perhaps implement something like `IInputFormat::getFileExtension()` - */ - path += std::to_string(generateSnowflakeID()) + "." + Poco::toLower(file_format); - - return path; -} - ColumnPtr HiveStylePartitionStrategy::computePartitionKey(const Chunk & chunk) { Block block_with_partition_by_expr = sample_block.cloneWithoutColumns(); diff --git a/src/Storages/IPartitionStrategy.h b/src/Storages/IPartitionStrategy.h index bc90d7f03461..606122b4ae71 100644 --- a/src/Storages/IPartitionStrategy.h +++ b/src/Storages/IPartitionStrategy.h @@ -29,8 +29,12 @@ struct IPartitionStrategy virtual ColumnPtr computePartitionKey(const Chunk & chunk) = 0; - virtual std::string getPathForRead(const std::string & prefix) = 0; - virtual std::string getPathForWrite(const std::string & prefix, const std::string & partition_key) = 0; + ColumnPtr computePartitionKey(Block & block) const + { + actions_with_column_name.actions->execute(block); + + return block.getByName(actions_with_column_name.column_name).column; + } virtual ColumnRawPtrs getFormatChunkColumns(const Chunk & chunk) { @@ -53,6 +57,7 @@ struct IPartitionStrategy const KeyDescription partition_key_description; const Block sample_block; ContextPtr context; + PartitionExpressionActionsAndColumnName actions_with_column_name; }; /* @@ -89,11 +94,6 @@ struct WildcardPartitionStrategy : IPartitionStrategy WildcardPartitionStrategy(KeyDescription partition_key_description_, const Block & sample_block_, ContextPtr context_); ColumnPtr computePartitionKey(const Chunk & chunk) override; - std::string getPathForRead(const std::string & prefix) override; - std::string getPathForWrite(const std::string & prefix, const std::string & partition_key) override; - -private: - PartitionExpressionActionsAndColumnName actions_with_column_name; }; /* @@ -111,8 +111,6 @@ struct HiveStylePartitionStrategy : IPartitionStrategy bool partition_columns_in_data_file_); ColumnPtr computePartitionKey(const Chunk & chunk) override; - std::string getPathForRead(const std::string & prefix) override; - std::string getPathForWrite(const std::string & prefix, const std::string & partition_key) override; ColumnRawPtrs getFormatChunkColumns(const Chunk & chunk) override; Block getFormatHeader() override; @@ -121,7 +119,6 @@ struct HiveStylePartitionStrategy : IPartitionStrategy const std::string file_format; const bool partition_columns_in_data_file; std::unordered_set partition_columns_name_set; - PartitionExpressionActionsAndColumnName actions_with_column_name; Block block_without_partition_columns; }; diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 215c832624c1..3766afafe6b9 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -321,6 +321,11 @@ CancellationCode IStorage::killPartMoveToShard(const UUID & /*task_uuid*/) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Part moves between shards are not supported by storage {}", getName()); } +CancellationCode IStorage::killExportPartition(const String & /*transaction_id*/) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Export partition is not supported by storage {}", getName()); +} + StorageID IStorage::getStorageID() const { std::lock_guard lock(id_mutex); diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 0fe6668ff8a2..15938d9f3c22 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -68,6 +68,9 @@ class RestorerFromBackup; class ConditionSelectivityEstimator; +class IObjectStorage; +using ObjectStoragePtr = std::shared_ptr; + class ActionsDAG; /** Storage. Describes the table. Responsible for @@ -409,6 +412,7 @@ class IStorage : public std::enable_shared_from_this, public TypePromo size_t /*max_block_size*/, size_t /*num_streams*/); +public: /// Should we process blocks of data returned by the storage in parallel /// even when the storage returned only one stream of data for reading? /// It is beneficial, for example, when you read from a file quickly, @@ -419,7 +423,6 @@ class IStorage : public std::enable_shared_from_this, public TypePromo /// useless). virtual bool parallelizeOutputAfterReading(ContextPtr) const { return !isSystemStorage(); } -public: /// Other version of read which adds reading step to query plan. /// Default implementation creates ReadFromStorageStep and uses usual read. /// Can be called after `shutdown`, but not after `drop`. @@ -450,6 +453,37 @@ class IStorage : public std::enable_shared_from_this, public TypePromo ContextPtr /*context*/, bool /*async_insert*/); + virtual bool supportsImport() const + { + return false; + } + + /* +It is currently only implemented in StorageObjectStorage. + It is meant to be used to import merge tree data parts into object storage. It is similar to the write API, + but it won't re-partition the data and should allow the filename to be set by the caller. + */ + virtual SinkToStoragePtr import( + const std::string & /* file_name */, + Block & /* block_with_partition_values */, + std::string & /* destination_file_path */, + bool /* overwrite_if_exists */, + const std::optional & /* format_settings */, + ContextPtr /* context */) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Import is not implemented for storage {}", getName()); + } + + virtual void commitExportPartitionTransaction( + const String & /* transaction_id */, + const String & /* partition_id */, + const Strings & /* exported_paths */, + ContextPtr /* local_context */) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "commitExportPartitionTransaction is not implemented for storage type {}", getName()); + } + + /** Writes the data to a table in distributed manner. * It is supposed that implementation looks into SELECT part of the query and executes distributed * INSERT SELECT if it is possible with current storage as a receiver and query SELECT part as a producer. @@ -558,6 +592,9 @@ class IStorage : public std::enable_shared_from_this, public TypePromo virtual void setMutationCSN(const String & /*mutation_id*/, UInt64 /*csn*/); + /// Cancel a replicated partition export by transaction id. + virtual CancellationCode killExportPartition(const String & /*transaction_id*/); + /// Cancel a part move to shard. virtual CancellationCode killPartMoveToShard(const UUID & /*task_uuid*/); diff --git a/src/Storages/IStorageCluster.cpp b/src/Storages/IStorageCluster.cpp index 23b73f3d823f..b3a1f7653bd4 100644 --- a/src/Storages/IStorageCluster.cpp +++ b/src/Storages/IStorageCluster.cpp @@ -1,5 +1,8 @@ #include +#include +#include + #include #include #include @@ -13,9 +16,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -23,6 +26,16 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -40,6 +53,15 @@ namespace Setting extern const SettingsBool parallel_replicas_local_plan; extern const SettingsString cluster_for_parallel_replicas; extern const SettingsNonZeroUInt64 max_parallel_replicas; + extern const SettingsObjectStorageClusterJoinMode object_storage_cluster_join_mode; + extern const SettingsUInt64 object_storage_max_nodes; + extern const SettingsBool object_storage_remote_initiator; +} + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; } namespace ErrorCodes @@ -57,51 +79,6 @@ IStorageCluster::IStorageCluster( { } -class ReadFromCluster : public SourceStepWithFilter -{ -public: - std::string getName() const override { return "ReadFromCluster"; } - void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; - void applyFilters(ActionDAGNodes added_filter_nodes) override; - - ReadFromCluster( - const Names & column_names_, - const SelectQueryInfo & query_info_, - const StorageSnapshotPtr & storage_snapshot_, - const ContextPtr & context_, - SharedHeader sample_block, - std::shared_ptr storage_, - ASTPtr query_to_send_, - QueryProcessingStage::Enum processed_stage_, - ClusterPtr cluster_, - LoggerPtr log_) - : SourceStepWithFilter( - std::move(sample_block), - column_names_, - query_info_, - storage_snapshot_, - context_) - , storage(std::move(storage_)) - , query_to_send(std::move(query_to_send_)) - , processed_stage(processed_stage_) - , cluster(std::move(cluster_)) - , log(log_) - { - } - -private: - std::shared_ptr storage; - ASTPtr query_to_send; - QueryProcessingStage::Enum processed_stage; - ClusterPtr cluster; - LoggerPtr log; - - std::optional extension; - - void createExtension(const ActionsDAG::Node * predicate); - ContextPtr updateSettings(const Settings & settings); -}; - void ReadFromCluster::applyFilters(ActionDAGNodes added_filter_nodes) { SourceStepWithFilter::applyFilters(std::move(added_filter_nodes)); @@ -125,6 +102,202 @@ void ReadFromCluster::createExtension(const ActionsDAG::Node * predicate) cluster); } +namespace +{ + +/* +Helping class to find in query tree first node of required type +*/ +class SearcherVisitor : public InDepthQueryTreeVisitorWithContext +{ +public: + using Base = InDepthQueryTreeVisitorWithContext; + using Base::Base; + + explicit SearcherVisitor(std::unordered_set types_, ContextPtr context) : Base(context), types(types_) {} + + bool needChildVisit(QueryTreeNodePtr &, QueryTreeNodePtr & /*child*/) + { + return !passed_node; + } + + void enterImpl(QueryTreeNodePtr & node) + { + if (passed_node) + return; + + auto node_type = node->getNodeType(); + + if (types.contains(node_type)) + { + passed_node = node; + passed_type = node_type; + } + } + + QueryTreeNodePtr getNode() const { return passed_node; } + std::optional getType() const { return passed_type; } + +private: + std::unordered_set types; + QueryTreeNodePtr passed_node; + std::optional passed_type; +}; + +/* +Helping class to find all used columns with specific source +*/ +class CollectUsedColumnsForSourceVisitor : public InDepthQueryTreeVisitorWithContext +{ +public: + using Base = InDepthQueryTreeVisitorWithContext; + using Base::Base; + + explicit CollectUsedColumnsForSourceVisitor( + QueryTreeNodePtr source_, + ContextPtr context, + bool collect_columns_from_other_sources_ = false) + : Base(context) + , source(source_) + , collect_columns_from_other_sources(collect_columns_from_other_sources_) + {} + + void enterImpl(QueryTreeNodePtr & node) + { + auto node_type = node->getNodeType(); + + if (node_type != QueryTreeNodeType::COLUMN) + return; + + auto & column_node = node->as(); + auto column_source = column_node.getColumnSourceOrNull(); + if (!column_source) + return; + + if ((column_source == source) != collect_columns_from_other_sources) + { + const auto & name = column_node.getColumnName(); + if (!names.count(name)) + { + columns.emplace_back(column_node.getColumn()); + names.insert(name); + } + } + } + + const NamesAndTypes & getColumns() const { return columns; } + +private: + std::unordered_set names; + QueryTreeNodePtr source; + NamesAndTypes columns; + bool collect_columns_from_other_sources; +}; + +}; + +/* +Try to make subquery to send on nodes +Converts + + SELECT s3.c1, s3.c2, t.c3 + FROM + s3Cluster(...) AS s3 + JOIN + localtable as t + ON s3.key == t.key + +to + + SELECT s3.c1, s3.c2, s3.key + FROM + s3Cluster(...) AS s3 +*/ +void IStorageCluster::updateQueryWithJoinToSendIfNeeded( + ASTPtr & query_to_send, + QueryTreeNodePtr query_tree, + const ContextPtr & context) +{ + auto object_storage_cluster_join_mode = context->getSettingsRef()[Setting::object_storage_cluster_join_mode]; + switch (object_storage_cluster_join_mode) + { + case ObjectStorageClusterJoinMode::LOCAL: + { + auto info = getQueryTreeInfo(query_tree, context); + + if (info.has_join || info.has_cross_join || info.has_local_columns_in_where) + { + auto modified_query_tree = query_tree->clone(); + + SearcherVisitor left_table_expression_searcher({QueryTreeNodeType::TABLE, QueryTreeNodeType::TABLE_FUNCTION}, context); + left_table_expression_searcher.visit(modified_query_tree); + auto table_function_node = left_table_expression_searcher.getNode(); + if (!table_function_node) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't find table function node"); + + QueryTreeNodePtr query_tree_distributed; + + auto & query_node = modified_query_tree->as(); + + if (info.has_join) + { + auto join_node = query_node.getJoinTree(); + query_tree_distributed = join_node->as()->getLeftTableExpression()->clone(); + } + else if (info.has_cross_join) + { + SearcherVisitor join_searcher({QueryTreeNodeType::CROSS_JOIN}, context); + join_searcher.visit(modified_query_tree); + auto cross_join_node = join_searcher.getNode(); + if (!cross_join_node) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't find CROSS JOIN node"); + // CrossJoinNode contains vector of nodes. 0 is left expression, always exists. + query_tree_distributed = cross_join_node->as()->getTableExpressions()[0]->clone(); + } + + // Find add used columns from table function to make proper projection list + // Need to do before changing WHERE condition + CollectUsedColumnsForSourceVisitor collector(table_function_node, context); + collector.visit(modified_query_tree); + const auto & columns = collector.getColumns(); + + query_node.resolveProjectionColumns(columns); + auto column_nodes_to_select = std::make_shared(); + column_nodes_to_select->getNodes().reserve(columns.size()); + for (auto & column : columns) + column_nodes_to_select->getNodes().emplace_back(std::make_shared(column, table_function_node)); + query_node.getProjectionNode() = column_nodes_to_select; + + if (info.has_local_columns_in_where) + { + if (query_node.getPrewhere()) + removeExpressionsThatDoNotDependOnTableIdentifiers(query_node.getPrewhere(), table_function_node, context); + if (query_node.getWhere()) + removeExpressionsThatDoNotDependOnTableIdentifiers(query_node.getWhere(), table_function_node, context); + } + + query_node.getOrderByNode() = std::make_shared(); + query_node.getGroupByNode() = std::make_shared(); + + if (query_tree_distributed) + { + // Left only table function to send on cluster nodes + modified_query_tree = modified_query_tree->cloneAndReplace(query_node.getJoinTree(), query_tree_distributed); + } + + query_to_send = queryNodeToDistributedSelectQuery(modified_query_tree); + } + + return; + } + case ObjectStorageClusterJoinMode::GLOBAL: + // TODO + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "`Global` mode for `object_storage_cluster_join_mode` setting is unimplemented for now"); + case ObjectStorageClusterJoinMode::ALLOW: // Do nothing special + return; + } +} + /// The code executes on initiator void IStorageCluster::read( QueryPlan & query_plan, @@ -133,34 +306,58 @@ void IStorageCluster::read( SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, - size_t /*max_block_size*/, - size_t /*num_streams*/) + size_t max_block_size, + size_t num_streams) { + auto cluster_name_from_settings = getClusterName(context); + + if (cluster_name_from_settings.empty()) + { + readFallBackToPure(query_plan, column_names, storage_snapshot, query_info, context, processed_stage, max_block_size, num_streams); + return; + } + + updateConfigurationIfNeeded(context); + storage_snapshot->check(column_names); - updateBeforeRead(context); - auto cluster = getCluster(context); + const auto & settings = context->getSettingsRef(); + + auto cluster = getClusterImpl(context, cluster_name_from_settings, settings[Setting::object_storage_max_nodes]); /// Calculate the header. This is significant, because some columns could be thrown away in some cases like query with count(*) SharedHeader sample_block; ASTPtr query_to_send = query_info.query; - if (context->getSettingsRef()[Setting::allow_experimental_analyzer]) + updateQueryWithJoinToSendIfNeeded(query_to_send, query_info.query_tree, context); + + if (settings[Setting::allow_experimental_analyzer]) { - sample_block = InterpreterSelectQueryAnalyzer::getSampleBlock(query_info.query, context, SelectQueryOptions(processed_stage)); + sample_block = InterpreterSelectQueryAnalyzer::getSampleBlock(query_to_send, context, SelectQueryOptions(processed_stage)); } else { - auto interpreter = InterpreterSelectQuery(query_info.query, context, SelectQueryOptions(processed_stage).analyze()); + auto interpreter = InterpreterSelectQuery(query_to_send, context, SelectQueryOptions(processed_stage).analyze()); sample_block = interpreter.getSampleBlock(); query_to_send = interpreter.getQueryInfo().query->clone(); } updateQueryToSendIfNeeded(query_to_send, storage_snapshot, context); + if (settings[Setting::object_storage_remote_initiator]) + { + auto storage_and_context = convertToRemote(cluster, context, cluster_name_from_settings, query_to_send); + auto src_distributed = std::dynamic_pointer_cast(storage_and_context.storage); + auto modified_query_info = query_info; + modified_query_info.cluster = src_distributed->getCluster(); + auto new_storage_snapshot = storage_and_context.storage->getStorageSnapshot(storage_snapshot->metadata, storage_and_context.context); + storage_and_context.storage->read(query_plan, column_names, new_storage_snapshot, modified_query_info, storage_and_context.context, processed_stage, max_block_size, num_streams); + return; + } + RestoreQualifiedNamesVisitor::Data data; - data.distributed_table = DatabaseAndTableWithAlias(*getTableExpression(query_info.query->as(), 0)); + data.distributed_table = DatabaseAndTableWithAlias(*getTableExpression(query_to_send->as(), 0)); data.remote_table.database = context->getCurrentDatabase(); data.remote_table.table = getName(); RestoreQualifiedNamesVisitor(data).visit(query_to_send); @@ -186,6 +383,76 @@ void IStorageCluster::read( query_plan.addStep(std::move(reading)); } +IStorageCluster::RemoteCallVariables IStorageCluster::convertToRemote( + ClusterPtr cluster, + ContextPtr context, + const std::string & cluster_name_from_settings, + ASTPtr query_to_send) +{ + auto host_addresses = cluster->getShardsAddresses(); + if (host_addresses.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty cluster {}", cluster_name_from_settings); + + static pcg64 rng(randomSeed()); + size_t shard_num = rng() % host_addresses.size(); + auto shard_addresses = host_addresses[shard_num]; + /// After getClusterImpl each shard must have exactly 1 replica + if (shard_addresses.size() != 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of shard {} in cluster {} is not equal 1", shard_num, cluster_name_from_settings); + auto host_name = shard_addresses[0].toString(); + + LOG_INFO(log, "Choose remote initiator '{}'", host_name); + + bool secure = shard_addresses[0].secure == Protocol::Secure::Enable; + std::string remote_function_name = secure ? "remoteSecure" : "remote"; + + /// Clean object_storage_remote_initiator setting to avoid infinite remote call + auto new_context = Context::createCopy(context); + new_context->setSetting("object_storage_remote_initiator", false); + + auto * select_query = query_to_send->as(); + if (!select_query) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query"); + + auto query_settings = select_query->settings(); + if (query_settings) + { + auto & settings_ast = query_settings->as(); + if (settings_ast.changes.removeSetting("object_storage_remote_initiator") && settings_ast.changes.empty()) + { + select_query->setExpression(ASTSelectQuery::Expression::SETTINGS, {}); + } + } + + ASTTableExpression * table_expression = extractTableExpressionASTPtrFromSelectQuery(query_to_send); + if (!table_expression) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't find table expression"); + + auto remote_query = makeASTFunction(remote_function_name, std::make_shared(host_name), table_expression->table_function); + + table_expression->table_function = remote_query; + + auto remote_function = TableFunctionFactory::instance().get(remote_query, new_context); + + auto storage = remote_function->execute(query_to_send, new_context, remote_function_name); + + return RemoteCallVariables{storage, new_context}; +} + +SinkToStoragePtr IStorageCluster::write( + const ASTPtr & query, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr context, + bool async_insert) +{ + auto cluster_name_from_settings = getClusterName(context); + + if (cluster_name_from_settings.empty()) + return writeFallBackToPure(query, metadata_snapshot, context, async_insert); + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method write is not supported by storage {}", getName()); +} + void ReadFromCluster::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) { const Scalars & scalars = context->hasQueryContext() ? context->getQueryContext()->getScalars() : Scalars{}; @@ -254,9 +521,62 @@ void ReadFromCluster::initializePipeline(QueryPipelineBuilder & pipeline, const pipeline.init(std::move(pipe)); } +IStorageCluster::QueryTreeInfo IStorageCluster::getQueryTreeInfo(QueryTreeNodePtr query_tree, ContextPtr context) +{ + QueryTreeInfo info; + + SearcherVisitor join_searcher({QueryTreeNodeType::JOIN, QueryTreeNodeType::CROSS_JOIN}, context); + join_searcher.visit(query_tree); + + if (join_searcher.getNode()) + { + if (join_searcher.getType() == QueryTreeNodeType::JOIN) + info.has_join = true; + else + info.has_cross_join = true; + } + + SearcherVisitor left_table_expression_searcher({QueryTreeNodeType::TABLE, QueryTreeNodeType::TABLE_FUNCTION}, context); + left_table_expression_searcher.visit(query_tree); + auto table_function_node = left_table_expression_searcher.getNode(); + if (!table_function_node) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't find table or table function node"); + + auto & query_node = query_tree->as(); + if (query_node.hasWhere() || query_node.hasPrewhere()) + { + CollectUsedColumnsForSourceVisitor collector_where(table_function_node, context, true); + if (query_node.hasPrewhere()) + collector_where.visit(query_node.getPrewhere()); + if (query_node.hasWhere()) + collector_where.visit(query_node.getWhere()); + + // SELECT x FROM datalake.table WHERE x IN local.table. + // Need to modify 'WHERE' on remote node if it contains columns from other sources + // because remote node might not have those sources. + if (!collector_where.getColumns().empty()) + info.has_local_columns_in_where = true; + } + + return info; +} + QueryProcessingStage::Enum IStorageCluster::getQueryProcessingStage( - ContextPtr context, QueryProcessingStage::Enum to_stage, const StorageSnapshotPtr &, SelectQueryInfo &) const + ContextPtr context, QueryProcessingStage::Enum to_stage, const StorageSnapshotPtr &, SelectQueryInfo & query_info) const { + auto object_storage_cluster_join_mode = context->getSettingsRef()[Setting::object_storage_cluster_join_mode]; + + if (object_storage_cluster_join_mode != ObjectStorageClusterJoinMode::ALLOW) + { + if (!context->getSettingsRef()[Setting::allow_experimental_analyzer]) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "object_storage_cluster_join_mode!='allow' is not supported without allow_experimental_analyzer=true"); + + auto info = getQueryTreeInfo(query_info.query_tree, context); + if (info.has_join || info.has_cross_join /*|| info.has_local_columns_in_where*/) + return QueryProcessingStage::Enum::FetchColumns; + } + /// Initiator executes query on remote node. if (context->getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) if (to_stage >= QueryProcessingStage::Enum::WithMergeableState) @@ -278,9 +598,9 @@ ContextPtr ReadFromCluster::updateSettings(const Settings & settings) return new_context; } -ClusterPtr IStorageCluster::getCluster(ContextPtr context) const +ClusterPtr IStorageCluster::getClusterImpl(ContextPtr context, const String & cluster_name_, size_t max_hosts) { - return context->getCluster(cluster_name)->getClusterWithReplicasAsShards(context->getSettingsRef()); + return context->getCluster(cluster_name_)->getClusterWithReplicasAsShards(context->getSettingsRef(), /* max_replicas_from_shard */ 0, max_hosts); } } diff --git a/src/Storages/IStorageCluster.h b/src/Storages/IStorageCluster.h index a4b63adad48b..b9f739ccad34 100644 --- a/src/Storages/IStorageCluster.h +++ b/src/Storages/IStorageCluster.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { @@ -30,10 +31,16 @@ class IStorageCluster : public IStorage SelectQueryInfo & query_info, ContextPtr context, QueryProcessingStage::Enum processed_stage, - size_t /*max_block_size*/, - size_t /*num_streams*/) override; + size_t max_block_size, + size_t num_streams) override; - ClusterPtr getCluster(ContextPtr context) const; + SinkToStoragePtr write( + const ASTPtr & query, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr context, + bool async_insert) override; + + ClusterPtr getCluster(ContextPtr context) const { return getClusterImpl(context, cluster_name); } /// Query is needed for pruning by virtual columns (_file, _path) virtual RemoteQueryExecutor::Extension getTaskIteratorExtension( @@ -49,14 +56,109 @@ class IStorageCluster : public IStorage bool supportsOptimizationToSubcolumns() const override { return false; } bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override { return true; } + const String & getOriginalClusterName() const { return cluster_name; } + virtual String getClusterName(ContextPtr /* context */) const { return getOriginalClusterName(); } + protected: - virtual void updateBeforeRead(const ContextPtr &) {} virtual void updateQueryToSendIfNeeded(ASTPtr & /*query*/, const StorageSnapshotPtr & /*storage_snapshot*/, const ContextPtr & /*context*/) {} + void updateQueryWithJoinToSendIfNeeded(ASTPtr & query_to_send, QueryTreeNodePtr query_tree, const ContextPtr & context); + + struct RemoteCallVariables + { + StoragePtr storage; + ContextPtr context; + }; + + RemoteCallVariables convertToRemote( + ClusterPtr cluster, + ContextPtr context, + const std::string & cluster_name_from_settings, + ASTPtr query_to_send); + + virtual void readFallBackToPure( + QueryPlan & /* query_plan */, + const Names & /* column_names */, + const StorageSnapshotPtr & /* storage_snapshot */, + SelectQueryInfo & /* query_info */, + ContextPtr /* context */, + QueryProcessingStage::Enum /* processed_stage */, + size_t /* max_block_size */, + size_t /* num_streams */) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method readFallBackToPure is not supported by storage {}", getName()); + } + + virtual SinkToStoragePtr writeFallBackToPure( + const ASTPtr & /*query*/, + const StorageMetadataPtr & /*metadata_snapshot*/, + ContextPtr /*context*/, + bool /*async_insert*/) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method writeFallBackToPure is not supported by storage {}", getName()); + } + + virtual void updateConfigurationIfNeeded(ContextPtr /* context */) {} private: + static ClusterPtr getClusterImpl(ContextPtr context, const String & cluster_name_, size_t max_hosts = 0); + LoggerPtr log; String cluster_name; + + struct QueryTreeInfo + { + bool has_join = false; + bool has_cross_join = false; + bool has_local_columns_in_where = false; + }; + + static QueryTreeInfo getQueryTreeInfo(QueryTreeNodePtr query_tree, ContextPtr context); }; +class ReadFromCluster : public SourceStepWithFilter +{ +public: + std::string getName() const override { return "ReadFromCluster"; } + void initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) override; + void applyFilters(ActionDAGNodes added_filter_nodes) override; + + ReadFromCluster( + const Names & column_names_, + const SelectQueryInfo & query_info_, + const StorageSnapshotPtr & storage_snapshot_, + const ContextPtr & context_, + SharedHeader sample_block, + std::shared_ptr storage_, + ASTPtr query_to_send_, + QueryProcessingStage::Enum processed_stage_, + ClusterPtr cluster_, + LoggerPtr log_) + : SourceStepWithFilter( + std::move(sample_block), + column_names_, + query_info_, + storage_snapshot_, + context_) + , storage(std::move(storage_)) + , query_to_send(std::move(query_to_send_)) + , processed_stage(processed_stage_) + , cluster(std::move(cluster_)) + , log(log_) + { + } + +private: + std::shared_ptr storage; + ASTPtr query_to_send; + QueryProcessingStage::Enum processed_stage; + ClusterPtr cluster; + LoggerPtr log; + + std::optional extension; + + void createExtension(const ActionsDAG::Node * predicate); + ContextPtr updateSettings(const Settings & settings); +}; + } diff --git a/src/Storages/MergeTree/ExportList.cpp b/src/Storages/MergeTree/ExportList.cpp new file mode 100644 index 000000000000..0239f841dc69 --- /dev/null +++ b/src/Storages/MergeTree/ExportList.cpp @@ -0,0 +1,66 @@ +#include + +namespace DB +{ + +ExportsListElement::ExportsListElement( + const StorageID & source_table_id_, + const StorageID & destination_table_id_, + UInt64 part_size_, + const String & part_name_, + const String & target_file_name_, + UInt64 total_rows_to_read_, + UInt64 total_size_bytes_compressed_, + UInt64 total_size_bytes_uncompressed_, + time_t create_time_, + const ContextPtr & context) +: source_table_id(source_table_id_) +, destination_table_id(destination_table_id_) +, part_size(part_size_) +, part_name(part_name_) +, destination_file_path(target_file_name_) +, total_rows_to_read(total_rows_to_read_) +, total_size_bytes_compressed(total_size_bytes_compressed_) +, total_size_bytes_uncompressed(total_size_bytes_uncompressed_) +, create_time(create_time_) +{ + thread_group = ThreadGroup::createForMergeMutate(context); +} + +ExportsListElement::~ExportsListElement() +{ + background_memory_tracker.adjustOnBackgroundTaskEnd(&thread_group->memory_tracker); +} + +ExportInfo ExportsListElement::getInfo() const +{ + ExportInfo res; + res.source_database = source_table_id.database_name; + res.source_table = source_table_id.table_name; + res.destination_database = destination_table_id.database_name; + res.destination_table = destination_table_id.table_name; + res.part_name = part_name; + res.destination_file_path = destination_file_path; + res.rows_read = rows_read; + res.total_rows_to_read = total_rows_to_read; + res.total_size_bytes_compressed = total_size_bytes_compressed; + res.total_size_bytes_uncompressed = total_size_bytes_uncompressed; + res.bytes_read_uncompressed = bytes_read_uncompressed; + res.memory_usage = getMemoryUsage(); + res.peak_memory_usage = getPeakMemoryUsage(); + res.create_time = create_time; + res.elapsed = elapsed; + return res; +} + +UInt64 ExportsListElement::getMemoryUsage() const +{ + return thread_group->memory_tracker.get(); +} + +UInt64 ExportsListElement::getPeakMemoryUsage() const +{ + return thread_group->memory_tracker.getPeak(); +} + +} diff --git a/src/Storages/MergeTree/ExportList.h b/src/Storages/MergeTree/ExportList.h new file mode 100644 index 000000000000..3c4daa07737b --- /dev/null +++ b/src/Storages/MergeTree/ExportList.h @@ -0,0 +1,90 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace CurrentMetrics +{ + extern const Metric Export; +} + +namespace DB +{ + +struct ExportInfo +{ + String source_database; + String source_table; + String destination_database; + String destination_table; + String part_name; + String destination_file_path; + UInt64 rows_read; + UInt64 total_rows_to_read; + UInt64 total_size_bytes_compressed; + UInt64 total_size_bytes_uncompressed; + UInt64 bytes_read_uncompressed; + UInt64 memory_usage; + UInt64 peak_memory_usage; + time_t create_time = 0; + Float64 elapsed; +}; + +struct ExportsListElement : private boost::noncopyable +{ + const StorageID source_table_id; + const StorageID destination_table_id; + const UInt64 part_size; + const String part_name; + String destination_file_path; + UInt64 rows_read {0}; + UInt64 total_rows_to_read {0}; + UInt64 total_size_bytes_compressed {0}; + UInt64 total_size_bytes_uncompressed {0}; + UInt64 bytes_read_uncompressed {0}; + time_t create_time {0}; + Float64 elapsed {0}; + + Stopwatch watch; + ThreadGroupPtr thread_group; + + ExportsListElement( + const StorageID & source_table_id_, + const StorageID & destination_table_id_, + UInt64 part_size_, + const String & part_name_, + const String & destination_file_path_, + UInt64 total_rows_to_read_, + UInt64 total_size_bytes_compressed_, + UInt64 total_size_bytes_uncompressed_, + time_t create_time_, + const ContextPtr & context); + + ~ExportsListElement(); + + ExportInfo getInfo() const; + + UInt64 getMemoryUsage() const; + UInt64 getPeakMemoryUsage() const; +}; + + +class ExportsList final : public BackgroundProcessList +{ +private: + using Parent = BackgroundProcessList; + +public: + ExportsList() + : Parent(CurrentMetrics::Export) + {} +}; + +using ExportsListEntry = BackgroundProcessListEntry; + +} diff --git a/src/Storages/MergeTree/ExportPartTask.cpp b/src/Storages/MergeTree/ExportPartTask.cpp new file mode 100644 index 000000000000..a43c45d0edaf --- /dev/null +++ b/src/Storages/MergeTree/ExportPartTask.cpp @@ -0,0 +1,290 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event PartsExportDuplicated; + extern const Event PartsExportFailures; + extern const Event PartsExports; + extern const Event PartsExportTotalMilliseconds; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_TABLE; + extern const int FILE_ALREADY_EXISTS; + extern const int LOGICAL_ERROR; + extern const int QUERY_WAS_CANCELLED; +} + +namespace Setting +{ + extern const SettingsUInt64 min_bytes_to_use_direct_io; +} + +ExportPartTask::ExportPartTask(MergeTreeData & storage_, const MergeTreePartExportManifest & manifest_, ContextPtr context_) + : storage(storage_), + manifest(manifest_), + local_context(context_) +{ +} + +bool ExportPartTask::executeStep() +{ + const auto & metadata_snapshot = manifest.metadata_snapshot; + + Names columns_to_read = metadata_snapshot->getColumns().getNamesOfPhysical(); + + MergeTreeSequentialSourceType read_type = MergeTreeSequentialSourceType::Export; + + Block block_with_partition_values; + if (metadata_snapshot->hasPartitionKey()) + { + /// todo arthur do I need to init minmax_idx? + block_with_partition_values = manifest.data_part->minmax_idx->getBlock(storage); + } + + auto destination_storage = DatabaseCatalog::instance().tryGetTable(manifest.destination_storage_id, local_context); + if (!destination_storage) + { + std::lock_guard inner_lock(storage.export_manifests_mutex); + + const auto destination_storage_id_name = manifest.destination_storage_id.getNameForLogs(); + storage.export_manifests.erase(manifest); + throw Exception(ErrorCodes::UNKNOWN_TABLE, "Failed to reconstruct destination storage: {}", destination_storage_id_name); + } + + auto exports_list_entry = storage.getContext()->getExportsList().insert( + getStorageID(), + manifest.destination_storage_id, + manifest.data_part->getBytesOnDisk(), + manifest.data_part->name, + "not_computed_yet", + manifest.data_part->rows_count, + manifest.data_part->getBytesOnDisk(), + manifest.data_part->getBytesUncompressedOnDisk(), + manifest.create_time, + local_context); + + SinkToStoragePtr sink; + + try + { + sink = destination_storage->import( + manifest.data_part->name + "_" + manifest.data_part->checksums.getTotalChecksumHex(), + block_with_partition_values, + (*exports_list_entry)->destination_file_path, + manifest.file_already_exists_policy == MergeTreePartExportManifest::FileAlreadyExistsPolicy::overwrite, + manifest.format_settings, + local_context); + } + catch (const Exception & e) + { + if (e.code() == ErrorCodes::FILE_ALREADY_EXISTS) + { + ProfileEvents::increment(ProfileEvents::PartsExportDuplicated); + + /// File already exists and the policy is NO_OP, treat it as success. + if (manifest.file_already_exists_policy == MergeTreePartExportManifest::FileAlreadyExistsPolicy::skip) + { + storage.writePartLog( + PartLogElement::Type::EXPORT_PART, + {}, + static_cast((*exports_list_entry)->elapsed * 1000000000), + manifest.data_part->name, + manifest.data_part, + {manifest.data_part}, + nullptr, + nullptr, + exports_list_entry.get()); + + std::lock_guard inner_lock(storage.export_manifests_mutex); + storage.export_manifests.erase(manifest); + + ProfileEvents::increment(ProfileEvents::PartsExports); + ProfileEvents::increment(ProfileEvents::PartsExportTotalMilliseconds, static_cast((*exports_list_entry)->elapsed * 1000)); + + if (manifest.completion_callback) + manifest.completion_callback(MergeTreePartExportManifest::CompletionCallbackResult::createSuccess((*exports_list_entry)->destination_file_path)); + return false; + } + } + + tryLogCurrentException(__PRETTY_FUNCTION__); + + ProfileEvents::increment(ProfileEvents::PartsExportFailures); + + std::lock_guard inner_lock(storage.export_manifests_mutex); + storage.export_manifests.erase(manifest); + + if (manifest.completion_callback) + manifest.completion_callback(MergeTreePartExportManifest::CompletionCallbackResult::createFailure(e)); + return false; + } + + bool apply_deleted_mask = true; + bool read_with_direct_io = local_context->getSettingsRef()[Setting::min_bytes_to_use_direct_io] > manifest.data_part->getBytesOnDisk(); + bool prefetch = false; + + MergeTreeData::IMutationsSnapshot::Params mutations_snapshot_params + { + .metadata_version = metadata_snapshot->getMetadataVersion(), + .min_part_metadata_version = manifest.data_part->getMetadataVersion() + }; + + auto mutations_snapshot = storage.getMutationsSnapshot(mutations_snapshot_params); + auto alter_conversions = MergeTreeData::getAlterConversionsForPart( + manifest.data_part, + mutations_snapshot, + local_context); + + QueryPlan plan_for_part; + + createReadFromPartStep( + read_type, + plan_for_part, + storage, + storage.getStorageSnapshot(metadata_snapshot, local_context), + RangesInDataPart(manifest.data_part), + alter_conversions, + nullptr, + columns_to_read, + nullptr, + apply_deleted_mask, + std::nullopt, + read_with_direct_io, + prefetch, + local_context, + getLogger("ExportPartition")); + + + ThreadGroupSwitcher switcher((*exports_list_entry)->thread_group, ""); + + QueryPlanOptimizationSettings optimization_settings(local_context); + auto pipeline_settings = BuildQueryPipelineSettings(local_context); + auto builder = plan_for_part.buildQueryPipeline(optimization_settings, pipeline_settings); + + builder->setProgressCallback([&exports_list_entry](const Progress & progress) + { + (*exports_list_entry)->bytes_read_uncompressed += progress.read_bytes; + (*exports_list_entry)->rows_read += progress.read_rows; + (*exports_list_entry)->elapsed = (*exports_list_entry)->watch.elapsedSeconds(); + }); + + pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); + + pipeline.complete(sink); + + try + { + CompletedPipelineExecutor exec(pipeline); + + auto is_cancelled_callback = [this]() + { + return isCancelled(); + }; + + exec.setCancelCallback(is_cancelled_callback, 100); + + exec.execute(); + + if (isCancelled()) + { + throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Export part was cancelled"); + } + + std::lock_guard inner_lock(storage.export_manifests_mutex); + storage.writePartLog( + PartLogElement::Type::EXPORT_PART, + {}, + static_cast((*exports_list_entry)->elapsed * 1000000000), + manifest.data_part->name, + manifest.data_part, + {manifest.data_part}, + nullptr, + nullptr, + exports_list_entry.get()); + + storage.export_manifests.erase(manifest); + + ProfileEvents::increment(ProfileEvents::PartsExports); + ProfileEvents::increment(ProfileEvents::PartsExportTotalMilliseconds, static_cast((*exports_list_entry)->elapsed * 1000)); + + if (manifest.completion_callback) + manifest.completion_callback(MergeTreePartExportManifest::CompletionCallbackResult::createSuccess((*exports_list_entry)->destination_file_path)); + } + catch (const Exception & e) + { + tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while exporting the part {}. User should retry.", manifest.data_part->name)); + + ProfileEvents::increment(ProfileEvents::PartsExportFailures); + + std::lock_guard inner_lock(storage.export_manifests_mutex); + storage.writePartLog( + PartLogElement::Type::EXPORT_PART, + ExecutionStatus::fromCurrentException("", true), + static_cast((*exports_list_entry)->elapsed * 1000000000), + manifest.data_part->name, + manifest.data_part, + {manifest.data_part}, + nullptr, + nullptr, + exports_list_entry.get()); + + storage.export_manifests.erase(manifest); + + if (manifest.completion_callback) + manifest.completion_callback(MergeTreePartExportManifest::CompletionCallbackResult::createFailure(e)); + + throw; + } + return false; +} + +void ExportPartTask::cancel() noexcept +{ + cancel_requested.store(true); + pipeline.cancel(); +} + +bool ExportPartTask::isCancelled() const +{ + return cancel_requested.load() || storage.parts_mover.moves_blocker.isCancelled(); +} + +void ExportPartTask::onCompleted() +{ +} + +StorageID ExportPartTask::getStorageID() const +{ + return storage.getStorageID(); +} + +Priority ExportPartTask::getPriority() const +{ + return Priority{}; +} + +String ExportPartTask::getQueryId() const +{ + return manifest.transaction_id; +} + +} diff --git a/src/Storages/MergeTree/ExportPartTask.h b/src/Storages/MergeTree/ExportPartTask.h new file mode 100644 index 000000000000..bcec68b2b737 --- /dev/null +++ b/src/Storages/MergeTree/ExportPartTask.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +class ExportPartTask : public IExecutableTask +{ +public: + explicit ExportPartTask( + MergeTreeData & storage_, + const MergeTreePartExportManifest & manifest_, + ContextPtr context_); + bool executeStep() override; + void onCompleted() override; + StorageID getStorageID() const override; + Priority getPriority() const override; + String getQueryId() const override; + + void cancel() noexcept override; + +private: + MergeTreeData & storage; + MergeTreePartExportManifest manifest; + ContextPtr local_context; + QueryPipeline pipeline; + std::atomic cancel_requested = false; + + bool isCancelled() const; +}; + +} diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp new file mode 100644 index 000000000000..79b92663b7bf --- /dev/null +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.cpp @@ -0,0 +1,310 @@ +#include +#include +#include +#include "Storages/MergeTree/ExportPartitionUtils.h" +#include "Common/logger_useful.h" +#include +#include +#include + +namespace DB +{ +namespace +{ + /* + Remove expired entries and fix non-committed exports that have already exported all parts. + + Return values: + - true: the cleanup was successful, the entry is removed from the entries_by_key container and the function returns true. Proceed to the next entry. + - false: the cleanup was not successful, the entry is not removed from the entries_by_key container and the function returns false. + */ + bool tryCleanup( + const zkutil::ZooKeeperPtr & zk, + const std::string & entry_path, + const LoggerPtr & log, + const ContextPtr & context, + const std::string & key, + const ExportReplicatedMergeTreePartitionManifest & metadata, + const time_t now, + const bool is_pending, + auto & entries_by_key + ) + { + bool has_expired = metadata.create_time < now - static_cast(metadata.ttl_seconds); + + if (has_expired && !is_pending) + { + zk->tryRemoveRecursive(fs::path(entry_path)); + auto it = entries_by_key.find(key); + if (it != entries_by_key.end()) + entries_by_key.erase(it); + LOG_INFO(log, "ExportPartition Manifest Updating Task: Removed {}: expired", key); + + return true; + } + else if (is_pending) + { + std::vector parts_in_processing_or_pending; + if (Coordination::Error::ZOK != zk->tryGetChildren(fs::path(entry_path) / "processing", parts_in_processing_or_pending)) + { + LOG_INFO(log, "ExportPartition Manifest Updating Task: Failed to get parts in processing or pending, skipping"); + return false; + } + + if (parts_in_processing_or_pending.empty()) + { + LOG_INFO(log, "ExportPartition Manifest Updating Task: Cleanup found PENDING for {} with all parts exported, try to fix it by committing the export", entry_path); + + const auto destination_storage_id = StorageID(QualifiedTableName {metadata.destination_database, metadata.destination_table}); + const auto destination_storage = DatabaseCatalog::instance().tryGetTable(destination_storage_id, context); + if (!destination_storage) + { + LOG_INFO(log, "ExportPartition Manifest Updating Task: Failed to reconstruct destination storage: {}, skipping", destination_storage_id.getNameForLogs()); + return false; + } + + /// it sounds like a replica exported the last part, but was not able to commit the export. Try to fix it + ExportPartitionUtils::commit(metadata, destination_storage, zk, log, entry_path, context); + + return true; + } + } + + return false; + } +} + +ExportPartitionManifestUpdatingTask::ExportPartitionManifestUpdatingTask(StorageReplicatedMergeTree & storage_) + : storage(storage_) +{ +} + +void ExportPartitionManifestUpdatingTask::poll() +{ + std::lock_guard lock(storage.export_merge_tree_partition_mutex); + + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Polling for new entries for table {}. Current number of entries: {}", storage.getStorageID().getNameForLogs(), storage.export_merge_tree_partition_task_entries_by_key.size()); + + auto zk = storage.getZooKeeper(); + + const std::string exports_path = fs::path(storage.zookeeper_path) / "exports"; + const std::string cleanup_lock_path = fs::path(storage.zookeeper_path) / "exports_cleanup_lock"; + + auto cleanup_lock = zkutil::EphemeralNodeHolder::tryCreate(cleanup_lock_path, *zk, storage.replica_name); + if (cleanup_lock) + { + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Cleanup lock acquired, will remove stale entries"); + } + + Coordination::Stat stat; + const auto children = zk->getChildrenWatch(exports_path, &stat, storage.export_merge_tree_partition_watch_callback); + const std::unordered_set zk_children(children.begin(), children.end()); + + const auto now = time(nullptr); + + auto & entries_by_key = storage.export_merge_tree_partition_task_entries_by_key; + + /// Load new entries + /// If we have the cleanup lock, also remove stale entries from zk and local + /// Upload dangling commit files if any + for (const auto & key : zk_children) + { + const std::string entry_path = fs::path(exports_path) / key; + + std::string metadata_json; + if (!zk->tryGet(fs::path(entry_path) / "metadata.json", metadata_json)) + { + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Skipping {}: missing metadata.json", key); + continue; + } + + const auto metadata = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); + + const auto local_entry = entries_by_key.find(key); + + /// If the zk entry has been replaced with export_merge_tree_partition_force_export, checking only for the export key is not enough + /// we need to make sure it is the same transaction id. If it is not, it needs to be replaced. + bool has_local_entry_and_is_up_to_date = local_entry != entries_by_key.end() + && local_entry->manifest.transaction_id == metadata.transaction_id; + + /// If the entry is up to date and we don't have the cleanup lock, early exit, nothing to be done. + if (!cleanup_lock && has_local_entry_and_is_up_to_date) + continue; + + std::weak_ptr weak_manifest_updater = storage.export_merge_tree_partition_manifest_updater; + + auto status_watch_callback = std::make_shared([weak_manifest_updater, key](const Coordination::WatchResponse &) + { + /// If the table is dropped but the watch is not removed, we need to prevent use after free + /// below code assumes that if manifest updater is still alive, the status handling task is also alive + if (auto manifest_updater = weak_manifest_updater.lock()) + { + manifest_updater->addStatusChange(key); + manifest_updater->storage.export_merge_tree_partition_status_handling_task->schedule(); + } + }); + + std::string status; + if (!zk->tryGetWatch(fs::path(entry_path) / "status", status, nullptr, status_watch_callback)) + { + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Skipping {}: missing status", key); + continue; + } + + bool is_pending = status == "PENDING"; + + /// if we have the cleanup lock, try to cleanup + /// if we successfully cleaned it up, early exit + if (cleanup_lock) + { + bool cleanup_successful = tryCleanup( + zk, + entry_path, + storage.log.load(), + storage.getContext(), + key, + metadata, + now, + is_pending, entries_by_key); + + if (cleanup_successful) + continue; + } + + if (!is_pending) + { + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Skipping {}: status is not PENDING", key); + continue; + } + + if (has_local_entry_and_is_up_to_date) + { + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Skipping {}: already exists", key); + continue; + } + + addTask(metadata, key, entries_by_key); + } + + /// Remove entries that were deleted by someone else + removeStaleEntries(zk_children, entries_by_key); + + LOG_INFO(storage.log, "ExportPartition Manifest Updating task: finished polling for new entries. Number of entries: {}", entries_by_key.size()); + + storage.export_merge_tree_partition_select_task->schedule(); +} + +void ExportPartitionManifestUpdatingTask::addTask( + const ExportReplicatedMergeTreePartitionManifest & metadata, + const std::string & key, + auto & entries_by_key +) +{ + std::vector part_references; + + for (const auto & part_name : metadata.parts) + { + if (const auto part = storage.getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated})) + { + part_references.push_back(part); + } + } + + /// Insert or update entry. The multi_index container automatically maintains both indexes. + auto entry = ExportReplicatedMergeTreePartitionTaskEntry {metadata, ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING, std::move(part_references)}; + auto it = entries_by_key.find(key); + if (it != entries_by_key.end()) + entries_by_key.replace(it, entry); + else + entries_by_key.insert(entry); +} + +void ExportPartitionManifestUpdatingTask::removeStaleEntries( + const std::unordered_set & zk_children, + auto & entries_by_key +) +{ + for (auto it = entries_by_key.begin(); it != entries_by_key.end();) + { + const auto & key = it->getCompositeKey(); + if (zk_children.contains(key)) + { + ++it; + continue; + } + + const auto & transaction_id = it->manifest.transaction_id; + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Export task {} was deleted, calling killExportPartition for transaction {}", key, transaction_id); + + try + { + storage.killExportPart(transaction_id); + } + catch (...) + { + tryLogCurrentException(storage.log, __PRETTY_FUNCTION__); + } + + it = entries_by_key.erase(it); + } +} + +void ExportPartitionManifestUpdatingTask::addStatusChange(const std::string & key) +{ + std::lock_guard lock(status_changes_mutex); + status_changes.emplace(key); +} + +void ExportPartitionManifestUpdatingTask::handleStatusChanges() +{ + std::lock_guard lock(status_changes_mutex); + std::lock_guard task_entries_lock(storage.export_merge_tree_partition_mutex); + auto zk = storage.getZooKeeper(); + + LOG_INFO(storage.log, "ExportPartition Manifest Updating task: handling status changes. Number of status changes: {}", status_changes.size()); + + while (!status_changes.empty()) + { + LOG_INFO(storage.log, "ExportPartition Manifest Updating task: handling status change for task {}", status_changes.front()); + const auto key = status_changes.front(); + status_changes.pop(); + + auto it = storage.export_merge_tree_partition_task_entries_by_key.find(key); + if (it == storage.export_merge_tree_partition_task_entries_by_key.end()) + continue; + + /// get new status from zk + std::string new_status_string; + if (!zk->tryGet(fs::path(storage.zookeeper_path) / "exports" / key / "status", new_status_string)) + { + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Failed to get new status for task {}, skipping", key); + continue; + } + + const auto new_status = magic_enum::enum_cast(new_status_string); + if (!new_status) + { + LOG_INFO(storage.log, "ExportPartition Manifest Updating Task: Invalid status {} for task {}, skipping", new_status_string, key); + continue; + } + + LOG_INFO(storage.log, "ExportPartition Manifest Updating task: status changed for task {}. New status: {}", key, magic_enum::enum_name(*new_status).data()); + + /// If status changed to KILLED, cancel local export operations + if (*new_status == ExportReplicatedMergeTreePartitionTaskEntry::Status::KILLED) + { + try + { + storage.killExportPart(it->manifest.transaction_id); + } + catch (...) + { + tryLogCurrentException(storage.log, __PRETTY_FUNCTION__); + } + } + + it->status = *new_status; + } +} + +} diff --git a/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h new file mode 100644 index 000000000000..ea52f679d654 --- /dev/null +++ b/src/Storages/MergeTree/ExportPartitionManifestUpdatingTask.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include +#include +#include +namespace DB +{ + +class StorageReplicatedMergeTree; +struct ExportReplicatedMergeTreePartitionManifest; + +class ExportPartitionManifestUpdatingTask +{ +public: + ExportPartitionManifestUpdatingTask(StorageReplicatedMergeTree & storage); + + void poll(); + + void handleStatusChanges(); + + void addStatusChange(const std::string & key); + +private: + StorageReplicatedMergeTree & storage; + + void addTask( + const ExportReplicatedMergeTreePartitionManifest & metadata, + const std::string & key, + auto & entries_by_key + ); + + void removeStaleEntries( + const std::unordered_set & zk_children, + auto & entries_by_key + ); + + std::mutex status_changes_mutex; + std::queue status_changes; +}; + +} diff --git a/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp b/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp new file mode 100644 index 000000000000..ab3a8ce361c7 --- /dev/null +++ b/src/Storages/MergeTree/ExportPartitionTaskScheduler.cpp @@ -0,0 +1,389 @@ +#include +#include +#include +#include +#include +#include +#include "Storages/MergeTree/ExportPartitionUtils.h" +#include "Storages/MergeTree/MergeTreePartExportManifest.h" + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int QUERY_WAS_CANCELLED; + extern const int LOGICAL_ERROR; +} + +namespace +{ + ContextPtr getContextCopyWithTaskSettings(const ContextPtr & context, const ExportReplicatedMergeTreePartitionManifest & manifest) + { + auto context_copy = Context::createCopy(context); + context_copy->makeQueryContextForExportPart(); + context_copy->setCurrentQueryId(manifest.transaction_id); + context_copy->setSetting("output_format_parallel_formatting", manifest.parallel_formatting); + context_copy->setSetting("output_format_parquet_parallel_encoding", manifest.parquet_parallel_encoding); + context_copy->setSetting("max_threads", manifest.max_threads); + context_copy->setSetting("export_merge_tree_part_file_already_exists_policy", String(magic_enum::enum_name(manifest.file_already_exists_policy))); + return context_copy; + } +} + +ExportPartitionTaskScheduler::ExportPartitionTaskScheduler(StorageReplicatedMergeTree & storage_) + : storage(storage_) +{ +} + +void ExportPartitionTaskScheduler::run() +{ + std::lock_guard lock(storage.export_merge_tree_partition_mutex); + + auto zk = storage.getZooKeeper(); + + // Iterate sorted by create_time + for (auto & entry : storage.export_merge_tree_partition_task_entries_by_create_time) + { + const auto & manifest = entry.manifest; + const auto key = entry.getCompositeKey(); + const auto database = storage.getContext()->resolveDatabase(manifest.destination_database); + const auto & table = manifest.destination_table; + + /// No need to query zk for status if the local one is not PENDING + if (entry.status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Skipping... Local status is {}", magic_enum::enum_name(entry.status).data()); + continue; + } + + const auto destination_storage_id = StorageID(QualifiedTableName {database, table}); + + const auto destination_storage = DatabaseCatalog::instance().tryGetTable(destination_storage_id, storage.getContext()); + + if (!destination_storage) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Failed to reconstruct destination storage: {}, skipping", destination_storage_id.getNameForLogs()); + continue; + } + + std::string status_in_zk_string; + if (!zk->tryGet(fs::path(storage.zookeeper_path) / "exports" / key / "status", status_in_zk_string)) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Failed to get status, skipping"); + continue; + } + + const auto status_in_zk = magic_enum::enum_cast(status_in_zk_string); + + if (!status_in_zk) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Failed to get status from zk, skipping"); + continue; + } + + if (status_in_zk.value() != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + { + entry.status = status_in_zk.value(); + LOG_INFO(storage.log, "ExportPartition scheduler task: Skipping... Status from zk is {}", entry.status); + continue; + } + + std::vector parts_in_processing_or_pending; + + if (Coordination::Error::ZOK != zk->tryGetChildren(fs::path(storage.zookeeper_path) / "exports" / key / "processing", parts_in_processing_or_pending)) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Failed to get parts in processing or pending, skipping"); + continue; + } + + if (parts_in_processing_or_pending.empty()) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: No parts in processing or pending, skipping"); + continue; + } + + std::vector locked_parts; + + if (Coordination::Error::ZOK != zk->tryGetChildren(fs::path(storage.zookeeper_path) / "exports" / key / "locks", locked_parts)) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Failed to get locked parts, skipping"); + continue; + } + + std::unordered_set locked_parts_set(locked_parts.begin(), locked_parts.end()); + + for (const auto & zk_part_name : parts_in_processing_or_pending) + { + if (locked_parts_set.contains(zk_part_name)) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Part {} is locked, skipping", zk_part_name); + continue; + } + + const auto part = storage.getPartIfExists(zk_part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); + if (!part) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Part {} not found locally, skipping", zk_part_name); + continue; + } + + if (Coordination::Error::ZOK != zk->tryCreate(fs::path(storage.zookeeper_path) / "exports" / key / "locks" / zk_part_name, storage.replica_name, zkutil::CreateMode::Ephemeral)) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Failed to lock part {}, skipping", zk_part_name); + continue; + } + + try + { + storage.exportPartToTable( + part->name, + destination_storage_id, + manifest.transaction_id, + getContextCopyWithTaskSettings(storage.getContext(), manifest), + [this, key, zk_part_name, manifest, destination_storage] + (MergeTreePartExportManifest::CompletionCallbackResult result) + { + handlePartExportCompletion(key, zk_part_name, manifest, destination_storage, result); + }); + } + catch (const Exception &) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + zk->tryRemove(fs::path(storage.zookeeper_path) / "exports" / key / "locks" / zk_part_name); + /// we should not increment retry_count because the node might just be full + } + } + } + + /// maybe we failed to schedule or failed to export, need to retry eventually + storage.export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); +} + +void ExportPartitionTaskScheduler::handlePartExportCompletion( + const std::string & export_key, + const std::string & part_name, + const ExportReplicatedMergeTreePartitionManifest & manifest, + const StoragePtr & destination_storage, + const MergeTreePartExportManifest::CompletionCallbackResult & result) +{ + const auto export_path = fs::path(storage.zookeeper_path) / "exports" / export_key; + const auto processing_parts_path = export_path / "processing"; + const auto processed_part_path = export_path / "processed" / part_name; + const auto zk = storage.getZooKeeper(); + + if (result.success) + { + handlePartExportSuccess(manifest, destination_storage, processing_parts_path, processed_part_path, part_name, export_path, zk, result.relative_path_in_destination_storage); + } + else + { + handlePartExportFailure(processing_parts_path, part_name, export_path, zk, result.exception, manifest.max_retries); + } +} + +void ExportPartitionTaskScheduler::handlePartExportSuccess( + const ExportReplicatedMergeTreePartitionManifest & manifest, + const StoragePtr & destination_storage, + const std::filesystem::path & processing_parts_path, + const std::filesystem::path & processed_part_path, + const std::string & part_name, + const std::filesystem::path & export_path, + const zkutil::ZooKeeperPtr & zk, + const String & relative_path_in_destination_storage +) +{ + LOG_INFO(storage.log, "ExportPartition scheduler task: Part {} exported successfully", relative_path_in_destination_storage); + + if (!tryToMovePartToProcessed(export_path, processing_parts_path, processed_part_path, part_name, relative_path_in_destination_storage, zk)) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Failed to move part to processed, will not commit export partition"); + return; + } + + LOG_INFO(storage.log, "ExportPartition scheduler task: Marked part export {} as completed", part_name); + + if (!areAllPartsProcessed(export_path, zk)) + { + return; + } + + LOG_INFO(storage.log, "ExportPartition scheduler task: All parts are processed, will try to commit export partition"); + + ExportPartitionUtils::commit(manifest, destination_storage, zk, storage.log.load(), export_path, storage.getContext()); +} + +void ExportPartitionTaskScheduler::handlePartExportFailure( + const std::filesystem::path & processing_parts_path, + const std::string & part_name, + const std::filesystem::path & export_path, + const zkutil::ZooKeeperPtr & zk, + const std::optional & exception, + size_t max_retries +) +{ + if (!exception) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "ExportPartition scheduler task: No exception provided for error handling. Sounds like a bug"); + } + + /// Early exit if the query was cancelled - no need to increment error counts + if (exception->code() == ErrorCodes::QUERY_WAS_CANCELLED) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Part {} export was cancelled, skipping error handling", part_name); + return; + } + + Coordination::Stat locked_by_stat; + std::string locked_by; + + if (!zk->tryGet(export_path / "locks" / part_name, locked_by, &locked_by_stat)) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Part {} is not locked by any replica, will not increment error counts", part_name); + return; + } + + if (locked_by != storage.replica_name) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Part {} is locked by another replica, will not increment error counts", part_name); + return; + } + + Coordination::Requests ops; + + const auto processing_part_path = processing_parts_path / part_name; + + std::string processing_part_string; + + if (!zk->tryGet(processing_part_path, processing_part_string)) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Failed to get processing part, will not increment error counts"); + return; + } + + /// todo arthur could this have been cached? + auto processing_part_entry = ExportReplicatedMergeTreePartitionProcessingPartEntry::fromJsonString(processing_part_string); + + processing_part_entry.retry_count++; + + if (processing_part_entry.retry_count) + { + ops.emplace_back(zkutil::makeRemoveRequest(export_path / "locks" / part_name, locked_by_stat.version)); + ops.emplace_back(zkutil::makeSetRequest(processing_part_path, processing_part_entry.toJsonString(), -1)); + + if (processing_part_entry.retry_count >= max_retries) + { + /// just set status in processing_part_path and finished_by + processing_part_entry.status = ExportReplicatedMergeTreePartitionProcessingPartEntry::Status::FAILED; + processing_part_entry.finished_by = storage.replica_name; + + ops.emplace_back(zkutil::makeSetRequest(export_path / "status", String(magic_enum::enum_name(ExportReplicatedMergeTreePartitionTaskEntry::Status::FAILED)).data(), -1)); + LOG_INFO(storage.log, "ExportPartition scheduler task: Retry count limit exceeded for part {}, will try to fail the entire task", part_name); + } + + std::size_t num_exceptions = 0; + + const auto exceptions_per_replica_path = export_path / "exceptions_per_replica" / storage.replica_name; + const auto count_path = exceptions_per_replica_path / "count"; + const auto last_exception_path = exceptions_per_replica_path / "last_exception"; + + if (zk->exists(exceptions_per_replica_path)) + { + std::string num_exceptions_string; + zk->tryGet(count_path, num_exceptions_string); + num_exceptions = std::stoull(num_exceptions_string.c_str()); + + ops.emplace_back(zkutil::makeSetRequest(last_exception_path / "part", part_name, -1)); + ops.emplace_back(zkutil::makeSetRequest(last_exception_path / "exception", exception->message(), -1)); + } + else + { + ops.emplace_back(zkutil::makeCreateRequest(exceptions_per_replica_path, "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(count_path, "0", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(last_exception_path, "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(last_exception_path / "part", part_name, zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(last_exception_path / "exception", exception->message(), zkutil::CreateMode::Persistent)); + } + + num_exceptions++; + ops.emplace_back(zkutil::makeSetRequest(count_path, std::to_string(num_exceptions), -1)); + + Coordination::Responses responses; + if (Coordination::Error::ZOK != zk->tryMulti(ops, responses)) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: All failure mechanism failed, will not try to update it"); + return; + } + } +} + +bool ExportPartitionTaskScheduler::tryToMovePartToProcessed( + const std::filesystem::path & export_path, + const std::filesystem::path & processing_parts_path, + const std::filesystem::path & processed_part_path, + const std::string & part_name, + const String & relative_path_in_destination_storage, + const zkutil::ZooKeeperPtr & zk +) +{ + Coordination::Stat locked_by_stat; + std::string locked_by; + + if (!zk->tryGet(export_path / "locks" / part_name, locked_by, &locked_by_stat)) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Part {} is not locked by any replica, will not commit or set it as completed", part_name); + return false; + } + + /// Is this a good idea? what if the file we just pushed to s3 ends up triggering an exception in the replica that actually locks the part and it does not commit? + /// I guess we should not throw if file already exists for export partition, hard coded. + if (locked_by != storage.replica_name) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Part {} is locked by another replica, will not commit or set it as completed", part_name); + return false; + } + + Coordination::Requests requests; + + ExportReplicatedMergeTreePartitionProcessedPartEntry processed_part_entry; + processed_part_entry.part_name = part_name; + processed_part_entry.path_in_destination = relative_path_in_destination_storage; + processed_part_entry.finished_by = storage.replica_name; + + requests.emplace_back(zkutil::makeRemoveRequest(processing_parts_path / part_name, -1)); + requests.emplace_back(zkutil::makeCreateRequest(processed_part_path, processed_part_entry.toJsonString(), zkutil::CreateMode::Persistent)); + requests.emplace_back(zkutil::makeRemoveRequest(export_path / "locks" / part_name, locked_by_stat.version)); + + Coordination::Responses responses; + if (Coordination::Error::ZOK != zk->tryMulti(requests, responses)) + { + /// todo arthur remember what to do here + LOG_INFO(storage.log, "ExportPartition scheduler task: Failed to update export path, skipping"); + return false; + } + + return true; +} + +bool ExportPartitionTaskScheduler::areAllPartsProcessed( + const std::filesystem::path & export_path, + const zkutil::ZooKeeperPtr & zk) +{ + Strings parts_in_processing_or_pending; + if (Coordination::Error::ZOK != zk->tryGetChildren(export_path / "processing", parts_in_processing_or_pending)) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: Failed to get parts in processing or pending, will not try to commit export partition"); + return false; + } + + if (!parts_in_processing_or_pending.empty()) + { + LOG_INFO(storage.log, "ExportPartition scheduler task: There are still parts in processing or pending, will not try to commit export partition"); + return false; + } + + return true; +} + +} diff --git a/src/Storages/MergeTree/ExportPartitionTaskScheduler.h b/src/Storages/MergeTree/ExportPartitionTaskScheduler.h new file mode 100644 index 000000000000..0045019a4ec7 --- /dev/null +++ b/src/Storages/MergeTree/ExportPartitionTaskScheduler.h @@ -0,0 +1,66 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class Exception; +class StorageReplicatedMergeTree; + +struct ExportReplicatedMergeTreePartitionManifest; + +/// todo arthur remember to add check(lock, version) when updating stuff because maybe if we believe we have the lock, we might not actually have it +class ExportPartitionTaskScheduler +{ +public: + ExportPartitionTaskScheduler(StorageReplicatedMergeTree & storage); + + void run(); +private: + StorageReplicatedMergeTree & storage; + + /// todo arthur maybe it is invalid to grab the manifst here + void handlePartExportCompletion( + const std::string & export_key, + const std::string & part_name, + const ExportReplicatedMergeTreePartitionManifest & manifest, + const StoragePtr & destination_storage, + const MergeTreePartExportManifest::CompletionCallbackResult & result); + + void handlePartExportSuccess( + const ExportReplicatedMergeTreePartitionManifest & manifest, + const StoragePtr & destination_storage, + const std::filesystem::path & processing_parts_path, + const std::filesystem::path & processed_part_path, + const std::string & part_name, + const std::filesystem::path & export_path, + const zkutil::ZooKeeperPtr & zk, + const String & relative_path_in_destination_storage + ); + + void handlePartExportFailure( + const std::filesystem::path & processing_parts_path, + const std::string & part_name, + const std::filesystem::path & export_path, + const zkutil::ZooKeeperPtr & zk, + const std::optional & exception, + size_t max_retries); + + bool tryToMovePartToProcessed( + const std::filesystem::path & export_path, + const std::filesystem::path & processing_parts_path, + const std::filesystem::path & processed_part_path, + const std::string & part_name, + const String & relative_path_in_destination_storage, + const zkutil::ZooKeeperPtr & zk + ); + + bool areAllPartsProcessed( + const std::filesystem::path & export_path, + const zkutil::ZooKeeperPtr & zk + ); +}; + +} diff --git a/src/Storages/MergeTree/ExportPartitionUtils.cpp b/src/Storages/MergeTree/ExportPartitionUtils.cpp new file mode 100644 index 000000000000..466eb79e8367 --- /dev/null +++ b/src/Storages/MergeTree/ExportPartitionUtils.cpp @@ -0,0 +1,95 @@ +#include +#include +#include +#include "Storages/ExportReplicatedMergeTreePartitionManifest.h" +#include "Storages/ExportReplicatedMergeTreePartitionTaskEntry.h" +#include + +namespace DB +{ + +namespace fs = std::filesystem; + +namespace ExportPartitionUtils +{ + /// Collect all the exported paths from the processed parts + /// If multiRead is supported by the keeper implementation, it is done in a single request + /// Otherwise, multiple async requests are sent + std::vector getExportedPaths(const LoggerPtr & log, const zkutil::ZooKeeperPtr & zk, const std::string & export_path) + { + std::vector exported_paths; + + LOG_INFO(log, "ExportPartition: Getting exported paths for {}", export_path); + + const auto processed_parts_path = fs::path(export_path) / "processed"; + + std::vector processed_parts; + if (Coordination::Error::ZOK != zk->tryGetChildren(processed_parts_path, processed_parts)) + { + /// todo arthur do something here + LOG_INFO(log, "ExportPartition: Failed to get parts children, exiting"); + return {}; + } + + std::vector get_paths; + + for (const auto & processed_part : processed_parts) + { + get_paths.emplace_back(processed_parts_path / processed_part); + } + + auto responses = zk->tryGet(get_paths); + + responses.waitForResponses(); + + for (size_t i = 0; i < responses.size(); ++i) + { + if (responses[i].error != Coordination::Error::ZOK) + { + /// todo arthur what to do in this case? + /// It could be that zk is corrupt, in that case we should fail the task + /// but it can also be some temporary network issue? not sure + LOG_INFO(log, "ExportPartition: Failed to get exported path, exiting"); + return {}; + } + + const auto processed_part_entry = ExportReplicatedMergeTreePartitionProcessedPartEntry::fromJsonString(responses[i].data); + + exported_paths.emplace_back(processed_part_entry.path_in_destination); + } + + return exported_paths; + } + + void commit( + const ExportReplicatedMergeTreePartitionManifest & manifest, + const StoragePtr & destination_storage, + const zkutil::ZooKeeperPtr & zk, + const LoggerPtr & log, + const std::string & entry_path, + const ContextPtr & context) + { + const auto exported_paths = ExportPartitionUtils::getExportedPaths(log, zk, entry_path); + + if (exported_paths.size() != manifest.parts.size()) + { + LOG_INFO(log, "ExportPartition: Skipping {}: exported paths size does not match parts size, this is a BUG", entry_path); + return; + } + + LOG_INFO(log, "ExportPartition: Exported paths size matches parts size, commit the export"); + destination_storage->commitExportPartitionTransaction(manifest.transaction_id, manifest.partition_id, exported_paths, context); + + LOG_INFO(log, "ExportPartition: Committed export, mark as completed"); + if (Coordination::Error::ZOK == zk->trySet(fs::path(entry_path) / "status", String(magic_enum::enum_name(ExportReplicatedMergeTreePartitionTaskEntry::Status::COMPLETED)).data(), -1)) + { + LOG_INFO(log, "ExportPartition: Marked export as completed"); + } + else + { + LOG_INFO(log, "ExportPartition: Failed to mark export as completed, will not try to fix it"); + } + } +} + +} diff --git a/src/Storages/MergeTree/ExportPartitionUtils.h b/src/Storages/MergeTree/ExportPartitionUtils.h new file mode 100644 index 000000000000..40fe04a5bfd3 --- /dev/null +++ b/src/Storages/MergeTree/ExportPartitionUtils.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include +#include +#include +#include "Storages/IStorage.h" + +namespace DB +{ + +struct ExportReplicatedMergeTreePartitionManifest; + +namespace ExportPartitionUtils +{ + std::vector getExportedPaths(const LoggerPtr & log, const zkutil::ZooKeeperPtr & zk, const std::string & export_path); + + void commit( + const ExportReplicatedMergeTreePartitionManifest & manifest, + const StoragePtr & destination_storage, + const zkutil::ZooKeeperPtr & zk, + const LoggerPtr & log, + const std::string & entry_path, + const ContextPtr & context + ); +} + +} diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 85e1e56d0241..a363ce6d16f1 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -279,6 +279,42 @@ String IMergeTreeDataPart::MinMaxIndex::getFileColumnName(const String & column_ return stream_name; } +Block IMergeTreeDataPart::MinMaxIndex::getBlock(const MergeTreeData & data) const +{ + if (!initialized) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to get block from uninitialized MinMax index."); + + Block block; + + const auto metadata_snapshot = data.getInMemoryMetadataPtr(); + const auto & partition_key = metadata_snapshot->getPartitionKey(); + + const auto minmax_column_names = data.getMinMaxColumnsNames(partition_key); + const auto minmax_column_types = data.getMinMaxColumnsTypes(partition_key); + const auto minmax_idx_size = minmax_column_types.size(); + + for (size_t i = 0; i < minmax_idx_size; ++i) + { + const auto & data_type = minmax_column_types[i]; + const auto & column_name = minmax_column_names[i]; + + const auto column = data_type->createColumn(); + + auto range = hyperrectangle.at(i); + range.shrinkToIncludedIfPossible(); + + const auto & min_val = range.left; + const auto & max_val = range.right; + + column->insert(min_val); + column->insert(max_val); + + block.insert(ColumnWithTypeAndName(column->getPtr(), data_type, column_name)); + } + + return block; +} + void IMergeTreeDataPart::incrementStateMetric(MergeTreeDataPartState state_) const { switch (state_) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index dfb086929172..14286d751704 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -362,6 +362,8 @@ class IMergeTreeDataPart : public std::enable_shared_from_this; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index d1fea772e267..260508764f6d 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -23,6 +24,11 @@ #include #include #include +#include "Storages/MergeTree/ExportPartTask.h" +#include +#include +#include +#include #include #include #include @@ -42,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -96,6 +103,7 @@ #include #include #include +#include #include @@ -114,6 +122,7 @@ #include #include #include +#include #include #include @@ -156,6 +165,10 @@ namespace ProfileEvents extern const Event LoadedDataPartsMicroseconds; extern const Event RestorePartsSkippedFiles; extern const Event RestorePartsSkippedBytes; + extern const Event PartsExports; + extern const Event PartsExportTotalMilliseconds; + extern const Event PartsExportFailures; + extern const Event PartsExportDuplicated; } namespace CurrentMetrics @@ -199,6 +212,11 @@ namespace Setting extern const SettingsUInt64 min_insert_block_size_rows; extern const SettingsUInt64 min_insert_block_size_bytes; extern const SettingsBool apply_patch_parts; + extern const SettingsBool allow_experimental_export_merge_tree_part; + extern const SettingsUInt64 min_bytes_to_use_direct_io; + extern const SettingsMergeTreePartExportFileAlreadyExistsPolicy export_merge_tree_part_file_already_exists_policy; + extern const SettingsBool output_format_parallel_formatting; + extern const SettingsBool output_format_parquet_parallel_encoding; } namespace MergeTreeSetting @@ -316,6 +334,8 @@ namespace ErrorCodes extern const int CANNOT_FORGET_PARTITION; extern const int DATA_TYPE_CANNOT_BE_USED_IN_KEY; extern const int TOO_LARGE_LIGHTWEIGHT_UPDATES; + extern const int UNKNOWN_TABLE; + extern const int FILE_ALREADY_EXISTS; } static void checkSuspiciousIndices(const ASTFunction * index_function) @@ -4490,8 +4510,6 @@ void MergeTreeData::changeSettings( { if (new_settings) { - bool has_storage_policy_changed = false; - const auto & new_changes = new_settings->as().changes; StoragePolicyPtr new_storage_policy = nullptr; @@ -4530,8 +4548,6 @@ void MergeTreeData::changeSettings( disk->createDirectories(fs::path(relative_data_path) / DETACHED_DIR_NAME); } /// FIXME how would that be done while reloading configuration??? - - has_storage_policy_changed = true; } } } @@ -4548,9 +4564,6 @@ void MergeTreeData::changeSettings( StorageInMemoryMetadata new_metadata = getInMemoryMetadata(); new_metadata.setSettingsChanges(new_settings); setInMemoryMetadata(new_metadata); - - if (has_storage_policy_changed) - startBackgroundMovesIfNeeded(); } } @@ -6189,6 +6202,98 @@ void MergeTreeData::movePartitionToTable(const PartitionCommand & command, Conte movePartitionToTable(dest_storage, command.partition, query_context); } +void MergeTreeData::exportPartToTable(const PartitionCommand & command, ContextPtr query_context) +{ + if (!query_context->getSettingsRef()[Setting::allow_experimental_export_merge_tree_part]) + { + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Exporting merge tree part is experimental. Set `allow_experimental_export_merge_tree_part` to enable it"); + } + + const auto part_name = command.partition->as().value.safeGet(); + + const auto database_name = query_context->resolveDatabase(command.to_database); + + exportPartToTable(part_name, StorageID{database_name, command.to_table}, generateSnowflakeIDString(), query_context); +} + +void MergeTreeData::exportPartToTable( + const std::string & part_name, + const StorageID & destination_storage_id, + const String & transaction_id, + ContextPtr query_context, + std::function completion_callback) +{ + auto dest_storage = DatabaseCatalog::instance().getTable(destination_storage_id, query_context); + + if (destination_storage_id == this->getStorageID()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Exporting to the same table is not allowed"); + } + + if (!dest_storage->supportsImport()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Destination storage {} does not support MergeTree parts or uses unsupported partitioning", dest_storage->getName()); + + auto query_to_string = [] (const ASTPtr & ast) + { + return ast ? ast->formatWithSecretsOneLine() : ""; + }; + + auto source_metadata_ptr = getInMemoryMetadataPtr(); + auto destination_metadata_ptr = dest_storage->getInMemoryMetadataPtr(); + + if (destination_metadata_ptr->getColumns().getAllPhysical().sizeOfDifference(source_metadata_ptr->getColumns().getAllPhysical())) + throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure"); + + if (query_to_string(source_metadata_ptr->getPartitionKeyAST()) != query_to_string(destination_metadata_ptr->getPartitionKeyAST())) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key"); + + auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); + + if (!part) + throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "No such data part '{}' to export in table '{}'", + part_name, getStorageID().getFullTableName()); + + { + const auto format_settings = getFormatSettings(query_context); + MergeTreePartExportManifest manifest( + dest_storage->getStorageID(), + part, + transaction_id, + query_context->getSettingsRef()[Setting::export_merge_tree_part_file_already_exists_policy].value, + format_settings, + source_metadata_ptr, + completion_callback); + + std::lock_guard lock(export_manifests_mutex); + + if (!export_manifests.emplace(std::move(manifest)).second) + { + throw Exception(ErrorCodes::ABORTED, "Data part '{}' is already being exported to table '{}'", + part_name, dest_storage->getStorageID().getFullTableName()); + } + } + + background_moves_assignee.trigger(); +} + +void MergeTreeData::killExportPart(const String & transaction_id) +{ + std::lock_guard lock(export_manifests_mutex); + + std::erase_if(export_manifests, [&](const auto & manifest) + { + if (manifest.transaction_id == transaction_id) + { + if (manifest.task) + manifest.task->cancel(); + + return true; + } + return false; + }); +} + void MergeTreeData::movePartitionToShard(const ASTPtr & /*partition*/, bool /*move_part*/, const String & /*to*/, ContextPtr /*query_context*/) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MOVE PARTITION TO SHARD is not supported by storage {}", getName()); @@ -6240,6 +6345,17 @@ Pipe MergeTreeData::alterPartition( } } break; + case PartitionCommand::EXPORT_PART: + { + exportPartToTable(command, query_context); + break; + } + + case PartitionCommand::EXPORT_PARTITION: + { + exportPartitionToTable(command, query_context); + break; + } case PartitionCommand::DROP_DETACHED_PARTITION: dropDetached(command.partition, command.part, query_context); @@ -8573,6 +8689,32 @@ std::pair MergeTreeData::cloneAn return std::make_pair(dst_data_part, std::move(temporary_directory_lock)); } +std::vector MergeTreeData::getExportsStatus() const +{ + std::lock_guard lock(export_manifests_mutex); + std::vector result; + + auto source_database = getStorageID().database_name; + auto source_table = getStorageID().table_name; + + for (const auto & manifest : export_manifests) + { + MergeTreeExportStatus status; + + status.source_database = source_database; + status.source_table = source_table; + status.destination_database = manifest.destination_storage_id.database_name; + status.destination_table = manifest.destination_storage_id.table_name; + status.create_time = manifest.create_time; + status.part_name = manifest.data_part->name; + + result.emplace_back(std::move(status)); + } + + return result; +} + + bool MergeTreeData::canUseAdaptiveGranularity() const { const auto settings = getSettings(); @@ -8850,7 +8992,8 @@ void MergeTreeData::writePartLog( const DataPartPtr & result_part, const DataPartsVector & source_parts, const MergeListEntry * merge_entry, - std::shared_ptr profile_counters) + std::shared_ptr profile_counters, + const ExportsListEntry * exports_entry) try { auto table_id = getStorageID(); @@ -8918,6 +9061,13 @@ try part_log_elem.rows = (*merge_entry)->rows_written; part_log_elem.peak_memory_usage = (*merge_entry)->getMemoryTracker().getPeak(); } + else if (exports_entry) + { + part_log_elem.rows_read = (*exports_entry)->rows_read; + part_log_elem.bytes_read_uncompressed = (*exports_entry)->bytes_read_uncompressed; + part_log_elem.peak_memory_usage = (*exports_entry)->getPeakMemoryUsage(); + part_log_elem.path_on_disk = (*exports_entry)->destination_file_path; + } if (profile_counters) { @@ -8959,21 +9109,51 @@ MergeTreeData::CurrentlyMovingPartsTagger::~CurrentlyMovingPartsTagger() bool MergeTreeData::scheduleDataMovingJob(BackgroundJobsAssignee & assignee) { - if (parts_mover.moves_blocker.isCancelled()) - return false; + if (!parts_mover.moves_blocker.isCancelled()) + { + auto moving_tagger = selectPartsForMove(); + if (!moving_tagger->parts_to_move.empty()) + { + assignee.scheduleMoveTask(std::make_shared( + [this, moving_tagger] () mutable + { + ReadSettings read_settings = Context::getGlobalContextInstance()->getReadSettings(); + WriteSettings write_settings = Context::getGlobalContextInstance()->getWriteSettings(); + return moveParts(moving_tagger, read_settings, write_settings, /* wait_for_move_if_zero_copy= */ false) == MovePartsOutcome::PartsMoved; + }, moves_assignee_trigger, getStorageID())); + return true; + } + } - auto moving_tagger = selectPartsForMove(); - if (moving_tagger->parts_to_move.empty()) - return false; + std::lock_guard lock(export_manifests_mutex); - assignee.scheduleMoveTask(std::make_shared( - [this, moving_tagger] () mutable + for (auto & manifest : export_manifests) + { + if (manifest.in_progress) { - ReadSettings read_settings = Context::getGlobalContextInstance()->getReadSettings(); - WriteSettings write_settings = Context::getGlobalContextInstance()->getWriteSettings(); - return moveParts(moving_tagger, read_settings, write_settings, /* wait_for_move_if_zero_copy= */ false) == MovePartsOutcome::PartsMoved; - }, moves_assignee_trigger, getStorageID())); - return true; + continue; + } + + auto context_copy = Context::createCopy(getContext()); + context_copy->makeQueryContextForExportPart(); + context_copy->setCurrentQueryId(manifest.transaction_id); + context_copy->setBackgroundOperationTypeForContext(ClientInfo::BackgroundOperationType::EXPORT_PART); + + auto task = std::make_shared(*this, manifest, context_copy); + + manifest.in_progress = assignee.scheduleMoveTask(task); + + if (!manifest.in_progress) + { + continue; + } + + manifest.task = task; + + return true; + } + + return false; } bool MergeTreeData::areBackgroundMovesNeeded() const @@ -9191,6 +9371,10 @@ bool MergeTreeData::canUsePolymorphicParts() const return canUsePolymorphicParts(*getSettings(), unused); } +void MergeTreeData::startBackgroundMoves() +{ + background_moves_assignee.start(); +} void MergeTreeData::checkDropOrRenameCommandDoesntAffectInProgressMutations( const AlterCommand & command, const std::map & unfinished_mutations, ContextPtr local_context) const diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 2cd69c086473..521bc7e50279 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +38,8 @@ #include #include #include +#include +#include #include #include @@ -979,6 +982,22 @@ class MergeTreeData : public IStorage, public WithMutableContext /// Moves partition to specified Table void movePartitionToTable(const PartitionCommand & command, ContextPtr query_context); + void exportPartToTable(const PartitionCommand & command, ContextPtr query_context); + + void exportPartToTable( + const std::string & part_name, + const StorageID & destination_storage_id, + const String & transaction_id, + ContextPtr query_context, + std::function completion_callback = {}); + + void killExportPart(const String & transaction_id); + + virtual void exportPartitionToTable(const PartitionCommand &, ContextPtr) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "EXPORT PARTITION is not implemented for engine {}", getName()); + } + /// Checks that Partition could be dropped right now /// Otherwise - throws an exception with detailed information. /// We do not use mutex because it is not very important that the size could change during the operation. @@ -1056,6 +1075,7 @@ class MergeTreeData : public IStorage, public WithMutableContext const WriteSettings & write_settings); virtual std::vector getMutationsStatus() const = 0; + std::vector getExportsStatus() const; /// Returns true if table can create new parts with adaptive granularity /// Has additional constraint in replicated version @@ -1241,6 +1261,10 @@ class MergeTreeData : public IStorage, public WithMutableContext /// Mutex for currently_moving_parts mutable std::mutex moving_parts_mutex; + mutable std::mutex export_manifests_mutex; + + std::set export_manifests; + PinnedPartUUIDsPtr getPinnedPartUUIDs() const; /// Schedules background job to like merge/mutate/fetch an executor @@ -1334,6 +1358,7 @@ class MergeTreeData : public IStorage, public WithMutableContext friend class MergeTask; friend class IPartMetadataManager; friend class IMergedBlockOutputStream; // for access to log + friend class ExportPartTask; bool require_part_metadata; @@ -1359,6 +1384,8 @@ class MergeTreeData : public IStorage, public WithMutableContext are_columns_and_secondary_indices_sizes_calculated = false; } + void startBackgroundMoves(); + /// Engine-specific methods BrokenPartCallback broken_part_callback; @@ -1614,7 +1641,8 @@ class MergeTreeData : public IStorage, public WithMutableContext const DataPartPtr & result_part, const DataPartsVector & source_parts, const MergeListEntry * merge_entry, - std::shared_ptr profile_counters); + std::shared_ptr profile_counters, + const ExportsListEntry * exports_entry = nullptr); /// If part is assigned to merge or mutation (possibly replicated) /// Should be overridden by children, because they can have different @@ -1825,8 +1853,6 @@ class MergeTreeData : public IStorage, public WithMutableContext bool canUsePolymorphicParts(const MergeTreeSettings & settings, String & out_reason) const; - virtual void startBackgroundMovesIfNeeded() = 0; - bool allow_nullable_key = false; bool allow_reverse_key = false; diff --git a/src/Storages/MergeTree/MergeTreePartExportManifest.h b/src/Storages/MergeTree/MergeTreePartExportManifest.h new file mode 100644 index 000000000000..533eeb6decdd --- /dev/null +++ b/src/Storages/MergeTree/MergeTreePartExportManifest.h @@ -0,0 +1,97 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace DB +{ + +class Exception; + +class ExportPartTask; + +struct MergeTreePartExportManifest +{ + using FileAlreadyExistsPolicy = MergeTreePartExportFileAlreadyExistsPolicy; + + using DataPartPtr = std::shared_ptr; + + struct CompletionCallbackResult + { + private: + CompletionCallbackResult(bool success_, const String & relative_path_in_destination_storage_, std::optional exception_) + : success(success_), relative_path_in_destination_storage(relative_path_in_destination_storage_), exception(std::move(exception_)) {} + public: + + static CompletionCallbackResult createSuccess(const String & relative_path_in_destination_storage_) + { + return CompletionCallbackResult(true, relative_path_in_destination_storage_, std::nullopt); + } + + static CompletionCallbackResult createFailure(Exception exception_) + { + return CompletionCallbackResult(false, "", std::move(exception_)); + } + + bool success = false; + String relative_path_in_destination_storage; + std::optional exception; + }; + + MergeTreePartExportManifest( + const StorageID & destination_storage_id_, + const DataPartPtr & data_part_, + const String & transaction_id_, + FileAlreadyExistsPolicy file_already_exists_policy_, + const FormatSettings & format_settings_, + const StorageMetadataPtr & metadata_snapshot_, + std::function completion_callback_ = {}) + : destination_storage_id(destination_storage_id_), + data_part(data_part_), + transaction_id(transaction_id_), + file_already_exists_policy(file_already_exists_policy_), + format_settings(format_settings_), + metadata_snapshot(metadata_snapshot_), + completion_callback(completion_callback_), + create_time(time(nullptr)) {} + + StorageID destination_storage_id; + DataPartPtr data_part; + /// Used for killing the export. + String transaction_id; + FileAlreadyExistsPolicy file_already_exists_policy; + FormatSettings format_settings; + + /// Metadata snapshot captured at the time of query validation to prevent race conditions with mutations + /// Otherwise the export could fail if the schema changes between validation and execution + StorageMetadataPtr metadata_snapshot; + + std::function completion_callback; + + time_t create_time; + mutable bool in_progress = false; + mutable std::shared_ptr task = nullptr; + + bool operator<(const MergeTreePartExportManifest & rhs) const + { + // Lexicographic comparison: first compare destination storage, then part name + auto lhs_storage = destination_storage_id.getQualifiedName(); + auto rhs_storage = rhs.destination_storage_id.getQualifiedName(); + + if (lhs_storage != rhs_storage) + return lhs_storage < rhs_storage; + + return data_part->name < rhs.data_part->name; + } + + bool operator==(const MergeTreePartExportManifest & rhs) const + { + return destination_storage_id.getQualifiedName() == rhs.destination_storage_id.getQualifiedName() + && data_part->name == rhs.data_part->name; + } +}; + +} diff --git a/src/Storages/MergeTree/MergeTreePartExportStatus.h b/src/Storages/MergeTree/MergeTreePartExportStatus.h new file mode 100644 index 000000000000..e71a2f15e6ed --- /dev/null +++ b/src/Storages/MergeTree/MergeTreePartExportStatus.h @@ -0,0 +1,20 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +struct MergeTreeExportStatus +{ + String source_database; + String source_table; + String destination_database; + String destination_table; + time_t create_time = 0; + std::string part_name; +}; + +} diff --git a/src/Storages/MergeTree/MergeTreePartition.cpp b/src/Storages/MergeTree/MergeTreePartition.cpp index a4ab9066bb33..3037f67b23ac 100644 --- a/src/Storages/MergeTree/MergeTreePartition.cpp +++ b/src/Storages/MergeTree/MergeTreePartition.cpp @@ -466,6 +466,22 @@ void MergeTreePartition::create(const StorageMetadataPtr & metadata_snapshot, Bl } } +Block MergeTreePartition::getBlockWithPartitionValues(const NamesAndTypesList & partition_columns) const +{ + chassert(partition_columns.size() == value.size()); + + Block result; + + std::size_t i = 0; + for (const auto & partition_column : partition_columns) + { + auto column = partition_column.type->createColumnConst(1, value[i++]); + result.insert({column, partition_column.type, partition_column.name}); + } + + return result; +} + NamesAndTypesList MergeTreePartition::executePartitionByExpression(const StorageMetadataPtr & metadata_snapshot, Block & block, ContextPtr context) { auto adjusted_partition_key = adjustPartitionKey(metadata_snapshot, context); diff --git a/src/Storages/MergeTree/MergeTreePartition.h b/src/Storages/MergeTree/MergeTreePartition.h index 4338b216cdb8..811cfdc2a90c 100644 --- a/src/Storages/MergeTree/MergeTreePartition.h +++ b/src/Storages/MergeTree/MergeTreePartition.h @@ -60,6 +60,8 @@ struct MergeTreePartition void create(const StorageMetadataPtr & metadata_snapshot, Block block, size_t row, ContextPtr context); + Block getBlockWithPartitionValues(const NamesAndTypesList & partition_columns) const; + /// Adjust partition key and execute its expression on block. Return sample block according to used expression. static NamesAndTypesList executePartitionByExpression(const StorageMetadataPtr & metadata_snapshot, Block & block, ContextPtr context); diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index d0385d1c7d33..56613657e68a 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -168,6 +168,10 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( addThrottler(read_settings.remote_throttler, context->getMergesThrottler()); addThrottler(read_settings.local_throttler, context->getMergesThrottler()); break; + case Export: + addThrottler(read_settings.local_throttler, context->getExportsThrottler()); + addThrottler(read_settings.remote_throttler, context->getExportsThrottler()); + break; } MergeTreeReaderSettings reader_settings = diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.h b/src/Storages/MergeTree/MergeTreeSequentialSource.h index abba230d9e79..a858adf33bb5 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.h +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.h @@ -15,6 +15,7 @@ enum MergeTreeSequentialSourceType { Mutation, Merge, + Export, }; /// Create stream for reading single part from MergeTree. diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp index 8420cd5738c2..aca997b1e443 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace CurrentMetrics @@ -27,6 +28,11 @@ namespace MergeTreeSetting extern const MergeTreeSettingsSeconds zookeeper_session_expiration_check_period; } +namespace ServerSetting +{ + extern const ServerSettingsBool enable_experimental_export_merge_tree_partition_feature; +} + namespace ErrorCodes { extern const int REPLICA_IS_ALREADY_ACTIVE; @@ -171,10 +177,19 @@ bool ReplicatedMergeTreeRestartingThread::runImpl() storage.mutations_updating_task->activateAndSchedule(); storage.mutations_finalizing_task->activateAndSchedule(); storage.merge_selecting_task->activateAndSchedule(); + + if (storage.getContext()->getServerSettings()[ServerSetting::enable_experimental_export_merge_tree_partition_feature]) + { + storage.export_merge_tree_partition_updating_task->activateAndSchedule(); + storage.export_merge_tree_partition_select_task->activateAndSchedule(); + storage.export_merge_tree_partition_status_handling_task->activateAndSchedule(); + } + storage.cleanup_thread.start(); storage.async_block_ids_cache.start(); storage.part_check_thread.start(); + LOG_DEBUG(log, "Table started successfully"); return true; } diff --git a/src/Storages/MergeTree/tests/gtest_export_partition_ordering.cpp b/src/Storages/MergeTree/tests/gtest_export_partition_ordering.cpp new file mode 100644 index 000000000000..c9e3ffd9eef9 --- /dev/null +++ b/src/Storages/MergeTree/tests/gtest_export_partition_ordering.cpp @@ -0,0 +1,75 @@ +#include +#include + +namespace DB +{ + +class ExportPartitionOrderingTest : public ::testing::Test +{ +protected: + ExportPartitionTaskEntriesContainer container; + ExportPartitionTaskEntriesContainer::index::type & by_key; + ExportPartitionTaskEntriesContainer::index::type & by_create_time; + + ExportPartitionOrderingTest() + : by_key(container.get()) + , by_create_time(container.get()) + { + } +}; + +TEST_F(ExportPartitionOrderingTest, IterationOrderMatchesCreateTime) +{ + time_t base_time = 1000; + + ExportReplicatedMergeTreePartitionManifest manifest1; + manifest1.partition_id = "2020"; + manifest1.destination_database = "db1"; + manifest1.destination_table = "table1"; + manifest1.transaction_id = "tx1"; + manifest1.create_time = base_time + 300; // Latest + + ExportReplicatedMergeTreePartitionManifest manifest2; + manifest2.partition_id = "2021"; + manifest2.destination_database = "db1"; + manifest2.destination_table = "table1"; + manifest2.transaction_id = "tx2"; + manifest2.create_time = base_time + 100; // Middle + + ExportReplicatedMergeTreePartitionManifest manifest3; + manifest3.partition_id = "2022"; + manifest3.destination_database = "db1"; + manifest3.destination_table = "table1"; + manifest3.transaction_id = "tx3"; + manifest3.create_time = base_time; // Oldest + + ExportReplicatedMergeTreePartitionTaskEntry entry1{manifest1, ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING, {}}; + ExportReplicatedMergeTreePartitionTaskEntry entry2{manifest2, ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING, {}}; + ExportReplicatedMergeTreePartitionTaskEntry entry3{manifest3, ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING, {}}; + + // Insert in reverse order + by_key.insert(entry1); + by_key.insert(entry2); + by_key.insert(entry3); + + // Verify iteration order matches create_time (ascending) + auto it = by_create_time.begin(); + ASSERT_NE(it, by_create_time.end()); + EXPECT_EQ(it->manifest.partition_id, "2022"); // Oldest first + EXPECT_EQ(it->manifest.create_time, base_time); + + ++it; + ASSERT_NE(it, by_create_time.end()); + EXPECT_EQ(it->manifest.partition_id, "2021"); + EXPECT_EQ(it->manifest.create_time, base_time + 100); + + ++it; + ASSERT_NE(it, by_create_time.end()); + EXPECT_EQ(it->manifest.partition_id, "2020"); + EXPECT_EQ(it->manifest.create_time, base_time + 300); + + ++it; + EXPECT_EQ(it, by_create_time.end()); +} + +} diff --git a/src/Storages/ObjectStorage/Azure/Configuration.cpp b/src/Storages/ObjectStorage/Azure/Configuration.cpp index c068a87d52c6..6d0392cd6a89 100644 --- a/src/Storages/ObjectStorage/Azure/Configuration.cpp +++ b/src/Storages/ObjectStorage/Azure/Configuration.cpp @@ -61,6 +61,7 @@ const std::unordered_set optional_configuration_keys = { "partition_columns_in_data_file", "client_id", "tenant_id", + "storage_type", }; void StorageAzureConfiguration::check(ContextPtr context) @@ -156,10 +157,6 @@ void StorageAzureConfiguration::fromNamedCollection(const NamedCollection & coll String connection_url; String container_name; - std::optional account_name; - std::optional account_key; - std::optional client_id; - std::optional tenant_id; if (collection.has("connection_string")) connection_url = collection.get("connection_string"); @@ -181,9 +178,9 @@ void StorageAzureConfiguration::fromNamedCollection(const NamedCollection & coll if (collection.has("tenant_id")) tenant_id = collection.get("tenant_id"); - structure = collection.getOrDefault("structure", "auto"); - format = collection.getOrDefault("format", format); - compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); + setStructure(collection.getOrDefault("structure", "auto")); + setFormat(collection.getOrDefault("format", getFormat())); + setCompressionMethod(collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto"))); if (collection.has("partition_strategy")) { @@ -195,10 +192,10 @@ void StorageAzureConfiguration::fromNamedCollection(const NamedCollection & coll throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition strategy {} is not supported", partition_strategy_name); } - partition_strategy_type = partition_strategy_type_opt.value(); + setPartitionStrategyType(partition_strategy_type_opt.value()); } - partition_columns_in_data_file = collection.getOrDefault("partition_columns_in_data_file", partition_strategy_type != PartitionStrategyFactory::StrategyType::HIVE); + setPartitionColumnsInDataFile(collection.getOrDefault("partition_columns_in_data_file", getPartitionStrategyType() != PartitionStrategyFactory::StrategyType::HIVE)); blobs_paths = {blob_path}; connection_params = getConnectionParams(connection_url, container_name, account_name, account_key, client_id, tenant_id, context); @@ -219,13 +216,13 @@ ASTPtr StorageAzureConfiguration::extractExtraCredentials(ASTs & args) return nullptr; } -bool StorageAzureConfiguration::collectCredentials(ASTPtr maybe_credentials, std::optional & client_id, std::optional & tenant_id, ContextPtr local_context) +bool StorageAzureConfiguration::collectCredentials(ASTPtr maybe_credentials, std::optional & client_id_, std::optional & tenant_id_, ContextPtr local_context) { if (!maybe_credentials) return false; - client_id = {}; - tenant_id = {}; + client_id_ = {}; + tenant_id_ = {}; const auto * credentials_ast_function = maybe_credentials->as(); if (!credentials_ast_function || credentials_ast_function->name != "extra_credentials") @@ -259,9 +256,9 @@ bool StorageAzureConfiguration::collectCredentials(ASTPtr maybe_credentials, std if (arg_value.getType() != Field::Types::Which::String) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected string as credential value"); else if (arg_name == "client_id") - client_id = arg_value.safeGet(); + client_id_ = arg_value.safeGet(); else if (arg_name == "tenant_id") - tenant_id = arg_value.safeGet(); + tenant_id_ = arg_value.safeGet(); else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid credential argument found: {}", arg_name); } @@ -287,16 +284,10 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, std::unordered_map engine_args_to_idx; - String connection_url = checkAndGetLiteralArgument(engine_args[0], "connection_string/storage_account_url"); String container_name = checkAndGetLiteralArgument(engine_args[1], "container"); blob_path = checkAndGetLiteralArgument(engine_args[2], "blobpath"); - std::optional account_name; - std::optional account_key; - std::optional client_id; - std::optional tenant_id; - collectCredentials(extra_credentials, client_id, tenant_id, context); auto is_format_arg = [] (const std::string & s) -> bool @@ -309,12 +300,12 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); if (is_format_arg(fourth_arg)) { - format = fourth_arg; + setFormat(fourth_arg); } else { if (with_structure) - structure = fourth_arg; + setStructure(fourth_arg); else throw Exception( ErrorCodes::BAD_ARGUMENTS, @@ -326,8 +317,8 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); if (is_format_arg(fourth_arg)) { - format = fourth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); + setFormat(fourth_arg); + setCompressionMethod(checkAndGetLiteralArgument(engine_args[4], "compression")); } else { @@ -340,19 +331,19 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); if (is_format_arg(fourth_arg)) { - format = fourth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); + setFormat(fourth_arg); + setCompressionMethod(checkAndGetLiteralArgument(engine_args[4], "compression")); auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "partition_strategy/structure"); if (magic_enum::enum_contains(sixth_arg, magic_enum::case_insensitive)) { - partition_strategy_type = magic_enum::enum_cast(sixth_arg, magic_enum::case_insensitive).value(); + setPartitionStrategyType(magic_enum::enum_cast(sixth_arg, magic_enum::case_insensitive).value()); } else { if (with_structure) { - structure = sixth_arg; + setStructure(sixth_arg); } else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partition strategy {}", sixth_arg); @@ -365,12 +356,12 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/structure"); if (is_format_arg(sixth_arg)) { - format = sixth_arg; + setFormat(sixth_arg); } else { if (with_structure) - structure = sixth_arg; + setStructure(sixth_arg); else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); } @@ -382,8 +373,8 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, if (is_format_arg(fourth_arg)) { - format = fourth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); + setFormat(fourth_arg); + setCompressionMethod(checkAndGetLiteralArgument(engine_args[4], "compression")); const auto partition_strategy_name = checkAndGetLiteralArgument(engine_args[5], "partition_strategy"); const auto partition_strategy_type_opt = magic_enum::enum_cast(partition_strategy_name, magic_enum::case_insensitive); @@ -392,14 +383,14 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partition strategy {}", partition_strategy_name); } - partition_strategy_type = partition_strategy_type_opt.value(); + setPartitionStrategyType(partition_strategy_type_opt.value()); /// If it's of type String, then it is not `partition_columns_in_data_file` if (const auto seventh_arg = tryGetLiteralArgument(engine_args[6], "structure/partition_columns_in_data_file")) { if (with_structure) { - structure = seventh_arg.value(); + setStructure(seventh_arg.value()); } else { @@ -408,7 +399,7 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, } else { - partition_columns_in_data_file = checkAndGetLiteralArgument(engine_args[6], "partition_columns_in_data_file"); + setPartitionColumnsInDataFile(checkAndGetLiteralArgument(engine_args[6], "partition_columns_in_data_file")); } } else @@ -423,8 +414,8 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); if (!is_format_arg(sixth_arg)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); - format = sixth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); + setFormat(sixth_arg); + setCompressionMethod(checkAndGetLiteralArgument(engine_args[6], "compression")); } } else if (engine_args.size() == 8) @@ -439,8 +430,8 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, /// When using a connection string, the function only accepts 8 arguments in case `with_structure=true` throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid sequence / combination of arguments"); } - format = fourth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); + setFormat(fourth_arg); + setCompressionMethod(checkAndGetLiteralArgument(engine_args[4], "compression")); const auto partition_strategy_name = checkAndGetLiteralArgument(engine_args[5], "partition_strategy"); const auto partition_strategy_type_opt = magic_enum::enum_cast(partition_strategy_name, magic_enum::case_insensitive); @@ -449,9 +440,9 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partition strategy {}", partition_strategy_name); } - partition_strategy_type = partition_strategy_type_opt.value(); - partition_columns_in_data_file = checkAndGetLiteralArgument(engine_args[6], "partition_columns_in_data_file"); - structure = checkAndGetLiteralArgument(engine_args[7], "structure"); + setPartitionStrategyType(partition_strategy_type_opt.value()); + setPartitionColumnsInDataFile(checkAndGetLiteralArgument(engine_args[6], "partition_columns_in_data_file")); + setStructure(checkAndGetLiteralArgument(engine_args[7], "structure")); } else { @@ -460,19 +451,19 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format"); if (!is_format_arg(sixth_arg)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); - format = sixth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); + setFormat(sixth_arg); + setCompressionMethod(checkAndGetLiteralArgument(engine_args[6], "compression")); auto eighth_arg = checkAndGetLiteralArgument(engine_args[7], "partition_strategy/structure"); if (magic_enum::enum_contains(eighth_arg, magic_enum::case_insensitive)) { - partition_strategy_type = magic_enum::enum_cast(eighth_arg, magic_enum::case_insensitive).value(); + setPartitionStrategyType(magic_enum::enum_cast(eighth_arg, magic_enum::case_insensitive).value()); } else { if (with_structure) { - structure = eighth_arg; + setStructure(eighth_arg); } else throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partition strategy {}", eighth_arg); @@ -487,8 +478,8 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format"); if (!is_format_arg(sixth_arg)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); - format = sixth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); + setFormat(sixth_arg); + setCompressionMethod(checkAndGetLiteralArgument(engine_args[6], "compression")); const auto partition_strategy_name = checkAndGetLiteralArgument(engine_args[7], "partition_strategy"); const auto partition_strategy_type_opt = magic_enum::enum_cast(partition_strategy_name, magic_enum::case_insensitive); @@ -497,13 +488,13 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partition strategy {}", partition_strategy_name); } - partition_strategy_type = partition_strategy_type_opt.value(); + setPartitionStrategyType(partition_strategy_type_opt.value()); /// If it's of type String, then it is not `partition_columns_in_data_file` if (const auto nineth_arg = tryGetLiteralArgument(engine_args[8], "structure/partition_columns_in_data_file")) { if (with_structure) { - structure = nineth_arg.value(); + setStructure(nineth_arg.value()); } else { @@ -512,7 +503,7 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, } else { - partition_columns_in_data_file = checkAndGetLiteralArgument(engine_args[8], "partition_columns_in_data_file"); + setPartitionColumnsInDataFile(checkAndGetLiteralArgument(engine_args[8], "partition_columns_in_data_file")); } } else if (engine_args.size() == 10 && with_structure) @@ -523,8 +514,8 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format"); if (!is_format_arg(sixth_arg)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); - format = sixth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); + setFormat(sixth_arg); + setCompressionMethod(checkAndGetLiteralArgument(engine_args[6], "compression")); const auto partition_strategy_name = checkAndGetLiteralArgument(engine_args[7], "partition_strategy"); const auto partition_strategy_type_opt = magic_enum::enum_cast(partition_strategy_name, magic_enum::case_insensitive); @@ -533,9 +524,9 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partition strategy {}", partition_strategy_name); } - partition_strategy_type = partition_strategy_type_opt.value(); - partition_columns_in_data_file = checkAndGetLiteralArgument(engine_args[8], "partition_columns_in_data_file"); - structure = checkAndGetLiteralArgument(engine_args[9], "structure"); + setPartitionStrategyType(partition_strategy_type_opt.value()); + setPartitionColumnsInDataFile(checkAndGetLiteralArgument(engine_args[8], "partition_columns_in_data_file")); + setStructure(checkAndGetLiteralArgument(engine_args[9], "structure")); } blobs_paths = {blob_path}; @@ -706,6 +697,22 @@ void StorageAzureConfiguration::addStructureAndFormatToArgsIfNeeded( } } +ASTPtr StorageAzureConfiguration::createArgsWithAccessData() const +{ + auto arguments = std::make_shared(); + + arguments->children.push_back(std::make_shared(connection_params.endpoint.storage_account_url)); + arguments->children.push_back(std::make_shared(connection_params.endpoint.container_name)); + arguments->children.push_back(std::make_shared(blob_path.path)); + if (account_name && account_key) + { + arguments->children.push_back(std::make_shared(*account_name)); + arguments->children.push_back(std::make_shared(*account_key)); + } + + return arguments; +} + } #endif diff --git a/src/Storages/ObjectStorage/Azure/Configuration.h b/src/Storages/ObjectStorage/Azure/Configuration.h index ceae6e1796c7..76c6ec2cfb0a 100644 --- a/src/Storages/ObjectStorage/Azure/Configuration.h +++ b/src/Storages/ObjectStorage/Azure/Configuration.h @@ -86,15 +86,21 @@ class StorageAzureConfiguration : public StorageObjectStorageConfiguration ContextPtr context, bool with_structure) override; + ASTPtr createArgsWithAccessData() const override; + protected: void fromNamedCollection(const NamedCollection & collection, ContextPtr context) override; void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; ASTPtr extractExtraCredentials(ASTs & args); - bool collectCredentials(ASTPtr maybe_credentials, std::optional & client_id, std::optional & tenant_id, ContextPtr local_context); + bool collectCredentials(ASTPtr maybe_credentials, std::optional & client_id_, std::optional & tenant_id_, ContextPtr local_context); Path blob_path; Paths blobs_paths; AzureBlobStorage::ConnectionParams connection_params; + std::optional account_name; + std::optional account_key; + std::optional client_id; + std::optional tenant_id; }; } diff --git a/src/Storages/ObjectStorage/DataLakes/Common.cpp b/src/Storages/ObjectStorage/DataLakes/Common.cpp index 65041a470e4c..c292915f2e16 100644 --- a/src/Storages/ObjectStorage/DataLakes/Common.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Common.cpp @@ -14,7 +14,7 @@ std::vector listFiles( const String & prefix, const String & suffix) { auto key = std::filesystem::path(configuration.getPathForRead().path) / prefix; - RelativePathsWithMetadata files_with_metadata; + PathsWithMetadata files_with_metadata; object_storage.listObjects(key, files_with_metadata, 0); Strings res; for (const auto & file_with_metadata : files_with_metadata) diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h index 1f01c5c12207..7fd9f2c20051 100644 --- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h +++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -10,11 +11,17 @@ #include #include #include -#include +#include #include #include #include #include +#include +#include +#include +#include +#include + #include #include @@ -36,20 +43,21 @@ namespace ErrorCodes namespace DataLakeStorageSetting { - extern DataLakeStorageSettingsBool allow_dynamic_metadata_for_data_lakes; - extern DataLakeStorageSettingsDatabaseDataLakeCatalogType storage_catalog_type; - extern DataLakeStorageSettingsString object_storage_endpoint; - extern DataLakeStorageSettingsString storage_aws_access_key_id; - extern DataLakeStorageSettingsString storage_aws_secret_access_key; - extern DataLakeStorageSettingsString storage_region; - extern DataLakeStorageSettingsString storage_catalog_url; - extern DataLakeStorageSettingsString storage_warehouse; - extern DataLakeStorageSettingsString storage_catalog_credential; - - extern DataLakeStorageSettingsString storage_auth_scope; - extern DataLakeStorageSettingsString storage_auth_header; - extern DataLakeStorageSettingsString storage_oauth_server_uri; - extern DataLakeStorageSettingsBool storage_oauth_server_use_request_body; + extern const DataLakeStorageSettingsBool allow_dynamic_metadata_for_data_lakes; + extern const DataLakeStorageSettingsDatabaseDataLakeCatalogType storage_catalog_type; + extern const DataLakeStorageSettingsString object_storage_endpoint; + extern const DataLakeStorageSettingsString storage_aws_access_key_id; + extern const DataLakeStorageSettingsString storage_aws_secret_access_key; + extern const DataLakeStorageSettingsString storage_region; + extern const DataLakeStorageSettingsString storage_catalog_url; + extern const DataLakeStorageSettingsString storage_warehouse; + extern const DataLakeStorageSettingsString storage_catalog_credential; + + extern const DataLakeStorageSettingsString storage_auth_scope; + extern const DataLakeStorageSettingsString storage_auth_header; + extern const DataLakeStorageSettingsString storage_oauth_server_uri; + extern const DataLakeStorageSettingsBool storage_oauth_server_use_request_body; + extern const DataLakeStorageSettingsString iceberg_metadata_file_path; } template @@ -121,7 +129,7 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl bool supportsDelete() const override { - assertInitialized(); + assertInitializedDL(); return current_metadata->supportsDelete(); } @@ -132,32 +140,32 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl std::shared_ptr catalog, const std::optional & format_settings) override { - assertInitialized(); + assertInitializedDL(); current_metadata->mutate(commands, context, storage_id, metadata_snapshot, catalog, format_settings); } void checkMutationIsPossible(const MutationCommands & commands) override { - assertInitialized(); + assertInitializedDL(); current_metadata->checkMutationIsPossible(commands); } void checkAlterIsPossible(const AlterCommands & commands) override { - assertInitialized(); + assertInitializedDL(); current_metadata->checkAlterIsPossible(commands); } void alter(const AlterCommands & params, ContextPtr context) override { - assertInitialized(); + assertInitializedDL(); current_metadata->alter(params, context); } std::optional tryGetTableStructureFromMetadata() const override { - assertInitialized(); + assertInitializedDL(); if (auto schema = current_metadata->getTableSchema(); !schema.empty()) return ColumnsDescription(std::move(schema)); return std::nullopt; @@ -165,38 +173,38 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl std::optional totalRows(ContextPtr local_context) override { - assertInitialized(); + assertInitializedDL(); return current_metadata->totalRows(local_context); } std::optional totalBytes(ContextPtr local_context) override { - assertInitialized(); + assertInitializedDL(); return current_metadata->totalBytes(local_context); } std::shared_ptr getInitialSchemaByPath(ContextPtr local_context, ObjectInfoPtr object_info) const override { - assertInitialized(); + assertInitializedDL(); return current_metadata->getInitialSchemaByPath(local_context, object_info); } std::shared_ptr getSchemaTransformer(ContextPtr local_context, ObjectInfoPtr object_info) const override { - assertInitialized(); + assertInitializedDL(); return current_metadata->getSchemaTransformer(local_context, object_info); } bool hasExternalDynamicMetadata() override { - assertInitialized(); + assertInitializedDL(); return (*settings)[DataLakeStorageSetting::allow_dynamic_metadata_for_data_lakes] && current_metadata->supportsSchemaEvolution(); } IDataLakeMetadata * getExternalMetadata() override { - assertInitialized(); + assertInitializedDL(); return current_metadata.get(); } @@ -204,7 +212,7 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl bool supportsWrites() const override { - assertInitialized(); + assertInitializedDL(); return current_metadata->supportsWrites(); } @@ -214,7 +222,7 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl size_t list_batch_size, ContextPtr context) override { - assertInitialized(); + assertInitializedDL(); return current_metadata->iterate(filter_dag, callback, list_batch_size, context); } @@ -226,7 +234,7 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl #if USE_PARQUET && USE_AWS_S3 DeltaLakePartitionColumns getDeltaLakePartitionColumns() const { - assertInitialized(); + assertInitializedDL(); const auto * delta_lake_metadata = dynamic_cast(current_metadata.get()); if (delta_lake_metadata) return delta_lake_metadata->getPartitionColumns(); @@ -234,20 +242,20 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl } #endif - void modifyFormatSettings(FormatSettings & settings_) const override + void modifyFormatSettings(FormatSettings & settings_, const Context & local_context) const override { - assertInitialized(); - current_metadata->modifyFormatSettings(settings_); + assertInitializedDL(); + current_metadata->modifyFormatSettings(settings_, local_context); } ColumnMapperPtr getColumnMapperForObject(ObjectInfoPtr object_info) const override { - assertInitialized(); + assertInitializedDL(); return current_metadata->getColumnMapperForObject(object_info); } ColumnMapperPtr getColumnMapperForCurrentSchema() const override { - assertInitialized(); + assertInitializedDL(); return current_metadata->getColumnMapperForCurrentSchema(); } @@ -309,7 +317,7 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl bool optimize(const StorageMetadataPtr & metadata_snapshot, ContextPtr context, const std::optional & format_settings) override { - assertInitialized(); + assertInitializedDL(); return current_metadata->optimize(metadata_snapshot, context, format_settings); } @@ -318,13 +326,50 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl current_metadata->addDeleteTransformers(object_info, builder, format_settings, local_context); } + ASTPtr createArgsWithAccessData() const override + { + auto res = BaseStorageConfiguration::createArgsWithAccessData(); + + auto iceberg_metadata_file_path = (*settings)[DataLakeStorageSetting::iceberg_metadata_file_path]; + + if (iceberg_metadata_file_path.changed) + { + auto * arguments = res->template as(); + if (!arguments) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Arguments are not an expression list"); + + bool has_settings = false; + + for (auto & arg : arguments->children) + { + if (auto * settings_ast = arg->template as()) + { + has_settings = true; + settings_ast->changes.setSetting("iceberg_metadata_file_path", iceberg_metadata_file_path.value); + break; + } + } + + if (!has_settings) + { + std::shared_ptr settings_ast = std::make_shared(); + settings_ast->is_standalone = false; + settings_ast->changes.setSetting("iceberg_metadata_file_path", iceberg_metadata_file_path.value); + arguments->children.push_back(settings_ast); + } + } + + return res; + } + private: DataLakeMetadataPtr current_metadata; LoggerPtr log = getLogger("DataLakeConfiguration"); const DataLakeStorageSettingsPtr settings; - void assertInitialized() const + void assertInitializedDL() const { + BaseStorageConfiguration::assertInitialized(); if (!current_metadata) throw Exception(ErrorCodes::LOGICAL_ERROR, "Metadata is not initialized"); } @@ -386,15 +431,362 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl using StorageS3IcebergConfiguration = DataLakeConfiguration; #endif -#if USE_AZURE_BLOB_STORAGE +# if USE_AZURE_BLOB_STORAGE using StorageAzureIcebergConfiguration = DataLakeConfiguration; #endif -#if USE_HDFS +# if USE_HDFS using StorageHDFSIcebergConfiguration = DataLakeConfiguration; #endif using StorageLocalIcebergConfiguration = DataLakeConfiguration; + +/// Class detects storage type by `storage_type` parameter if exists +/// and uses appropriate implementation - S3, Azure, HDFS or Local +class StorageIcebergConfiguration : public StorageObjectStorageConfiguration, public std::enable_shared_from_this +{ + friend class StorageObjectStorageConfiguration; + +public: + explicit StorageIcebergConfiguration(DataLakeStorageSettingsPtr settings_) : settings(settings_) {} + + ObjectStorageType getType() const override { return getImpl().getType(); } + + std::string getTypeName() const override { return getImpl().getTypeName(); } + std::string getEngineName() const override { return getImpl().getEngineName(); } + std::string getNamespaceType() const override { return getImpl().getNamespaceType(); } + + Path getRawPath() const override { return getImpl().getRawPath(); } + const String & getRawURI() const override { return getImpl().getRawURI(); } + const Path & getPathForRead() const override { return getImpl().getPathForRead(); } + Path getPathForWrite(const std::string & partition_id) const override { return getImpl().getPathForWrite(partition_id); } + + void setPathForRead(const Path & path) override { getImpl().setPathForRead(path); } + + const Paths & getPaths() const override { return getImpl().getPaths(); } + void setPaths(const Paths & paths) override { getImpl().setPaths(paths); } + + String getDataSourceDescription() const override { return getImpl().getDataSourceDescription(); } + String getNamespace() const override { return getImpl().getNamespace(); } + + StorageObjectStorageQuerySettings getQuerySettings(const ContextPtr & context) const override + { return getImpl().getQuerySettings(context); } + + void addStructureAndFormatToArgsIfNeeded( + ASTs & args, const String & structure_, const String & format_, ContextPtr context, bool with_structure) override + { getImpl().addStructureAndFormatToArgsIfNeeded(args, structure_, format_, context, with_structure); } + + bool isNamespaceWithGlobs() const override { return getImpl().isNamespaceWithGlobs(); } + + bool isArchive() const override { return getImpl().isArchive(); } + bool isPathInArchiveWithGlobs() const override { return getImpl().isPathInArchiveWithGlobs(); } + std::string getPathInArchive() const override { return getImpl().getPathInArchive(); } + + void check(ContextPtr context) override { getImpl().check(context); } + void validateNamespace(const String & name) const override { getImpl().validateNamespace(name); } + + ObjectStoragePtr createObjectStorage(ContextPtr context, bool is_readonly) override + { return getImpl().createObjectStorage(context, is_readonly); } + bool isStaticConfiguration() const override { return getImpl().isStaticConfiguration(); } + + bool isDataLakeConfiguration() const override { return getImpl().isDataLakeConfiguration(); } + + std::optional totalRows(ContextPtr context) override { return getImpl().totalRows(context); } + std::optional totalBytes(ContextPtr context) override { return getImpl().totalBytes(context); } + + bool hasExternalDynamicMetadata() override { return getImpl().hasExternalDynamicMetadata(); } + + IDataLakeMetadata * getExternalMetadata() override { return getImpl().getExternalMetadata(); } + + std::shared_ptr getInitialSchemaByPath(ContextPtr context, ObjectInfoPtr object_info) const override + { return getImpl().getInitialSchemaByPath(context, object_info); } + + std::shared_ptr getSchemaTransformer(ContextPtr context, ObjectInfoPtr object_info) const override + { return getImpl().getSchemaTransformer(context, object_info); } + + void modifyFormatSettings(FormatSettings & settings_, const Context & local_context) const override { getImpl().modifyFormatSettings(settings_, local_context); } + + void addDeleteTransformers( + ObjectInfoPtr object_info, + QueryPipelineBuilder & builder, + const std::optional & format_settings, + ContextPtr local_context) const override { getImpl().addDeleteTransformers(object_info, builder, format_settings, local_context); } + + ReadFromFormatInfo prepareReadingFromFormat( + ObjectStoragePtr object_storage, + const Strings & requested_columns, + const StorageSnapshotPtr & storage_snapshot, + bool supports_subset_of_columns, + bool supports_tuple_elements, + ContextPtr local_context, + const PrepareReadingFromFormatHiveParams & hive_parameters) override + { + return getImpl().prepareReadingFromFormat( + object_storage, + requested_columns, + storage_snapshot, + supports_subset_of_columns, + supports_tuple_elements, + local_context, + hive_parameters); + } + + void initPartitionStrategy(ASTPtr partition_by, const ColumnsDescription & columns, ContextPtr context) override + { getImpl().initPartitionStrategy(partition_by, columns, context); } + + std::optional tryGetTableStructureFromMetadata() const override + { return getImpl().tryGetTableStructureFromMetadata(); } + + bool supportsFileIterator() const override { return getImpl().supportsFileIterator(); } + bool supportsWrites() const override { return getImpl().supportsWrites(); } + + bool supportsPartialPathPrefix() const override { return getImpl().supportsPartialPathPrefix(); } + + ObjectIterator iterate( + const ActionsDAG * filter_dag, + std::function callback, + size_t list_batch_size, + ContextPtr context) override + { + return getImpl().iterate(filter_dag, callback, list_batch_size, context); + } + + bool update( + ObjectStoragePtr object_storage_ptr, + ContextPtr context, + bool if_not_updated_before, + bool check_consistent_with_previous_metadata) override + { + return getImpl().update(object_storage_ptr, context, if_not_updated_before, check_consistent_with_previous_metadata); + } + + void create( + ObjectStoragePtr object_storage, + ContextPtr local_context, + const std::optional & columns, + ASTPtr partition_by, + bool if_not_exists, + std::shared_ptr catalog, + const StorageID & table_id_) override + { + getImpl().create(object_storage, local_context, columns, partition_by, if_not_exists, catalog, table_id_); + } + + SinkToStoragePtr write( + SharedHeader sample_block, + const StorageID & table_id, + ObjectStoragePtr object_storage, + const std::optional & format_settings, + ContextPtr context, + std::shared_ptr catalog) override + { + return getImpl().write(sample_block, table_id, object_storage, format_settings, context, catalog); + } + + bool supportsDelete() const override { return getImpl().supportsDelete(); } + void mutate(const MutationCommands & commands, + ContextPtr context, + const StorageID & storage_id, + StorageMetadataPtr metadata_snapshot, + std::shared_ptr catalog, + const std::optional & format_settings) override + { + getImpl().mutate(commands, context, storage_id, metadata_snapshot, catalog, format_settings); + } + + void checkMutationIsPossible(const MutationCommands & commands) override { getImpl().checkMutationIsPossible(commands); } + + void checkAlterIsPossible(const AlterCommands & commands) override { getImpl().checkAlterIsPossible(commands); } + + void alter(const AlterCommands & params, ContextPtr context) override { getImpl().alter(params, context); } + + const DataLakeStorageSettings & getDataLakeSettings() const override { return getImpl().getDataLakeSettings(); } + + void initialize( + ASTs & engine_args, + ContextPtr local_context, + bool with_table_structure) override + { + createDynamicConfiguration(engine_args, local_context); + getImpl().initialize(engine_args, local_context, with_table_structure); + } + + ASTPtr createArgsWithAccessData() const override + { + return getImpl().createArgsWithAccessData(); + } + + void fromNamedCollection(const NamedCollection & collection, ContextPtr context) override + { getImpl().fromNamedCollection(collection, context); } + void fromAST(ASTs & args, ContextPtr context, bool with_structure) override + { getImpl().fromAST(args, context, with_structure); } + + const String & getFormat() const override { return getImpl().getFormat(); } + const String & getCompressionMethod() const override { return getImpl().getCompressionMethod(); } + const String & getStructure() const override { return getImpl().getStructure(); } + + PartitionStrategyFactory::StrategyType getPartitionStrategyType() const override { return getImpl().getPartitionStrategyType(); } + bool getPartitionColumnsInDataFile() const override { return getImpl().getPartitionColumnsInDataFile(); } + std::shared_ptr getPartitionStrategy() const override { return getImpl().getPartitionStrategy(); } + + void setFormat(const String & format_) override { getImpl().setFormat(format_); } + void setCompressionMethod(const String & compression_method_) override { getImpl().setCompressionMethod(compression_method_); } + void setStructure(const String & structure_) override { getImpl().setStructure(structure_); } + + void setPartitionStrategyType(PartitionStrategyFactory::StrategyType partition_strategy_type_) override + { + getImpl().setPartitionStrategyType(partition_strategy_type_); + } + void setPartitionColumnsInDataFile(bool partition_columns_in_data_file_) override + { + getImpl().setPartitionColumnsInDataFile(partition_columns_in_data_file_); + } + void setPartitionStrategy(const std::shared_ptr & partition_strategy_) override + { + getImpl().setPartitionStrategy(partition_strategy_); + } + + ColumnMapperPtr getColumnMapperForObject(ObjectInfoPtr obj) const override { return getImpl().getColumnMapperForObject(obj); } + + ColumnMapperPtr getColumnMapperForCurrentSchema() const override { return getImpl().getColumnMapperForCurrentSchema(); } + + std::shared_ptr getCatalog(ContextPtr context, bool is_attach) const override + { + return getImpl().getCatalog(context, is_attach); + } + + bool optimize(const StorageMetadataPtr & metadata_snapshot, ContextPtr context, const std::optional & format_settings) override + { + return getImpl().optimize(metadata_snapshot, context, format_settings); + } + +protected: + /// Find storage_type argument and remove it from args if exists. + /// Return storage type. + ObjectStorageType extractDynamicStorageType(ASTs & args, ContextPtr context, ASTPtr * type_arg) const override + { + static const auto * const storage_type_name = "storage_type"; + + if (auto named_collection = tryGetNamedCollectionWithOverrides(args, context)) + { + if (named_collection->has(storage_type_name)) + { + return objectStorageTypeFromString(named_collection->get(storage_type_name)); + } + } + + auto * type_it = args.end(); + + /// S3 by default for backward compatibility + /// Iceberg without storage_type == IcebergS3 + ObjectStorageType type = ObjectStorageType::S3; + + for (auto * arg_it = args.begin(); arg_it != args.end(); ++arg_it) + { + const auto * type_ast_function = (*arg_it)->as(); + + if (type_ast_function && type_ast_function->name == "equals" + && type_ast_function->arguments && type_ast_function->arguments->children.size() == 2) + { + auto * name = type_ast_function->arguments->children[0]->as(); + + if (name && name->name() == storage_type_name) + { + if (type_it != args.end()) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "DataLake can have only one key-value argument: storage_type='type'."); + } + + auto * value = type_ast_function->arguments->children[1]->as(); + + if (!value) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "DataLake parameter 'storage_type' has wrong type, string literal expected."); + } + + if (value->value.getType() != Field::Types::String) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "DataLake parameter 'storage_type' has wrong value type, string expected."); + } + + type = objectStorageTypeFromString(value->value.safeGet()); + + type_it = arg_it; + } + } + } + + if (type_it != args.end()) + { + if (type_arg) + *type_arg = *type_it; + args.erase(type_it); + } + + return type; + } + + void createDynamicConfiguration(ASTs & args, ContextPtr context) + { + ObjectStorageType type = extractDynamicStorageType(args, context, nullptr); + createDynamicStorage(type); + } + + void assertInitialized() const override { getImpl().assertInitialized(); } + +private: + inline StorageObjectStorageConfiguration & getImpl() const + { + if (!impl) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Dynamic DataLake storage not initialized"); + + return *impl; + } + + void createDynamicStorage(ObjectStorageType type) + { + if (impl) + { + if (impl->getType() == type) + return; + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't change datalake engine storage"); + } + + switch (type) + { +# if USE_AWS_S3 + case ObjectStorageType::S3: + impl = std::make_unique(settings); + break; +# endif +# if USE_AZURE_BLOB_STORAGE + case ObjectStorageType::Azure: + impl = std::make_unique(settings); + break; +# endif +# if USE_HDFS + case ObjectStorageType::HDFS: + impl = std::make_unique(settings); + break; +# endif + case ObjectStorageType::Local: + impl = std::make_unique(settings); + break; + default: + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unsuported DataLake storage {}", type); + } + } + + StorageObjectStorageConfigurationPtr impl; + DataLakeStorageSettingsPtr settings; +}; #endif #if USE_PARQUET diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h b/src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h index 04f6f8fb1931..5c9ef47f8f01 100644 --- a/src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h +++ b/src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h @@ -60,6 +60,9 @@ If enabled, the engine would use the metadata file with the most recent last_upd )", 0) \ DECLARE(Bool, iceberg_use_version_hint, false, R"( Get latest metadata path from version-hint.text file. +)", 0) \ + DECLARE(String, object_storage_cluster, "", R"( +Cluster for distributed requests )", 0) \ DECLARE(NonZeroUInt64, iceberg_format_version, 2, R"( Metadata format version. diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLake/DeltaLakePartitionedSink.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLake/DeltaLakePartitionedSink.cpp index dfe79756429b..da222f531491 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLake/DeltaLakePartitionedSink.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLake/DeltaLakePartitionedSink.cpp @@ -180,7 +180,7 @@ DeltaLakePartitionedSink::createSinkForPartition(StringRef partition_key) { auto data_prefix = std::filesystem::path(delta_transaction->getDataPath()) / partition_key.toString(); return std::make_unique( - DeltaLake::generateWritePath(std::move(data_prefix), configuration->format), + DeltaLake::generateWritePath(std::move(data_prefix), configuration->getFormat()), object_storage, configuration, format_settings, diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLake/DeltaLakeSink.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLake/DeltaLakeSink.cpp index 41726594cb0f..a588435e5189 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLake/DeltaLakeSink.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLake/DeltaLakeSink.cpp @@ -42,7 +42,7 @@ DeltaLakeSink::StorageSinkPtr DeltaLakeSink::createStorageSink() const return std::make_unique( DeltaLake::generateWritePath( delta_transaction->getDataPath(), - configuration->format), + configuration->getFormat()), object_storage, configuration, format_settings, diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp index 68a42937e379..5155a9cd2255 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.cpp @@ -115,7 +115,7 @@ NamesAndTypesList DeltaLakeMetadataDeltaKernel::getTableSchema() const return table_snapshot->getTableSchema(); } -void DeltaLakeMetadataDeltaKernel::modifyFormatSettings(FormatSettings & format_settings) const +void DeltaLakeMetadataDeltaKernel::modifyFormatSettings(FormatSettings & format_settings, const Context &) const { /// There can be missing columns because of ALTER ADD/DROP COLUMN. /// So to support reading from such tables it is enough to turn on this setting. diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h index c43d7dcd13a2..4a161ada5930 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h @@ -47,7 +47,7 @@ class DeltaLakeMetadataDeltaKernel final : public IDataLakeMetadata bool operator ==(const IDataLakeMetadata &) const override; - void modifyFormatSettings(FormatSettings & format_settings) const override; + void modifyFormatSettings(FormatSettings & format_settings, const Context &) const override; static DataLakeMetadataPtr create( ObjectStoragePtr object_storage, diff --git a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.cpp index 8417b232a845..b6471c4ba03d 100644 --- a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.cpp @@ -44,7 +44,7 @@ Strings HudiMetadata::getDataFilesImpl() const { auto configuration_ptr = configuration.lock(); auto log = getLogger("HudiMetadata"); - const auto keys = listFiles(*object_storage, *configuration_ptr, "", Poco::toLower(configuration_ptr->format)); + const auto keys = listFiles(*object_storage, *configuration_ptr, "", Poco::toLower(configuration_ptr->getFormat())); using Partition = std::string; using FileID = std::string; @@ -91,7 +91,7 @@ HudiMetadata::HudiMetadata(ObjectStoragePtr object_storage_, StorageObjectStorag { } -Strings HudiMetadata::getDataFiles(const ActionsDAG *) const +Strings HudiMetadata::getDataFiles() const { if (data_files.empty()) data_files = getDataFilesImpl(); @@ -99,12 +99,12 @@ Strings HudiMetadata::getDataFiles(const ActionsDAG *) const } ObjectIterator HudiMetadata::iterate( - const ActionsDAG * filter_dag, + const ActionsDAG * /* filter_dag */, FileProgressCallback callback, size_t /* list_batch_size */, ContextPtr /* context */) const { - return createKeysIterator(getDataFiles(filter_dag), object_storage, callback); + return createKeysIterator(getDataFiles(), object_storage, callback); } } diff --git a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h index 47f2e8d95366..147b39893006 100644 --- a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h @@ -60,7 +60,7 @@ class HudiMetadata final : public IDataLakeMetadata, private WithContext mutable Strings data_files; Strings getDataFilesImpl() const; - Strings getDataFiles(const ActionsDAG * filter_dag) const; + Strings getDataFiles() const; }; } diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp index 92c40ae695d7..3205da746054 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp @@ -1,9 +1,18 @@ #include #include +#include +#include +#include +#include namespace DB { +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; +}; + namespace { @@ -41,12 +50,16 @@ class KeysIterator : public IObjectIterator return nullptr; auto key = data_files[current_index]; - auto object_metadata = object_storage->getObjectMetadata(key); if (callback) - callback(FileProgress(0, object_metadata.size_bytes)); + { + /// Too expencive to load size for metadata always + /// because it requires API call to external storage. + /// In many cases only keys are needed. + callback(FileProgress(0, 1)); + } - return std::make_shared(key, std::move(object_metadata)); + return std::make_shared(key, std::nullopt); } } @@ -87,4 +100,109 @@ ReadFromFormatInfo IDataLakeMetadata::prepareReadingFromFormat( return DB::prepareReadingFromFormat(requested_columns, storage_snapshot, context, supports_subset_of_columns, supports_tuple_elements); } +DataFileMetaInfo::DataFileMetaInfo( + const Iceberg::IcebergSchemaProcessor & schema_processor, + Int32 schema_id, + const std::unordered_map & columns_info_) +{ + + std::vector column_ids; + for (const auto & column : columns_info_) + column_ids.push_back(column.first); + + auto name_and_types = schema_processor.tryGetFieldsCharacteristics(schema_id, column_ids); + std::unordered_map name_by_index; + for (const auto & name_and_type : name_and_types) + { + const auto name = name_and_type.getNameInStorage(); + auto index = schema_processor.tryGetColumnIDByName(schema_id, name); + if (index.has_value()) + name_by_index[index.value()] = name; + } + + for (const auto & column : columns_info_) + { + auto i_name = name_by_index.find(column.first); + if (i_name != name_by_index.end()) + { + columns_info[i_name->second] = {column.second.rows_count, column.second.nulls_count, column.second.hyperrectangle}; + } + } +} + +constexpr size_t FIELD_MASK_ROWS = 0x1; +constexpr size_t FIELD_MASK_NULLS = 0x2; +constexpr size_t FIELD_MASK_RECT = 0x4; +constexpr size_t FIELD_MASK_ALL = 0x7; + +void DataFileMetaInfo::serialize(WriteBuffer & out) const +{ + auto size = columns_info.size(); + writeIntBinary(size, out); + for (const auto & column : columns_info) + { + writeStringBinary(column.first, out); + size_t field_mask = 0; + if (column.second.rows_count.has_value()) + field_mask |= FIELD_MASK_ROWS; + if (column.second.nulls_count.has_value()) + field_mask |= FIELD_MASK_NULLS; + if (column.second.hyperrectangle.has_value()) + field_mask |= FIELD_MASK_RECT; + writeIntBinary(field_mask, out); + + if (column.second.rows_count.has_value()) + writeIntBinary(column.second.rows_count.value(), out); + if (column.second.nulls_count.has_value()) + writeIntBinary(column.second.nulls_count.value(), out); + if (column.second.hyperrectangle.has_value()) + { + writeFieldBinary(column.second.hyperrectangle.value().left, out); + writeFieldBinary(column.second.hyperrectangle.value().right, out); + } + } +} + +DataFileMetaInfo DataFileMetaInfo::deserialize(ReadBuffer & in) +{ + DataFileMetaInfo result; + + size_t size; + readIntBinary(size, in); + + for (size_t i = 0; i < size; ++i) + { + std::string name; + readStringBinary(name, in); + size_t field_mask; + readIntBinary(field_mask, in); + if ((field_mask & FIELD_MASK_ALL) != field_mask) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected field mask: {}", field_mask); + + ColumnInfo & column = result.columns_info[name]; + + if (field_mask & FIELD_MASK_ROWS) + { + Int64 value; + readIntBinary(value, in); + column.rows_count = value; + } + if (field_mask & FIELD_MASK_NULLS) + { + Int64 value; + readIntBinary(value, in); + column.nulls_count = value; + } + if (field_mask & FIELD_MASK_RECT) + { + FieldRef left = readFieldBinary(in); + FieldRef right = readFieldBinary(in); + column.hyperrectangle = Range(left, true, right, true); + } + } + + return result; +} + + } diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h index 962e965574b4..eb509c474afe 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -15,6 +16,8 @@ #include #include +#include + namespace DataLake { @@ -27,7 +30,59 @@ namespace DB namespace ErrorCodes { extern const int UNSUPPORTED_METHOD; -} +}; + +namespace Iceberg +{ +struct ColumnInfo; +}; + +class DataFileMetaInfo +{ +public: + DataFileMetaInfo() = default; + + // subset of Iceberg::ColumnInfo now + struct ColumnInfo + { + std::optional rows_count; + std::optional nulls_count; + std::optional hyperrectangle; + }; + + // Extract metadata from Iceberg structure + explicit DataFileMetaInfo( + const Iceberg::IcebergSchemaProcessor & schema_processor, + Int32 schema_id, + const std::unordered_map & columns_info_); + + void serialize(WriteBuffer & out) const; + static DataFileMetaInfo deserialize(ReadBuffer & in); + + bool empty() const { return columns_info.empty(); } + + std::unordered_map columns_info; +}; + +using DataFileMetaInfoPtr = std::shared_ptr; + +struct DataFileInfo +{ + std::string file_path; + std::optional file_meta_info; + std::optional absolute_uri; + + explicit DataFileInfo(const std::string & file_path_) + : file_path(file_path_) {} + + explicit DataFileInfo(std::string && file_path_) + : file_path(std::move(file_path_)) {} + + bool operator==(const DataFileInfo & rhs) const + { + return file_path == rhs.file_path; + } +}; class SinkToStorage; using SinkToStoragePtr = std::shared_ptr; @@ -74,7 +129,7 @@ class IDataLakeMetadata : boost::noncopyable virtual bool supportsSchemaEvolution() const { return false; } virtual bool supportsWrites() const { return false; } - virtual void modifyFormatSettings(FormatSettings &) const {} + virtual void modifyFormatSettings(FormatSettings &, const Context &) const {} virtual std::optional totalRows(ContextPtr) const { return {}; } virtual std::optional totalBytes(ContextPtr) const { return {}; } @@ -112,6 +167,9 @@ class IDataLakeMetadata : boost::noncopyable virtual void checkAlterIsPossible(const AlterCommands & /*commands*/) { throwNotImplemented("alter"); } virtual void alter(const AlterCommands & /*params*/, ContextPtr /*context*/) { throwNotImplemented("alter"); } + virtual std::optional partitionKey(ContextPtr) const { return {}; } + virtual std::optional sortingKey(ContextPtr) const { return {}; } + protected: virtual ObjectIterator createKeysIterator( Strings && data_files_, diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.cpp index 5057b5df9d2d..a574b06f2eeb 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/AvroForIcebergDeserializer.cpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace DB::ErrorCodes { @@ -19,6 +20,12 @@ namespace DB::ErrorCodes extern const int INCORRECT_DATA; } +namespace ProfileEvents +{ + extern const Event IcebergAvroFileParsing; + extern const Event IcebergAvroFileParsingMicroseconds; +} + namespace DB::Iceberg { @@ -30,6 +37,9 @@ try : buffer(std::move(buffer_)) , manifest_file_path(manifest_file_path_) { + ProfileEvents::increment(ProfileEvents::IcebergAvroFileParsing); + ProfileEventTimeIncrement watch(ProfileEvents::IcebergAvroFileParsingMicroseconds); + auto manifest_file_reader = std::make_unique(std::make_unique(*buffer)); diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Compaction.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Compaction.cpp index 0076b1d52ddf..c2e783dc73e9 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Compaction.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Compaction.cpp @@ -69,6 +69,7 @@ struct Plan IcebergHistory history; std::unordered_map manifest_file_to_first_snapshot; std::unordered_map> manifest_list_to_manifest_files; + std::unordered_map> manifest_file_to_snapshots; std::unordered_map>> snapshot_id_to_data_files; std::unordered_map> path_to_data_file; FileNamesGenerator generator; @@ -111,6 +112,7 @@ Plan getPlan( IcebergHistory snapshots_info, const PersistentTableComponents & persistent_table_components, ObjectStoragePtr object_storage, + SecondaryStorages & secondary_storages, StorageObjectStorageConfigurationPtr configuration, ContextPtr context, CompressionMethod compression_method) @@ -146,29 +148,36 @@ Plan getPlan( std::unordered_map> manifest_files; for (const auto & snapshot : snapshots_info) { + auto [manifest_list_storage, key_in_storage] = resolveObjectStorageForPath(persistent_table_components.table_location, snapshot.manifest_list_path, object_storage, secondary_storages, context); + auto manifest_list - = getManifestList(object_storage, configuration, persistent_table_components, context, snapshot.manifest_list_path, log); + = getManifestList(manifest_list_storage, configuration, persistent_table_components, context, key_in_storage, snapshot.manifest_list_path, log); + for (const auto & manifest_file : manifest_list) { - plan.manifest_list_to_manifest_files[snapshot.manifest_list_path].push_back(manifest_file.manifest_file_path); - if (!plan.manifest_file_to_first_snapshot.contains(manifest_file.manifest_file_path)) - plan.manifest_file_to_first_snapshot[manifest_file.manifest_file_path] = snapshot.snapshot_id; + plan.manifest_list_to_manifest_files[snapshot.manifest_list_absolute_path].push_back(manifest_file.manifest_file_absolute_path); + if (!plan.manifest_file_to_first_snapshot.contains(manifest_file.manifest_file_absolute_path)) + { + plan.manifest_file_to_first_snapshot[manifest_file.manifest_file_absolute_path] = snapshot.snapshot_id; + } auto manifest_file_content = getManifestFile( object_storage, configuration, persistent_table_components, context, log, - manifest_file.manifest_file_path, + manifest_file.manifest_file_absolute_path, manifest_file.added_sequence_number, - manifest_file.added_snapshot_id); + manifest_file.added_snapshot_id, + secondary_storages); - if (!manifest_files.contains(manifest_file.manifest_file_path)) + if (!manifest_files.contains(manifest_file.manifest_file_absolute_path)) { - manifest_files[manifest_file.manifest_file_path] = std::make_shared(current_schema); - manifest_files[manifest_file.manifest_file_path]->path = manifest_file.manifest_file_path; + manifest_files[manifest_file.manifest_file_absolute_path] = std::make_shared(current_schema); + manifest_files[manifest_file.manifest_file_absolute_path]->path = manifest_file.manifest_file_absolute_path; } - manifest_files[manifest_file.manifest_file_path]->manifest_lists_path.push_back(snapshot.manifest_list_path); + manifest_files[manifest_file.manifest_file_absolute_path]->manifest_lists_path.push_back(snapshot.manifest_list_path); + plan.manifest_file_to_snapshots[manifest_file.manifest_file_absolute_path].insert(snapshot.snapshot_id); auto data_files = manifest_file_content->getFilesWithoutDeleted(FileContentType::DATA); auto positional_delete_files = manifest_file_content->getFilesWithoutDeleted(FileContentType::POSITION_DELETE); for (const auto & pos_delete_file : positional_delete_files) @@ -180,19 +189,23 @@ Plan getPlan( if (plan.partitions.size() <= partition_index) plan.partitions.push_back({}); - IcebergDataObjectInfoPtr data_object_info = std::make_shared(data_file); + auto [resolved_storage, resolved_key] = resolveObjectStorageForPath( + persistent_table_components.table_location, data_file.file_path, object_storage, secondary_storages, context); + + IcebergDataObjectInfoPtr data_object_info = std::make_shared(data_file, resolved_storage, resolved_key); std::shared_ptr data_file_ptr; - if (!plan.path_to_data_file.contains(manifest_file.manifest_file_path)) + std::string path_identifier = resolved_storage->getDescription() + ":" + resolved_storage->getObjectsNamespace() + "|" + resolved_key; + if (!plan.path_to_data_file.contains(path_identifier)) { data_file_ptr = std::make_shared(DataFilePlan{ .data_object_info = data_object_info, - .manifest_list = manifest_files[manifest_file.manifest_file_path], + .manifest_list = manifest_files[manifest_file.manifest_file_absolute_path], .patched_path = plan.generator.generateDataFileName()}); - plan.path_to_data_file[manifest_file.manifest_file_path] = data_file_ptr; + plan.path_to_data_file[path_identifier] = data_file_ptr; } else { - data_file_ptr = plan.path_to_data_file[manifest_file.manifest_file_path]; + data_file_ptr = plan.path_to_data_file[path_identifier]; } plan.partitions[partition_index].push_back(data_file_ptr); plan.snapshot_id_to_data_files[snapshot.snapshot_id].push_back(plan.partitions[partition_index].back()); @@ -224,15 +237,20 @@ void writeDataFiles( ObjectStoragePtr object_storage, const std::optional & format_settings, ContextPtr context, - StorageObjectStorageConfigurationPtr configuration) + StorageObjectStorageConfigurationPtr configuration, + const String & table_location, + SecondaryStorages & secondary_storages) { for (auto & [_, data_file] : initial_plan.path_to_data_file) { auto delete_file_transform = std::make_shared( - sample_block, data_file->data_object_info, object_storage, format_settings, context); + sample_block, data_file->data_object_info, object_storage, format_settings, context, table_location, secondary_storages); + ObjectStoragePtr storage_to_use = data_file->data_object_info->getObjectStorage(); + if (!storage_to_use) + storage_to_use = object_storage; StorageObjectStorage::ObjectInfo object_info(data_file->data_object_info->getPath()); - auto read_buffer = createReadBuffer(object_info, object_storage, context, getLogger("IcebergCompaction")); + auto read_buffer = createReadBuffer(object_info, storage_to_use, context, getLogger("IcebergCompaction")); const Settings & settings = context->getSettingsRef(); auto parser_shared_resources = std::make_shared( @@ -240,7 +258,7 @@ void writeDataFiles( /*num_streams_=*/1); auto input_format = FormatFactory::instance().getInput( - configuration->format, + configuration->getFormat(), *read_buffer, *sample_block, context, @@ -249,7 +267,7 @@ void writeDataFiles( parser_shared_resources, std::make_shared(nullptr, context, nullptr), true /* is_remote_fs */, - chooseCompressionMethod(data_file->data_object_info->getPath(), configuration->compression_method), + chooseCompressionMethod(data_file->data_object_info->getPath(), configuration->getCompressionMethod()), false); auto write_buffer = object_storage->writeObject( @@ -260,7 +278,7 @@ void writeDataFiles( context->getWriteSettings()); auto output_format - = FormatFactory::instance().getOutputFormat(configuration->format, *write_buffer, *sample_block, context, format_settings); + = FormatFactory::instance().getOutputFormat(configuration->getFormat(), *write_buffer, *sample_block, context, format_settings); while (true) { @@ -390,6 +408,9 @@ void writeMetadataFiles( { manifest_entry->patched_path = plan.generator.generateManifestEntryName(); manifest_file_renamings[manifest_entry->path] = manifest_entry->patched_path.path_in_metadata; + + std::vector unique_data_filenames(data_filenames.begin(), data_filenames.end()); + auto buffer_manifest_entry = object_storage->writeObject( StoredObject(manifest_entry->patched_path.path_in_storage), WriteMode::Rewrite, @@ -407,11 +428,11 @@ void writeMetadataFiles( partition_columns, plan.partition_encoder.getPartitionValue(grouped_by_manifest_files_partitions[manifest_entry]), ChunkPartitioner(fields_from_partition_spec, current_schema, context, sample_block_).getResultTypes(), - std::vector(data_filenames.begin(), data_filenames.end()), + unique_data_filenames, manifest_entry->statistics, sample_block_, snapshot, - configuration->format, + configuration->getFormat(), partititon_spec, partition_spec_id, *buffer_manifest_entry, @@ -436,16 +457,25 @@ void writeMetadataFiles( if (plan.history[i].added_files == 0) continue; - auto initial_manifest_list_name = plan.history[i].manifest_list_path; + auto initial_manifest_list_name = plan.history[i].manifest_list_absolute_path; auto initial_manifest_entries = plan.manifest_list_to_manifest_files[initial_manifest_list_name]; - auto renamed_manifest_list = manifest_list_renamings[initial_manifest_list_name]; + auto renamed_manifest_list = manifest_list_renamings[plan.history[i].manifest_list_path]; std::vector renamed_manifest_entries; + std::unordered_set seen_manifest_entries; // Deduplicate manifest entries Int32 total_manifest_file_sizes = 0; for (const auto & initial_manifest_entry : initial_manifest_entries) { auto renamed_manifest_entry = manifest_file_renamings[initial_manifest_entry]; if (!renamed_manifest_entry.empty()) { + auto it = plan.manifest_file_to_snapshots.find(initial_manifest_entry); + if (it != plan.manifest_file_to_snapshots.end() && !it->second.contains(plan.history[i].snapshot_id)) + continue; + + if (seen_manifest_entries.contains(renamed_manifest_entry)) + continue; + + seen_manifest_entries.insert(renamed_manifest_entry); renamed_manifest_entries.push_back(renamed_manifest_entry); total_manifest_file_sizes += manifest_file_sizes[renamed_manifest_entry]; } @@ -508,6 +538,7 @@ void compactIcebergTable( IcebergHistory snapshots_info, const PersistentTableComponents & persistent_table_components, ObjectStoragePtr object_storage_, + SecondaryStorages & secondary_storages_, StorageObjectStorageConfigurationPtr configuration_, const std::optional & format_settings_, SharedHeader sample_block_, @@ -515,11 +546,11 @@ void compactIcebergTable( CompressionMethod compression_method_) { auto plan - = getPlan(std::move(snapshots_info), persistent_table_components, object_storage_, configuration_, context_, compression_method_); + = getPlan(std::move(snapshots_info), persistent_table_components, object_storage_, secondary_storages_, configuration_, context_, compression_method_); if (plan.need_optimize) { auto old_files = getOldFiles(object_storage_, configuration_); - writeDataFiles(plan, sample_block_, object_storage_, format_settings_, context_, configuration_); + writeDataFiles(plan, sample_block_, object_storage_, format_settings_, context_, configuration_, persistent_table_components.table_location, secondary_storages_); writeMetadataFiles(plan, object_storage_, configuration_, context_, sample_block_); clearOldFiles(object_storage_, old_files); } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Compaction.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Compaction.h index c2c80ccbf6a9..16d3622c939e 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Compaction.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Compaction.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB::Iceberg @@ -15,6 +16,7 @@ void compactIcebergTable( IcebergHistory snapshots_info, const PersistentTableComponents & persistent_table_components, DB::ObjectStoragePtr object_storage_, + SecondaryStorages & secondary_storages_, DB::StorageObjectStorageConfigurationPtr configuration_, const std::optional & format_settings_, DB::SharedHeader sample_block_, diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp index 7462a3b59bff..057a95cfbf7a 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.cpp @@ -1,3 +1,4 @@ +#include #include "config.h" #if USE_AVRO @@ -36,18 +37,35 @@ namespace Setting extern const SettingsBool use_roaring_bitmap_iceberg_positional_deletes; }; + IcebergDataObjectInfo::IcebergDataObjectInfo(Iceberg::ManifestFileEntry data_manifest_file_entry_) - : RelativePathWithMetadata(data_manifest_file_entry_.file_path) + : PathWithMetadata(data_manifest_file_entry_.file_path, std::nullopt, + data_manifest_file_entry_.file_path_key.empty() ? std::nullopt : std::make_optional(data_manifest_file_entry_.file_path_key)) , data_object_file_path_key(data_manifest_file_entry_.file_path_key) , underlying_format_read_schema_id(data_manifest_file_entry_.schema_id) , sequence_number(data_manifest_file_entry_.added_sequence_number) { - auto toupper = [](String & str) + if (!position_deletes_objects.empty() && Poco::toUpperInPlace(data_manifest_file_entry_.file_format) != "PARQUET") { - std::transform(str.begin(), str.end(), str.begin(), ::toupper); - return str; - }; - if (!position_deletes_objects.empty() && toupper(data_manifest_file_entry_.file_format) != "PARQUET") + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, + "Position deletes are only supported for data files of Parquet format in Iceberg, but got {}", + data_manifest_file_entry_.file_format); + } +} + +IcebergDataObjectInfo::IcebergDataObjectInfo( + Iceberg::ManifestFileEntry data_manifest_file_entry_, + ObjectStoragePtr resolved_storage, + const String & resolved_key) + : PathWithMetadata(resolved_key, std::nullopt, + data_manifest_file_entry_.file_path.empty() ? std::nullopt : std::make_optional(data_manifest_file_entry_.file_path), + resolved_storage) + , data_object_file_path_key(data_manifest_file_entry_.file_path_key) + , underlying_format_read_schema_id(data_manifest_file_entry_.schema_id) + , sequence_number(data_manifest_file_entry_.added_sequence_number) +{ + if (!position_deletes_objects.empty() && Poco::toUpperInPlace(data_manifest_file_entry_.file_format) != "PARQUET") { throw Exception( ErrorCodes::NOT_IMPLEMENTED, @@ -60,13 +78,15 @@ std::shared_ptr IcebergDataObjectInfo::getPositionDeleteTransf ObjectStoragePtr object_storage, const SharedHeader & header, const std::optional & format_settings, - ContextPtr context_) + ContextPtr context_, + const String & table_location, + SecondaryStorages & secondary_storages) { IcebergDataObjectInfoPtr self = shared_from_this(); if (!context_->getSettingsRef()[Setting::use_roaring_bitmap_iceberg_positional_deletes].value) - return std::make_shared(header, self, object_storage, format_settings, context_); + return std::make_shared(header, self, object_storage, format_settings, context_, table_location, secondary_storages); else - return std::make_shared(header, self, object_storage, format_settings, context_); + return std::make_shared(header, self, object_storage, format_settings, context_, table_location, secondary_storages); } } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h index 40cbd2252928..e5e8e1b6cc09 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergDataObjectInfo.h @@ -8,12 +8,13 @@ #include #include +#include #include namespace DB { -struct IcebergDataObjectInfo : public RelativePathWithMetadata, std::enable_shared_from_this +struct IcebergDataObjectInfo : public PathWithMetadata, std::enable_shared_from_this { using IcebergDataObjectInfoPtr = std::shared_ptr; @@ -21,12 +22,20 @@ struct IcebergDataObjectInfo : public RelativePathWithMetadata, std::enable_shar /// It is used to filter position deletes objects by data file path. /// It is also used to create a filter for the data object in the position delete transform. explicit IcebergDataObjectInfo(Iceberg::ManifestFileEntry data_manifest_file_entry_); + + /// Sometimes data files are located outside the table location and even in a different storage. + explicit IcebergDataObjectInfo( + Iceberg::ManifestFileEntry data_manifest_file_entry_, + ObjectStoragePtr resolved_storage, + const String & resolved_key); std::shared_ptr getPositionDeleteTransformer( ObjectStoragePtr object_storage, const SharedHeader & header, const std::optional & format_settings, - ContextPtr context_); + ContextPtr context_, + const String & table_location, + SecondaryStorages & secondary_storages); void addPositionDeleteObject(Iceberg::ManifestFileEntry position_delete_object) { diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp index 7377400d32e4..7264647486e6 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ #include #include #include +#include #include @@ -137,15 +139,16 @@ std::optional SingleThreadIcebergKeysIterator::next() persistent_components, local_context, log, - data_snapshot->manifest_list_entries[manifest_file_index].manifest_file_path, + data_snapshot->manifest_list_entries[manifest_file_index].manifest_file_absolute_path, data_snapshot->manifest_list_entries[manifest_file_index].added_sequence_number, - data_snapshot->manifest_list_entries[manifest_file_index].added_snapshot_id); + data_snapshot->manifest_list_entries[manifest_file_index].added_snapshot_id, + *secondary_storages); internal_data_index = 0; } auto files = files_generator(current_manifest_file_content); while (internal_data_index < files.size()) { - const auto & manifest_file_entry = files[internal_data_index++]; + auto & manifest_file_entry = files[internal_data_index++]; if ((manifest_file_entry.schema_id != previous_entry_schema) && (use_partition_pruning)) { previous_entry_schema = manifest_file_entry.schema_id; @@ -167,7 +170,7 @@ std::optional SingleThreadIcebergKeysIterator::next() auto pruning_status = current_pruner ? current_pruner->canBePruned(manifest_file_entry) : PruningReturnStatus::NOT_PRUNED; insertRowToLogTable( local_context, - "", + [&]()->String { return ""; }, DB::IcebergMetadataLogLevel::ManifestFileEntry, configuration.lock()->getRawPath().path, current_manifest_file_content->getPathToManifestFile(), @@ -176,7 +179,13 @@ std::optional SingleThreadIcebergKeysIterator::next() switch (pruning_status) { case PruningReturnStatus::NOT_PRUNED: + { + auto [storage_to_use, resolved_key] = resolveObjectStorageForPath( + persistent_components.table_location, manifest_file_entry.file_path, object_storage, *secondary_storages, local_context); + manifest_file_entry.storage_to_use = storage_to_use; + manifest_file_entry.resolved_key = resolved_key; return manifest_file_entry; + } case PruningReturnStatus::MIN_MAX_INDEX_PRUNED: { ++min_max_index_pruned_files; break; @@ -214,7 +223,8 @@ SingleThreadIcebergKeysIterator::SingleThreadIcebergKeysIterator( const ActionsDAG * filter_dag_, Iceberg::IcebergTableStateSnapshotPtr table_snapshot_, Iceberg::IcebergDataSnapshotPtr data_snapshot_, - PersistentTableComponents persistent_components_) + PersistentTableComponents persistent_components_, + std::shared_ptr secondary_storages_) : object_storage(object_storage_) , filter_dag(filter_dag_ ? std::make_shared(filter_dag_->clone()) : nullptr) , local_context(local_context_) @@ -236,6 +246,7 @@ SingleThreadIcebergKeysIterator::SingleThreadIcebergKeysIterator( , persistent_components(persistent_components_) , files_generator(files_generator_) , log(getLogger("IcebergIterator")) + , secondary_storages(secondary_storages_) , manifest_file_content_type(manifest_file_content_type_) { } @@ -248,7 +259,8 @@ IcebergIterator::IcebergIterator( IDataLakeMetadata::FileProgressCallback callback_, Iceberg::IcebergTableStateSnapshotPtr table_snapshot_, Iceberg::IcebergDataSnapshotPtr data_snapshot_, - PersistentTableComponents persistent_components_) + PersistentTableComponents persistent_components_, + std::shared_ptr secondary_storages_) : filter_dag(filter_dag_ ? std::make_unique(filter_dag_->clone()) : nullptr) , object_storage(std::move(object_storage_)) , data_files_iterator( @@ -260,7 +272,8 @@ IcebergIterator::IcebergIterator( filter_dag.get(), table_snapshot_, data_snapshot_, - persistent_components_) + persistent_components_, + secondary_storages_) , deletes_iterator( object_storage, local_context_, @@ -276,12 +289,16 @@ IcebergIterator::IcebergIterator( filter_dag.get(), table_snapshot_, data_snapshot_, - persistent_components_) + persistent_components_, + secondary_storages_) , blocking_queue(100) , producer_task(std::nullopt) , callback(std::move(callback_)) - , format(configuration_.lock()->format) - , compression_method(configuration_.lock()->compression_method) + , format(configuration_.lock()->getFormat()) + , compression_method(configuration_.lock()->getCompressionMethod()) + , persistent_components(persistent_components_) + , table_schema_id(table_snapshot_->schema_id) + , secondary_storages(secondary_storages_) { auto delete_file = deletes_iterator.next(); while (delete_file.has_value()) @@ -338,16 +355,22 @@ ObjectInfoPtr IcebergIterator::next(size_t) Iceberg::ManifestFileEntry manifest_file_entry; if (blocking_queue.pop(manifest_file_entry)) { - IcebergDataObjectInfoPtr object_info = std::make_shared(manifest_file_entry); + IcebergDataObjectInfoPtr object_info = std::make_shared( + manifest_file_entry, manifest_file_entry.storage_to_use, manifest_file_entry.resolved_key); + for (const auto & position_delete : defineDeletesSpan(manifest_file_entry, position_deletes_files, false)) - { object_info->addPositionDeleteObject(position_delete); - } + for (const auto & equality_delete : defineDeletesSpan(manifest_file_entry, equality_deletes_files, true)) - { object_info->addEqualityDeleteObject(equality_delete); - } + + object_info->setFileMetaInfo(std::make_shared( + *persistent_components.schema_processor, + table_schema_id, /// current schema id to use current column names + manifest_file_entry.columns_infos)); + ProfileEvents::increment(ProfileEvents::IcebergMetadataReturnedObjectInfos); + return object_info; } { diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.h index 16d8f823e303..f59db6c16a2c 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergIterator.h @@ -25,6 +25,7 @@ #include #include #include +#include namespace DB { @@ -45,7 +46,8 @@ class SingleThreadIcebergKeysIterator const ActionsDAG * filter_dag_, IcebergTableStateSnapshotPtr table_snapshot_, IcebergDataSnapshotPtr data_snapshot_, - PersistentTableComponents persistent_components); + PersistentTableComponents persistent_components, + std::shared_ptr secondary_storages_); std::optional next(); @@ -62,7 +64,7 @@ class SingleThreadIcebergKeysIterator PersistentTableComponents persistent_components; FilesGenerator files_generator; LoggerPtr log; - + std::shared_ptr secondary_storages; // By Iceberg design it is difficult to avoid storing position deletes in memory. size_t manifest_file_index = 0; @@ -90,7 +92,8 @@ class IcebergIterator : public IObjectIterator IDataLakeMetadata::FileProgressCallback callback_, Iceberg::IcebergTableStateSnapshotPtr table_snapshot_, Iceberg::IcebergDataSnapshotPtr data_snapshot_, - Iceberg::PersistentTableComponents persistent_components); + Iceberg::PersistentTableComponents persistent_components_, + std::shared_ptr secondary_storages_); ObjectInfoPtr next(size_t) override; @@ -111,6 +114,9 @@ class IcebergIterator : public IObjectIterator std::vector equality_deletes_files; std::exception_ptr exception; std::mutex exception_mutex; + Iceberg::PersistentTableComponents persistent_components; + Int32 table_schema_id; + std::shared_ptr secondary_storages; // Sometimes data or manifests can be located on another storage }; } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp index 8cfc7ed86039..0ed41e4f8409 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp @@ -135,9 +135,10 @@ IcebergMetadata::IcebergMetadata( IcebergMetadataFilesCachePtr cache_ptr, CompressionMethod metadata_compression_method_) : object_storage(std::move(object_storage_)) + , secondary_storages(std::make_shared()) , configuration(std::move(configuration_)) , persistent_components(PersistentTableComponents{ - .schema_processor = std::make_shared(), + .schema_processor = std::make_shared(context_), .metadata_cache = cache_ptr, .format_version = format_version_, .table_location = metadata_object_->getValue(f_location) @@ -150,7 +151,7 @@ IcebergMetadata::IcebergMetadata( updateState(context_, metadata_object_); } -void IcebergMetadata::addTableSchemaById(Int32 schema_id, Poco::JSON::Object::Ptr metadata_object) const +void IcebergMetadata::addTableSchemaById(Int32 schema_id, Poco::JSON::Object::Ptr metadata_object, ContextPtr context_) const { if (persistent_components.schema_processor->hasClickhouseTableSchemaById(schema_id)) return; @@ -165,7 +166,7 @@ void IcebergMetadata::addTableSchemaById(Int32 schema_id, Poco::JSON::Object::Pt auto current_schema = schemas->getObject(i); if (current_schema->has(f_schema_id) && current_schema->getValue(f_schema_id) == schema_id) { - persistent_components.schema_processor->addIcebergTableSchema(current_schema); + persistent_components.schema_processor->addIcebergTableSchema(current_schema, context_); return; } } @@ -176,13 +177,16 @@ void IcebergMetadata::addTableSchemaById(Int32 schema_id, Poco::JSON::Object::Pt } Int32 IcebergMetadata::parseTableSchema( - const Poco::JSON::Object::Ptr & metadata_object, IcebergSchemaProcessor & schema_processor, LoggerPtr metadata_logger) + const Poco::JSON::Object::Ptr & metadata_object, + IcebergSchemaProcessor & schema_processor, + ContextPtr context_, + LoggerPtr metadata_logger) { const auto format_version = metadata_object->getValue(f_format_version); if (format_version == 2) { auto [schema, current_schema_id] = parseTableSchemaV2Method(metadata_object); - schema_processor.addIcebergTableSchema(schema); + schema_processor.addIcebergTableSchema(schema, context_); return current_schema_id; } else @@ -190,7 +194,7 @@ Int32 IcebergMetadata::parseTableSchema( try { auto [schema, current_schema_id] = parseTableSchemaV1Method(metadata_object); - schema_processor.addIcebergTableSchema(schema); + schema_processor.addIcebergTableSchema(schema, context_); return current_schema_id; } catch (const Exception & first_error) @@ -200,7 +204,7 @@ Int32 IcebergMetadata::parseTableSchema( try { auto [schema, current_schema_id] = parseTableSchemaV2Method(metadata_object); - schema_processor.addIcebergTableSchema(schema); + schema_processor.addIcebergTableSchema(schema, context_); LOG_WARNING( metadata_logger, "Iceberg table schema was parsed using v2 specification, but it was impossible to parse it using v1 " @@ -245,9 +249,10 @@ bool IcebergMetadata::update(const ContextPtr & local_context) updateState(local_context, metadata_object); + auto dump_metadata = [&]()->String { return dumpMetadataObjectToString(metadata_object); }; insertRowToLogTable( local_context, - dumpMetadataObjectToString(metadata_object), + dump_metadata, DB::IcebergMetadataLogLevel::Metadata, configuration_ptr->getRawPath().path, metadata_file_path, @@ -261,6 +266,192 @@ bool IcebergMetadata::update(const ContextPtr & local_context) return previous_snapshot_schema_id != relevant_snapshot_schema_id; } +namespace +{ + +using IdToName = std::unordered_map; + +IdToName buildIdToNameMap(const Poco::JSON::Object::Ptr & metadata_obj) +{ + IdToName map; + if (!metadata_obj || !metadata_obj->has("current-schema-id") || !metadata_obj->has("schemas")) + return map; + + const auto current_schema_id = metadata_obj->getValue("current-schema-id"); + auto schemas = metadata_obj->getArray("schemas"); + if (!schemas) + return map; + + for (size_t i = 0; i < schemas->size(); ++i) + { + auto schema = schemas->getObject(i); + + if (!schema || !schema->has("schema-id") || (schema->getValue("schema-id") != current_schema_id)) + continue; + + if (auto fields = schema->getArray("fields")) + { + for (size_t j = 0; j < fields->size(); ++j) + { + auto f = fields->getObject(j); + if (!f || !f->has("id") || !f->has("name")) + continue; + map.emplace(f->getValue("id"), f->getValue("name")); + } + } + break; + } + return map; +} + +String formatTransform( + const String & transform, + const Poco::JSON::Object::Ptr & field_obj, + const IdToName & id_to_name) +{ + Int32 source_id = (field_obj && field_obj->has("source-id")) + ? field_obj->getValue("source-id") + : -1; + + const auto it = id_to_name.find(source_id); + const String col = (it != id_to_name.end()) ? it->second : ("col_" + toString(source_id)); + + String base = transform; + String param; + if (const auto lpos = transform.find('['); lpos != String::npos && transform.back() == ']') + { + base = transform.substr(0, lpos); + param = transform.substr(lpos + 1, transform.size() - lpos - 2); // strip [ and ] + } + + String result; + if (base == "identity") + result = col; + else if (base == "year" || base == "month" || base == "day" || base == "hour") + result = base + "(" + col + ")"; + else if (base != "void") + { + if (!param.empty()) + result = base + "(" + param + ", " + col + ")"; + else + result = base + "(" + col + ")"; + } + return result; +} + +Poco::JSON::Array::Ptr findActivePartitionFields(const Poco::JSON::Object::Ptr & metadata_obj) +{ + if (!metadata_obj) + return nullptr; + + if (metadata_obj->has("partition-spec")) + return metadata_obj->getArray("partition-spec"); + + // If for some reason there is no partition-spec, try partition-specs + default- + if (metadata_obj->has("partition-specs") && metadata_obj->has("default-spec-id")) + { + const auto default_spec_id = metadata_obj->getValue("default-spec-id"); + if (auto specs = metadata_obj->getArray("partition-specs")) + { + for (size_t i = 0; i < specs->size(); ++i) + { + auto spec = specs->getObject(i); + if (!spec || !spec->has("spec-id")) + continue; + if (spec->getValue("spec-id") == default_spec_id) + return spec->has("fields") ? spec->getArray("fields") : nullptr; + } + } + } + + return nullptr; +} + +Poco::JSON::Array::Ptr findActiveSortFields(const Poco::JSON::Object::Ptr & metadata_obj) +{ + if (!metadata_obj || !metadata_obj->has("default-sort-order-id") || !metadata_obj->has("sort-orders")) + return nullptr; + + const auto default_sort_order_id = metadata_obj->getValue("default-sort-order-id"); + auto orders = metadata_obj->getArray("sort-orders"); + if (!orders) + return nullptr; + + for (size_t i = 0; i < orders->size(); ++i) + { + auto order = orders->getObject(i); + if (!order || !order->has("order-id")) + continue; + if (order->getValue("order-id") == default_sort_order_id) + return order->has("fields") ? order->getArray("fields") : nullptr; + } + return nullptr; +} + +String composeList( + const Poco::JSON::Array::Ptr & fields, + const IdToName & id_to_name, + bool lookup_sort_modifiers) +{ + if (!fields || fields->size() == 0) + return {}; + + Strings parts; + parts.reserve(fields->size()); + + for (size_t i = 0; i < fields->size(); ++i) + { + auto field = fields->getObject(i); + if (!field) + continue; + + const String transform = field->has("transform") ? field->getValue("transform") : "identity"; + String expr = formatTransform(transform, field, id_to_name); + if (expr.empty()) + continue; + + if (lookup_sort_modifiers) + { + if (field->has("direction")) + { + auto d = field->getValue("direction"); + expr += (Poco::icompare(d, "desc") == 0) ? " DESC" : " ASC"; + } + } + + parts.push_back(std::move(expr)); + } + + if (parts.empty()) + return {}; + + String res; + for (size_t i = 0; i < parts.size(); ++i) + { + if (i) res += ", "; + res += parts[i]; + } + return res; +} + +std::pair, std::optional> extractIcebergKeys(const Poco::JSON::Object::Ptr & metadata_obj) +{ + std::optional partition_key; + std::optional sort_key; + + if (metadata_obj) + { + auto id_to_name = buildIdToNameMap(metadata_obj); + + partition_key = composeList(findActivePartitionFields(metadata_obj), id_to_name, /*lookup_sort_modifiers=*/ false); + sort_key = composeList(findActiveSortFields(metadata_obj), id_to_name, /*lookup_sort_modifiers=*/ true); + } + + return {partition_key, sort_key}; +} + +} + void IcebergMetadata::updateSnapshot(ContextPtr local_context, Poco::JSON::Object::Ptr metadata_object) { auto configuration_ptr = configuration.lock(); @@ -274,7 +465,7 @@ void IcebergMetadata::updateSnapshot(ContextPtr local_context, Poco::JSON::Objec for (UInt32 j = 0; j < schemas->size(); ++j) { auto schema = schemas->getObject(j); - persistent_components.schema_processor->addIcebergTableSchema(schema); + persistent_components.schema_processor->addIcebergTableSchema(schema, local_context); } auto snapshots = metadata_object->get(f_snapshots).extract(); bool successfully_found_snapshot = false; @@ -312,19 +503,26 @@ void IcebergMetadata::updateSnapshot(ContextPtr local_context, Poco::JSON::Objec } } + auto [partition_key, sorting_key] = extractIcebergKeys(metadata_object); + + String manifest_list_path = snapshot->getValue(f_manifest_list); + auto [storage_to_use, key_in_storage] = resolveObjectStorageForPath(persistent_components.table_location, manifest_list_path, object_storage, *secondary_storages, local_context); + relevant_snapshot = std::make_shared( getManifestList( - object_storage, + storage_to_use, configuration_ptr, persistent_components, - local_context, - getProperFilePathFromMetadataInfo( - snapshot->getValue(f_manifest_list), configuration_ptr->getPathForRead().path, persistent_components.table_location), + local_context, + key_in_storage, + makeAbsolutePath(persistent_components.table_location, manifest_list_path), log), relevant_snapshot_id, total_rows, total_bytes, - total_position_deletes); + total_position_deletes, + partition_key, + sorting_key); if (!snapshot->has(f_schema_id)) throw Exception( @@ -333,7 +531,7 @@ void IcebergMetadata::updateSnapshot(ContextPtr local_context, Poco::JSON::Objec relevant_snapshot_id, configuration_ptr->getPathForRead().path); relevant_snapshot_schema_id = snapshot->getValue(f_schema_id); - addTableSchemaById(relevant_snapshot_schema_id, metadata_object); + addTableSchemaById(relevant_snapshot_schema_id, metadata_object, local_context); } } if (!successfully_found_snapshot) @@ -355,6 +553,7 @@ bool IcebergMetadata::optimize(const StorageMetadataPtr & metadata_snapshot, Con snapshots_info, persistent_components, object_storage, + *secondary_storages, configuration_ptr, format_settings, sample_block, @@ -420,7 +619,11 @@ void IcebergMetadata::updateState(const ContextPtr & local_context, Poco::JSON:: { updateSnapshot(local_context, metadata_object); } - relevant_snapshot_schema_id = parseTableSchema(metadata_object, *persistent_components.schema_processor, log); + relevant_snapshot_schema_id = parseTableSchema( + metadata_object, + *persistent_components.schema_processor, + local_context, + log); } } @@ -437,14 +640,17 @@ std::shared_ptr IcebergMetadata::getInitialSchemaByPath(Conte : nullptr; } -std::shared_ptr IcebergMetadata::getSchemaTransformer(ContextPtr, ObjectInfoPtr object_info) const +std::shared_ptr IcebergMetadata::getSchemaTransformer(ContextPtr local_context, ObjectInfoPtr object_info) const { IcebergDataObjectInfo * iceberg_object_info = dynamic_cast(object_info.get()); SharedLockGuard lock(mutex); if (!iceberg_object_info) return nullptr; return (iceberg_object_info->underlying_format_read_schema_id != relevant_snapshot_schema_id) - ? persistent_components.schema_processor->getSchemaTransformationDagByIds(iceberg_object_info->underlying_format_read_schema_id, relevant_snapshot_schema_id) + ? persistent_components.schema_processor->getSchemaTransformationDagByIds( + local_context, + iceberg_object_info->underlying_format_read_schema_id, + relevant_snapshot_schema_id) : nullptr; } @@ -584,9 +790,10 @@ DataLakeMetadataPtr IcebergMetadata::create( auto format_version = object->getValue(f_format_version); + auto dump_metadata = [&]()->String { return dumpMetadataObjectToString(object); }; insertRowToLogTable( local_context, - dumpMetadataObjectToString(object), + dump_metadata, DB::IcebergMetadataLogLevel::Metadata, configuration_ptr->getRawPath().path, metadata_file_path, @@ -725,9 +932,10 @@ std::optional IcebergMetadata::totalRows(ContextPtr local_context) const persistent_components, local_context, log, - manifest_list_entry.manifest_file_path, + manifest_list_entry.manifest_file_absolute_path, manifest_list_entry.added_sequence_number, - manifest_list_entry.added_snapshot_id); + manifest_list_entry.added_snapshot_id, + *secondary_storages); auto data_count = manifest_file_ptr->getRowsCountInAllFilesExcludingDeleted(FileContentType::DATA); auto position_deletes_count = manifest_file_ptr->getRowsCountInAllFilesExcludingDeleted(FileContentType::POSITION_DELETE); if (!data_count.has_value() || !position_deletes_count.has_value()) @@ -765,9 +973,10 @@ std::optional IcebergMetadata::totalBytes(ContextPtr local_context) cons persistent_components, local_context, log, - manifest_list_entry.manifest_file_path, + manifest_list_entry.manifest_file_absolute_path, manifest_list_entry.added_sequence_number, - manifest_list_entry.added_snapshot_id); + manifest_list_entry.added_snapshot_id, + *secondary_storages); auto count = manifest_file_ptr->getBytesCountInAllDataFilesExcludingDeleted(); if (!count.has_value()) return {}; @@ -778,6 +987,19 @@ std::optional IcebergMetadata::totalBytes(ContextPtr local_context) cons return result; } +std::optional IcebergMetadata::partitionKey(ContextPtr) const +{ + SharedLockGuard lock(mutex); + return relevant_snapshot->partition_key; +} + +std::optional IcebergMetadata::sortingKey(ContextPtr) const +{ + SharedLockGuard lock(mutex); + return relevant_snapshot->sorting_key; +} + + ObjectIterator IcebergMetadata::iterate( const ActionsDAG * filter_dag, FileProgressCallback callback, @@ -798,7 +1020,8 @@ ObjectIterator IcebergMetadata::iterate( callback, table_snapshot, relevant_snapshot, - persistent_components); + persistent_components, + secondary_storages); } NamesAndTypesList IcebergMetadata::getTableSchema() const @@ -813,6 +1036,14 @@ std::tuple IcebergMetadata::getVersion() const return std::make_tuple(relevant_snapshot_id, relevant_snapshot_schema_id); } +void IcebergMetadata::modifyFormatSettings(FormatSettings & format_settings, const Context & local_context) const +{ + if (!local_context.getSettingsRef()[Setting::use_roaring_bitmap_iceberg_positional_deletes].value) + /// IcebergStreamingPositionDeleteTransform requires increasing row numbers from both the + /// data reader and the deletes reader. + format_settings.parquet.preserve_order = true; +} + void IcebergMetadata::addDeleteTransformers( ObjectInfoPtr object_info, QueryPipelineBuilder & builder, @@ -825,21 +1056,27 @@ void IcebergMetadata::addDeleteTransformers( if (!iceberg_object_info->position_deletes_objects.empty()) { + LOG_DEBUG(log, "Constructing filter transform for position delete, there are {} delete objects", iceberg_object_info->position_deletes_objects.size()); builder.addSimpleTransform( [&](const SharedHeader & header) - { return iceberg_object_info->getPositionDeleteTransformer(object_storage, header, format_settings, local_context); }); + { return iceberg_object_info->getPositionDeleteTransformer(object_storage, header, format_settings, local_context, persistent_components.table_location, *secondary_storages); }); } const auto & delete_files = iceberg_object_info->equality_deletes_objects; - LOG_DEBUG(log, "Constructing filter transform for equality delete, there are {} delete files", delete_files.size()); + if (!delete_files.empty()) + LOG_DEBUG(log, "Constructing filter transform for equality delete, there are {} delete files", delete_files.size()); for (const ManifestFileEntry & delete_file : delete_files) { auto simple_transform_adder = [&](const SharedHeader & header) { /// get header of delete file Block delete_file_header; - ObjectInfo delete_file_object(delete_file.file_path); + + auto [delete_storage_to_use, resolved_delete_key] = resolveObjectStorageForPath( + persistent_components.table_location, delete_file.file_path, object_storage, *secondary_storages, local_context); + + PathWithMetadata delete_file_object(resolved_delete_key, std::nullopt, delete_file.file_path, delete_storage_to_use); { - auto schema_read_buffer = createReadBuffer(delete_file_object, object_storage, local_context, log); + auto schema_read_buffer = createReadBuffer(delete_file_object, delete_storage_to_use, local_context, log); auto schema_reader = FormatFactory::instance().getSchemaReader(delete_file.file_format, *schema_read_buffer, local_context); auto columns_with_names = schema_reader->readSchema(); ColumnsWithTypeAndName initial_header_data; @@ -862,7 +1099,7 @@ void IcebergMetadata::addDeleteTransformers( } /// Then we read the content of the delete file. auto mutable_columns_for_set = block_for_set.cloneEmptyColumns(); - std::unique_ptr data_read_buffer = createReadBuffer(delete_file_object, object_storage, local_context, log); + std::unique_ptr data_read_buffer = createReadBuffer(delete_file_object, delete_storage_to_use, local_context, log); CompressionMethod compression_method = chooseCompressionMethod(delete_file.file_path, "auto"); auto delete_format = FormatFactory::instance().getInput( delete_file.file_format, @@ -958,7 +1195,7 @@ ColumnMapperPtr IcebergMetadata::getColumnMapperForObject(ObjectInfoPtr object_i if (!iceberg_object_info) return nullptr; auto configuration_ptr = configuration.lock(); - if (Poco::toLower(configuration_ptr->format) != "parquet") + if (Poco::toLower(configuration_ptr->getFormat()) != "parquet") return nullptr; return persistent_components.schema_processor->getColumnMapperById(iceberg_object_info->underlying_format_read_schema_id); @@ -967,11 +1204,12 @@ ColumnMapperPtr IcebergMetadata::getColumnMapperForObject(ObjectInfoPtr object_i ColumnMapperPtr IcebergMetadata::getColumnMapperForCurrentSchema() const { auto configuration_ptr = configuration.lock(); - if (Poco::toLower(configuration_ptr->format) != "parquet") + if (Poco::toLower(configuration_ptr->getFormat()) != "parquet") return nullptr; SharedLockGuard lock(mutex); return persistent_components.schema_processor->getColumnMapperById(relevant_snapshot_schema_id); } + } #endif diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h index e755c72946b8..ad1b60e86820 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h @@ -28,6 +28,7 @@ #include #include #include +#include namespace DB { @@ -80,7 +81,10 @@ class IcebergMetadata : public IDataLakeMetadata bool supportsSchemaEvolution() const override { return true; } static Int32 parseTableSchema( - const Poco::JSON::Object::Ptr & metadata_object, Iceberg::IcebergSchemaProcessor & schema_processor, LoggerPtr metadata_logger); + const Poco::JSON::Object::Ptr & metadata_object, + Iceberg::IcebergSchemaProcessor & schema_processor, + ContextPtr context_, + LoggerPtr metadata_logger); bool supportsUpdate() const override { return true; } bool supportsWrites() const override { return true; } @@ -117,16 +121,28 @@ class IcebergMetadata : public IDataLakeMetadata void checkMutationIsPossible(const MutationCommands & commands) override; + void modifyFormatSettings(FormatSettings & format_settings, const Context & local_context) const override; void addDeleteTransformers(ObjectInfoPtr object_info, QueryPipelineBuilder & builder, const std::optional & format_settings, ContextPtr local_context) const override; void checkAlterIsPossible(const AlterCommands & commands) override; void alter(const AlterCommands & params, ContextPtr context) override; + std::optional partitionKey(ContextPtr) const override; + std::optional sortingKey(ContextPtr) const override; + protected: + ObjectIterator createIcebergKeysIterator( + Strings && data_files_, + ObjectStoragePtr, + IDataLakeMetadata::FileProgressCallback callback_, + ContextPtr local_context); + ObjectIterator iterate(const ActionsDAG * filter_dag, FileProgressCallback callback, size_t list_batch_size, ContextPtr local_context) const override; private: const ObjectStoragePtr object_storage; + mutable std::shared_ptr secondary_storages; // Sometimes data or manifests can be located on another storage + const StorageObjectStorageConfigurationWeakPtr configuration; DB::Iceberg::PersistentTableComponents persistent_components; @@ -147,7 +163,7 @@ class IcebergMetadata : public IDataLakeMetadata void updateState(const ContextPtr & local_context, Poco::JSON::Object::Ptr metadata_object) TSA_REQUIRES(mutex); void updateSnapshot(ContextPtr local_context, Poco::JSON::Object::Ptr metadata_object) TSA_REQUIRES(mutex); - void addTableSchemaById(Int32 schema_id, Poco::JSON::Object::Ptr metadata_object) const TSA_REQUIRES(mutex); + void addTableSchemaById(Int32 schema_id, Poco::JSON::Object::Ptr metadata_object, ContextPtr context_) const TSA_REQUIRES(mutex); std::optional getSchemaVersionByFileIfOutdated(String data_path) const TSA_REQUIRES_SHARED(mutex); void initializeSchemasFromManifestList(ContextPtr local_context, ManifestFileCacheKeys manifest_list_ptr) const TSA_REQUIRES(mutex); }; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h index d60fa5fdd870..a5f136648f32 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h @@ -31,7 +31,7 @@ namespace DB /// And we can get `ManifestFileContent` from cache by ManifestFileEntry. struct ManifestFileCacheKey { - String manifest_file_path; + String manifest_file_absolute_path; Int64 added_sequence_number; Int64 added_snapshot_id; Iceberg::ManifestFileContentType content_type; @@ -73,7 +73,7 @@ struct IcebergMetadataFilesCacheCell : private boost::noncopyable size_t total_size = 0; for (const auto & entry: manifest_file_cache_keys) { - total_size += sizeof(ManifestFileCacheKey) + entry.manifest_file_path.capacity(); + total_size += sizeof(ManifestFileCacheKey) + entry.manifest_file_absolute_path.capacity(); } return total_size; } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.cpp index ab80b64c5e37..9263f1042246 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergWrites.cpp @@ -1249,7 +1249,7 @@ void IcebergStorageSink::consume(Chunk & chunk) format_settings->parquet.filter_push_down = true; } writers[partition_key] = FormatFactory::instance().getOutputFormatParallelIfPossible( - configuration->format, *write_buffers[partition_key], *sample_block, context, format_settings); + configuration->getFormat(), *write_buffers[partition_key], *sample_block, context, format_settings); } writers[partition_key]->write(getHeader().cloneWithColumns(part_chunk.getColumns())); @@ -1368,7 +1368,7 @@ bool IcebergStorageSink::initializeMetadata() statistics.at(partition_key), sample_block, new_snapshot, - configuration->format, + configuration->getFormat(), partititon_spec, partition_spec_id, *buffer_manifest_entry, diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp index 98ffcc794fa1..d5c50a6c9994 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.cpp @@ -8,9 +8,9 @@ #include #include -#include #include #include +#include #include #include @@ -154,9 +154,10 @@ ManifestFileContent::ManifestFileContent( const String & path_to_manifest_file_) : path_to_manifest_file(path_to_manifest_file_) { + auto dump_metadata = [&]()->String { return manifest_file_deserializer.getMetadataContent(); }; insertRowToLogTable( context, - manifest_file_deserializer.getMetadataContent(), + dump_metadata, DB::IcebergMetadataLogLevel::ManifestFileMetadata, common_path, path_to_manifest_file, @@ -200,7 +201,7 @@ ManifestFileContent::ManifestFileContent( const Poco::JSON::Object::Ptr & schema_object = json.extract(); Int32 manifest_schema_id = schema_object->getValue(f_schema_id); - schema_processor.addIcebergTableSchema(schema_object); + schema_processor.addIcebergTableSchema(schema_object, context); for (size_t i = 0; i != partition_specification->size(); ++i) { @@ -230,9 +231,10 @@ ManifestFileContent::ManifestFileContent( for (size_t i = 0; i < manifest_file_deserializer.rows(); ++i) { + auto dump_row_metadata = [&]()->String { return manifest_file_deserializer.getContent(i); }; insertRowToLogTable( context, - manifest_file_deserializer.getContent(i), + dump_row_metadata, DB::IcebergMetadataLogLevel::ManifestFileEntry, common_path, path_to_manifest_file, @@ -243,7 +245,6 @@ ManifestFileContent::ManifestFileContent( content_type = FileContentType(manifest_file_deserializer.getValueFromRowByName(i, c_data_file_content, TypeIndex::Int32).safeGet()); const auto status = ManifestEntryStatus(manifest_file_deserializer.getValueFromRowByName(i, f_status, TypeIndex::Int32).safeGet()); - if (status == ManifestEntryStatus::DELETED) continue; @@ -286,9 +287,9 @@ ManifestFileContent::ManifestFileContent( } const auto schema_id = schema_id_opt.has_value() ? schema_id_opt.value() : manifest_schema_id; - const auto file_path_key - = manifest_file_deserializer.getValueFromRowByName(i, c_data_file_file_path, TypeIndex::String).safeGet(); - const auto file_path = getProperFilePathFromMetadataInfo(manifest_file_deserializer.getValueFromRowByName(i, c_data_file_file_path, TypeIndex::String).safeGet(), common_path, table_location); + const auto file_path_key_field = manifest_file_deserializer.getValueFromRowByName(i, c_data_file_file_path, TypeIndex::String); + const auto file_path_key = file_path_key_field.safeGet(); + const auto file_path = makeAbsolutePath(table_location, file_path_key); /// NOTE: This is weird, because in manifest file partition looks like this: /// { diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h index a045e976f3c6..ad63ab2be8e3 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/ManifestFile.h @@ -1,6 +1,23 @@ #pragma once #include "config.h" +#include +#include + +#include + +namespace DB::Iceberg +{ + +struct ColumnInfo +{ + std::optional rows_count; + std::optional bytes_size; + std::optional nulls_count; + std::optional hyperrectangle; +}; + +} #if USE_AVRO @@ -9,6 +26,7 @@ #include #include #include +#include #include @@ -38,14 +56,6 @@ enum class ManifestFileContentType String FileContentTypeToString(FileContentType type); -struct ColumnInfo -{ - std::optional rows_count; - std::optional bytes_size; - std::optional nulls_count; - std::optional hyperrectangle; -}; - struct PartitionSpecsEntry { Int32 source_id; @@ -76,6 +86,10 @@ struct ManifestFileEntry String file_format; std::optional reference_data_file_path; // For position delete files only. std::optional> equality_ids; + + // Resolved storage and key (set by SingleThreadIcebergKeysIterator) + ObjectStoragePtr storage_to_use; + String resolved_key; }; /** diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Mutations.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Mutations.cpp index c7ef9a654903..0bc97042b01e 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Mutations.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Mutations.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -114,18 +115,10 @@ std::optional writeDataFiles( Chunk chunk(block.getColumns(), block.rows()); auto partition_result = chunk_partitioner.partitionChunk(chunk); - auto col_data_filename = block.getByName(block_datafile_path); - auto col_position = block.getByName(block_row_number); - - size_t col_data_filename_index = 0; - size_t col_position_index = 0; - for (size_t i = 0; i < block.columns(); ++i) - { - if (block.getNames()[i] == block_datafile_path) - col_data_filename_index = i; - if (block.getNames()[i] == block_row_number) - col_position_index = i; - } + size_t col_data_filename_index = block.getPositionByName(block_datafile_path); + size_t col_position_index = block.getPositionByName(block_row_number); + ColumnWithTypeAndName col_data_filename = block.getByPosition(col_data_filename_index); + ColumnWithTypeAndName col_position = block.getByPosition(col_position_index); for (const auto & [partition_key, partition_chunk] : partition_result) { @@ -152,7 +145,7 @@ std::optional writeDataFiles( column_mapper->setStorageColumnEncoding(std::move(field_ids)); FormatFilterInfoPtr format_filter_info = std::make_shared(nullptr, context, column_mapper); auto output_format = FormatFactory::instance().getOutputFormat( - configuration->format, *write_buffer, delete_file_sample_block, context, format_settings, format_filter_info); + configuration->getFormat(), *write_buffer, delete_file_sample_block, context, format_settings, format_filter_info); write_buffers[partition_key] = std::move(write_buffer); writers[partition_key] = std::move(output_format); @@ -161,21 +154,33 @@ std::optional writeDataFiles( col_data_filename.column = partition_chunk.getColumns()[col_data_filename_index]; col_position.column = partition_chunk.getColumns()[col_position_index]; + if (const ColumnNullable * nullable = typeid_cast(col_position.column.get())) + { + const auto & null_map = nullable->getNullMapData(); + if (std::any_of(null_map.begin(), null_map.end(), [](UInt8 x) { return x != 0; })) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected null _row_number"); + col_position.column = nullable->getNestedColumnPtr(); + } + auto col_data_filename_without_namespaces = ColumnString::create(); for (size_t i = 0; i < col_data_filename.column->size(); ++i) { Field cur_value; col_data_filename.column->get(i, cur_value); + String original_path = cur_value.safeGet(); String path_without_namespace; - if (cur_value.safeGet().starts_with(configuration->getNamespace())) - path_without_namespace = cur_value.safeGet().substr(configuration->getNamespace().size()); - if (!path_without_namespace.starts_with(configuration->getPathForRead().path)) + if (original_path.starts_with(configuration->getNamespace())) + path_without_namespace = original_path.substr(configuration->getNamespace().size()); + else + path_without_namespace = original_path; + + if (!path_without_namespace.empty() && !path_without_namespace.starts_with(configuration->getPathForRead().path)) { if (path_without_namespace.starts_with('/')) path_without_namespace = path_without_namespace.substr(1); - else + else if (!path_without_namespace.empty()) path_without_namespace = "/" + path_without_namespace; } col_data_filename_without_namespaces->insert(path_without_namespace); @@ -285,7 +290,7 @@ bool writeMetadataFiles( delete_filenames.statistic.at(partition_key), std::make_shared(getPositionDeleteFileSampleBlock()), new_snapshot, - configuration->format, + configuration->getFormat(), partititon_spec, partition_spec_id, *buffer_manifest_entry, diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.cpp index 1599d93433bb..d97b786d1ff0 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.cpp @@ -67,18 +67,22 @@ void IcebergPositionDeleteTransform::initializeDeleteSources() { /// Skip position deletes that do not match the data file path. if (position_deletes_object.reference_data_file_path.has_value() - && position_deletes_object.reference_data_file_path != iceberg_data_path) + && position_deletes_object.reference_data_file_path.value() != iceberg_data_path) continue; - auto object_path = position_deletes_object.file_path; - auto object_metadata = object_storage->getObjectMetadata(object_path); - auto object_info = std::make_shared(object_path, object_metadata); + /// Resolve the position delete file path to get the correct storage and key + /// This handles cases where delete files are outside the table location + auto [delete_storage_to_use, resolved_delete_key] = resolveObjectStorageForPath( + table_location, position_deletes_object.file_path, object_storage, secondary_storages, context); + + auto object_metadata = delete_storage_to_use->getObjectMetadata(resolved_delete_key); + PathWithMetadata delete_file_object(resolved_delete_key, object_metadata, position_deletes_object.file_path, delete_storage_to_use); String format = position_deletes_object.file_format; Block initial_header; { - std::unique_ptr read_buf_schema = createReadBuffer(*object_info, object_storage, context, log); + std::unique_ptr read_buf_schema = createReadBuffer(delete_file_object, delete_storage_to_use, context, log); auto schema_reader = FormatFactory::instance().getSchemaReader(format, *read_buf_schema, context); auto columns_with_names = schema_reader->readSchema(); ColumnsWithTypeAndName initial_header_data; @@ -89,9 +93,9 @@ void IcebergPositionDeleteTransform::initializeDeleteSources() initial_header = Block(initial_header_data); } - CompressionMethod compression_method = chooseCompressionMethod(object_path, "auto"); + CompressionMethod compression_method = chooseCompressionMethod(resolved_delete_key, "auto"); - delete_read_buffers.push_back(createReadBuffer(*object_info, object_storage, context, log)); + delete_read_buffers.push_back(createReadBuffer(delete_file_object, delete_storage_to_use, context, log)); auto syntax_result = TreeRewriter(context).analyze(where_ast, initial_header.getNamesAndTypesList()); ExpressionAnalyzer analyzer(where_ast, syntax_result, context); @@ -138,25 +142,46 @@ void IcebergBitmapPositionDeleteTransform::transform(Chunk & chunk) IColumn::Filter delete_vector(num_rows, true); size_t num_rows_after_filtration = num_rows; - auto chunk_info = chunk.getChunkInfos().get(); + auto chunk_info = chunk.getChunkInfos().get(); if (!chunk_info) - throw Exception(ErrorCodes::LOGICAL_ERROR, "ChunkInfoRowNumOffset does not exist"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "ChunkInfoRowNumbers does not exist"); size_t row_num_offset = chunk_info->row_num_offset; - for (size_t i = 0; i < num_rows; i++) + auto & applied_filter = chunk_info->applied_filter; + size_t num_indices = applied_filter.has_value() ? applied_filter->size() : num_rows; + size_t idx_in_chunk = 0; + for (size_t i = 0; i < num_indices; i++) { - size_t row_idx = row_num_offset + i; - if (bitmap.rb_contains(row_idx)) + if (!applied_filter.has_value() || applied_filter.value()[i]) { - delete_vector[i] = false; - num_rows_after_filtration--; + size_t row_idx = row_num_offset + i; + if (bitmap.rb_contains(row_idx)) + { + delete_vector[idx_in_chunk] = false; + + /// If we already have a _row_number-indexed filter vector, update it in place. + if (applied_filter.has_value()) + applied_filter.value()[i] = false; + + num_rows_after_filtration--; + } + idx_in_chunk += 1; } } + chassert(idx_in_chunk == num_rows); + + if (num_rows_after_filtration == num_rows) + return; auto columns = chunk.detachColumns(); for (auto & column : columns) column = column->filter(delete_vector, -1); + /// If it's the first filtering we do on this Chunk (i.e. its _row_number-s were consecutive), + /// assign its applied_filter. + if (!applied_filter.has_value()) + applied_filter.emplace(std::move(delete_vector)); + chunk.setColumns(std::move(columns), num_rows_after_filtration); } @@ -167,10 +192,8 @@ void IcebergBitmapPositionDeleteTransform::initialize() while (auto delete_chunk = delete_source->read()) { int position_index = getColumnIndex(delete_source, IcebergPositionDeleteTransform::positions_column_name); - int filename_index = getColumnIndex(delete_source, IcebergPositionDeleteTransform::data_file_path_column_name); auto position_column = delete_chunk.getColumns()[position_index]; - auto filename_column = delete_chunk.getColumns()[filename_index]; for (size_t i = 0; i < delete_chunk.getNumRows(); ++i) { @@ -223,50 +246,75 @@ void IcebergStreamingPositionDeleteTransform::transform(Chunk & chunk) size_t num_rows = chunk.getNumRows(); IColumn::Filter filter(num_rows, true); size_t num_rows_after_filtration = chunk.getNumRows(); - auto chunk_info = chunk.getChunkInfos().get(); + auto chunk_info = chunk.getChunkInfos().get(); if (!chunk_info) - throw Exception(ErrorCodes::LOGICAL_ERROR, "ChunkInfoRowNumOffset does not exist"); - - size_t total_previous_chunks_size = chunk_info->row_num_offset; - if (previous_chunk_offset && previous_chunk_offset.value() > total_previous_chunks_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ChunkInfoRowNumbers does not exist"); + + size_t num_indices = chunk_info->applied_filter.has_value() ? chunk_info->applied_filter->size() : chunk.getNumRows(); + + /// We get chunks in order of increasing row number because: + /// * this transform should be immediately after the IInputFormat + /// (typically ParquetV3BlockInputFormat) in the pipeline, + /// * IInputFormat outputs chunks in order of row number even if it uses multiple threads + /// internally; for parquet IcebergMetadata::modifyFormatSettings sets + /// `format_settings.parquet.preserve_order = true` to ensure this, other formats return + /// chunks in order by default. + if (previous_chunk_end_offset && previous_chunk_end_offset.value() > chunk_info->row_num_offset) throw Exception(ErrorCodes::LOGICAL_ERROR, "Chunks offsets should increase."); - previous_chunk_offset = total_previous_chunks_size; - for (size_t i = 0; i < chunk.getNumRows(); ++i) + previous_chunk_end_offset = chunk_info->row_num_offset + num_indices; + + size_t idx_in_chunk = 0; + for (size_t i = 0; i < num_indices; i++) { - while (!latest_positions.empty()) + if (!chunk_info->applied_filter.has_value() || chunk_info->applied_filter.value()[i]) { - auto it = latest_positions.begin(); - if (it->first < i + total_previous_chunks_size) + size_t row_idx = chunk_info->row_num_offset + i; + + while (!latest_positions.empty()) { - size_t delete_source_index = it->second; - latest_positions.erase(it); - if (iterator_at_latest_chunks[delete_source_index] + 1 >= latest_chunks[delete_source_index].getNumRows() && latest_chunks[delete_source_index].getNumRows() > 0) + auto it = latest_positions.begin(); + if (it->first < row_idx) { - fetchNewChunkFromSource(delete_source_index); + size_t delete_source_index = it->second; + latest_positions.erase(it); + if (iterator_at_latest_chunks[delete_source_index] + 1 >= latest_chunks[delete_source_index].getNumRows() && latest_chunks[delete_source_index].getNumRows() > 0) + { + fetchNewChunkFromSource(delete_source_index); + } + else + { + ++iterator_at_latest_chunks[delete_source_index]; + auto position_index = delete_source_column_indices[delete_source_index].position_index; + size_t next_index_value_in_positional_delete_file = latest_chunks[delete_source_index].getColumns()[position_index]->get64(iterator_at_latest_chunks[delete_source_index]); + latest_positions.insert(std::pair{next_index_value_in_positional_delete_file, delete_source_index}); + } } - else + else if (it->first == row_idx) { - ++iterator_at_latest_chunks[delete_source_index]; - auto position_index = delete_source_column_indices[delete_source_index].position_index; - size_t next_index_value_in_positional_delete_file = latest_chunks[delete_source_index].getColumns()[position_index]->get64(iterator_at_latest_chunks[delete_source_index]); - latest_positions.insert(std::pair{next_index_value_in_positional_delete_file, delete_source_index}); + filter[idx_in_chunk] = false; + + if (chunk_info->applied_filter.has_value()) + chunk_info->applied_filter.value()[i] = false; + + --num_rows_after_filtration; + break; } + else + break; } - else if (it->first == i + total_previous_chunks_size) - { - filter[i] = false; - --num_rows_after_filtration; - break; - } - else - break; + + idx_in_chunk += 1; } } + chassert(idx_in_chunk == chunk.getNumRows()); auto columns = chunk.detachColumns(); for (auto & column : columns) column = column->filter(filter, -1); + if (!chunk_info->applied_filter.has_value()) + chunk_info->applied_filter.emplace(std::move(filter)); + chunk.setColumns(std::move(columns), num_rows_after_filtration); } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.h index 1c2122645b06..e03fe400b542 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/PositionDeleteTransform.h @@ -27,13 +27,17 @@ class IcebergPositionDeleteTransform : public ISimpleTransform IcebergDataObjectInfoPtr iceberg_object_info_, ObjectStoragePtr object_storage_, const std::optional & format_settings_, - ContextPtr context_) + ContextPtr context_, + const String & table_location_, + SecondaryStorages & secondary_storages_) : ISimpleTransform(header_, header_, false) , header(header_) , iceberg_object_info(iceberg_object_info_) , object_storage(object_storage_) , format_settings(format_settings_) , context(context_) + , table_location(table_location_) + , secondary_storages(secondary_storages_) { initializeDeleteSources(); } @@ -52,6 +56,8 @@ class IcebergPositionDeleteTransform : public ISimpleTransform const ObjectStoragePtr object_storage; const std::optional format_settings; ContextPtr context; + const String table_location; + SecondaryStorages & secondary_storages; /// We need to keep the read buffers alive since the delete_sources depends on them. std::vector> delete_read_buffers; @@ -66,8 +72,10 @@ class IcebergBitmapPositionDeleteTransform : public IcebergPositionDeleteTransfo IcebergDataObjectInfoPtr iceberg_object_info_, ObjectStoragePtr object_storage_, const std::optional & format_settings_, - ContextPtr context_) - : IcebergPositionDeleteTransform(header_, iceberg_object_info_, object_storage_, format_settings_, context_) + ContextPtr context_, + const String & table_location_, + SecondaryStorages & secondary_storages_) + : IcebergPositionDeleteTransform(header_, iceberg_object_info_, object_storage_, format_settings_, context_, table_location_, secondary_storages_) { initialize(); } @@ -82,6 +90,7 @@ class IcebergBitmapPositionDeleteTransform : public IcebergPositionDeleteTransfo }; +/// Requires both the deletes and the input Chunk-s to arrive in order of increasing row number. class IcebergStreamingPositionDeleteTransform : public IcebergPositionDeleteTransform { public: @@ -90,8 +99,10 @@ class IcebergStreamingPositionDeleteTransform : public IcebergPositionDeleteTran IcebergDataObjectInfoPtr iceberg_object_info_, ObjectStoragePtr object_storage_, const std::optional & format_settings_, - ContextPtr context_) - : IcebergPositionDeleteTransform(header_, iceberg_object_info_, object_storage_, format_settings_, context_) + ContextPtr context_, + const String & table_location_, + SecondaryStorages & secondary_storages_) + : IcebergPositionDeleteTransform(header_, iceberg_object_info_, object_storage_, format_settings_, context_, table_location_, secondary_storages_) { initialize(); } @@ -116,7 +127,7 @@ class IcebergStreamingPositionDeleteTransform : public IcebergPositionDeleteTran std::vector iterator_at_latest_chunks; std::set> latest_positions; - std::optional previous_chunk_offset; + std::optional previous_chunk_end_offset; }; } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp index 6d74a15b8120..56a2cd976f6b 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,8 @@ #include #include #include +#include +#include #include @@ -46,6 +49,10 @@ extern const int LOGICAL_ERROR; extern const int BAD_ARGUMENTS; } +namespace Setting +{ +extern const SettingsTimezone iceberg_timezone_for_timestamptz; +} namespace { @@ -144,7 +151,7 @@ namespace Iceberg std::string IcebergSchemaProcessor::default_link{}; -void IcebergSchemaProcessor::addIcebergTableSchema(Poco::JSON::Object::Ptr schema_ptr) +void IcebergSchemaProcessor::addIcebergTableSchema(Poco::JSON::Object::Ptr schema_ptr, ContextPtr context_) { std::lock_guard lock(mutex); @@ -167,7 +174,7 @@ void IcebergSchemaProcessor::addIcebergTableSchema(Poco::JSON::Object::Ptr schem auto name = field->getValue(f_name); bool required = field->getValue(f_required); current_full_name = name; - auto type = getFieldType(field, f_type, required, current_full_name, true); + auto type = getFieldType(field, f_type, context_, required, current_full_name, true); clickhouse_schema->push_back(NameAndTypePair{name, type}); clickhouse_types_by_source_ids[{schema_id, field->getValue(f_id)}] = NameAndTypePair{current_full_name, type}; clickhouse_ids_by_source_names[{schema_id, current_full_name}] = field->getValue(f_id); @@ -221,7 +228,7 @@ NamesAndTypesList IcebergSchemaProcessor::tryGetFieldsCharacteristics(Int32 sche return fields; } -DataTypePtr IcebergSchemaProcessor::getSimpleType(const String & type_name) +DataTypePtr IcebergSchemaProcessor::getSimpleType(const String & type_name, ContextPtr context_) { if (type_name == f_boolean) return DataTypeFactory::instance().get("Bool"); @@ -240,7 +247,10 @@ DataTypePtr IcebergSchemaProcessor::getSimpleType(const String & type_name) if (type_name == f_timestamp) return std::make_shared(6); if (type_name == f_timestamptz) - return std::make_shared(6, "UTC"); + { + std::string timezone = context_->getSettingsRef()[Setting::iceberg_timezone_for_timestamptz]; + return std::make_shared(6, timezone); + } if (type_name == f_string || type_name == f_binary) return std::make_shared(); if (type_name == f_uuid) @@ -265,21 +275,25 @@ DataTypePtr IcebergSchemaProcessor::getSimpleType(const String & type_name) } DataTypePtr -IcebergSchemaProcessor::getComplexTypeFromObject(const Poco::JSON::Object::Ptr & type, String & current_full_name, bool is_subfield_of_root) +IcebergSchemaProcessor::getComplexTypeFromObject( + const Poco::JSON::Object::Ptr & type, + String & current_full_name, + ContextPtr context_, + bool is_subfield_of_root) { String type_name = type->getValue(f_type); if (type_name == f_list) { bool element_required = type->getValue("element-required"); - auto element_type = getFieldType(type, f_element, element_required); + auto element_type = getFieldType(type, f_element, context_, element_required); return std::make_shared(element_type); } if (type_name == f_map) { - auto key_type = getFieldType(type, f_key, true); + auto key_type = getFieldType(type, f_key, context_, true); auto value_required = type->getValue("value-required"); - auto value_type = getFieldType(type, f_value, value_required); + auto value_type = getFieldType(type, f_value, context_, value_required); return std::make_shared(key_type, value_type); } @@ -303,7 +317,7 @@ IcebergSchemaProcessor::getComplexTypeFromObject(const Poco::JSON::Object::Ptr & (current_full_name += ".").append(element_names.back()); scope_guard guard([&] { current_full_name.resize(current_full_name.size() - element_names.back().size() - 1); }); - element_types.push_back(getFieldType(field, f_type, required, current_full_name, true)); + element_types.push_back(getFieldType(field, f_type, context_, required, current_full_name, true)); TSA_SUPPRESS_WARNING_FOR_WRITE(clickhouse_types_by_source_ids) [{schema_id, field->getValue(f_id)}] = NameAndTypePair{current_full_name, element_types.back()}; @@ -312,7 +326,7 @@ IcebergSchemaProcessor::getComplexTypeFromObject(const Poco::JSON::Object::Ptr & } else { - element_types.push_back(getFieldType(field, f_type, required)); + element_types.push_back(getFieldType(field, f_type, context_, required)); } } @@ -323,16 +337,21 @@ IcebergSchemaProcessor::getComplexTypeFromObject(const Poco::JSON::Object::Ptr & } DataTypePtr IcebergSchemaProcessor::getFieldType( - const Poco::JSON::Object::Ptr & field, const String & type_key, bool required, String & current_full_name, bool is_subfield_of_root) + const Poco::JSON::Object::Ptr & field, + const String & type_key, + ContextPtr context_, + bool required, + String & current_full_name, + bool is_subfield_of_root) { if (field->isObject(type_key)) - return getComplexTypeFromObject(field->getObject(type_key), current_full_name, is_subfield_of_root); + return getComplexTypeFromObject(field->getObject(type_key), current_full_name, context_, is_subfield_of_root); auto type = field->get(type_key); if (type.isString()) { const String & type_name = type.extract(); - auto data_type = getSimpleType(type_name); + auto data_type = getSimpleType(type_name, context_); return required ? data_type : makeNullable(data_type); } @@ -362,7 +381,11 @@ bool IcebergSchemaProcessor::allowPrimitiveTypeConversion(const String & old_typ // Ids are passed only for error logging purposes std::shared_ptr IcebergSchemaProcessor::getSchemaTransformationDag( - const Poco::JSON::Object::Ptr & old_schema, const Poco::JSON::Object::Ptr & new_schema, Int32 old_id, Int32 new_id) + const Poco::JSON::Object::Ptr & old_schema, + const Poco::JSON::Object::Ptr & new_schema, + ContextPtr context_, + Int32 old_id, + Int32 new_id) { std::unordered_map> old_schema_entries; auto old_schema_fields = old_schema->get(f_fields).extract(); @@ -374,7 +397,7 @@ std::shared_ptr IcebergSchemaProcessor::getSchemaTransformationDag( size_t id = field->getValue(f_id); auto name = field->getValue(f_name); bool required = field->getValue(f_required); - old_schema_entries[id] = {field, &dag->addInput(name, getFieldType(field, f_type, required))}; + old_schema_entries[id] = {field, &dag->addInput(name, getFieldType(field, f_type, context_, required))}; } auto new_schema_fields = new_schema->get(f_fields).extract(); for (size_t i = 0; i != new_schema_fields->size(); ++i) @@ -383,7 +406,7 @@ std::shared_ptr IcebergSchemaProcessor::getSchemaTransformationDag( size_t id = field->getValue(f_id); auto name = field->getValue(f_name); bool required = field->getValue(f_required); - auto type = getFieldType(field, f_type, required); + auto type = getFieldType(field, f_type, context_, required); auto old_node_it = old_schema_entries.find(id); if (old_node_it != old_schema_entries.end()) { @@ -393,7 +416,7 @@ std::shared_ptr IcebergSchemaProcessor::getSchemaTransformationDag( || field->getObject(f_type)->getValue(f_type) == "list" || field->getObject(f_type)->getValue(f_type) == "map")) { - auto old_type = getFieldType(old_json, "type", required); + auto old_type = getFieldType(old_json, "type", context_, required); auto transform = std::make_shared(std::vector{type}, std::vector{old_type}, old_json, field); old_node = &dag->addFunction(transform, std::vector{old_node}, name); @@ -423,7 +446,7 @@ std::shared_ptr IcebergSchemaProcessor::getSchemaTransformationDag( } else if (allowPrimitiveTypeConversion(old_type, new_type)) { - node = &dag->addCast(*old_node, getFieldType(field, f_type, required), name); + node = &dag->addCast(*old_node, getFieldType(field, f_type, context_, required), name); } outputs.push_back(node); } @@ -449,7 +472,10 @@ std::shared_ptr IcebergSchemaProcessor::getSchemaTransformationDag( return dag; } -std::shared_ptr IcebergSchemaProcessor::getSchemaTransformationDagByIds(Int32 old_id, Int32 new_id) +std::shared_ptr IcebergSchemaProcessor::getSchemaTransformationDagByIds( + ContextPtr context_, + Int32 old_id, + Int32 new_id) { if (old_id == new_id) return nullptr; @@ -468,7 +494,7 @@ std::shared_ptr IcebergSchemaProcessor::getSchemaTransformatio throw Exception(ErrorCodes::BAD_ARGUMENTS, "Schema with schema-id {} is unknown", new_id); return transform_dags_by_ids[{old_id, new_id}] - = getSchemaTransformationDag(old_schema_it->second, new_schema_it->second, old_id, new_id); + = getSchemaTransformationDag(old_schema_it->second, new_schema_it->second, context_, old_id, new_id); } Poco::JSON::Object::Ptr IcebergSchemaProcessor::getIcebergTableSchemaById(Int32 id) const diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h index 0fc42cccd266..b4a7a8d6fdeb 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h @@ -75,16 +75,18 @@ ColumnMapperPtr createColumnMapper(Poco::JSON::Object::Ptr schema_object); * } * } */ -class IcebergSchemaProcessor +class IcebergSchemaProcessor : private WithContext { static std::string default_link; using Node = ActionsDAG::Node; public: - void addIcebergTableSchema(Poco::JSON::Object::Ptr schema_ptr); + explicit IcebergSchemaProcessor(ContextPtr context_) : WithContext(context_) {} + + void addIcebergTableSchema(Poco::JSON::Object::Ptr schema_ptr, ContextPtr context_); std::shared_ptr getClickhouseTableSchemaById(Int32 id); - std::shared_ptr getSchemaTransformationDagByIds(Int32 old_id, Int32 new_id); + std::shared_ptr getSchemaTransformationDagByIds(ContextPtr context_, Int32 old_id, Int32 new_id); NameAndTypePair getFieldCharacteristics(Int32 schema_version, Int32 source_id) const; std::optional tryGetFieldCharacteristics(Int32 schema_version, Int32 source_id) const; NamesAndTypesList tryGetFieldsCharacteristics(Int32 schema_id, const std::vector & source_ids) const; @@ -92,7 +94,7 @@ class IcebergSchemaProcessor Poco::JSON::Object::Ptr getIcebergTableSchemaById(Int32 id) const; bool hasClickhouseTableSchemaById(Int32 id) const; - static DataTypePtr getSimpleType(const String & type_name); + static DataTypePtr getSimpleType(const String & type_name, ContextPtr context_); static std::unordered_map traverseSchema(Poco::JSON::Array::Ptr schema); @@ -112,10 +114,15 @@ class IcebergSchemaProcessor std::unordered_map schema_id_by_snapshot TSA_GUARDED_BY(mutex); NamesAndTypesList getSchemaType(const Poco::JSON::Object::Ptr & schema); - DataTypePtr getComplexTypeFromObject(const Poco::JSON::Object::Ptr & type, String & current_full_name, bool is_subfield_of_root); + DataTypePtr getComplexTypeFromObject( + const Poco::JSON::Object::Ptr & type, + String & current_full_name, + ContextPtr context_, + bool is_subfield_of_root); DataTypePtr getFieldType( const Poco::JSON::Object::Ptr & field, const String & type_key, + ContextPtr context_, bool required, String & current_full_name = default_link, bool is_subfield_of_root = false); @@ -124,7 +131,11 @@ class IcebergSchemaProcessor const Node * getDefaultNodeForField(const Poco::JSON::Object::Ptr & field); std::shared_ptr getSchemaTransformationDag( - const Poco::JSON::Object::Ptr & old_schema, const Poco::JSON::Object::Ptr & new_schema, Int32 old_id, Int32 new_id); + const Poco::JSON::Object::Ptr & old_schema, + const Poco::JSON::Object::Ptr & new_schema, + ContextPtr context_, + Int32 old_id, + Int32 new_id); mutable SharedMutex mutex; }; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h index ec1553b65212..f88838168639 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Snapshot.h @@ -17,6 +17,8 @@ struct IcebergDataSnapshot std::optional total_rows; std::optional total_bytes; std::optional total_position_delete_rows; + std::optional partition_key; + std::optional sorting_key; std::optional getTotalRows() const { @@ -44,6 +46,7 @@ struct IcebergHistoryRecord Int64 parent_id; bool is_current_ancestor; String manifest_list_path; + String manifest_list_absolute_path; Int32 added_files = 0; Int32 added_records = 0; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.cpp index 2f76d0a0bb03..892503d126b0 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.cpp @@ -73,9 +73,10 @@ Iceberg::ManifestFilePtr getManifestFile( const PersistentTableComponents & persistent_table_components, ContextPtr local_context, LoggerPtr log, - const String & filename, + const String & absolute_path, Int64 inherited_sequence_number, - Int64 inherited_snapshot_id) + Int64 inherited_snapshot_id, + SecondaryStorages & secondary_storages) { auto log_level = local_context->getSettingsRef()[Setting::iceberg_metadata_log_level].value; @@ -84,19 +85,22 @@ Iceberg::ManifestFilePtr getManifestFile( auto create_fn = [&, use_iceberg_metadata_cache]() { - RelativePathWithMetadata manifest_object_info(filename); + auto [storage_to_use, resolved_key_in_storage] = resolveObjectStorageForPath( + persistent_table_components.table_location, absolute_path, object_storage, secondary_storages, local_context); + + PathWithMetadata manifest_object_info(resolved_key_in_storage, std::nullopt, absolute_path, storage_to_use); auto read_settings = local_context->getReadSettings(); /// Do not utilize filesystem cache if more precise cache enabled if (use_iceberg_metadata_cache) read_settings.enable_filesystem_cache = false; - auto buffer = createReadBuffer(manifest_object_info, object_storage, local_context, log, read_settings); - Iceberg::AvroForIcebergDeserializer manifest_file_deserializer(std::move(buffer), filename, getFormatSettings(local_context)); + auto buffer = createReadBuffer(manifest_object_info, storage_to_use, local_context, log, read_settings); + Iceberg::AvroForIcebergDeserializer manifest_file_deserializer(std::move(buffer), resolved_key_in_storage, getFormatSettings(local_context)); return std::make_shared( manifest_file_deserializer, - filename, + resolved_key_in_storage, persistent_table_components.format_version, configuration->getPathForRead().path, *persistent_table_components.schema_processor, @@ -104,13 +108,13 @@ Iceberg::ManifestFilePtr getManifestFile( inherited_snapshot_id, persistent_table_components.table_location, local_context, - filename); + absolute_path); }; if (use_iceberg_metadata_cache) { auto manifest_file = persistent_table_components.metadata_cache->getOrSetManifestFile( - IcebergMetadataFilesCache::getKey(configuration, filename), create_fn); + IcebergMetadataFilesCache::getKey(configuration, absolute_path), create_fn); return manifest_file; } return create_fn(); @@ -121,7 +125,8 @@ ManifestFileCacheKeys getManifestList( StorageObjectStorageConfigurationWeakPtr configuration, const PersistentTableComponents & persistent_table_components, ContextPtr local_context, - const String & filename, + const String & key_in_storage, + const String & absolute_path, LoggerPtr log) { auto configuration_ptr = configuration.lock(); @@ -135,7 +140,7 @@ ManifestFileCacheKeys getManifestList( auto create_fn = [&, use_iceberg_metadata_cache]() { - StorageObjectStorage::ObjectInfo object_info(filename); + PathWithMetadata object_info(key_in_storage, std::nullopt, absolute_path, object_storage); auto read_settings = local_context->getReadSettings(); /// Do not utilize filesystem cache if more precise cache enabled @@ -143,16 +148,17 @@ ManifestFileCacheKeys getManifestList( read_settings.enable_filesystem_cache = false; auto manifest_list_buf = createReadBuffer(object_info, object_storage, local_context, log, read_settings); - AvroForIcebergDeserializer manifest_list_deserializer(std::move(manifest_list_buf), filename, getFormatSettings(local_context)); + AvroForIcebergDeserializer manifest_list_deserializer(std::move(manifest_list_buf), key_in_storage, getFormatSettings(local_context)); ManifestFileCacheKeys manifest_file_cache_keys; + auto dump_metadata = [&]()->String { return manifest_list_deserializer.getMetadataContent(); }; insertRowToLogTable( local_context, - manifest_list_deserializer.getMetadataContent(), + dump_metadata, DB::IcebergMetadataLogLevel::ManifestListMetadata, configuration_ptr->getRawPath().path, - filename, + key_in_storage, std::nullopt, std::nullopt); @@ -160,8 +166,7 @@ ManifestFileCacheKeys getManifestList( { const std::string file_path = manifest_list_deserializer.getValueFromRowByName(i, f_manifest_path, TypeIndex::String).safeGet(); - const auto manifest_file_name = getProperFilePathFromMetadataInfo( - file_path, configuration_ptr->getPathForRead().path, persistent_table_components.table_location); + const auto manifest_absolute_path = makeAbsolutePath(persistent_table_components.table_location, file_path); Int64 added_sequence_number = 0; auto added_snapshot_id = manifest_list_deserializer.getValueFromRowByName(i, f_added_snapshot_id); if (added_snapshot_id.isNull()) @@ -180,14 +185,15 @@ ManifestFileCacheKeys getManifestList( manifest_list_deserializer.getValueFromRowByName(i, f_content, TypeIndex::Int32).safeGet()); } manifest_file_cache_keys.emplace_back( - manifest_file_name, added_sequence_number, added_snapshot_id.safeGet(), content_type); + manifest_absolute_path, added_sequence_number, added_snapshot_id.safeGet(), content_type); + auto dump_row_metadata = [&]()->String { return manifest_list_deserializer.getContent(i); }; insertRowToLogTable( local_context, - manifest_list_deserializer.getContent(i), + dump_row_metadata, DB::IcebergMetadataLogLevel::ManifestListEntry, configuration_ptr->getRawPath().path, - filename, + absolute_path, i, std::nullopt); } @@ -200,7 +206,7 @@ ManifestFileCacheKeys getManifestList( ManifestFileCacheKeys manifest_file_cache_keys; if (use_iceberg_metadata_cache) manifest_file_cache_keys = persistent_table_components.metadata_cache->getOrSetManifestFileCacheKeys( - IcebergMetadataFilesCache::getKey(configuration_ptr, filename), create_fn); + IcebergMetadataFilesCache::getKey(configuration_ptr, absolute_path), create_fn); else manifest_file_cache_keys = create_fn(); return manifest_file_cache_keys; diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.h index 432409678312..26f17cbd2231 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/StatelessMetadataFileGetter.h @@ -19,6 +19,7 @@ #include #include +#include namespace DB::Iceberg { @@ -29,9 +30,10 @@ Iceberg::ManifestFilePtr getManifestFile( const PersistentTableComponents & persistent_table_components, ContextPtr local_context, LoggerPtr log, - const String & filename, + const String & absolute_path, Int64 inherited_sequence_number, - Int64 inherited_snapshot_id); + Int64 inherited_snapshot_id, + SecondaryStorages & secondary_storages); ManifestFileCacheKeys getManifestList( @@ -39,7 +41,8 @@ ManifestFileCacheKeys getManifestList( StorageObjectStorageConfigurationWeakPtr configuration, const PersistentTableComponents & persistent_table_components, ContextPtr local_context, - const String & filename, + const String & key_in_storage, + const String & absolute_path, LoggerPtr log); std::pair parseTableSchemaV1Method(const Poco::JSON::Object::Ptr & metadata_object); diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp index 8cc23f59d729..5b8a657fc997 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.cpp @@ -40,6 +40,8 @@ #include #include +#include + using namespace DB; @@ -65,6 +67,8 @@ namespace DB::DataLakeStorageSetting namespace ProfileEvents { extern const Event IcebergVersionHintUsed; + extern const Event IcebergJsonFileParsing; + extern const Event IcebergJsonFileParsingMicroseconds; } namespace DB::Setting @@ -74,7 +78,6 @@ namespace DB::Setting namespace DB::Iceberg { - using namespace DB; void writeMessageToFile( @@ -163,78 +166,6 @@ std::optional parseTransformAndArgument(const String & tra return std::nullopt; } -// This function is used to get the file path inside the directory which corresponds to iceberg table from the full blob path which is written in manifest and metadata files. -// For example, if the full blob path is s3://bucket/table_name/data/00000-1-1234567890.avro, the function will return table_name/data/00000-1-1234567890.avro -// Common path should end with "" or "/". -std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::string_view common_path, std::string_view table_location) -{ - auto trim_backward_slash = [](std::string_view str) -> std::string_view - { - if (str.ends_with('/')) - { - return str.substr(0, str.size() - 1); - } - return str; - }; - auto trim_forward_slash = [](std::string_view str) -> std::string_view - { - if (str.starts_with('/')) - { - return str.substr(1); - } - return str; - }; - common_path = trim_backward_slash(common_path); - table_location = trim_backward_slash(table_location); - - if (data_path.starts_with(table_location) && table_location.ends_with(common_path)) - { - return std::filesystem::path{common_path} / trim_forward_slash(data_path.substr(table_location.size())); - } - - - auto pos = data_path.find(common_path); - /// Valid situation when data and metadata files are stored in different directories. - if (pos == std::string::npos) - { - /// connection://bucket - auto prefix = table_location.substr(0, table_location.size() - common_path.size()); - return std::string{data_path.substr(prefix.size())}; - } - - size_t good_pos = std::string::npos; - while (pos != std::string::npos) - { - auto potential_position = pos + common_path.size(); - if ((std::string_view(data_path.data() + potential_position, 6) == "/data/") - || (std::string_view(data_path.data() + potential_position, 10) == "/metadata/")) - { - good_pos = pos; - break; - } - size_t new_pos = data_path.find(common_path, pos + 1); - if (new_pos == std::string::npos) - { - break; - } - pos = new_pos; - } - - - if (good_pos != std::string::npos) - { - return std::string{data_path.substr(good_pos)}; - } - else if (pos != std::string::npos) - { - return std::string{data_path.substr(pos)}; - } - else - { - throw ::DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Expected to find '{}' in data path: '{}'", common_path, data_path); - } -} - enum class MostRecentMetadataFileSelectionWay { BY_LAST_UPDATED_MS_FIELD, @@ -293,6 +224,9 @@ Poco::JSON::Object::Ptr getMetadataJSONObject( return json_str; }; + ProfileEvents::increment(ProfileEvents::IcebergJsonFileParsing); + ProfileEventTimeIncrement watch(ProfileEvents::IcebergJsonFileParsingMicroseconds); + String metadata_json_str; if (cache_ptr) metadata_json_str = cache_ptr->getOrSetTableMetadata(IcebergMetadataFilesCache::getKey(configuration_ptr, metadata_file_path), create_fn); diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h index 52c2b19b049d..a1b559b05810 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/Utils.h @@ -1,7 +1,10 @@ #pragma once + +#include #include "config.h" +#include #include #include @@ -31,8 +34,6 @@ void writeMessageToFile( std::function cleanup, DB::CompressionMethod compression_method = DB::CompressionMethod::None); -std::string getProperFilePathFromMetadataInfo(std::string_view data_path, std::string_view common_path, std::string_view table_location); - struct TransformAndArgument { String transform_name; diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index 5fa9c3a80fd8..dec6cf52ff8e 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -92,23 +92,23 @@ void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool wit if (args.size() > 1) { - format = checkAndGetLiteralArgument(args[1], "format_name"); + setFormat(checkAndGetLiteralArgument(args[1], "format_name")); } if (with_structure) { if (args.size() > 2) { - structure = checkAndGetLiteralArgument(args[2], "structure"); + setStructure(checkAndGetLiteralArgument(args[2], "structure")); } if (args.size() > 3) { - compression_method = checkAndGetLiteralArgument(args[3], "compression_method"); + setCompressionMethod(checkAndGetLiteralArgument(args[3], "compression_method")); } } else if (args.size() > 2) { - compression_method = checkAndGetLiteralArgument(args[2], "compression_method"); + setCompressionMethod(checkAndGetLiteralArgument(args[2], "compression_method")); } setURL(url_str); @@ -124,10 +124,10 @@ void StorageHDFSConfiguration::fromNamedCollection(const NamedCollection & colle else url_str = collection.get("url"); - format = collection.getOrDefault("format", "auto"); - compression_method = collection.getOrDefault("compression_method", - collection.getOrDefault("compression", "auto")); - structure = collection.getOrDefault("structure", "auto"); + setFormat(collection.getOrDefault("format", "auto")); + setCompressionMethod(collection.getOrDefault("compression_method", + collection.getOrDefault("compression", "auto"))); + setStructure(collection.getOrDefault("structure", "auto")); setURL(url_str); } @@ -217,6 +217,13 @@ void StorageHDFSConfiguration::addStructureAndFormatToArgsIfNeeded( } } +ASTPtr StorageHDFSConfiguration::createArgsWithAccessData() const +{ + auto arguments = std::make_shared(); + arguments->children.push_back(std::make_shared(url + path.path)); + return arguments; +} + } #endif diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index 27ede8512988..7bc1dcbe2a44 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -69,6 +69,8 @@ class StorageHDFSConfiguration : public StorageObjectStorageConfiguration ContextPtr context, bool with_structure) override; + ASTPtr createArgsWithAccessData() const override; + private: void fromNamedCollection(const NamedCollection &, ContextPtr context) override; void fromAST(ASTs & args, ContextPtr, bool /* with_structure */) override; diff --git a/src/Storages/ObjectStorage/IObjectIterator.h b/src/Storages/ObjectStorage/IObjectIterator.h index 76358ea44dfc..126abd181910 100644 --- a/src/Storages/ObjectStorage/IObjectIterator.h +++ b/src/Storages/ObjectStorage/IObjectIterator.h @@ -5,8 +5,8 @@ namespace DB { -using ObjectInfo = RelativePathWithMetadata; -using ObjectInfoPtr = std::shared_ptr; +using ObjectInfo = PathWithMetadata; +using ObjectInfoPtr = std::shared_ptr; class ExpressionActions; struct IObjectIterator diff --git a/src/Storages/ObjectStorage/Local/Configuration.cpp b/src/Storages/ObjectStorage/Local/Configuration.cpp index 870c3c383c0d..58eeac06b0e2 100644 --- a/src/Storages/ObjectStorage/Local/Configuration.cpp +++ b/src/Storages/ObjectStorage/Local/Configuration.cpp @@ -24,9 +24,9 @@ extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; void StorageLocalConfiguration::fromNamedCollection(const NamedCollection & collection, ContextPtr) { path = collection.get("path"); - format = collection.getOrDefault("format", "auto"); - compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); - structure = collection.getOrDefault("structure", "auto"); + setFormat(collection.getOrDefault("format", "auto")); + setCompressionMethod(collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto"))); + setStructure(collection.getOrDefault("structure", "auto")); paths = {path}; } @@ -46,23 +46,23 @@ void StorageLocalConfiguration::fromAST(ASTs & args, ContextPtr context, bool wi if (args.size() > 1) { - format = checkAndGetLiteralArgument(args[1], "format_name"); + setFormat(checkAndGetLiteralArgument(args[1], "format_name")); } if (with_structure) { if (args.size() > 2) { - structure = checkAndGetLiteralArgument(args[2], "structure"); + setStructure(checkAndGetLiteralArgument(args[2], "structure")); } if (args.size() > 3) { - compression_method = checkAndGetLiteralArgument(args[3], "compression_method"); + setCompressionMethod(checkAndGetLiteralArgument(args[3], "compression_method")); } } else if (args.size() > 2) { - compression_method = checkAndGetLiteralArgument(args[2], "compression_method"); + setCompressionMethod(checkAndGetLiteralArgument(args[2], "compression_method")); } paths = {path}; } @@ -81,4 +81,20 @@ StorageObjectStorageQuerySettings StorageLocalConfiguration::getQuerySettings(co .ignore_non_existent_file = false}; } +ASTPtr StorageLocalConfiguration::createArgsWithAccessData() const +{ + auto arguments = std::make_shared(); + + arguments->children.push_back(std::make_shared(path.path)); + if (getFormat() != "auto") + arguments->children.push_back(std::make_shared(getFormat())); + if (getStructure() != "auto") + arguments->children.push_back(std::make_shared(getStructure())); + if (getCompressionMethod() != "auto") + arguments->children.push_back(std::make_shared(getCompressionMethod())); + + return arguments; +} + + } diff --git a/src/Storages/ObjectStorage/Local/Configuration.h b/src/Storages/ObjectStorage/Local/Configuration.h index 231e33f84d35..207d297147c1 100644 --- a/src/Storages/ObjectStorage/Local/Configuration.h +++ b/src/Storages/ObjectStorage/Local/Configuration.h @@ -60,6 +60,8 @@ class StorageLocalConfiguration : public StorageObjectStorageConfiguration void addStructureAndFormatToArgsIfNeeded(ASTs &, const String &, const String &, ContextPtr, bool) override { } + ASTPtr createArgsWithAccessData() const override; + private: void fromNamedCollection(const NamedCollection & collection, ContextPtr context) override; void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; diff --git a/src/Storages/ObjectStorage/ObjectStorageFilePathGenerator.h b/src/Storages/ObjectStorage/ObjectStorageFilePathGenerator.h new file mode 100644 index 000000000000..a1f21dc502d5 --- /dev/null +++ b/src/Storages/ObjectStorage/ObjectStorageFilePathGenerator.h @@ -0,0 +1,83 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace DB +{ + struct ObjectStorageFilePathGenerator + { + virtual ~ObjectStorageFilePathGenerator() = default; + std::string getPathForWrite(const std::string & partition_id) const { + return getPathForWrite(partition_id, ""); + } + virtual std::string getPathForWrite(const std::string & partition_id, const std::string & /* file_name_override */) const = 0; + virtual std::string getPathForRead() const = 0; + }; + + struct ObjectStorageWildcardFilePathGenerator : ObjectStorageFilePathGenerator + { + static constexpr const char * FILE_WILDCARD = "{_file}"; + explicit ObjectStorageWildcardFilePathGenerator(const std::string & raw_path_) : raw_path(raw_path_) {} + + using ObjectStorageFilePathGenerator::getPathForWrite; // Bring base class overloads into scope + std::string getPathForWrite(const std::string & partition_id, const std::string & file_name_override) const override + { + const auto partition_replaced_path = PartitionedSink::replaceWildcards(raw_path, partition_id); + const auto final_path = boost::replace_all_copy(partition_replaced_path, FILE_WILDCARD, file_name_override); + return final_path; + } + + std::string getPathForRead() const override + { + return raw_path; + } + + private: + std::string raw_path; + + }; + + struct ObjectStorageAppendFilePathGenerator : ObjectStorageFilePathGenerator + { + explicit ObjectStorageAppendFilePathGenerator( + const std::string & raw_path_, + const std::string & file_format_) + : raw_path(raw_path_), file_format(Poco::toLower(file_format_)){} + + using ObjectStorageFilePathGenerator::getPathForWrite; // Bring base class overloads into scope + std::string getPathForWrite(const std::string & partition_id, const std::string & file_name_override) const override + { + std::string result; + + result += raw_path; + + if (!result.empty() && result.back() != '/') + { + result += "/"; + } + + /// Not adding '/' because buildExpressionHive() always adds a trailing '/' + result += partition_id; + + const auto file_name = file_name_override.empty() ? std::to_string(generateSnowflakeID()) : file_name_override; + + result += file_name + "." + file_format; + + return result; + } + + std::string getPathForRead() const override + { + return raw_path + "**." + file_format; + } + + private: + std::string raw_path; + std::string file_format; + }; + +} diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index 31089036abe0..131b73b343ae 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -40,8 +40,8 @@ ReadBufferIterator::ReadBufferIterator( , read_keys(read_keys_) , prev_read_keys_size(read_keys_.size()) { - if (configuration->format != "auto") - format = configuration->format; + if (configuration->getFormat() != "auto") + format = configuration->getFormat(); } SchemaCache::Key ReadBufferIterator::getKeyForSchemaCache(const ObjectInfo & object_info, const String & format_name) const @@ -76,10 +76,7 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( const auto & object_info = (*it); auto get_last_mod_time = [&] -> std::optional { - const auto & path = object_info->isArchive() ? object_info->getPathToArchive() : object_info->getPath(); - if (!object_info->metadata) - object_info->metadata = object_storage->tryGetObjectMetadata(path); - + object_info->loadMetadata(object_storage); return object_info->metadata ? std::optional(object_info->metadata->last_modified.epochTime()) : std::nullopt; @@ -151,10 +148,9 @@ std::unique_ptr ReadBufferIterator::recreateLastReadBuffer() { auto context = getContext(); - const auto & path = current_object_info->isArchive() ? current_object_info->getPathToArchive() : current_object_info->getPath(); auto impl = createReadBuffer(*current_object_info, object_storage, context, getLogger("ReadBufferIterator")); - const auto compression_method = chooseCompressionMethod(current_object_info->getFileName(), configuration->compression_method); + const auto compression_method = chooseCompressionMethod(current_object_info->getFileName(), configuration->getCompressionMethod()); const auto zstd_window = static_cast(context->getSettingsRef()[Setting::zstd_window_log_max]); return wrapReadBufferWithCompressionMethod(std::move(impl), compression_method, zstd_window); @@ -250,6 +246,8 @@ ReadBufferIterator::Data ReadBufferIterator::next() prev_read_keys_size = read_keys.size(); } + current_object_info->loadMetadata(object_storage); + if (query_settings.skip_empty_files && current_object_info->metadata && current_object_info->metadata->size_bytes == 0) continue; @@ -270,13 +268,13 @@ ReadBufferIterator::Data ReadBufferIterator::next() using ObjectInfoInArchive = StorageObjectStorageSource::ArchiveIterator::ObjectInfoInArchive; if (const auto * object_info_in_archive = dynamic_cast(current_object_info.get())) { - compression_method = chooseCompressionMethod(filename, configuration->compression_method); + compression_method = chooseCompressionMethod(filename, configuration->getCompressionMethod()); const auto & archive_reader = object_info_in_archive->archive_reader; read_buf = archive_reader->readFile(object_info_in_archive->path_in_archive, /*throw_on_not_found=*/true); } else { - compression_method = chooseCompressionMethod(filename, configuration->compression_method); + compression_method = chooseCompressionMethod(filename, configuration->getCompressionMethod()); read_buf = createReadBuffer(*current_object_info, object_storage, getContext(), getLogger("ReadBufferIterator")); } diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 302c34614f6e..1c4edefca767 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -92,6 +92,7 @@ static const std::unordered_set optional_configuration_keys = "no_sign_request", "partition_strategy", "partition_columns_in_data_file", + "storage_type", /// Private configuration options "role_arn", /// for extra_credentials "role_session_name", /// for extra_credentials @@ -205,10 +206,10 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition strategy {} is not supported", partition_strategy_name); } - partition_strategy_type = partition_strategy_type_opt.value(); + setPartitionStrategyType(partition_strategy_type_opt.value()); } - partition_columns_in_data_file = collection.getOrDefault("partition_columns_in_data_file", partition_strategy_type != PartitionStrategyFactory::StrategyType::HIVE); + setPartitionColumnsInDataFile(collection.getOrDefault("partition_columns_in_data_file", getPartitionStrategyType() != PartitionStrategyFactory::StrategyType::HIVE)); s3_settings->auth_settings[S3AuthSetting::role_arn] = collection.getOrDefault("role_arn", ""); s3_settings->auth_settings[S3AuthSetting::role_session_name] = collection.getOrDefault("role_session_name", ""); @@ -217,9 +218,9 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect s3_settings->auth_settings[S3AuthSetting::metadata_service] = collection.getOrDefault("metadata_service", ""); s3_settings->auth_settings[S3AuthSetting::request_token_path] = collection.getOrDefault("request_token_path", ""); - format = collection.getOrDefault("format", format); - compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); - structure = collection.getOrDefault("structure", "auto"); + setFormat(collection.getOrDefault("format", getFormat())); + setCompressionMethod(collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto"))); + setStructure(collection.getOrDefault("structure", "auto")); s3_settings->request_settings = S3::S3RequestSettings(collection, settings, /* validate_settings */true); @@ -619,25 +620,26 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ if (auto format_value = getFromPositionOrKeyValue("format", args, engine_args_to_idx, key_value_args); format_value.has_value()) { - format = format_value.value(); + auto format_ = format_value.value(); /// Set format to configuration only of it's not 'auto', /// because we can have default format set in configuration. - if (format != "auto") - format = format; + if (format_ != "auto") + setFormat(format_); } if (auto structure_value = getFromPositionOrKeyValue("structure", args, engine_args_to_idx, key_value_args); structure_value.has_value()) { - structure = structure_value.value(); + setStructure(structure_value.value()); } if (auto compression_method_value = getFromPositionOrKeyValue("compression_method", args, engine_args_to_idx, key_value_args); compression_method_value.has_value()) { - compression_method = compression_method_value.value(); + setCompressionMethod(compression_method_value.value()); } + if (auto partition_strategy_value = getFromPositionOrKeyValue("partition_strategy", args, engine_args_to_idx, key_value_args); partition_strategy_value.has_value()) { @@ -649,16 +651,16 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition strategy {} is not supported", partition_strategy_name); } - partition_strategy_type = partition_strategy_type_opt.value(); + setPartitionStrategyType(partition_strategy_type_opt.value()); } if (auto partition_columns_in_data_file_value = getFromPositionOrKeyValue("partition_columns_in_data_file", args, engine_args_to_idx, key_value_args); partition_columns_in_data_file_value.has_value()) { - partition_columns_in_data_file = partition_columns_in_data_file_value.value(); + setPartitionColumnsInDataFile(partition_columns_in_data_file_value.value()); } else - partition_columns_in_data_file = partition_strategy_type != PartitionStrategyFactory::StrategyType::HIVE; + setPartitionColumnsInDataFile(getPartitionStrategyType() != PartitionStrategyFactory::StrategyType::HIVE); if (auto access_key_id_value = getFromPositionOrKeyValue("access_key_id", args, engine_args_to_idx, key_value_args); access_key_id_value.has_value()) @@ -969,6 +971,30 @@ void StorageS3Configuration::addStructureAndFormatToArgsIfNeeded( } } +ASTPtr StorageS3Configuration::createArgsWithAccessData() const +{ + auto arguments = std::make_shared(); + + arguments->children.push_back(std::make_shared(url.uri_str)); + if (s3_settings->auth_settings[S3AuthSetting::no_sign_request]) + { + arguments->children.push_back(std::make_shared("NOSIGN")); + } + else + { + arguments->children.push_back(std::make_shared(s3_settings->auth_settings[S3AuthSetting::access_key_id].value)); + arguments->children.push_back(std::make_shared(s3_settings->auth_settings[S3AuthSetting::secret_access_key].value)); + if (!s3_settings->auth_settings[S3AuthSetting::session_token].value.empty()) + arguments->children.push_back(std::make_shared(s3_settings->auth_settings[S3AuthSetting::session_token].value)); + if (getFormat() != "auto") + arguments->children.push_back(std::make_shared(getFormat())); + if (!getCompressionMethod().empty()) + arguments->children.push_back(std::make_shared(getCompressionMethod())); + } + + return arguments; +} + } #endif diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index b53810daa1fb..438c9d487751 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -100,6 +100,8 @@ class StorageS3Configuration : public StorageObjectStorageConfiguration ContextPtr context, bool with_structure) override; + ASTPtr createArgsWithAccessData() const override; + static ASTPtr extractExtraCredentials(ASTs & args); static bool collectCredentials(ASTPtr maybe_credentials, S3::S3AuthSettings & auth_settings_, ContextPtr local_context); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 0d333b9a9713..597fea59cd21 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -1,6 +1,8 @@ +#include #include #include #include +#include #include #include @@ -32,7 +34,6 @@ #include #include #include -#include #include #include #include @@ -55,6 +56,7 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; extern const int LOGICAL_ERROR; extern const int INCORRECT_DATA; + extern const int FILE_ALREADY_EXISTS; } String StorageObjectStorage::getPathSample(ContextPtr context) @@ -68,6 +70,11 @@ String StorageObjectStorage::getPathSample(ContextPtr context) if (context->getSettingsRef()[Setting::use_hive_partitioning]) local_distributed_processing = false; + const auto path = configuration->getRawPath(); + + if (!configuration->isArchive() && !path.hasGlobs() && !local_distributed_processing) + return path.path; + auto file_iterator = StorageObjectStorageSource::createFileIterator( configuration, query_settings, @@ -82,11 +89,6 @@ String StorageObjectStorage::getPathSample(ContextPtr context) {} // file_progress_callback ); - const auto path = configuration->getRawPath(); - - if (!configuration->isArchive() && !path.hasGlobs() && !local_distributed_processing) - return path.path; - if (auto file = file_iterator->next(0)) return file->getPath(); return ""; @@ -103,12 +105,13 @@ StorageObjectStorage::StorageObjectStorage( std::optional format_settings_, LoadingStrictnessLevel mode, std::shared_ptr catalog_, - bool if_not_exists_, - bool is_datalake_query, + bool /*if_not_exists_*/, + bool /*is_datalake_query*/, bool distributed_processing_, ASTPtr partition_by_, bool is_table_function, - bool lazy_init) + bool lazy_init, + std::optional sample_path_) : IStorage(table_id_) , configuration(configuration_) , object_storage(object_storage_) @@ -120,25 +123,12 @@ StorageObjectStorage::StorageObjectStorage( { configuration->initPartitionStrategy(partition_by_, columns_in_table_or_function_definition, context); - const bool need_resolve_columns_or_format = columns_in_table_or_function_definition.empty() || (configuration->format == "auto"); + const bool need_resolve_columns_or_format = columns_in_table_or_function_definition.empty() || (configuration->getFormat() == "auto"); const bool need_resolve_sample_path = context->getSettingsRef()[Setting::use_hive_partitioning] - && !configuration->partition_strategy + && !configuration->getPartitionStrategy() && !configuration->isDataLakeConfiguration(); const bool do_lazy_init = lazy_init && !need_resolve_columns_or_format && !need_resolve_sample_path; - if (!is_table_function && !columns_in_table_or_function_definition.empty() && !is_datalake_query && mode == LoadingStrictnessLevel::CREATE) - { - configuration->create( - object_storage, - context, - columns_in_table_or_function_definition, - partition_by_, - if_not_exists_, - catalog, - storage_id - ); - } - bool updated_configuration = false; try { @@ -169,11 +159,11 @@ StorageObjectStorage::StorageObjectStorage( /// (e.g. read always follows constructor immediately). update_configuration_on_read_write = !is_table_function || !updated_configuration; - std::string sample_path; + std::string sample_path = sample_path_.value_or(""); ColumnsDescription columns{columns_in_table_or_function_definition}; if (need_resolve_columns_or_format) - resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, format_settings, sample_path, context); + resolveSchemaAndFormat(columns, object_storage, configuration, format_settings, sample_path, context); else validateSupportedColumns(columns, *configuration); @@ -181,7 +171,7 @@ StorageObjectStorage::StorageObjectStorage( /// FIXME: We need to call getPathSample() lazily on select /// in case it failed to be initialized in constructor. - if (updated_configuration && sample_path.empty() && need_resolve_sample_path && !configuration->partition_strategy) + if (updated_configuration && sample_path.empty() && need_resolve_sample_path && !configuration->getPartitionStrategy()) { try { @@ -213,7 +203,26 @@ StorageObjectStorage::StorageObjectStorage( sample_path); } - supports_prewhere = FormatFactory::instance().checkIfFormatSupportsPrewhere(configuration->format, context, format_settings); + /// TODO: Known problems with datalake prewhere: + /// * If the iceberg table went through schema evolution, columns read from file may need to + /// be renamed or typecast before applying prewhere. There's already a mechanism for + /// telling parquet reader to rename columns: ColumnMapper. And parquet reader already + /// automatically does type casts to requested types. But weirdly the iceberg reader uses + /// those mechanism to request the *old* name and type of the column, then has additional + /// code to do the renaming and casting as a separate step outside parquet reader. + /// We should probably change this and delete that additional code? + /// * Delta Lake can have "partition columns", which are columns with constant value specified + /// in the metadata, not present in parquet file. Like hive partitioning, but in metadata + /// files instead of path. Currently these columns are added to the block outside parquet + /// reader. If they appear in prewhere expression, parquet reader gets a "no column in block" + /// error. Unlike hive partitioning, we can't (?) just return these columns from + /// supportedPrewhereColumns() because at the time of the call the delta lake metadata hasn't + /// been read yet. So we should probably pass these columns to the parquet reader instead of + /// adding them outside. + /// * There's a bug in StorageObjectStorageSource::createReader: it makes a copy of + /// FormatFilterInfo, but for some reason unsets prewhere_info and row_level_filter_info. + /// There's probably no reason for this, and it should just copy those fields like the others. + supports_prewhere = !configuration->isDataLakeConfiguration() && FormatFactory::instance().checkIfFormatSupportsPrewhere(configuration->getFormat(), context, format_settings); StorageInMemoryMetadata metadata; metadata.setColumns(columns); @@ -221,9 +230,9 @@ StorageObjectStorage::StorageObjectStorage( metadata.setComment(comment); /// I am not sure this is actually required, but just in case - if (configuration->partition_strategy) + if (configuration->getPartitionStrategy()) { - metadata.partition_key = configuration->partition_strategy->getPartitionKeyDescription(); + metadata.partition_key = configuration->getPartitionStrategy()->getPartitionKeyDescription(); } setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.columns)); @@ -237,17 +246,17 @@ String StorageObjectStorage::getName() const bool StorageObjectStorage::prefersLargeBlocks() const { - return FormatFactory::instance().checkIfOutputFormatPrefersLargeBlocks(configuration->format); + return FormatFactory::instance().checkIfOutputFormatPrefersLargeBlocks(configuration->getFormat()); } bool StorageObjectStorage::parallelizeOutputAfterReading(ContextPtr context) const { - return FormatFactory::instance().checkParallelizeOutputAfterReading(configuration->format, context); + return FormatFactory::instance().checkParallelizeOutputAfterReading(configuration->getFormat(), context); } bool StorageObjectStorage::supportsSubsetOfColumns(const ContextPtr & context) const { - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context, format_settings); + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->getFormat(), context, format_settings); } bool StorageObjectStorage::supportsPrewhere() const @@ -262,7 +271,7 @@ bool StorageObjectStorage::canMoveConditionsToPrewhere() const std::optional StorageObjectStorage::supportedPrewhereColumns() const { - return getInMemoryMetadataPtr()->getColumnsWithoutDefaultExpressions(); + return getInMemoryMetadataPtr()->getColumnsWithoutDefaultExpressions(/*exclude=*/ hive_partition_columns_to_read_from_file_path); } IStorage::ColumnSizeByName StorageObjectStorage::getColumnSizes() const @@ -355,7 +364,7 @@ void StorageObjectStorage::read( /* check_consistent_with_previous_metadata */true); } - if (configuration->partition_strategy && configuration->partition_strategy_type != PartitionStrategyFactory::StrategyType::HIVE) + if (configuration->getPartitionStrategy() && configuration->getPartitionStrategyType() != PartitionStrategyFactory::StrategyType::HIVE) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Reading from a partitioned {} storage is not implemented yet", @@ -380,7 +389,7 @@ void StorageObjectStorage::read( if (!modified_format_settings.has_value()) modified_format_settings.emplace(getFormatSettings(local_context)); - configuration->modifyFormatSettings(modified_format_settings.value()); + configuration->modifyFormatSettings(modified_format_settings.value(), *local_context); auto read_step = std::make_unique( object_storage, @@ -442,9 +451,10 @@ SinkToStoragePtr StorageObjectStorage::write( /// Not a data lake, just raw object storage - if (configuration->partition_strategy) + if (configuration->getPartitionStrategy()) { - return std::make_shared(object_storage, configuration, format_settings, sample_block, local_context); + auto sink_creator = std::make_shared(object_storage, configuration, format_settings, sample_block, local_context); + return std::make_shared(configuration->partition_strategy, sink_creator, local_context, sample_block); } auto paths = configuration->getPaths(); @@ -476,6 +486,74 @@ bool StorageObjectStorage::optimize( return configuration->optimize(metadata_snapshot, context, format_settings); } +bool StorageObjectStorage::supportsImport() const +{ + if (!configuration->partition_strategy) + return false; + + if (configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::WILDCARD) + return configuration->getRawPath().hasExportFilenameWildcard(); + + return configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE; +} + + +SinkToStoragePtr StorageObjectStorage::import( + const std::string & file_name, + Block & block_with_partition_values, + std::string & destination_file_path, + bool overwrite_if_exists, + const std::optional & format_settings_, + ContextPtr local_context) +{ + std::string partition_key; + + if (configuration->partition_strategy) + { + const auto column_with_partition_key = configuration->partition_strategy->computePartitionKey(block_with_partition_values); + + if (!column_with_partition_key->empty()) + { + partition_key = column_with_partition_key->getDataAt(0).toString(); + } + } + + destination_file_path = configuration->getPathForWrite(partition_key, file_name).path; + + if (!overwrite_if_exists && object_storage->exists(StoredObject(destination_file_path))) + { + throw Exception(ErrorCodes::FILE_ALREADY_EXISTS, "File {} already exists", destination_file_path); + } + + return std::make_shared( + destination_file_path, + object_storage, + configuration, + format_settings_ ? format_settings_ : format_settings, + std::make_shared(getInMemoryMetadataPtr()->getSampleBlock()), + local_context); +} + +void StorageObjectStorage::commitExportPartitionTransaction(const String & transaction_id, const String & partition_id, const Strings & exported_paths, ContextPtr local_context) +{ + const String commit_object = configuration->getRawPath().path + "/commit_" + partition_id + "_" + transaction_id; + + /// if file already exists, nothing to be done + if (object_storage->exists(StoredObject(commit_object))) + { + LOG_DEBUG(getLogger("StorageObjectStorage"), "Commit file already exists, nothing to be done: {}", commit_object); + return; + } + + auto out = object_storage->writeObject(StoredObject(commit_object), WriteMode::Rewrite, /* attributes= */ {}, DBMS_DEFAULT_BUFFER_SIZE, local_context->getWriteSettings()); + for (const auto & p : exported_paths) + { + out->write(p.data(), p.size()); + out->write("\n", 1); + } + out->finalize(); +} + void StorageObjectStorage::truncate( const ASTPtr & /* query */, const StorageMetadataPtr & /* metadata_snapshot */, @@ -549,7 +627,7 @@ ColumnsDescription StorageObjectStorage::resolveSchemaFromData( { ObjectInfos read_keys; auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); - auto schema = readSchemaFromFormat(configuration->format, format_settings, *iterator, context); + auto schema = readSchemaFromFormat(configuration->getFormat(), format_settings, *iterator, context); sample_path = iterator->getLastFilePath(); return schema; } @@ -570,7 +648,7 @@ std::string StorageObjectStorage::resolveFormatFromData( std::pair StorageObjectStorage::resolveSchemaAndFormatFromData( const ObjectStoragePtr & object_storage, - const StorageObjectStorageConfigurationPtr & configuration, + StorageObjectStorageConfigurationPtr & configuration, const std::optional & format_settings, std::string & sample_path, const ContextPtr & context) @@ -579,13 +657,13 @@ std::pair StorageObjectStorage::resolveSchemaAn auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); auto [columns, format] = detectFormatAndReadSchema(format_settings, *iterator, context); sample_path = iterator->getLastFilePath(); - configuration->format = format; + configuration->setFormat(format); return std::pair(columns, format); } void StorageObjectStorage::addInferredEngineArgsToCreateQuery(ASTs & args, const ContextPtr & context) const { - configuration->addStructureAndFormatToArgsIfNeeded(args, "", configuration->format, context, /*with_structure=*/false); + configuration->addStructureAndFormatToArgsIfNeeded(args, "", configuration->getFormat(), context, /*with_structure=*/false); } SchemaCache & StorageObjectStorage::getSchemaCache(const ContextPtr & context, const std::string & storage_engine_name) @@ -647,5 +725,4 @@ void StorageObjectStorage::checkAlterIsPossible(const AlterCommands & commands, configuration->checkAlterIsPossible(commands); } - } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index cead14537e6b..9b1fe6ea9aee 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -7,6 +7,7 @@ #include #include #include +#include "Storages/ObjectStorage/ObjectStorageFilePathGenerator.h" #include #include #include @@ -36,7 +37,10 @@ struct IPartitionStrategy; class StorageObjectStorage : public IStorage { public: - using ObjectInfo = RelativePathWithMetadata; + class Configuration; + using ConfigurationPtr = std::shared_ptr; + using ConfigurationObserverPtr = std::weak_ptr; + using ObjectInfo = PathWithMetadata; using ObjectInfoPtr = std::shared_ptr; using ObjectInfos = std::vector; @@ -56,7 +60,8 @@ class StorageObjectStorage : public IStorage bool distributed_processing_ = false, ASTPtr partition_by_ = nullptr, bool is_table_function_ = false, - bool lazy_init = false); + bool lazy_init = false, + std::optional sample_path_ = std::nullopt); String getName() const override; @@ -76,6 +81,23 @@ class StorageObjectStorage : public IStorage ContextPtr context, bool async_insert) override; + + bool supportsImport() const override; + + SinkToStoragePtr import( + const std::string & /* file_name */, + Block & /* block_with_partition_values */, + std::string & /* destination_file_path */, + bool /* overwrite_if_exists */, + const std::optional & /* format_settings_ */, + ContextPtr /* context */) override; + + void commitExportPartitionTransaction( + const String & transaction_id, + const String & partition_id, + const Strings & exported_paths, + ContextPtr local_context) override; + void truncate( const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, @@ -122,7 +144,7 @@ class StorageObjectStorage : public IStorage static std::pair resolveSchemaAndFormatFromData( const ObjectStoragePtr & object_storage, - const StorageObjectStorageConfigurationPtr & configuration, + StorageObjectStorageConfigurationPtr & configuration, const std::optional & format_settings, std::string & sample_path, const ContextPtr & context); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index ea529021429e..9cf1532fcd18 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -7,9 +8,16 @@ #include #include +#include +#include +#include +#include +#include #include #include #include +#include +#include #include #include @@ -24,11 +32,14 @@ namespace Setting { extern const SettingsBool use_hive_partitioning; extern const SettingsBool cluster_function_process_archive_on_multiple_nodes; + extern const SettingsUInt64 lock_object_storage_task_distribution_ms; + extern const SettingsString object_storage_cluster; } namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int INVALID_SETTING_VALUE; } String StorageObjectStorageCluster::getPathSample(ContextPtr context) @@ -36,6 +47,14 @@ String StorageObjectStorageCluster::getPathSample(ContextPtr context) auto query_settings = configuration->getQuerySettings(context); /// We don't want to throw an exception if there are no files with specified path. query_settings.throw_on_zero_files_match = false; + + if (!configuration->isArchive()) + { + const auto & path = configuration->getPathForRead(); + if (!path.hasGlobs()) + return path.path; + } + auto file_iterator = StorageObjectStorageSource::createFileIterator( configuration, query_settings, @@ -47,11 +66,14 @@ String StorageObjectStorageCluster::getPathSample(ContextPtr context) {}, // virtual_columns {}, // hive_columns nullptr, // read_keys - {} // file_progress_callback + {}, // file_progress_callback + false, // ignore_archive_globs + true // skip_object_metadata ); if (auto file = file_iterator->next(0)) return file->getPath(); + return ""; } @@ -63,28 +85,81 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( const ColumnsDescription & columns_in_table_or_function_definition, const ConstraintsDescription & constraints_, const ASTPtr & partition_by, - ContextPtr context_) + ContextPtr context_, + const String & comment_, + std::optional format_settings_, + LoadingStrictnessLevel mode_, + std::shared_ptr catalog, + bool if_not_exists, + bool is_datalake_query, + bool is_table_function, + bool lazy_init) : IStorageCluster( cluster_name_, table_id_, getLogger(fmt::format("{}({})", configuration_->getEngineName(), table_id_.table_name))) , configuration{configuration_} , object_storage(object_storage_) + , cluster_name_in_settings(false) { configuration->initPartitionStrategy(partition_by, columns_in_table_or_function_definition, context_); - /// We allow exceptions to be thrown on update(), - /// because Cluster engine can only be used as table function, - /// so no lazy initialization is allowed. - configuration->update( - object_storage, - context_, - /* if_not_updated_before */false, - /* check_consistent_with_previous_metadata */true); + + const bool need_resolve_columns_or_format = columns_in_table_or_function_definition.empty() || (configuration->getFormat() == "auto"); + const bool do_lazy_init = lazy_init && !need_resolve_columns_or_format; + + auto log_ = getLogger("StorageObjectStorageCluster"); + + if (!columns_in_table_or_function_definition.empty() + && !is_datalake_query + && mode_ == LoadingStrictnessLevel::CREATE) + { + configuration->create( + object_storage, + context_, + columns_in_table_or_function_definition, + partition_by, + if_not_exists, + catalog, + table_id_ + ); + } + + try + { + if (!do_lazy_init) + { + /// We allow exceptions to be thrown on update(), + /// because Cluster engine can only be used as table function, + /// so no lazy initialization is allowed. + configuration->update( + object_storage, + context_, + /* if_not_updated_before */false, + /* check_consistent_with_previous_metadata */true); + } + } + catch (...) + { + // If we don't have format or schema yet, we can't ignore failed configuration update, + // because relevant configuration is crucial for format and schema inference + if (mode_ <= LoadingStrictnessLevel::CREATE || need_resolve_columns_or_format) + { + throw; + } + tryLogCurrentException(log_); + } + + // For tables need to update configuration on each read + // because data can be changed after previous update + update_configuration_on_read_write = !is_table_function; ColumnsDescription columns{columns_in_table_or_function_definition}; std::string sample_path; - resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, {}, sample_path, context_); + if (need_resolve_columns_or_format) + resolveSchemaAndFormat(columns, object_storage, configuration, {}, sample_path, context_); + else + validateSupportedColumns(columns, *configuration); configuration->check(context_); - if (sample_path.empty() && context_->getSettingsRef()[Setting::use_hive_partitioning] && !configuration->isDataLakeConfiguration() && !configuration->partition_strategy) + if (sample_path.empty() && context_->getSettingsRef()[Setting::use_hive_partitioning] && !configuration->isDataLakeConfiguration() && !configuration->getPartitionStrategy()) sample_path = getPathSample(context_); /// Not grabbing the file_columns because it is not necessary to do it here. @@ -100,8 +175,37 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( metadata.setColumns(columns); metadata.setConstraints(constraints_); + if (configuration->getPartitionStrategy()) + { + metadata.partition_key = configuration->getPartitionStrategy()->getPartitionKeyDescription(); + } + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.columns)); setInMemoryMetadata(metadata); + + pure_storage = std::make_shared( + configuration, + object_storage, + context_, + getStorageID(), + IStorageCluster::getInMemoryMetadata().getColumns(), + IStorageCluster::getInMemoryMetadata().getConstraints(), + comment_, + format_settings_, + mode_, + catalog, + if_not_exists, + is_datalake_query, + /* distributed_processing */false, + partition_by, + /* is_table_function */false, + /* lazy_init */lazy_init, + sample_path); + + auto virtuals_ = getVirtualsPtr(); + if (virtuals_) + pure_storage->setVirtuals(*virtuals_); + pure_storage->setInMemoryMetadata(IStorageCluster::getInMemoryMetadata()); } std::string StorageObjectStorageCluster::getName() const @@ -111,6 +215,8 @@ std::string StorageObjectStorageCluster::getName() const std::optional StorageObjectStorageCluster::totalRows(ContextPtr query_context) const { + if (pure_storage) + return pure_storage->totalRows(query_context); configuration->update( object_storage, query_context, @@ -121,6 +227,8 @@ std::optional StorageObjectStorageCluster::totalRows(ContextPtr query_co std::optional StorageObjectStorageCluster::totalBytes(ContextPtr query_context) const { + if (pure_storage) + return pure_storage->totalBytes(query_context); configuration->update( object_storage, query_context, @@ -129,11 +237,136 @@ std::optional StorageObjectStorageCluster::totalBytes(ContextPtr query_c return configuration->totalBytes(query_context); } +void StorageObjectStorageCluster::updateQueryForDistributedEngineIfNeeded(ASTPtr & query, ContextPtr context) +{ + // Change table engine on table function for distributed request + // CREATE TABLE t (...) ENGINE=IcebergS3(...) + // SELECT * FROM t + // change on + // SELECT * FROM icebergS3(...) + // to execute on cluster nodes + + auto * select_query = query->as(); + if (!select_query || !select_query->tables()) + return; + + auto * tables = select_query->tables()->as(); + + if (tables->children.empty()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Expected SELECT query from table with engine {}, got '{}'", + configuration->getEngineName(), query->formatForLogging()); + + auto * table_expression = tables->children[0]->as()->table_expression->as(); + + if (!table_expression) + return; + + if (!table_expression->database_and_table_name) + return; + + auto & table_identifier_typed = table_expression->database_and_table_name->as(); + + auto table_alias = table_identifier_typed.tryGetAlias(); + + auto storage_engine_name = configuration->getEngineName(); + if (storage_engine_name == "Iceberg") + { + switch (configuration->getType()) + { + case ObjectStorageType::S3: + storage_engine_name = "IcebergS3"; + break; + case ObjectStorageType::Azure: + storage_engine_name = "IcebergAzure"; + break; + case ObjectStorageType::HDFS: + storage_engine_name = "IcebergHDFS"; + break; + default: + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Can't find table function for engine {}", + storage_engine_name + ); + } + } + + static std::unordered_map engine_to_function = { + {"S3", "s3"}, + {"Azure", "azureBlobStorage"}, + {"HDFS", "hdfs"}, + {"Iceberg", "iceberg"}, + {"IcebergS3", "icebergS3"}, + {"IcebergAzure", "icebergAzure"}, + {"IcebergHDFS", "icebergHDFS"}, + {"IcebergLocal", "icebergLocal"}, + {"DeltaLake", "deltaLake"}, + {"DeltaLakeS3", "deltaLakeS3"}, + {"DeltaLakeAzure", "deltaLakeAzure"}, + {"Hudi", "hudi"} + }; + + auto p = engine_to_function.find(storage_engine_name); + if (p == engine_to_function.end()) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Can't find table function for engine {}", + storage_engine_name + ); + } + + std::string table_function_name = p->second; + + auto function_ast = std::make_shared(); + function_ast->name = table_function_name; + + auto cluster_name = getClusterName(context); + + if (cluster_name.empty()) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Can't be here without cluster name, no cluster name in query {}", + query->formatForLogging()); + } + + function_ast->arguments = configuration->createArgsWithAccessData(); + function_ast->children.push_back(function_ast->arguments); + function_ast->setAlias(table_alias); + + ASTPtr function_ast_ptr(function_ast); + + table_expression->database_and_table_name = nullptr; + table_expression->table_function = function_ast_ptr; + table_expression->children[0] = function_ast_ptr; + + auto settings = select_query->settings(); + if (settings) + { + auto & settings_ast = settings->as(); + settings_ast.changes.insertSetting("object_storage_cluster", cluster_name); + } + else + { + auto settings_ast_ptr = std::make_shared(); + settings_ast_ptr->is_standalone = false; + settings_ast_ptr->changes.setSetting("object_storage_cluster", cluster_name); + select_query->setExpression(ASTSelectQuery::Expression::SETTINGS, std::move(settings_ast_ptr)); + } + + cluster_name_in_settings = true; +} + void StorageObjectStorageCluster::updateQueryToSendIfNeeded( ASTPtr & query, const DB::StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) { + updateQueryForDistributedEngineIfNeeded(query, context); + auto * table_function = extractTableFunctionFromSelectQuery(query); if (!table_function) { @@ -162,6 +395,8 @@ void StorageObjectStorageCluster::updateQueryToSendIfNeeded( configuration->getEngineName()); } + ASTPtr object_storage_type_arg; + configuration->extractDynamicStorageType(args, context, &object_storage_type_arg); ASTPtr settings_temporary_storage = nullptr; for (auto * it = args.begin(); it != args.end(); ++it) { @@ -174,21 +409,107 @@ void StorageObjectStorageCluster::updateQueryToSendIfNeeded( } } - if (!endsWith(table_function->name, "Cluster")) - configuration->addStructureAndFormatToArgsIfNeeded(args, structure, configuration->format, context, /*with_structure=*/true); + if (cluster_name_in_settings || !endsWith(table_function->name, "Cluster")) + { + configuration->addStructureAndFormatToArgsIfNeeded(args, structure, configuration->getFormat(), context, /*with_structure=*/true); + + /// Convert to old-stype *Cluster table function. + /// This allows to use old clickhouse versions in cluster. + static std::unordered_map function_to_cluster_function = { + {"s3", "s3Cluster"}, + {"azureBlobStorage", "azureBlobStorageCluster"}, + {"hdfs", "hdfsCluster"}, + {"iceberg", "icebergCluster"}, + {"icebergS3", "icebergS3Cluster"}, + {"icebergAzure", "icebergAzureCluster"}, + {"icebergHDFS", "icebergHDFSCluster"}, + {"icebergLocal", "icebergLocalCluster"}, + {"deltaLake", "deltaLakeCluster"}, + {"deltaLakeS3", "deltaLakeS3Cluster"}, + {"deltaLakeAzure", "deltaLakeAzureCluster"}, + {"hudi", "hudiCluster"}, + }; + + auto p = function_to_cluster_function.find(table_function->name); + if (p == function_to_cluster_function.end()) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Can't find cluster name for table function {}", + table_function->name); + } + + table_function->name = p->second; + + auto cluster_name = getClusterName(context); + auto cluster_name_arg = std::make_shared(cluster_name); + args.insert(args.begin(), cluster_name_arg); + + auto * select_query = query->as(); + if (!select_query) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Expected SELECT query from table function {}", + configuration->getEngineName()); + + auto settings = select_query->settings(); + if (settings) + { + auto & settings_ast = settings->as(); + if (settings_ast.changes.removeSetting("object_storage_cluster") && settings_ast.changes.empty()) + { + select_query->setExpression(ASTSelectQuery::Expression::SETTINGS, {}); + } + /// No throw if not found - `object_storage_cluster` can be global setting. + } + } else { ASTPtr cluster_name_arg = args.front(); args.erase(args.begin()); - configuration->addStructureAndFormatToArgsIfNeeded(args, structure, configuration->format, context, /*with_structure=*/true); + configuration->addStructureAndFormatToArgsIfNeeded(args, structure, configuration->getFormat(), context, /*with_structure=*/true); args.insert(args.begin(), cluster_name_arg); } if (settings_temporary_storage) { args.insert(args.end(), std::move(settings_temporary_storage)); } + if (object_storage_type_arg) + args.insert(args.end(), object_storage_type_arg); } +class TaskDistributor : public TaskIterator +{ +public: + TaskDistributor(std::shared_ptr iterator, + std::vector && ids_of_hosts, + bool send_over_whole_archive, + uint64_t lock_object_storage_task_distribution_ms, + ContextPtr context_ + ) + : task_distributor(iterator, std::move(ids_of_hosts), send_over_whole_archive, lock_object_storage_task_distribution_ms) + , context(context_) {} + ~TaskDistributor() override = default; + bool supportRerunTask() const override { return true; } + void rescheduleTasksFromReplica(size_t number_of_current_replica) override + { + task_distributor.rescheduleTasksFromReplica(number_of_current_replica); + } + + ClusterFunctionReadTaskResponsePtr operator()(size_t number_of_current_replica) const override + { + auto task = task_distributor.getNextTask(number_of_current_replica); + if (task) + { + return std::make_shared(std::move(task), context); + } + return std::make_shared(); + } + +private: + mutable StorageObjectStorageStableTaskDistributor task_distributor; + ContextPtr context; +}; RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExtension( const ActionsDAG::Node * predicate, @@ -204,7 +525,7 @@ RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExten local_context, predicate, filter, - virtual_columns, + getVirtualsList(), hive_partition_columns_to_read_from_file_path, nullptr, local_context->getFileProgressCallback(), @@ -224,21 +545,468 @@ RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExten } } - auto task_distributor = std::make_shared( - iterator, - std::move(ids_of_hosts), - /* send_over_whole_archive */!local_context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes]); + uint64_t lock_object_storage_task_distribution_ms = local_context->getSettingsRef()[Setting::lock_object_storage_task_distribution_ms]; - auto callback = std::make_shared( - [task_distributor, local_context](size_t number_of_current_replica) mutable -> ClusterFunctionReadTaskResponsePtr - { - auto task = task_distributor->getNextTask(number_of_current_replica); - if (task) - return std::make_shared(std::move(task), local_context); - return std::make_shared(); - }); + /// Check value to avoid negative result after conversion in microseconds. + /// Poco::Timestamp::TimeDiff is signed int 64. + static const uint64_t lock_object_storage_task_distribution_ms_max = 0x0020000000000000ULL; + if (lock_object_storage_task_distribution_ms > lock_object_storage_task_distribution_ms_max) + throw Exception(ErrorCodes::INVALID_SETTING_VALUE, + "Value lock_object_storage_task_distribution_ms is too big: {}, allowed maximum is {}", + lock_object_storage_task_distribution_ms, + lock_object_storage_task_distribution_ms_max + ); + + auto callback = std::make_shared(iterator, + std::move(ids_of_hosts), + /* send_over_whole_archive */!local_context->getSettingsRef()[Setting::cluster_function_process_archive_on_multiple_nodes], + lock_object_storage_task_distribution_ms, + local_context); return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) }; } +bool StorageObjectStorageCluster::supportsImport() const +{ + if (pure_storage) + return pure_storage->supportsImport(); + return false; +} + +SinkToStoragePtr StorageObjectStorageCluster::import( + const std::string & file_name, + Block & block_with_partition_values, + std::string & destination_file_path, + bool overwrite_if_exists, + const std::optional & format_settings_, + ContextPtr context) +{ + if (pure_storage) + return pure_storage->import(file_name, block_with_partition_values, destination_file_path, overwrite_if_exists, format_settings_, context); + + return IStorageCluster::import(file_name, block_with_partition_values, destination_file_path, overwrite_if_exists, format_settings_, context); +} + +void StorageObjectStorageCluster::readFallBackToPure( + QueryPlan & query_plan, + const Names & column_names, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & query_info, + ContextPtr context, + QueryProcessingStage::Enum processed_stage, + size_t max_block_size, + size_t num_streams) +{ + pure_storage->read(query_plan, column_names, storage_snapshot, query_info, context, processed_stage, max_block_size, num_streams); +} + +SinkToStoragePtr StorageObjectStorageCluster::writeFallBackToPure( + const ASTPtr & query, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr context, + bool async_insert) +{ + return pure_storage->write(query, metadata_snapshot, context, async_insert); +} + +String StorageObjectStorageCluster::getClusterName(ContextPtr context) const +{ + /// StorageObjectStorageCluster is always created for cluster or non-cluster variants. + /// User can specify cluster name in table definition or in setting `object_storage_cluster` + /// only for several queries. When it specified in both places, priority is given to the query setting. + /// When it is empty, non-cluster realization is used. + auto cluster_name_from_settings = context->getSettingsRef()[Setting::object_storage_cluster].value; + if (cluster_name_from_settings.empty()) + cluster_name_from_settings = getOriginalClusterName(); + return cluster_name_from_settings; +} + +QueryProcessingStage::Enum StorageObjectStorageCluster::getQueryProcessingStage( + ContextPtr context, QueryProcessingStage::Enum to_stage, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info) const +{ + /// Full query if fall back to pure storage. + if (getClusterName(context).empty()) + return QueryProcessingStage::Enum::FetchColumns; + + /// Distributed storage. + return IStorageCluster::getQueryProcessingStage(context, to_stage, storage_snapshot, query_info); +} + +std::optional StorageObjectStorageCluster::distributedWrite( + const ASTInsertQuery & query, + ContextPtr context) +{ + if (getClusterName(context).empty()) + return pure_storage->distributedWrite(query, context); + return IStorageCluster::distributedWrite(query, context); +} + +void StorageObjectStorageCluster::drop() +{ + if (pure_storage) + { + pure_storage->drop(); + return; + } + IStorageCluster::drop(); +} + +void StorageObjectStorageCluster::dropInnerTableIfAny(bool sync, ContextPtr context) +{ + if (getClusterName(context).empty()) + { + pure_storage->dropInnerTableIfAny(sync, context); + return; + } + IStorageCluster::dropInnerTableIfAny(sync, context); +} + +void StorageObjectStorageCluster::truncate( + const ASTPtr & query, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr local_context, + TableExclusiveLockHolder & lock_holder) +{ + /// Full query if fall back to pure storage. + if (getClusterName(local_context).empty()) + { + pure_storage->truncate(query, metadata_snapshot, local_context, lock_holder); + return; + } + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Truncate is not supported by storage {}", getName()); +} + +void StorageObjectStorageCluster::checkTableCanBeRenamed(const StorageID & new_name) const +{ + if (pure_storage) + { + pure_storage->checkTableCanBeRenamed(new_name); + return; + } + IStorageCluster::checkTableCanBeRenamed(new_name); +} + +void StorageObjectStorageCluster::rename(const String & new_path_to_table_data, const StorageID & new_table_id) +{ + if (pure_storage) + { + pure_storage->rename(new_path_to_table_data, new_table_id); + return; + } + IStorageCluster::rename(new_path_to_table_data, new_table_id); +} + +void StorageObjectStorageCluster::renameInMemory(const StorageID & new_table_id) +{ + if (pure_storage) + { + pure_storage->renameInMemory(new_table_id); + return; + } + IStorageCluster::renameInMemory(new_table_id); +} + +void StorageObjectStorageCluster::alter(const AlterCommands & params, ContextPtr context, AlterLockHolder & alter_lock_holder) +{ + if (getClusterName(context).empty()) + { + pure_storage->alter(params, context, alter_lock_holder); + setInMemoryMetadata(pure_storage->getInMemoryMetadata()); + return; + } + IStorageCluster::alter(params, context, alter_lock_holder); + pure_storage->setInMemoryMetadata(IStorageCluster::getInMemoryMetadata()); +} + +void StorageObjectStorageCluster::addInferredEngineArgsToCreateQuery(ASTs & args, const ContextPtr & context) const +{ + configuration->addStructureAndFormatToArgsIfNeeded(args, "", configuration->getFormat(), context, /*with_structure=*/false); +} + +bool StorageObjectStorageCluster::updateExternalDynamicMetadataIfExists(ContextPtr context) +{ + if (getClusterName(context).empty()) + return pure_storage->updateExternalDynamicMetadataIfExists(context); + return IStorageCluster::updateExternalDynamicMetadataIfExists(context); +} + +StorageMetadataPtr StorageObjectStorageCluster::getInMemoryMetadataPtr() const +{ + if (pure_storage) + return pure_storage->getInMemoryMetadataPtr(); + return IStorageCluster::getInMemoryMetadataPtr(); +} + +IDataLakeMetadata * StorageObjectStorageCluster::getExternalMetadata(ContextPtr query_context) +{ + if (getClusterName(query_context).empty()) + return pure_storage->getExternalMetadata(query_context); + + configuration->update( + object_storage, + query_context, + /* if_not_updated_before */false, + /* check_consistent_with_previous_metadata */false); + + return configuration->getExternalMetadata(); +} + +void StorageObjectStorageCluster::updateConfigurationIfNeeded(ContextPtr context) +{ + if (update_configuration_on_read_write) + { + configuration->update( + object_storage, + context, + /* if_not_updated_before */false, + /* check_consistent_with_previous_metadata */false); + } +} + +void StorageObjectStorageCluster::checkAlterIsPossible(const AlterCommands & commands, ContextPtr context) const +{ + if (getClusterName(context).empty()) + { + pure_storage->checkAlterIsPossible(commands, context); + return; + } + IStorageCluster::checkAlterIsPossible(commands, context); +} + +void StorageObjectStorageCluster::checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const +{ + if (pure_storage) + { + pure_storage->checkMutationIsPossible(commands, settings); + return; + } + IStorageCluster::checkMutationIsPossible(commands, settings); +} + +Pipe StorageObjectStorageCluster::alterPartition( + const StorageMetadataPtr & metadata_snapshot, + const PartitionCommands & commands, + ContextPtr context) +{ + if (getClusterName(context).empty()) + return pure_storage->alterPartition(metadata_snapshot, commands, context); + return IStorageCluster::alterPartition(metadata_snapshot, commands, context); +} + +void StorageObjectStorageCluster::checkAlterPartitionIsPossible( + const PartitionCommands & commands, + const StorageMetadataPtr & metadata_snapshot, + const Settings & settings, + ContextPtr context) const +{ + if (getClusterName(context).empty()) + { + pure_storage->checkAlterPartitionIsPossible(commands, metadata_snapshot, settings, context); + return; + } + IStorageCluster::checkAlterPartitionIsPossible(commands, metadata_snapshot, settings, context); +} + +bool StorageObjectStorageCluster::optimize( + const ASTPtr & query, + const StorageMetadataPtr & metadata_snapshot, + const ASTPtr & partition, + bool final, + bool deduplicate, + const Names & deduplicate_by_columns, + bool cleanup, + ContextPtr context) +{ + if (getClusterName(context).empty()) + return pure_storage->optimize(query, metadata_snapshot, partition, final, deduplicate, deduplicate_by_columns, cleanup, context); + return IStorageCluster::optimize(query, metadata_snapshot, partition, final, deduplicate, deduplicate_by_columns, cleanup, context); +} + +QueryPipeline StorageObjectStorageCluster::updateLightweight(const MutationCommands & commands, ContextPtr context) +{ + if (getClusterName(context).empty()) + return pure_storage->updateLightweight(commands, context); + return IStorageCluster::updateLightweight(commands, context); +} + +void StorageObjectStorageCluster::mutate(const MutationCommands & commands, ContextPtr context) +{ + if (getClusterName(context).empty()) + { + pure_storage->mutate(commands, context); + return; + } + IStorageCluster::mutate(commands, context); +} + +CancellationCode StorageObjectStorageCluster::killMutation(const String & mutation_id) +{ + if (pure_storage) + return pure_storage->killMutation(mutation_id); + return IStorageCluster::killMutation(mutation_id); +} + +void StorageObjectStorageCluster::waitForMutation(const String & mutation_id, bool wait_for_another_mutation) +{ + if (pure_storage) + { + pure_storage->waitForMutation(mutation_id, wait_for_another_mutation); + return; + } + IStorageCluster::waitForMutation(mutation_id, wait_for_another_mutation); +} + +void StorageObjectStorageCluster::setMutationCSN(const String & mutation_id, UInt64 csn) +{ + if (pure_storage) + { + pure_storage->setMutationCSN(mutation_id, csn); + return; + } + IStorageCluster::setMutationCSN(mutation_id, csn); +} + +CancellationCode StorageObjectStorageCluster::killPartMoveToShard(const UUID & task_uuid) +{ + if (pure_storage) + return pure_storage->killPartMoveToShard(task_uuid); + return IStorageCluster::killPartMoveToShard(task_uuid); +} + +void StorageObjectStorageCluster::startup() +{ + if (pure_storage) + { + pure_storage->startup(); + return; + } + IStorageCluster::startup(); +} + +void StorageObjectStorageCluster::shutdown(bool is_drop) +{ + if (pure_storage) + { + pure_storage->shutdown(is_drop); + return; + } + IStorageCluster::shutdown(is_drop); +} + +void StorageObjectStorageCluster::flushAndPrepareForShutdown() +{ + if (pure_storage) + { + pure_storage->flushAndPrepareForShutdown(); + return; + } + IStorageCluster::flushAndPrepareForShutdown(); +} + +ActionLock StorageObjectStorageCluster::getActionLock(StorageActionBlockType action_type) +{ + if (pure_storage) + return pure_storage->getActionLock(action_type); + return IStorageCluster::getActionLock(action_type); +} + +void StorageObjectStorageCluster::onActionLockRemove(StorageActionBlockType action_type) +{ + if (pure_storage) + { + pure_storage->onActionLockRemove(action_type); + return; + } + IStorageCluster::onActionLockRemove(action_type); +} + +bool StorageObjectStorageCluster::prefersLargeBlocks() const +{ + if (pure_storage) + return pure_storage->prefersLargeBlocks(); + return IStorageCluster::prefersLargeBlocks(); +} + +void StorageObjectStorageCluster::commitExportPartitionTransaction( + const String & transaction_id, + const String & partition_id, + const Strings & exported_paths, + ContextPtr local_context) +{ + if (pure_storage) + return pure_storage->commitExportPartitionTransaction(transaction_id, partition_id, exported_paths, local_context); + return IStorageCluster::commitExportPartitionTransaction(transaction_id, partition_id, exported_paths, local_context); +} + +bool StorageObjectStorageCluster::supportsPartitionBy() const +{ + if (pure_storage) + return pure_storage->supportsPartitionBy(); + return IStorageCluster::supportsPartitionBy(); +} + +bool StorageObjectStorageCluster::supportsSubcolumns() const +{ + if (pure_storage) + return pure_storage->supportsSubcolumns(); + return IStorageCluster::supportsSubcolumns(); +} + +bool StorageObjectStorageCluster::supportsDynamicSubcolumns() const +{ + if (pure_storage) + return pure_storage->supportsDynamicSubcolumns(); + return IStorageCluster::supportsDynamicSubcolumns(); +} + +bool StorageObjectStorageCluster::supportsTrivialCountOptimization(const StorageSnapshotPtr & snapshot, ContextPtr context) const +{ + if (pure_storage) + return pure_storage->supportsTrivialCountOptimization(snapshot, context); + return IStorageCluster::supportsTrivialCountOptimization(snapshot, context); +} + +bool StorageObjectStorageCluster::supportsPrewhere() const +{ + if (pure_storage) + return pure_storage->supportsPrewhere(); + return IStorageCluster::supportsPrewhere(); +} + +bool StorageObjectStorageCluster::canMoveConditionsToPrewhere() const +{ + if (pure_storage) + return pure_storage->canMoveConditionsToPrewhere(); + return IStorageCluster::canMoveConditionsToPrewhere(); +} + +std::optional StorageObjectStorageCluster::supportedPrewhereColumns() const +{ + if (pure_storage) + return pure_storage->supportedPrewhereColumns(); + return IStorageCluster::supportedPrewhereColumns(); +} + +IStorageCluster::ColumnSizeByName StorageObjectStorageCluster::getColumnSizes() const +{ + if (pure_storage) + return pure_storage->getColumnSizes(); + return IStorageCluster::getColumnSizes(); +} + +bool StorageObjectStorageCluster::parallelizeOutputAfterReading(ContextPtr context) const +{ + if (pure_storage) + return pure_storage->parallelizeOutputAfterReading(context); + return IStorageCluster::parallelizeOutputAfterReading(context); +} + +bool StorageObjectStorageCluster::supportsDelete() const +{ + if (pure_storage) + return pure_storage->supportsDelete(); + return IStorageCluster::supportsDelete(); +} + } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index 6dc9837da134..72f5bf5dc009 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -18,7 +18,15 @@ class StorageObjectStorageCluster : public IStorageCluster const ColumnsDescription & columns_in_table_or_function_definition, const ConstraintsDescription & constraints_, const ASTPtr & partition_by, - ContextPtr context_); + ContextPtr context_, + const String & comment_, + std::optional format_settings_, + LoadingStrictnessLevel mode_, + std::shared_ptr catalog, + bool if_not_exists, + bool is_datalake_query, + bool is_table_function, + bool lazy_init); std::string getName() const override; @@ -32,6 +40,123 @@ class StorageObjectStorageCluster : public IStorageCluster std::optional totalRows(ContextPtr query_context) const override; std::optional totalBytes(ContextPtr query_context) const override; + void setClusterNameInSettings(bool cluster_name_in_settings_) { cluster_name_in_settings = cluster_name_in_settings_; } + + String getClusterName(ContextPtr context) const override; + + QueryProcessingStage::Enum getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; + + std::optional distributedWrite( + const ASTInsertQuery & query, + ContextPtr context) override; + + void drop() override; + + void dropInnerTableIfAny(bool sync, ContextPtr context) override; + + void truncate( + const ASTPtr & query, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr local_context, + TableExclusiveLockHolder &) override; + + void checkTableCanBeRenamed(const StorageID & new_name) const override; + + void rename(const String & new_path_to_table_data, const StorageID & new_table_id) override; + + void renameInMemory(const StorageID & new_table_id) override; + + void alter(const AlterCommands & params, ContextPtr context, AlterLockHolder & alter_lock_holder) override; + + void addInferredEngineArgsToCreateQuery(ASTs & args, const ContextPtr & context) const override; + + IDataLakeMetadata * getExternalMetadata(ContextPtr query_context); + + bool updateExternalDynamicMetadataIfExists(ContextPtr context) override; + + StorageMetadataPtr getInMemoryMetadataPtr() const override; + + void checkAlterIsPossible(const AlterCommands & commands, ContextPtr context) const override; + + void checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const override; + + Pipe alterPartition( + const StorageMetadataPtr & metadata_snapshot, + const PartitionCommands & commands, + ContextPtr context) override; + + void checkAlterPartitionIsPossible( + const PartitionCommands & commands, + const StorageMetadataPtr & metadata_snapshot, + const Settings & settings, + ContextPtr context) const override; + + bool optimize( + const ASTPtr & query, + const StorageMetadataPtr & metadata_snapshot, + const ASTPtr & partition, + bool final, + bool deduplicate, + const Names & deduplicate_by_columns, + bool cleanup, + ContextPtr context) override; + + QueryPipeline updateLightweight(const MutationCommands & commands, ContextPtr context) override; + + void mutate(const MutationCommands & commands, ContextPtr context) override; + + CancellationCode killMutation(const String & mutation_id) override; + + void waitForMutation(const String & mutation_id, bool wait_for_another_mutation) override; + + void setMutationCSN(const String & mutation_id, UInt64 csn) override; + + CancellationCode killPartMoveToShard(const UUID & task_uuid) override; + + void startup() override; + + void shutdown(bool is_drop = false) override; + + void flushAndPrepareForShutdown() override; + + ActionLock getActionLock(StorageActionBlockType action_type) override; + + void onActionLockRemove(StorageActionBlockType action_type) override; + + bool supportsImport() const override; + + SinkToStoragePtr import( + const std::string & /* file_name */, + Block & /* block_with_partition_values */, + std::string & /* destination_file_path */, + bool /* overwrite_if_exists */, + const std::optional & /* format_settings_ */, + ContextPtr /* context */) override; + bool prefersLargeBlocks() const override; + + void commitExportPartitionTransaction( + const String & transaction_id, + const String & partition_id, + const Strings & exported_paths, + ContextPtr local_context) override; + + bool supportsPartitionBy() const override; + + bool supportsSubcolumns() const override; + + bool supportsDynamicSubcolumns() const override; + + bool supportsTrivialCountOptimization(const StorageSnapshotPtr &, ContextPtr) const override; + + /// Things required for PREWHERE. + bool supportsPrewhere() const override; + bool canMoveConditionsToPrewhere() const override; + std::optional supportedPrewhereColumns() const override; + ColumnSizeByName getColumnSizes() const override; + + bool parallelizeOutputAfterReading(ContextPtr context) const override; + + bool supportsDelete() const override; private: void updateQueryToSendIfNeeded( @@ -39,11 +164,47 @@ class StorageObjectStorageCluster : public IStorageCluster const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; + void readFallBackToPure( + QueryPlan & query_plan, + const Names & column_names, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & query_info, + ContextPtr context, + QueryProcessingStage::Enum processed_stage, + size_t max_block_size, + size_t num_streams) override; + + SinkToStoragePtr writeFallBackToPure( + const ASTPtr & query, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr context, + bool async_insert) override; + + void updateConfigurationIfNeeded(ContextPtr context) override; + + /* + In case the table was created with `object_storage_cluster` setting, + modify the AST query object so that it uses the table function implementation + by mapping the engine name to table function name and setting `object_storage_cluster`. + For table like + CREATE TABLE table ENGINE=S3(...) SETTINGS object_storage_cluster='cluster' + coverts request + SELECT * FROM table + to + SELECT * FROM s3(...) SETTINGS object_storage_cluster='cluster' + to make distributed request over cluster 'cluster'. + */ + void updateQueryForDistributedEngineIfNeeded(ASTPtr & query, ContextPtr context); + const String engine_name; - const StorageObjectStorageConfigurationPtr configuration; + StorageObjectStorageConfigurationPtr configuration; const ObjectStoragePtr object_storage; - NamesAndTypesList virtual_columns; NamesAndTypesList hive_partition_columns_to_read_from_file_path; + bool cluster_name_in_settings; + + /// non-clustered storage to fall back on pure realisation if needed + std::shared_ptr pure_storage; + bool update_configuration_on_read_write; }; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp index 982cd6857910..6112817663f4 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.cpp @@ -59,56 +59,65 @@ std::optional StorageObjectStorageConfiguration::tryGetTable } void StorageObjectStorageConfiguration::initialize( - StorageObjectStorageConfiguration & configuration_to_initialize, ASTs & engine_args, ContextPtr local_context, bool with_table_structure) { if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) - configuration_to_initialize.fromNamedCollection(*named_collection, local_context); + fromNamedCollection(*named_collection, local_context); else - configuration_to_initialize.fromAST(engine_args, local_context, with_table_structure); + fromAST(engine_args, local_context, with_table_structure); - if (configuration_to_initialize.isNamespaceWithGlobs()) + if (isNamespaceWithGlobs()) throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Expression can not have wildcards inside {} name", configuration_to_initialize.getNamespaceType()); + "Expression can not have wildcards inside {} name", getNamespaceType()); - if (configuration_to_initialize.isDataLakeConfiguration()) + if (isDataLakeConfiguration()) { - if (configuration_to_initialize.partition_strategy_type != PartitionStrategyFactory::StrategyType::NONE) + if (getPartitionStrategyType() != PartitionStrategyFactory::StrategyType::NONE) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "The `partition_strategy` argument is incompatible with data lakes"); } } - else if (configuration_to_initialize.partition_strategy_type == PartitionStrategyFactory::StrategyType::NONE) + else if (getPartitionStrategyType() == PartitionStrategyFactory::StrategyType::NONE) { - if (configuration_to_initialize.getRawPath().hasPartitionWildcard()) + if (getRawPath().hasPartitionWildcard()) { // Promote to wildcard in case it is not data lake to make it backwards compatible - configuration_to_initialize.partition_strategy_type = PartitionStrategyFactory::StrategyType::WILDCARD; + setPartitionStrategyType(PartitionStrategyFactory::StrategyType::WILDCARD); } } - if (configuration_to_initialize.format == "auto") + if (partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE) { - if (configuration_to_initialize.isDataLakeConfiguration()) + file_path_generator = std::make_shared( + getRawPath().path, + format); + } + else + { + file_path_generator = std::make_shared(getRawPath().path); + } + + if (format == "auto") + { + if (isDataLakeConfiguration()) { - configuration_to_initialize.format = "Parquet"; + format = "Parquet"; } else { - configuration_to_initialize.format + format = FormatFactory::instance() - .tryGetFormatFromFileName(configuration_to_initialize.isArchive() ? configuration_to_initialize.getPathInArchive() : configuration_to_initialize.getRawPath().path) + .tryGetFormatFromFileName(isArchive() ? getPathInArchive() : getRawPath().path) .value_or("auto"); } } else - FormatFactory::instance().checkFormatName(configuration_to_initialize.format); + FormatFactory::instance().checkFormatName(format); - /// It might be changed on `StorageObjectStorageConfiguration::initPartitionStrategy` - configuration_to_initialize.read_path = configuration_to_initialize.getRawPath(); - configuration_to_initialize.initialized = true; + read_path = file_path_generator->getPathForRead(); + initialized = true; } void StorageObjectStorageConfiguration::initPartitionStrategy(ASTPtr partition_by, const ColumnsDescription & columns, ContextPtr context) @@ -125,7 +134,6 @@ void StorageObjectStorageConfiguration::initPartitionStrategy(ASTPtr partition_b if (partition_strategy) { - read_path = partition_strategy->getPathForRead(getRawPath().path); LOG_DEBUG(getLogger("StorageObjectStorageConfiguration"), "Initialized partition strategy {}", magic_enum::enum_name(partition_strategy_type)); } } @@ -137,14 +145,12 @@ const StorageObjectStorageConfiguration::Path & StorageObjectStorageConfiguratio StorageObjectStorageConfiguration::Path StorageObjectStorageConfiguration::getPathForWrite(const std::string & partition_id) const { - auto raw_path = getRawPath(); - - if (!partition_strategy) - { - return raw_path; - } + return getPathForWrite(partition_id, /* filename_override */ ""); +} - return Path {partition_strategy->getPathForWrite(raw_path.path, partition_id)}; +StorageObjectStorageConfiguration::Path StorageObjectStorageConfiguration::getPathForWrite(const std::string & partition_id, const std::string & filename_override) const +{ + return Path {file_path_generator->getPathForWrite(partition_id, filename_override)}; } bool StorageObjectStorageConfiguration::Path::hasPartitionWildcard() const @@ -153,6 +159,12 @@ bool StorageObjectStorageConfiguration::Path::hasPartitionWildcard() const return path.find(PARTITION_ID_WILDCARD) != String::npos; } +bool StorageObjectStorageConfiguration::Path::hasExportFilenameWildcard() const +{ + return path.find(ObjectStorageWildcardFilePathGenerator::FILE_WILDCARD) != String::npos; +} + + bool StorageObjectStorageConfiguration::Path::hasGlobsIgnorePartitionWildcard() const { if (!hasPartitionWildcard()) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h index 6929c7018d67..f45cd9725dea 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageConfiguration.h @@ -13,6 +13,7 @@ #include #include #include +#include namespace DB { @@ -61,6 +62,7 @@ class StorageObjectStorageConfiguration std::string path; bool hasPartitionWildcard() const; + bool hasExportFilenameWildcard() const; bool hasGlobsIgnorePartitionWildcard() const; bool hasGlobs() const; std::string cutGlobs(bool supports_partial_prefix) const; @@ -69,8 +71,7 @@ class StorageObjectStorageConfiguration using Paths = std::vector; /// Initialize configuration from either AST or NamedCollection. - static void initialize( - StorageObjectStorageConfiguration & configuration_to_initialize, + virtual void initialize( ASTs & engine_args, ContextPtr local_context, bool with_table_structure); @@ -91,11 +92,13 @@ class StorageObjectStorageConfiguration /// Raw URI, specified by a user. Used in permission check. virtual const String & getRawURI() const = 0; - const Path & getPathForRead() const; + virtual const Path & getPathForRead() const; // Path used for writing, it should not be globbed and might contain a partition key - Path getPathForWrite(const std::string & partition_id = "") const; + virtual Path getPathForWrite(const std::string & partition_id = "") const; - void setPathForRead(const Path & path) + Path getPathForWrite(const std::string & partition_id, const std::string & filename_override) const; + + virtual void setPathForRead(const Path & path) { read_path = path; } @@ -117,10 +120,10 @@ class StorageObjectStorageConfiguration virtual void addStructureAndFormatToArgsIfNeeded( ASTs & args, const String & structure_, const String & format_, ContextPtr context, bool with_structure) = 0; - bool isNamespaceWithGlobs() const; + virtual bool isNamespaceWithGlobs() const; virtual bool isArchive() const { return false; } - bool isPathInArchiveWithGlobs() const; + virtual bool isPathInArchiveWithGlobs() const; virtual std::string getPathInArchive() const; virtual void check(ContextPtr context); @@ -142,7 +145,7 @@ class StorageObjectStorageConfiguration virtual std::shared_ptr getSchemaTransformer(ContextPtr, ObjectInfoPtr) const { return {}; } - virtual void modifyFormatSettings(FormatSettings &) const {} + virtual void modifyFormatSettings(FormatSettings &, const Context &) const {} virtual void addDeleteTransformers( ObjectInfoPtr object_info, @@ -159,7 +162,7 @@ class StorageObjectStorageConfiguration ContextPtr local_context, const PrepareReadingFromFormatHiveParams & hive_parameters); - void initPartitionStrategy(ASTPtr partition_by, const ColumnsDescription & columns, ContextPtr context); + virtual void initPartitionStrategy(ASTPtr partition_by, const ColumnsDescription & columns, ContextPtr context); virtual std::optional tryGetTableStructureFromMetadata() const; @@ -221,6 +224,45 @@ class StorageObjectStorageConfiguration throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDataLakeSettings() is not implemented for configuration type {}", getTypeName()); } + /// Create arguments for table function with path and access parameters + virtual ASTPtr createArgsWithAccessData() const + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method createArgsWithAccessData is not supported by storage {}", getEngineName()); + } + + virtual void fromNamedCollection(const NamedCollection & collection, ContextPtr context) = 0; + virtual void fromAST(ASTs & args, ContextPtr context, bool with_structure) = 0; + + virtual ObjectStorageType extractDynamicStorageType(ASTs & /* args */, ContextPtr /* context */, ASTPtr * /* type_arg */) const + { return ObjectStorageType::None; } + + virtual const String & getFormat() const { return format; } + virtual const String & getCompressionMethod() const { return compression_method; } + virtual const String & getStructure() const { return structure; } + + virtual PartitionStrategyFactory::StrategyType getPartitionStrategyType() const { return partition_strategy_type; } + virtual bool getPartitionColumnsInDataFile() const { return partition_columns_in_data_file; } + virtual std::shared_ptr getPartitionStrategy() const { return partition_strategy; } + + virtual void setFormat(const String & format_) { format = format_; } + virtual void setCompressionMethod(const String & compression_method_) { compression_method = compression_method_; } + virtual void setStructure(const String & structure_) { structure = structure_; } + + virtual void setPartitionStrategyType(PartitionStrategyFactory::StrategyType partition_strategy_type_) + { + partition_strategy_type = partition_strategy_type_; + } + virtual void setPartitionColumnsInDataFile(bool partition_columns_in_data_file_) + { + partition_columns_in_data_file = partition_columns_in_data_file_; + } + virtual void setPartitionStrategy(const std::shared_ptr & partition_strategy_) + { + partition_strategy = partition_strategy_; + } + + virtual void assertInitialized() const; + virtual ColumnMapperPtr getColumnMapperForObject(ObjectInfoPtr /**/) const { return nullptr; } virtual ColumnMapperPtr getColumnMapperForCurrentSchema() const { return nullptr; } @@ -233,27 +275,26 @@ class StorageObjectStorageConfiguration return false; } - String format = "auto"; - String compression_method = "auto"; - String structure = "auto"; PartitionStrategyFactory::StrategyType partition_strategy_type = PartitionStrategyFactory::StrategyType::NONE; + std::shared_ptr partition_strategy; /// Whether partition column values are contained in the actual data. /// And alternative is with hive partitioning, when they are contained in file path. bool partition_columns_in_data_file = true; - std::shared_ptr partition_strategy; -protected: - virtual void fromNamedCollection(const NamedCollection & collection, ContextPtr context) = 0; - virtual void fromAST(ASTs & args, ContextPtr context, bool with_structure) = 0; - - void assertInitialized() const; +private: + String format = "auto"; + String compression_method = "auto"; + String structure = "auto"; +protected: bool initialized = false; private: // Path used for reading, by default it is the same as `getRawPath` // When using `partition_strategy=hive`, a recursive reading pattern will be appended `'table_root/**.parquet' Path read_path; + + std::shared_ptr file_path_generator; }; using StorageObjectStorageConfigurationPtr = std::shared_ptr; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageDefinitions.h b/src/Storages/ObjectStorage/StorageObjectStorageDefinitions.h index 83fb17ab3b8a..ef00c15750ab 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageDefinitions.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageDefinitions.h @@ -127,6 +127,13 @@ struct HDFSClusterDefinition static constexpr auto non_clustered_storage_engine_name = HDFSDefinition::storage_engine_name; }; +struct IcebergClusterDefinition +{ + static constexpr auto name = "icebergCluster"; + static constexpr auto storage_engine_name = "IcebergCluster"; + static constexpr auto non_clustered_storage_engine_name = IcebergDefinition::storage_engine_name; +}; + struct IcebergS3ClusterDefinition { static constexpr auto name = "icebergS3Cluster"; @@ -148,6 +155,13 @@ struct IcebergHDFSClusterDefinition static constexpr auto non_clustered_storage_engine_name = IcebergHDFSDefinition::storage_engine_name; }; +struct IcebergLocalClusterDefinition +{ + static constexpr auto name = "icebergLocalCluster"; + static constexpr auto storage_engine_name = "IcebergLocalCluster"; + static constexpr auto non_clustered_storage_engine_name = IcebergLocalDefinition::storage_engine_name; +}; + struct DeltaLakeClusterDefinition { static constexpr auto name = "deltaLakeCluster"; @@ -155,6 +169,20 @@ struct DeltaLakeClusterDefinition static constexpr auto non_clustered_storage_engine_name = DeltaLakeDefinition::storage_engine_name; }; +struct DeltaLakeS3ClusterDefinition +{ + static constexpr auto name = "deltaLakeS3Cluster"; + static constexpr auto storage_engine_name = "DeltaLakeS3Cluster"; + static constexpr auto non_clustered_storage_engine_name = DeltaLakeS3Definition::storage_engine_name; +}; + +struct DeltaLakeAzureClusterDefinition +{ + static constexpr auto name = "deltaLakeAzureCluster"; + static constexpr auto storage_engine_name = "DeltaLakeAzureCluster"; + static constexpr auto non_clustered_storage_engine_name = DeltaLakeAzureDefinition::storage_engine_name; +}; + struct HudiClusterDefinition { static constexpr auto name = "hudiCluster"; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSettings.h b/src/Storages/ObjectStorage/StorageObjectStorageSettings.h index 1314b7d87c3d..180d71ea0c8a 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSettings.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSettings.h @@ -68,7 +68,17 @@ struct StorageObjectStorageSettings using StorageObjectStorageSettingsPtr = std::shared_ptr; +// clang-format off + +#define STORAGE_OBJECT_STORAGE_RELATED_SETTINGS(DECLARE, ALIAS) \ + DECLARE(String, object_storage_cluster, "", R"( +Cluster for distributed requests +)", 0) \ + +// clang-format on + #define LIST_OF_STORAGE_OBJECT_STORAGE_SETTINGS(M, ALIAS) \ + STORAGE_OBJECT_STORAGE_RELATED_SETTINGS(M, ALIAS) \ LIST_OF_ALL_FORMAT_SETTINGS(M, ALIAS) } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index a3d51ca76111..48525da19ec7 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -62,7 +62,7 @@ StorageObjectStorageSink::StorageObjectStorageSink( , sample_block(sample_block_) { const auto & settings = context->getSettingsRef(); - const auto chosen_compression_method = chooseCompressionMethod(path, configuration->compression_method); + const auto chosen_compression_method = chooseCompressionMethod(path, configuration->getCompressionMethod()); auto buffer = object_storage->writeObject( StoredObject(path), WriteMode::Rewrite, std::nullopt, DBMS_DEFAULT_BUFFER_SIZE, context->getWriteSettings()); @@ -74,7 +74,7 @@ StorageObjectStorageSink::StorageObjectStorageSink( static_cast(settings[Setting::output_format_compression_zstd_window_log])); writer = FormatFactory::instance().getOutputFormatParallelIfPossible( - configuration->format, *write_buf, *sample_block, context, format_settings_); + configuration->getFormat(), *write_buf, *sample_block, context, format_settings_); } void StorageObjectStorageSink::consume(Chunk & chunk) @@ -142,8 +142,7 @@ PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( std::optional format_settings_, SharedHeader sample_block_, ContextPtr context_) - : PartitionedSink(configuration_->partition_strategy, context_, sample_block_) - , object_storage(object_storage_) + : object_storage(object_storage_) , configuration(configuration_) , query_settings(configuration_->getQuerySettings(context_)) , format_settings(format_settings_) @@ -176,7 +175,7 @@ SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String object_storage, configuration, format_settings, - std::make_shared(partition_strategy->getFormatHeader()), + std::make_shared(configuration->partition_strategy->getFormatHeader()), context ); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index f4a775030715..39873998ad7a 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -8,6 +8,8 @@ namespace DB { class StorageObjectStorageSink : public SinkToStorage { +friend class StorageObjectStorageImporterSink; + public: StorageObjectStorageSink( const std::string & path_, @@ -41,7 +43,7 @@ class StorageObjectStorageSink : public SinkToStorage void cancelBuffers(); }; -class PartitionedStorageObjectStorageSink : public PartitionedSink +class PartitionedStorageObjectStorageSink : public PartitionedSink::SinkCreator { public: PartitionedStorageObjectStorageSink( diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index db47a7fc7945..358f9a2e6fea 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -38,7 +39,7 @@ #endif #include - +#include namespace fs = std::filesystem; namespace ProfileEvents @@ -65,6 +66,8 @@ namespace Setting extern const SettingsBool use_iceberg_partition_pruning; extern const SettingsBool cluster_function_process_archive_on_multiple_nodes; extern const SettingsBool table_engine_read_through_distributed_cache; + extern const SettingsBool use_object_storage_list_objects_cache; + extern const SettingsBool allow_experimental_iceberg_read_optimization; } namespace ErrorCodes @@ -170,6 +173,8 @@ std::shared_ptr StorageObjectStorageSource::createFileIterator( return distributed_iterator; } + configuration->update(object_storage, local_context, true, true); + std::unique_ptr iterator; const auto & reading_path = configuration->getPathForRead(); if (reading_path.hasGlobs()) @@ -182,11 +187,36 @@ std::shared_ptr StorageObjectStorageSource::createFileIterator( query_settings.ignore_non_existent_file, skip_object_metadata, file_progress_callback); } else + { + std::shared_ptr object_iterator = nullptr; + std::unique_ptr cache_ptr = nullptr; + + if (local_context->getSettingsRef()[Setting::use_object_storage_list_objects_cache] && object_storage->supportsListObjectsCache()) + { + auto & cache = ObjectStorageListObjectsCache::instance(); + ObjectStorageListObjectsCache::Key cache_key {object_storage->getDescription(), configuration->getNamespace(), configuration->getRawPath().cutGlobs(configuration->supportsPartialPathPrefix())}; + + if (auto objects_info = cache.get(cache_key, /*filter_by_prefix=*/ false)) + { + object_iterator = std::make_shared(std::move(*objects_info)); + } + else + { + cache_ptr = std::make_unique(cache, cache_key); + object_iterator = object_storage->iterate(configuration->getRawPath().cutGlobs(configuration->supportsPartialPathPrefix()), query_settings.list_object_keys_size); + } + } + else + { + object_iterator = object_storage->iterate(configuration->getRawPath().cutGlobs(configuration->supportsPartialPathPrefix()), query_settings.list_object_keys_size); + } + /// Iterate through disclosed globs and make a source for each file iterator = std::make_unique( - object_storage, configuration, predicate, virtual_columns, hive_columns, - local_context, is_archive ? nullptr : read_keys, query_settings.list_object_keys_size, - query_settings.throw_on_zero_files_match, file_progress_callback); + object_iterator, configuration, predicate, virtual_columns, hive_columns, + local_context, is_archive ? nullptr : read_keys, + query_settings.throw_on_zero_files_match, file_progress_callback, std::move(cache_ptr)); + } } else if (configuration->supportsFileIterator()) { @@ -270,7 +300,6 @@ void StorageObjectStorageSource::lazyInitialize() Chunk StorageObjectStorageSource::generate() { - lazyInitialize(); while (true) @@ -317,10 +346,14 @@ Chunk StorageObjectStorageSource::generate() path); } + /// For _path column, use absolute_path if available (e.g., file:///home/...) + /// Otherwise, fall back to the storage path identifier + std::string path_for_virtual_column = object_info->getAbsolutePath().value_or(path); + VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( chunk, read_from_format_info.requested_virtual_columns, - {.path = path, + {.path = path_for_virtual_column, .size = object_info->isArchive() ? object_info->fileSizeInArchive() : object_info->metadata->size_bytes, .filename = &filename, .last_modified = object_info->metadata->last_modified, @@ -328,6 +361,16 @@ Chunk StorageObjectStorageSource::generate() .data_lake_snapshot_version = file_iterator->getSnapshotVersion()}, read_context); + /// Not empty when allow_experimental_iceberg_read_optimization=true + /// and some columns were removed from read list as columns with constant values. + /// Restore data for these columns. + for (const auto & constant_column : reader.constant_columns_with_values) + { + chunk.addColumn(constant_column.first, + constant_column.second.name_and_type.type->createColumnConst( + chunk.getNumRows(), constant_column.second.value)->convertToFullColumnIfConst()); + } + #if USE_PARQUET && USE_AWS_S3 if (chunk_size && chunk.hasColumns()) { @@ -405,7 +448,7 @@ Chunk StorageObjectStorageSource::generate() void StorageObjectStorageSource::addNumRowsToCache(const ObjectInfo & object_info, size_t num_rows) { const auto cache_key = getKeyForSchemaCache( - getUniqueStoragePathIdentifier(*configuration, object_info), configuration->format, format_settings, read_context); + getUniqueStoragePathIdentifier(*configuration, object_info), configuration->getFormat(), format_settings, read_context); schema_cache.addNumRows(cache_key, num_rows); } @@ -445,43 +488,61 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade ObjectInfoPtr object_info; auto query_settings = configuration->getQuerySettings(context_); + bool not_a_path = false; + do { + not_a_path = false; object_info = file_iterator->next(processor); - if (!object_info || object_info->getPath().empty()) + if (!object_info) return {}; - if (!object_info->metadata) + if (object_info->getCommand().isValid()) { - const auto & path = object_info->isArchive() ? object_info->getPathToArchive() : object_info->getPath(); - - if (query_settings.ignore_non_existent_file) + auto retry_after_us = object_info->getCommand().getRetryAfterUs(); + if (retry_after_us.has_value()) { - auto metadata = object_storage->tryGetObjectMetadata(path); - if (!metadata) - return {}; - - object_info->metadata = metadata; + not_a_path = true; + /// TODO: Make asyncronous waiting without sleep in thread + /// Now this sleep is on executor node in worker thread + /// Does not block query initiator + sleepForMicroseconds(std::min(Poco::Timestamp::TimeDiff(100000ul), retry_after_us.value())); + continue; } - else - object_info->metadata = object_storage->getObjectMetadata(path); } + + if (object_info->getPath().empty()) + return {}; + + object_info->loadMetadata(object_storage, query_settings.ignore_non_existent_file); } - while (query_settings.skip_empty_files && object_info->metadata->size_bytes == 0); + while (not_a_path || (query_settings.skip_empty_files && object_info->metadata->size_bytes == 0)); + + ObjectStoragePtr storage_to_use = object_info->getObjectStorage(); + if (!storage_to_use) + storage_to_use = object_storage; QueryPipelineBuilder builder; std::shared_ptr source; std::unique_ptr read_buf; + std::optional rows_count_from_metadata; auto try_get_num_rows_from_cache = [&]() -> std::optional { + if (rows_count_from_metadata.has_value()) + { + /// Must be non negative here + size_t value = rows_count_from_metadata.value(); + return value; + } + if (!schema_cache) return std::nullopt; const auto cache_key = getKeyForSchemaCache( getUniqueStoragePathIdentifier(*configuration, *object_info), - configuration->format, + configuration->getFormat(), format_settings, context_); @@ -494,6 +555,128 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade return schema_cache->tryGetNumRows(cache_key, get_last_mod_time); }; + /// List of columns with constant value in current file, and values + std::map constant_columns_with_values; + std::unordered_set constant_columns; + + NamesAndTypesList requested_columns_copy = read_from_format_info.requested_columns; + + std::unordered_map> requested_columns_list; + { + size_t column_index = 0; + for (const auto & column : requested_columns_copy) + requested_columns_list[column.getNameInStorage()] = std::make_pair(column_index++, column); + } + + if (context_->getSettingsRef()[Setting::allow_experimental_iceberg_read_optimization]) + { + auto file_meta_data = object_info->getFileMetaInfo(); + if (file_meta_data.has_value()) + { + bool is_all_rows_count_equals = true; + for (const auto & column : file_meta_data.value()->columns_info) + { + if (is_all_rows_count_equals && column.second.rows_count.has_value()) + { + if (rows_count_from_metadata.has_value()) + { + if (column.second.rows_count.value() != rows_count_from_metadata.value()) + { + LOG_WARNING(log, "Inconsistent rows count for file {} in metadats, ignored", object_info->getPath()); + is_all_rows_count_equals = false; + rows_count_from_metadata = std::nullopt; + } + } + else if (column.second.rows_count.value() < 0) + { + LOG_WARNING(log, "Negative rows count for file {} in metadats, ignored", object_info->getPath()); + is_all_rows_count_equals = false; + rows_count_from_metadata = std::nullopt; + } + else + rows_count_from_metadata = column.second.rows_count; + } + + if (column.second.hyperrectangle.has_value()) + { + auto column_name = column.first; + + auto i_column = requested_columns_list.find(column_name); + if (i_column == requested_columns_list.end()) + continue; + + if (column.second.hyperrectangle.value().isPoint() && + (!column.second.nulls_count.has_value() || column.second.nulls_count.value() <= 0)) + { + /// isPoint() method checks before that left==right + constant_columns_with_values[i_column->second.first] = + ConstColumnWithValue{ + i_column->second.second, + column.second.hyperrectangle.value().left + }; + constant_columns.insert(column_name); + + LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value '{}'", + object_info->getPath(), + column_name, + i_column->second.second.type, + column.second.hyperrectangle.value().left.dump()); + } + else if (column.second.rows_count.has_value() && column.second.nulls_count.has_value() + && column.second.rows_count.value() == column.second.nulls_count.value() + && i_column->second.second.type->isNullable()) + { + constant_columns_with_values[i_column->second.first] = + ConstColumnWithValue{ + i_column->second.second, + Field() + }; + constant_columns.insert(column_name); + + LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'", + object_info->getPath(), + column_name, + i_column->second.second.type); + } + } + } + for (const auto & column : requested_columns_list) + { + const auto & column_name = column.first; + + if (file_meta_data.value()->columns_info.contains(column_name)) + continue; + + if (!column.second.second.type->isNullable()) + continue; + + /// Column is nullable and absent in file + constant_columns_with_values[column.second.first] = + ConstColumnWithValue{ + column.second.second, + Field() + }; + constant_columns.insert(column_name); + + LOG_DEBUG(log, "In file {} constant column '{}' type '{}' with value 'NULL'", + object_info->getPath(), + column_name, + column.second.second.type); + } + } + + if (!constant_columns.empty()) + { + size_t original_columns = requested_columns_copy.size(); + requested_columns_copy = requested_columns_copy.eraseNames(constant_columns); + if (requested_columns_copy.size() + constant_columns.size() != original_columns) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Can't remove constant columns for file {} correct, fallback to read. Founded constant columns: [{}]", + object_info->getPath(), constant_columns); + if (requested_columns_copy.empty()) + need_only_count = true; + } + } + std::optional num_rows_from_cache = need_only_count && context_->getSettingsRef()[Setting::use_cache_for_count_from_files] ? try_get_num_rows_from_cache() : std::nullopt; @@ -511,20 +694,22 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade columns.emplace_back(type->createColumn(), type, name); builder.init(Pipe(std::make_shared( std::make_shared(columns), *num_rows_from_cache, max_block_size))); + if (!constant_columns.empty()) + configuration->addDeleteTransformers(object_info, builder, format_settings, context_); } else { CompressionMethod compression_method; if (const auto * object_info_in_archive = dynamic_cast(object_info.get())) { - compression_method = chooseCompressionMethod(configuration->getPathInArchive(), configuration->compression_method); + compression_method = chooseCompressionMethod(configuration->getPathInArchive(), configuration->getCompressionMethod()); const auto & archive_reader = object_info_in_archive->archive_reader; read_buf = archive_reader->readFile(object_info_in_archive->path_in_archive, /*throw_on_not_found=*/true); } else { - compression_method = chooseCompressionMethod(object_info->getFileName(), configuration->compression_method); - read_buf = createReadBuffer(*object_info, object_storage, context_, log); + compression_method = chooseCompressionMethod(object_info->getFileName(), configuration->getCompressionMethod()); + read_buf = createReadBuffer(*object_info, storage_to_use, context_, log); } Block initial_header = read_from_format_info.format_header; @@ -551,7 +736,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade }(); auto input_format = FormatFactory::instance().getInput( - configuration->format, + configuration->getFormat(), *read_buf, initial_header, context_, @@ -568,6 +753,9 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade if (need_only_count) input_format->needOnlyCount(); + if (!object_info->getPath().empty()) + input_format->setStorageRelatedUniqueKey(context_->getSettingsRef(), object_info->getPath() + ":" + object_info->metadata->etag); + builder.init(Pipe(input_format)); configuration->addDeleteTransformers(object_info, builder, format_settings, context_); @@ -612,7 +800,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade /// from chunk read by IInputFormat. builder.addSimpleTransform([&](const SharedHeader & header) { - return std::make_shared(header, read_from_format_info.requested_columns); + return std::make_shared(header, requested_columns_copy); }); auto pipeline = std::make_unique(QueryPipelineBuilder::getPipeline(std::move(builder))); @@ -621,7 +809,12 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade ProfileEvents::increment(ProfileEvents::EngineFileLikeReadFiles); return ReaderHolder( - object_info, std::move(read_buf), std::move(source), std::move(pipeline), std::move(current_reader)); + object_info, + std::move(read_buf), + std::move(source), + std::move(pipeline), + std::move(current_reader), + std::move(constant_columns_with_values)); } std::future StorageObjectStorageSource::createReaderAsync() @@ -639,6 +832,10 @@ std::unique_ptr createReadBuffer( const auto & settings = context_->getSettingsRef(); const auto & effective_read_settings = read_settings.has_value() ? read_settings.value() : context_->getReadSettings(); + ObjectStoragePtr storage_to_use = object_info.getObjectStorage(); + if (!storage_to_use) + storage_to_use = object_storage; + bool use_distributed_cache = false; #if ENABLE_DISTRIBUTED_CACHE ObjectStorageConnectionInfoPtr connection_info; @@ -646,7 +843,7 @@ std::unique_ptr createReadBuffer( && DistributedCache::Registry::instance().isReady( effective_read_settings.distributed_cache_settings.read_only_from_current_az)) { - connection_info = object_storage->getConnectionInfo(); + connection_info = storage_to_use->getConnectionInfo(); if (connection_info) use_distributed_cache = true; } @@ -659,15 +856,15 @@ std::unique_ptr createReadBuffer( filesystem_cache_name = settings[Setting::filesystem_cache_name].value; use_filesystem_cache = effective_read_settings.enable_filesystem_cache && !filesystem_cache_name.empty() - && (object_storage->getType() == ObjectStorageType::Azure - || object_storage->getType() == ObjectStorageType::S3); + && (storage_to_use->getType() == ObjectStorageType::Azure + || storage_to_use->getType() == ObjectStorageType::S3); } /// We need object metadata for two cases: /// 1. object size suggests whether we need to use prefetch /// 2. object etag suggests a cache key in case we use filesystem cache if (!object_info.metadata) - object_info.metadata = object_storage->getObjectMetadata(object_info.getPath()); + object_info.metadata = storage_to_use->getObjectMetadata(object_info.getPath()); const auto & object_size = object_info.metadata->size_bytes; @@ -703,9 +900,9 @@ std::unique_ptr createReadBuffer( { const std::string path = object_info.getPath(); StoredObject object(path, "", object_size); - auto read_buffer_creator = [object, nested_buffer_read_settings, object_storage]() + auto read_buffer_creator = [object, nested_buffer_read_settings, storage_to_use]() { - return object_storage->readObject(object, nested_buffer_read_settings); + return storage_to_use->readObject(object, nested_buffer_read_settings); }; impl = std::make_unique( @@ -738,9 +935,9 @@ std::unique_ptr createReadBuffer( const auto cache_key = FileCacheKey::fromKey(hash.get128()); auto cache = FileCacheFactory::instance().get(filesystem_cache_name); - auto read_buffer_creator = [path = object_info.getPath(), object_size, nested_buffer_read_settings, object_storage]() + auto read_buffer_creator = [path = object_info.getPath(), object_size, nested_buffer_read_settings, object_storage, storage_to_use]() { - return object_storage->readObject(StoredObject(path, "", object_size), nested_buffer_read_settings); + return storage_to_use->readObject(StoredObject(path, "", object_size), nested_buffer_read_settings); }; impl = std::make_unique( @@ -768,7 +965,7 @@ std::unique_ptr createReadBuffer( } if (!impl) - impl = object_storage->readObject(StoredObject(object_info.getPath(), "", object_size), nested_buffer_read_settings); + impl = storage_to_use->readObject(StoredObject(object_info.getPath(), "", object_size), nested_buffer_read_settings); if (!use_async_buffer) return impl; @@ -805,18 +1002,18 @@ std::unique_ptr createReadBuffer( } StorageObjectStorageSource::GlobIterator::GlobIterator( - ObjectStoragePtr object_storage_, - StorageObjectStorageConfigurationPtr configuration_, + const ObjectStorageIteratorPtr & object_storage_iterator_, + ConfigurationPtr configuration_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, const NamesAndTypesList & hive_columns_, ContextPtr context_, ObjectInfos * read_keys_, - size_t list_object_keys_size, bool throw_on_zero_files_match_, - std::function file_progress_callback_) + std::function file_progress_callback_, + std::unique_ptr list_cache_) : WithContext(context_) - , object_storage(object_storage_) + , object_storage_iterator(object_storage_iterator_) , configuration(configuration_) , virtual_columns(virtual_columns_) , hive_columns(hive_columns_) @@ -825,6 +1022,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( , read_keys(read_keys_) , local_context(context_) , file_progress_callback(file_progress_callback_) + , list_cache(std::move(list_cache_)) { const auto & reading_path = configuration->getPathForRead(); if (reading_path.hasGlobs()) @@ -832,8 +1030,6 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( const auto & key_with_globs = reading_path; const auto key_prefix = reading_path.cutGlobs(configuration->supportsPartialPathPrefix()); - object_storage_iterator = object_storage->iterate(key_prefix, list_object_keys_size); - matcher = std::make_unique(makeRegexpPatternFromGlobs(key_with_globs.path)); if (!matcher->ok()) { @@ -897,11 +1093,21 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne auto result = object_storage_iterator->getCurrentBatchAndScheduleNext(); if (!result.has_value()) { + if (list_cache) + { + list_cache->set(std::move(object_list)); + } is_finished = true; return {}; } new_batch = std::move(result.value()); + + if (list_cache) + { + object_list.insert(object_list.end(), new_batch.begin(), new_batch.end()); + } + for (auto it = new_batch.begin(); it != new_batch.end();) { if (!recursive && !re2::RE2::FullMatch((*it)->getPath(), *matcher)) @@ -1013,12 +1219,14 @@ StorageObjectStorageSource::ReaderHolder::ReaderHolder( std::unique_ptr read_buf_, std::shared_ptr source_, std::unique_ptr pipeline_, - std::unique_ptr reader_) + std::unique_ptr reader_, + std::map && constant_columns_with_values_) : object_info(std::move(object_info_)) , read_buf(std::move(read_buf_)) , source(std::move(source_)) , pipeline(std::move(pipeline_)) , reader(std::move(reader_)) + , constant_columns_with_values(std::move(constant_columns_with_values_)) { } @@ -1032,6 +1240,7 @@ StorageObjectStorageSource::ReaderHolder::operator=(ReaderHolder && other) noexc source = std::move(other.source); read_buf = std::move(other.read_buf); object_info = std::move(other.object_info); + constant_columns_with_values = std::move(other.constant_columns_with_values); return *this; } @@ -1046,6 +1255,12 @@ StorageObjectStorageSource::ReadTaskIterator::ReadTaskIterator( , is_archive(is_archive_) , object_storage(object_storage_) { + if (!getContext()->isSwarmModeEnabled()) + { + LOG_DEBUG(getLogger("StorageObjectStorageSource"), "STOP SWARM MODE called, stop getting new tasks"); + return; + } + ThreadPool pool( CurrentMetrics::StorageObjectStorageThreads, CurrentMetrics::StorageObjectStorageThreadsActive, @@ -1070,7 +1285,22 @@ StorageObjectStorageSource::ReadTaskIterator::ReadTaskIterator( { auto object = object_future.get(); if (object) + { + if (object->getAbsolutePath().has_value()) + { + auto [storage_to_use, key] = resolveObjectStorageForPath("", object->getAbsolutePath().value(), object_storage, secondary_storages, getContext()); + if (!key.empty()) + { + object->object_storage_to_use = storage_to_use; + object->relative_path = key; + } + } + else + { + object->object_storage_to_use = object_storage; + } buffer.push_back(object); + } } } @@ -1081,10 +1311,35 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::ReadTaskIterator ObjectInfoPtr object_info; if (current_index >= buffer.size()) { - auto task = callback(); - if (!task || task->isEmpty()) + if (!getContext()->isSwarmModeEnabled()) + { + LOG_DEBUG(getLogger("StorageObjectStorageSource"), "STOP SWARM MODE called, stop getting new tasks"); return nullptr; - object_info = task->getObjectInfo(); + } + + auto raw = callback(); + if (!raw || raw->isEmpty()) + return nullptr; + + object_info = raw->getObjectInfo(); + + // The 'path' field from master is already the correctly resolved relative path. + // We should use it directly and NOT overwrite relative_path. + // Only resolve absolute_path if we need to determine which storage to use (for secondary storages). + object_info->object_storage_to_use = object_storage; + + if (raw->absolute_path.has_value()) + { + auto [storage_to_use, key] + = resolveObjectStorageForPath("", raw->absolute_path.value(), object_storage, secondary_storages, getContext()); + + if (!key.empty() && storage_to_use != object_storage) + { + // File is in a different storage (secondary storage), use that storage + // BUT preserve the original relative_path from master - don't overwrite it! + object_info->object_storage_to_use = storage_to_use; + } + } } else { @@ -1181,7 +1436,10 @@ StorageObjectStorageSource::ArchiveIterator::createArchiveReader(ObjectInfoPtr o return DB::createArchiveReader( /* path_to_archive */ object_info->getPath(), - /* archive_read_function */ [=, this]() { return createReadBuffer(*object_info, object_storage, getContext(), log); }, + /* archive_read_function */ [=, this]() { + ObjectStoragePtr storage = object_info->getObjectStorage() ? object_info->getObjectStorage() : object_storage; + return createReadBuffer(*object_info, storage, getContext(), log); + }, /* archive_size */ size); } @@ -1203,7 +1461,12 @@ ObjectInfoPtr StorageObjectStorageSource::ArchiveIterator::next(size_t processor } if (!archive_object->metadata) - archive_object->metadata = object_storage->getObjectMetadata(archive_object->getPath()); + { + ObjectStoragePtr storage_to_use = archive_object->getObjectStorage(); + if (!storage_to_use) + storage_to_use = object_storage; + archive_object->metadata = storage_to_use->getObjectMetadata(archive_object->getPath()); + } archive_reader = createArchiveReader(archive_object); file_enumerator = archive_reader->firstFile(); @@ -1229,7 +1492,12 @@ ObjectInfoPtr StorageObjectStorageSource::ArchiveIterator::next(size_t processor return {}; if (!archive_object->metadata) - archive_object->metadata = object_storage->getObjectMetadata(archive_object->getPath()); + { + ObjectStoragePtr storage_to_use = archive_object->getObjectStorage(); + if (!storage_to_use) + storage_to_use = object_storage; + archive_object->metadata = storage_to_use->getObjectMetadata(archive_object->getPath()); + } archive_reader = createArchiveReader(archive_object); if (!archive_reader->fileExists(path_in_archive)) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 8145105a09e2..ebcc159d8679 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -8,8 +8,12 @@ #include #include #include +#include #include #include +#include + + namespace DB { @@ -87,6 +91,12 @@ class StorageObjectStorageSource : public ISource size_t total_rows_in_file = 0; LoggerPtr log = getLogger("StorageObjectStorageSource"); + struct ConstColumnWithValue + { + NameAndTypePair name_and_type; + Field value; + }; + struct ReaderHolder : private boost::noncopyable { public: @@ -95,7 +105,8 @@ class StorageObjectStorageSource : public ISource std::unique_ptr read_buf_, std::shared_ptr source_, std::unique_ptr pipeline_, - std::unique_ptr reader_); + std::unique_ptr reader_, + std::map && constant_columns_with_values_); ReaderHolder() = default; ReaderHolder(ReaderHolder && other) noexcept { *this = std::move(other); } @@ -114,6 +125,9 @@ class StorageObjectStorageSource : public ISource std::shared_ptr source; std::unique_ptr pipeline; std::unique_ptr reader; + + public: + std::map constant_columns_with_values; }; ReaderHolder reader; @@ -166,6 +180,7 @@ class StorageObjectStorageSource::ReadTaskIterator : public IObjectIterator, pri std::atomic_size_t index = 0; bool is_archive; ObjectStoragePtr object_storage; + SecondaryStorages secondary_storages; // Sometimes data can be located on a different storage /// path_to_archive -> archive reader. std::unordered_map> archive_readers; std::mutex archive_readers_mutex; @@ -175,17 +190,33 @@ class StorageObjectStorageSource::ReadTaskIterator : public IObjectIterator, pri class StorageObjectStorageSource::GlobIterator : public IObjectIterator, WithContext { public: + struct ListObjectsCacheWithKey + { + ListObjectsCacheWithKey(ObjectStorageListObjectsCache & cache_, const ObjectStorageListObjectsCache::Key & key_) : cache(cache_), key(key_) {} + + void set(ObjectStorageListObjectsCache::Value && value) const + { + cache.set(key, std::make_shared(std::move(value))); + } + + private: + ObjectStorageListObjectsCache & cache; + ObjectStorageListObjectsCache::Key key; + }; + + using ConfigurationPtr = std::shared_ptr; + GlobIterator( - ObjectStoragePtr object_storage_, - StorageObjectStorageConfigurationPtr configuration_, + const ObjectStorageIteratorPtr & object_storage_iterator_, + ConfigurationPtr configuration_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, const NamesAndTypesList & hive_columns_, ContextPtr context_, ObjectInfos * read_keys_, - size_t list_object_keys_size, bool throw_on_zero_files_match_, - std::function file_progress_callback_ = {}); + std::function file_progress_callback_ = {}, + std::unique_ptr list_cache_ = nullptr); ~GlobIterator() override = default; @@ -198,7 +229,7 @@ class StorageObjectStorageSource::GlobIterator : public IObjectIterator, WithCon void createFilterAST(const String & any_key); void fillBufferForKey(const std::string & uri_key); - const ObjectStoragePtr object_storage; + ObjectStorageIteratorPtr object_storage_iterator; const StorageObjectStorageConfigurationPtr configuration; const NamesAndTypesList virtual_columns; const NamesAndTypesList hive_columns; @@ -210,7 +241,6 @@ class StorageObjectStorageSource::GlobIterator : public IObjectIterator, WithCon ObjectInfos object_infos; ObjectInfos * read_keys; ExpressionActionsPtr filter_expr; - ObjectStorageIteratorPtr object_storage_iterator; bool recursive{false}; std::vector expanded_keys; std::vector::iterator expanded_keys_iter; @@ -223,6 +253,8 @@ class StorageObjectStorageSource::GlobIterator : public IObjectIterator, WithCon const ContextPtr local_context; std::function file_progress_callback; + std::unique_ptr list_cache; + ObjectInfos object_list; }; class StorageObjectStorageSource::KeysIterator : public IObjectIterator @@ -247,7 +279,7 @@ class StorageObjectStorageSource::KeysIterator : public IObjectIterator const ObjectStoragePtr object_storage; const NamesAndTypesList virtual_columns; const std::function file_progress_callback; - const std::vector keys; + const Strings keys; std::atomic index = 0; bool ignore_non_existent_files; bool skip_object_metadata; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.cpp b/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.cpp index 37b966cd7bbc..e665d99994e9 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.cpp @@ -9,34 +9,57 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; -} + extern const int CANNOT_READ_ALL_DATA; +}; StorageObjectStorageStableTaskDistributor::StorageObjectStorageStableTaskDistributor( std::shared_ptr iterator_, std::vector && ids_of_nodes_, - bool send_over_whole_archive_) + bool send_over_whole_archive_, + uint64_t lock_object_storage_task_distribution_ms_) : iterator(std::move(iterator_)) , send_over_whole_archive(send_over_whole_archive_) , connection_to_files(ids_of_nodes_.size()) , ids_of_nodes(std::move(ids_of_nodes_)) + , lock_object_storage_task_distribution_us(lock_object_storage_task_distribution_ms_ * 1000) , iterator_exhausted(false) { + Poco::Timestamp now; + size_t nodes = ids_of_nodes.size(); + for (size_t i = 0; i < nodes; ++i) + { + replica_to_files_to_be_processed[i] = std::list{}; + last_node_activity[i] = now; + } } ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getNextTask(size_t number_of_current_replica) { LOG_TRACE(log, "Received request from replica {} looking for a file", number_of_current_replica); - // 1. Check pre-queued files first - if (auto file = getPreQueuedFile(number_of_current_replica)) - return file; + saveLastNodeActivity(number_of_current_replica); - // 2. Try to find a matching file from the iterator - if (auto file = getMatchingFileFromIterator(number_of_current_replica)) - return file; + auto processed_file_list_ptr = replica_to_files_to_be_processed.find(number_of_current_replica); + if (processed_file_list_ptr == replica_to_files_to_be_processed.end()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Replica number {} was marked as lost, can't set task for it anymore", + number_of_current_replica + ); + // 1. Check pre-queued files first + auto file = getPreQueuedFile(number_of_current_replica); + // 2. Try to find a matching file from the iterator + if (!file) + file = getMatchingFileFromIterator(number_of_current_replica); // 3. Process unprocessed files if iterator is exhausted - return getAnyUnprocessedFile(number_of_current_replica); + if (!file) + file = getAnyUnprocessedFile(number_of_current_replica); + + if (file) + processed_file_list_ptr->second.push_back(file); + + return file; } size_t StorageObjectStorageStableTaskDistributor::getReplicaForFile(const String & file_path) @@ -48,16 +71,27 @@ size_t StorageObjectStorageStableTaskDistributor::getReplicaForFile(const String return 0; /// Rendezvous hashing - size_t best_id = 0; - UInt64 best_weight = sipHash64(ids_of_nodes[0] + file_path); - for (size_t id = 1; id < nodes_count; ++id) + auto replica = replica_to_files_to_be_processed.begin(); + if (replica == replica_to_files_to_be_processed.end()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "No active replicas, can't find best replica for file {}", + file_path + ); + + size_t best_id = replica->first; + UInt64 best_weight = sipHash64(ids_of_nodes[best_id] + file_path); + ++replica; + while (replica != replica_to_files_to_be_processed.end()) { + size_t id = replica->first; UInt64 weight = sipHash64(ids_of_nodes[id] + file_path); if (weight > best_weight) { best_weight = weight; best_id = id; } + ++replica; } return best_id; } @@ -81,7 +115,7 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getPreQueuedFile(size_t auto next_file = files.back(); files.pop_back(); - auto file_path = send_over_whole_archive ? next_file->getPathOrPathToArchiveIfArchive() : next_file->getPath(); + auto file_path = send_over_whole_archive ? next_file->getPathOrPathToArchiveIfArchive() : next_file->getAbsolutePath().value_or(next_file->getPath()); auto it = unprocessed_files.find(file_path); if (it == unprocessed_files.end()) continue; @@ -135,7 +169,7 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getMatchingFileFromIter } else { - file_path = object_info->getPath(); + file_path = object_info->getAbsolutePath().value_or(object_info->getPath()); } size_t file_replica_idx = getReplicaForFile(file_path); @@ -159,7 +193,7 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getMatchingFileFromIter // Queue file for its assigned replica { std::lock_guard lock(mutex); - unprocessed_files.emplace(file_path, object_info); + unprocessed_files.emplace(file_path, std::make_pair(object_info, file_replica_idx)); connection_to_files[file_replica_idx].push_back(object_info); } } @@ -169,26 +203,96 @@ ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getMatchingFileFromIter ObjectInfoPtr StorageObjectStorageStableTaskDistributor::getAnyUnprocessedFile(size_t number_of_current_replica) { + /// Limit time of node activity to keep task in queue + Poco::Timestamp activity_limit; + Poco::Timestamp oldest_activity; + if (lock_object_storage_task_distribution_us > 0) + activity_limit -= lock_object_storage_task_distribution_us; + std::lock_guard lock(mutex); if (!unprocessed_files.empty()) { auto it = unprocessed_files.begin(); - auto next_file = it->second; - unprocessed_files.erase(it); - auto file_path = send_over_whole_archive ? next_file->getPathOrPathToArchiveIfArchive() : next_file->getPath(); + while (it != unprocessed_files.end()) + { + auto number_of_matched_replica = it->second.second; + auto last_activity = last_node_activity.find(number_of_matched_replica); + if (lock_object_storage_task_distribution_us <= 0 // file deferring is turned off + || it->second.second == number_of_current_replica // file is matching with current replica + || last_activity == last_node_activity.end() // msut never be happen, last_activity is filled for each replica on start + || activity_limit > last_activity->second) // matched replica did not ask for a new files for a while + { + auto next_file = it->second.first; + unprocessed_files.erase(it); + + auto file_path = send_over_whole_archive ? next_file->getPathOrPathToArchiveIfArchive() : next_file->getAbsolutePath().value_or(next_file->getPath()); + LOG_TRACE( + log, + "Iterator exhausted. Assigning unprocessed file {} to replica {} from matched replica {}", + file_path, + number_of_current_replica, + number_of_matched_replica + ); + + return next_file; + } + + oldest_activity = std::min(oldest_activity, last_activity->second); + ++it; + } + LOG_TRACE( log, - "Iterator exhausted. Assigning unprocessed file {} to replica {}", - file_path, - number_of_current_replica + "No unprocessed file for replica {}, need to retry after {} us", + number_of_current_replica, + oldest_activity - activity_limit ); - return next_file; + /// All unprocessed files owned by alive replicas with recenlty activity + /// Need to retry after (oldest_activity - activity_limit) microseconds + PathWithMetadata::CommandInTaskResponse response; + response.setRetryAfterUs(oldest_activity - activity_limit); + return std::make_shared(response.toString()); } return {}; } +void StorageObjectStorageStableTaskDistributor::saveLastNodeActivity(size_t number_of_current_replica) +{ + Poco::Timestamp now; + std::lock_guard lock(mutex); + last_node_activity[number_of_current_replica] = now; +} + +void StorageObjectStorageStableTaskDistributor::rescheduleTasksFromReplica(size_t number_of_current_replica) +{ + LOG_INFO(log, "Replica {} is marked as lost, tasks are returned to queue", number_of_current_replica); + std::lock_guard lock(mutex); + + auto processed_file_list_ptr = replica_to_files_to_be_processed.find(number_of_current_replica); + if (processed_file_list_ptr == replica_to_files_to_be_processed.end()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Replica number {} was marked as lost already", + number_of_current_replica + ); + + if (replica_to_files_to_be_processed.size() < 2) + throw Exception( + ErrorCodes::CANNOT_READ_ALL_DATA, + "All replicas were marked as lost" + ); + + replica_to_files_to_be_processed.erase(number_of_current_replica); + for (const auto & file : processed_file_list_ptr->second) + { + auto file_replica_idx = getReplicaForFile(file->getAbsolutePath().value_or(file->getPath())); + unprocessed_files.emplace(file->getAbsolutePath().value_or(file->getPath()), std::make_pair(file, file_replica_idx)); + connection_to_files[file_replica_idx].push_back(file); + } +} + } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h b/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h index 02d3ba7a030f..25673b3eeb02 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h @@ -4,7 +4,13 @@ #include #include #include +#include + +#include + #include +#include +#include #include #include #include @@ -18,24 +24,34 @@ class StorageObjectStorageStableTaskDistributor StorageObjectStorageStableTaskDistributor( std::shared_ptr iterator_, std::vector && ids_of_nodes_, - bool send_over_whole_archive_); + bool send_over_whole_archive_, + uint64_t lock_object_storage_task_distribution_ms_); ObjectInfoPtr getNextTask(size_t number_of_current_replica); + /// Insert objects back to unprocessed files + void rescheduleTasksFromReplica(size_t number_of_current_replica); + private: size_t getReplicaForFile(const String & file_path); ObjectInfoPtr getPreQueuedFile(size_t number_of_current_replica); ObjectInfoPtr getMatchingFileFromIterator(size_t number_of_current_replica); ObjectInfoPtr getAnyUnprocessedFile(size_t number_of_current_replica); + void saveLastNodeActivity(size_t number_of_current_replica); + const std::shared_ptr iterator; const bool send_over_whole_archive; std::vector> connection_to_files; - std::unordered_map unprocessed_files; + std::unordered_map> unprocessed_files; std::vector ids_of_nodes; + std::unordered_map last_node_activity; + Poco::Timestamp::TimeDiff lock_object_storage_task_distribution_us; + std::unordered_map> replica_to_files_to_be_processed; + std::mutex mutex; bool iterator_exhausted = false; diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp index 5b5cc208ef86..8ae4ca36654e 100644 --- a/src/Storages/ObjectStorage/Utils.cpp +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -9,6 +9,18 @@ #include #include #include +#include +#include +#include +#include +#include +#if USE_AWS_S3 +#include +#endif +#if USE_HDFS +#include +#endif + namespace DB { @@ -19,6 +31,173 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +namespace +{ + +std::string normalizeScheme(const std::string & scheme) +{ + auto scheme_lowercase = Poco::toLower(scheme); + + if (scheme_lowercase == "s3a" || scheme_lowercase == "s3n") + scheme_lowercase = "s3"; + else if (scheme_lowercase == "wasb" || scheme_lowercase == "wasbs" || scheme_lowercase == "abfss") + scheme_lowercase = "abfs"; + + return scheme_lowercase; +} + +std::string factoryTypeForScheme(const std::string & normalized_scheme) +{ + if (normalized_scheme == "s3") return "s3"; + if (normalized_scheme == "abfs") return "azure"; + if (normalized_scheme == "hdfs") return "hdfs"; + if (normalized_scheme == "file") return "local"; + return ""; +} + +bool isAbsolutePath(const std::string & path) +{ + if (!path.empty() && (path.front() == '/' || path.find("://") != std::string_view::npos)) + return true; + + return false; +} + +/// Normalize a path string by removing redundant components and leading slashes. +std::string normalizePathString(const std::string & path) +{ + std::filesystem::path fs_path(path); + std::filesystem::path normalized = fs_path.lexically_normal(); + + std::string normalized_result = normalized.string(); + + while (!normalized_result.empty() && normalized_result.front() == '/') + normalized_result = normalized_result.substr(1); + + return normalized_result; +} + +#if USE_AWS_S3 +/// For s3:// URIs (generic), bucket needs to match. +/// For explicit http(s):// URIs, both bucket and endpoint must match. +bool s3URIMatches(const S3::URI & target_uri, const std::string & base_bucket, const std::string & base_endpoint, const std::string & target_scheme_normalized) +{ + bool bucket_matches = (target_uri.bucket == base_bucket); + bool endpoint_matches = (target_uri.endpoint == base_endpoint); + bool is_generic_s3_uri = (target_scheme_normalized == "s3"); + return bucket_matches && (endpoint_matches || is_generic_s3_uri); +} +#endif + +std::pair getOrCreateStorageAndKey( + const std::string & cache_key, + const std::string & key_to_use, + const std::string & storage_type, + SecondaryStorages & secondary_storages, + const ContextPtr & context, + std::function configure_fn) +{ + { + std::lock_guard lock(secondary_storages.mutex); + if (auto it = secondary_storages.storages.find(cache_key); it != secondary_storages.storages.end()) + return {it->second, key_to_use}; + } + + Poco::AutoPtr cfg(new Poco::Util::MapConfiguration); + const std::string config_prefix = "object_storages." + cache_key; + + cfg->setString(config_prefix + ".object_storage_type", storage_type); + + configure_fn(*cfg, config_prefix); + + ObjectStoragePtr storage = ObjectStorageFactory::instance().create(cache_key, *cfg, config_prefix, context, /*skip_access_check*/ true); + + { + std::lock_guard lock(secondary_storages.mutex); + auto [it, inserted] = secondary_storages.storages.emplace(cache_key, storage); + if (!inserted) + return {it->second, key_to_use}; + } + + return {storage, key_to_use}; +} + +/// Normalize a path (relative to table location ot absolute path) to a key that will be looked up in the object storage. +std::string normalizePathToStorageRoot(const std::string & table_location, const std::string & path) +{ + if (table_location.empty()) + { + if (!path.empty() && path.front() == '/') + return path.substr(1); + return path; + } + + if (isAbsolutePath(path)) + return SchemeAuthorityKey(path).key; // Absolute path, return the key part + + SchemeAuthorityKey base{table_location}; + if (base.key.empty()) + return path; // Table location is empty, return the path as is + + std::string base_key_trimmed = base.key; + while (!base_key_trimmed.empty() && base_key_trimmed.front() == '/') + base_key_trimmed = base_key_trimmed.substr(1); + while (!base_key_trimmed.empty() && base_key_trimmed.back() == '/') + base_key_trimmed.pop_back(); + + std::string rel_path = path; + while (!rel_path.empty() && rel_path.front() == '/') + rel_path = rel_path.substr(1); + + if (!base_key_trimmed.empty() && (rel_path == base_key_trimmed || rel_path.starts_with(base_key_trimmed + "/"))) + return normalizePathString(rel_path); // Path already includes table location + + std::string result = base.key; + if (!result.empty() && result.back() != '/') + result += '/'; + result += rel_path; + + return normalizePathString(result); +} + +} + +// TODO: handle https://s3.amazonaws.com/bucketname/... properly +SchemeAuthorityKey::SchemeAuthorityKey(const std::string & uri) +{ + if (uri.empty()) + return; + + if (auto scheme_sep = uri.find("://"); scheme_sep != std::string_view::npos) + { + scheme = Poco::toLower(uri.substr(0, scheme_sep)); + auto rest = uri.substr(scheme_sep + 3); // skip :// + + // authority is up to next '/' + auto slash = rest.find('/'); + if (slash == std::string_view::npos) + { + authority = std::string(rest); + key = "/"; // Path obviously incorrect, but it will be dealt with by caller + return; + } + authority = std::string(rest.substr(0, slash)); + key = std::string(rest.substr(++slash)); // do not keep leading '/' + return; + } + + // if part has no scheme and starts with '/' -- it is an absolute uri for local file: file:///path + if (uri.front() == '/') + { + scheme = "file"; + key = std::string(uri); + return; + } + + // Relative path + key = std::string(uri); +} + std::optional checkAndGetNewFileOnInsertIfNeeded( const IObjectStorage & object_storage, const StorageObjectStorageConfiguration & configuration, @@ -54,14 +233,13 @@ std::optional checkAndGetNewFileOnInsertIfNeeded( void resolveSchemaAndFormat( ColumnsDescription & columns, - std::string & format, ObjectStoragePtr object_storage, - const StorageObjectStorageConfigurationPtr & configuration, + StorageObjectStorageConfigurationPtr & configuration, std::optional format_settings, std::string & sample_path, const ContextPtr & context) { - if (format == "auto") + if (configuration->getFormat() == "auto") { if (configuration->isDataLakeConfiguration()) { @@ -83,21 +261,23 @@ void resolveSchemaAndFormat( if (columns.empty()) { - if (format == "auto") + if (configuration->getFormat() == "auto") { + std::string format; std::tie(columns, format) = StorageObjectStorage::resolveSchemaAndFormatFromData( object_storage, configuration, format_settings, sample_path, context); + configuration->setFormat(format); } else { - chassert(!format.empty()); + chassert(!configuration->getFormat().empty()); columns = StorageObjectStorage::resolveSchemaFromData(object_storage, configuration, format_settings, sample_path, context); } } } - else if (format == "auto") + else if (configuration->getFormat() == "auto") { - format = StorageObjectStorage::resolveFormatFromData(object_storage, configuration, format_settings, sample_path, context); + configuration->setFormat(StorageObjectStorage::resolveFormatFromData(object_storage, configuration, format_settings, sample_path, context)); } validateSupportedColumns(columns, *configuration); @@ -116,11 +296,210 @@ void validateSupportedColumns( } } -namespace Setting +std::string makeAbsolutePath(const std::string & table_location, const std::string & path) { -extern const SettingsUInt64 max_download_buffer_size; -extern const SettingsBool use_cache_for_count_from_files; -extern const SettingsString filesystem_cache_name; -extern const SettingsUInt64 filesystem_cache_boundary_alignment; + if (isAbsolutePath(path)) + return path; + + auto table_location_decomposed = SchemeAuthorityKey(table_location); + + std::string normalized_key = normalizePathToStorageRoot(table_location, path); + + if (!table_location_decomposed.scheme.empty()) + return table_location_decomposed.scheme + "://" + table_location_decomposed.authority + "/" + normalized_key; + + return normalized_key; } + +std::pair resolveObjectStorageForPath( + const std::string & table_location, + const std::string & path, + const DB::ObjectStoragePtr & base_storage, + SecondaryStorages & secondary_storages, + const DB::ContextPtr & context) +{ + if (!isAbsolutePath(path)) + return {base_storage, normalizePathToStorageRoot(table_location, path)}; // Relative path definitely goes to base storage + + SchemeAuthorityKey table_location_decomposed{table_location}; + SchemeAuthorityKey target_decomposed{path}; + + const std::string base_scheme_normalized = normalizeScheme(table_location_decomposed.scheme); + const std::string target_scheme_normalized = normalizeScheme(target_decomposed.scheme); + + // For S3 URIs, use S3::URI to properly handle all kinds of URIs, e.g. https://s3.amazonaws.com/bucket/... == s3://bucket/... + #if USE_AWS_S3 + if (target_scheme_normalized == "s3" || target_scheme_normalized == "https" || target_scheme_normalized == "http") + { + std::string normalized_path = path; + if (target_decomposed.scheme == "s3a" || target_decomposed.scheme == "s3n") + { + normalized_path = "s3://" + target_decomposed.authority + "/" + target_decomposed.key; + } + S3::URI s3_uri(normalized_path); + + std::string key_to_use = s3_uri.key; + + bool use_base_storage = false; + if (base_storage->getType() == ObjectStorageType::S3) + { + if (auto s3_storage = std::dynamic_pointer_cast(base_storage)) + { + const std::string base_bucket = s3_storage->getObjectsNamespace(); + const std::string base_endpoint = s3_storage->getDescription(); + + if (s3URIMatches(s3_uri, base_bucket, base_endpoint, target_scheme_normalized)) + use_base_storage = true; + } + } + + if (!use_base_storage && (base_scheme_normalized == "s3" || base_scheme_normalized == "https" || base_scheme_normalized == "http")) + { + std::string normalized_table_location = table_location; + if (table_location_decomposed.scheme == "s3a" || table_location_decomposed.scheme == "s3n") + { + normalized_table_location = "s3://" + table_location_decomposed.authority + "/" + table_location_decomposed.key; + } + S3::URI base_s3_uri(normalized_table_location); + + if (s3URIMatches(s3_uri, base_s3_uri.bucket, base_s3_uri.endpoint, target_scheme_normalized)) + use_base_storage = true; + } + + if (use_base_storage) + return {base_storage, key_to_use}; + + const std::string storage_cache_key = "s3://" + s3_uri.bucket + "@" + (s3_uri.endpoint.empty() ? "amazonaws.com" : s3_uri.endpoint); + + return getOrCreateStorageAndKey( + storage_cache_key, + key_to_use, + "s3", + secondary_storages, + context, + [&](Poco::Util::MapConfiguration & cfg, const std::string & config_prefix) + { + // Use the full endpoint or construct it from bucket + std::string endpoint = s3_uri.endpoint.empty() + ? ("https://" + s3_uri.bucket + ".s3.amazonaws.com") + : s3_uri.endpoint; + cfg.setString(config_prefix + ".endpoint", endpoint); + + // Copy credentials from base storage if it's also S3 + if (base_storage->getType() == ObjectStorageType::S3) + { + if (auto s3_storage = std::dynamic_pointer_cast(base_storage)) + { + if (auto s3_client = s3_storage->tryGetS3StorageClient()) + { + const auto credentials = s3_client->getCredentials(); + const String & access_key_id = credentials.GetAWSAccessKeyId(); + const String & secret_access_key = credentials.GetAWSSecretKey(); + const String & session_token = credentials.GetSessionToken(); + const String & region = s3_client->getRegion(); + + if (!access_key_id.empty()) + cfg.setString(config_prefix + ".access_key_id", access_key_id); + if (!secret_access_key.empty()) + cfg.setString(config_prefix + ".secret_access_key", secret_access_key); + if (!session_token.empty()) + cfg.setString(config_prefix + ".session_token", session_token); + if (!region.empty()) + cfg.setString(config_prefix + ".region", region); + } + } + } + }); + } + #endif + + #if USE_HDFS + if (target_scheme_normalized == "hdfs") + { + bool use_base_storage = false; + + // Check if base_storage matches (only if it's HDFS) + if (base_storage->getType() == ObjectStorageType::HDFS) + { + if (auto hdfs_storage = std::dynamic_pointer_cast(base_storage)) + { + const std::string base_url = hdfs_storage->getDescription(); + // Extract endpoint from base URL (hdfs://namenode:port/path -> hdfs://namenode:port) + std::string base_endpoint; + if (auto pos = base_url.find('/', base_url.find("//") + 2); pos != std::string::npos) + base_endpoint = base_url.substr(0, pos); + else + base_endpoint = base_url; + + // For HDFS, compare endpoints (namenode addresses) + std::string target_endpoint = target_scheme_normalized + "://" + target_decomposed.authority; + + if (base_endpoint == target_endpoint) + use_base_storage = true; + + // Also check if table_location matches + if (!use_base_storage && base_scheme_normalized == "hdfs") + { + if (table_location_decomposed.authority == target_decomposed.authority) + use_base_storage = true; + } + } + } + + if (use_base_storage) + return {base_storage, target_decomposed.key}; + } + #endif + + /// Fallback for schemes not handled above (e.g., abfs, file) + if (base_scheme_normalized == target_scheme_normalized && table_location_decomposed.authority == target_decomposed.authority) + return {base_storage, target_decomposed.key}; + + const std::string cache_key = target_scheme_normalized + "://" + target_decomposed.authority; + + const std::string type_for_factory = factoryTypeForScheme(target_scheme_normalized); + if (type_for_factory.empty()) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Unsupported storage scheme '{}' in path '{}'", target_scheme_normalized, path); + + std::string key_to_use = target_decomposed.key; + if (target_scheme_normalized == "file") + key_to_use = "/" + target_decomposed.key; // file:///absolute/path/to/file -> key = /absolute/path/to/file (full POSIX path) + + /// Handle storage types that need new storage creation + return getOrCreateStorageAndKey( + cache_key, + key_to_use, + type_for_factory, + secondary_storages, + context, + [&](Poco::Util::MapConfiguration & cfg, const std::string & config_prefix) + { + if (target_scheme_normalized == "file") + { + std::filesystem::path fs_path(key_to_use); + std::filesystem::path parent = fs_path.parent_path(); + std::string dir_path = parent.string(); + + if (dir_path.empty() || dir_path == "/") + dir_path = "/"; + else if (dir_path.back() != '/') + dir_path += '/'; + + cfg.setString(config_prefix + ".path", dir_path); + } + else if (target_scheme_normalized == "abfs") + { + cfg.setString(config_prefix + ".endpoint", target_scheme_normalized + "://" + target_decomposed.authority); + } + else if (target_scheme_normalized == "hdfs") + { + // HDFS endpoint must end with '/' + auto endpoint = target_scheme_normalized + "://" + target_decomposed.authority; + if (!endpoint.empty() && endpoint.back() != '/') + endpoint.push_back('/'); + cfg.setString(config_prefix + ".endpoint", endpoint); + } + }); +} + } diff --git a/src/Storages/ObjectStorage/Utils.h b/src/Storages/ObjectStorage/Utils.h index e7964778080b..2d99dbe269b7 100644 --- a/src/Storages/ObjectStorage/Utils.h +++ b/src/Storages/ObjectStorage/Utils.h @@ -1,11 +1,35 @@ #pragma once #include +#include +#include +#include + namespace DB { class IObjectStorage; +/// Thread-safe wrapper for secondary object storages map +struct SecondaryStorages +{ + mutable std::mutex mutex; + std::map storages; +}; + +// A URI splitted into components +// s3://bucket/a/b -> scheme="s3", authority="bucket", path="/a/b" +// file:///var/x -> scheme="file", authority="", path="/var/x" +// /abs/p -> scheme="", authority="", path="/abs/p" +struct SchemeAuthorityKey +{ + explicit SchemeAuthorityKey(const std::string & uri); + + std::string scheme; + std::string authority; + std::string key; +}; + std::optional checkAndGetNewFileOnInsertIfNeeded( const IObjectStorage & object_storage, const StorageObjectStorageConfiguration & configuration, @@ -15,9 +39,8 @@ std::optional checkAndGetNewFileOnInsertIfNeeded( void resolveSchemaAndFormat( ColumnsDescription & columns, - std::string & format, ObjectStoragePtr object_storage, - const StorageObjectStorageConfigurationPtr & configuration, + StorageObjectStorageConfigurationPtr & configuration, std::optional format_settings, std::string & sample_path, const ContextPtr & context); @@ -32,4 +55,17 @@ std::unique_ptr createReadBuffer( const ContextPtr & context_, const LoggerPtr & log, const std::optional & read_settings = std::nullopt); + +std::string makeAbsolutePath(const std::string & table_location, const std::string & path); + +/// Resolve object storage and key for reading from that storage +/// If path is relative -- it must be read from base_storage +/// Otherwise, look for a suitable storage in secondary_storages +std::pair resolveObjectStorageForPath( + const std::string & table_location, + const std::string & path, + const DB::ObjectStoragePtr & base_storage, + SecondaryStorages & secondary_storages, + const DB::ContextPtr & context); + } diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index 708e9ae34b9b..cc031aad68a1 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -37,7 +38,7 @@ namespace // LocalObjectStorage is only supported for Iceberg Datalake operations where Avro format is required. For regular file access, use FileStorage instead. #if USE_AWS_S3 || USE_AZURE_BLOB_STORAGE || USE_HDFS || USE_AVRO -std::shared_ptr +StoragePtr createStorageObjectStorage(const StorageFactory::Arguments & args, StorageObjectStorageConfigurationPtr configuration) { auto & engine_args = args.engine_args; @@ -45,7 +46,16 @@ createStorageObjectStorage(const StorageFactory::Arguments & args, StorageObject throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); const auto context = args.getLocalContext(); - StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, context, false); + + std::string cluster_name; + + if (args.storage_def->settings) + { + if (const auto * value = args.storage_def->settings->changes.tryGet("object_storage_cluster")) + cluster_name = value->safeGet(); + } + + configuration->initialize(args.engine_args, context, false); // Use format settings from global server context + settings from // the SETTINGS clause of the create query. Settings from current @@ -72,23 +82,25 @@ createStorageObjectStorage(const StorageFactory::Arguments & args, StorageObject ContextMutablePtr context_copy = Context::createCopy(args.getContext()); Settings settings_copy = args.getLocalContext()->getSettingsCopy(); context_copy->setSettings(settings_copy); - return std::make_shared( + return std::make_shared( + cluster_name, configuration, // We only want to perform write actions (e.g. create a container in Azure) when the table is being created, // and we want to avoid it when we load the table after a server restart. configuration->createObjectStorage(context, /* is_readonly */ args.mode != LoadingStrictnessLevel::CREATE), - context_copy, /// Use global context. args.table_id, args.columns, args.constraints, + partition_by, + context_copy, /// Use global context. args.comment, format_settings, args.mode, configuration->getCatalog(context, args.query.attach), args.query.if_not_exists, - /* is_datalake_query*/ false, - /* distributed_processing */ false, - partition_by); + /* is_datalake_query */ false, + /* is_table_function */ false, + /* lazy_init */ false); } #endif @@ -198,13 +210,12 @@ static DataLakeStorageSettingsPtr getDataLakeStorageSettings(const ASTStorage & void registerStorageIceberg(StorageFactory & factory) { -#if USE_AWS_S3 factory.registerStorage( IcebergDefinition::storage_engine_name, [&](const StorageFactory::Arguments & args) { const auto storage_settings = getDataLakeStorageSettings(*args.storage_def); - auto configuration = std::make_shared(storage_settings); + auto configuration = std::make_shared(storage_settings); return createStorageObjectStorage(args, configuration); }, { @@ -215,6 +226,7 @@ void registerStorageIceberg(StorageFactory & factory) .has_builtin_setting_fn = DataLakeStorageSettings::hasBuiltin, }); +# if USE_AWS_S3 factory.registerStorage( IcebergS3Definition::storage_engine_name, [&](const StorageFactory::Arguments & args) @@ -288,7 +300,7 @@ void registerStorageIceberg(StorageFactory & factory) #if USE_PARQUET && USE_DELTA_KERNEL_RS void registerStorageDeltaLake(StorageFactory & factory) { -#if USE_AWS_S3 +# if USE_AWS_S3 factory.registerStorage( DeltaLakeDefinition::storage_engine_name, [&](const StorageFactory::Arguments & args) diff --git a/src/Storages/ObjectStorage/tests/gtest_rendezvous_hashing.cpp b/src/Storages/ObjectStorage/tests/gtest_rendezvous_hashing.cpp index b00c1d609fa1..47a45d925ebf 100644 --- a/src/Storages/ObjectStorage/tests/gtest_rendezvous_hashing.cpp +++ b/src/Storages/ObjectStorage/tests/gtest_rendezvous_hashing.cpp @@ -101,7 +101,7 @@ TEST(RendezvousHashing, SingleNode) { auto iterator = makeIterator(); std::vector replicas = {"replica0", "replica1", "replica2", "replica3"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0); std::vector paths; ASSERT_TRUE(extractNForReplica(distributor, paths, 0, 10)); ASSERT_TRUE(checkHead(paths, {6})); @@ -110,7 +110,7 @@ TEST(RendezvousHashing, SingleNode) { auto iterator = makeIterator(); std::vector replicas = {"replica0", "replica1", "replica2", "replica3"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0); std::vector paths; ASSERT_TRUE(extractNForReplica(distributor, paths, 1, 10)); ASSERT_TRUE(checkHead(paths, {0, 2, 4})); @@ -119,7 +119,7 @@ TEST(RendezvousHashing, SingleNode) { auto iterator = makeIterator(); std::vector replicas = {"replica0", "replica1", "replica2", "replica3"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0); std::vector paths; ASSERT_TRUE(extractNForReplica(distributor, paths, 2, 10)); ASSERT_TRUE(checkHead(paths, {1, 5, 7, 8})); @@ -128,7 +128,7 @@ TEST(RendezvousHashing, SingleNode) { auto iterator = makeIterator(); std::vector replicas = {"replica0", "replica1", "replica2", "replica3"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0); std::vector paths; ASSERT_TRUE(extractNForReplica(distributor, paths, 3, 10)); ASSERT_TRUE(checkHead(paths, {3, 9})); @@ -139,7 +139,7 @@ TEST(RendezvousHashing, MultipleNodes) { auto iterator = makeIterator(); std::vector replicas = {"replica0", "replica1", "replica2", "replica3"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0); { std::vector paths; @@ -171,7 +171,7 @@ TEST(RendezvousHashing, SingleNodeReducedCluster) { auto iterator = makeIterator(); std::vector replicas = {"replica2", "replica1"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0); std::vector paths; ASSERT_TRUE(extractNForReplica(distributor, paths, 0, 10)); ASSERT_TRUE(checkHead(paths, {1, 5, 6, 7, 8, 9})); @@ -180,7 +180,7 @@ TEST(RendezvousHashing, SingleNodeReducedCluster) { auto iterator = makeIterator(); std::vector replicas = {"replica2", "replica1"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0); std::vector paths; ASSERT_TRUE(extractNForReplica(distributor, paths, 1, 10)); ASSERT_TRUE(checkHead(paths, {0, 2, 3, 4})); @@ -191,7 +191,7 @@ TEST(RendezvousHashing, MultipleNodesReducedCluster) { auto iterator = makeIterator(); std::vector replicas = {"replica2", "replica1"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0); { std::vector paths; @@ -210,7 +210,7 @@ TEST(RendezvousHashing, MultipleNodesReducedClusterOneByOne) { auto iterator = makeIterator(); std::vector replicas = {"replica2", "replica1"}; - StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false); + StorageObjectStorageStableTaskDistributor distributor(iterator, std::move(replicas), false, 0); std::vector paths0; std::vector paths1; diff --git a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp index 5a542e8b9b11..0d334fcd4d10 100644 --- a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp +++ b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp @@ -228,12 +228,12 @@ StorageObjectStorageQueue::StorageObjectStorageQueue( validateSettings(*queue_settings_, is_attach); object_storage = configuration->createObjectStorage(context_, /* is_readonly */true); - FormatFactory::instance().checkFormatName(configuration->format); + FormatFactory::instance().checkFormatName(configuration->getFormat()); configuration->check(context_); ColumnsDescription columns{columns_}; std::string sample_path; - resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, format_settings, sample_path, context_); + resolveSchemaAndFormat(columns, object_storage, configuration, format_settings, sample_path, context_); configuration->check(context_); StorageInMemoryMetadata storage_metadata; @@ -248,7 +248,7 @@ StorageObjectStorageQueue::StorageObjectStorageQueue( LOG_INFO(log, "Using zookeeper path: {}", zk_path.string()); auto table_metadata = ObjectStorageQueueMetadata::syncWithKeeper( - zk_path, *queue_settings_, storage_metadata.getColumns(), configuration_->format, context_, is_attach, log); + zk_path, *queue_settings_, storage_metadata.getColumns(), configuration_->getFormat(), context_, is_attach, log); ObjectStorageType storage_type = engine_name == "S3Queue" ? ObjectStorageType::S3 : ObjectStorageType::Azure; @@ -378,7 +378,7 @@ void StorageObjectStorageQueue::renameInMemory(const StorageID & new_table_id) bool StorageObjectStorageQueue::supportsSubsetOfColumns(const ContextPtr & context_) const { - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context_, format_settings); + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->getFormat(), context_, format_settings); } class ReadFromObjectStorageQueue : public SourceStepWithFilter diff --git a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.h b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.h index 9627934ca61d..d7bc2bfd5208 100644 --- a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.h +++ b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.h @@ -57,7 +57,7 @@ class StorageObjectStorageQueue : public IStorage, WithContext void renameInMemory(const StorageID & new_table_id) override; - const auto & getFormatName() const { return configuration->format; } + const auto & getFormatName() const { return configuration->getFormat(); } const fs::path & getZooKeeperPath() const { return zk_path; } diff --git a/src/Storages/ObjectStorageQueue/registerQueueStorage.cpp b/src/Storages/ObjectStorageQueue/registerQueueStorage.cpp index 43460ccda4da..56a80b2e53f1 100644 --- a/src/Storages/ObjectStorageQueue/registerQueueStorage.cpp +++ b/src/Storages/ObjectStorageQueue/registerQueueStorage.cpp @@ -41,7 +41,7 @@ StoragePtr createQueueStorage(const StorageFactory::Arguments & args) throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments"); auto configuration = std::make_shared(); - StorageObjectStorageConfiguration::initialize(*configuration, args.engine_args, args.getContext(), false); + configuration->initialize(args.engine_args, args.getContext(), false); // Use format settings from global server context + settings from // the SETTINGS clause of the create query. Settings from current diff --git a/src/Storages/PartitionCommands.cpp b/src/Storages/PartitionCommands.cpp index c12da89d7ed4..b8ef557604bc 100644 --- a/src/Storages/PartitionCommands.cpp +++ b/src/Storages/PartitionCommands.cpp @@ -130,6 +130,25 @@ std::optional PartitionCommand::parse(const ASTAlterCommand * res.with_name = command_ast->with_name; return res; } + if (command_ast->type == ASTAlterCommand::EXPORT_PART) + { + PartitionCommand res; + res.type = EXPORT_PART; + res.partition = command_ast->partition->clone(); + res.part = command_ast->part; + res.to_database = command_ast->to_database; + res.to_table = command_ast->to_table; + return res; + } + if (command_ast->type == ASTAlterCommand::EXPORT_PARTITION) + { + PartitionCommand res; + res.type = EXPORT_PARTITION; + res.partition = command_ast->partition->clone(); + res.to_database = command_ast->to_database; + res.to_table = command_ast->to_table; + return res; + } return {}; } @@ -171,6 +190,10 @@ std::string PartitionCommand::typeToString() const return "UNFREEZE ALL"; case PartitionCommand::Type::REPLACE_PARTITION: return "REPLACE PARTITION"; + case PartitionCommand::Type::EXPORT_PART: + return "EXPORT PART"; + case PartitionCommand::Type::EXPORT_PARTITION: + return "EXPORT PARTITION"; default: throw Exception(ErrorCodes::LOGICAL_ERROR, "Uninitialized partition command"); } diff --git a/src/Storages/PartitionCommands.h b/src/Storages/PartitionCommands.h index 917e510f24b4..e3f36d0e7c1f 100644 --- a/src/Storages/PartitionCommands.h +++ b/src/Storages/PartitionCommands.h @@ -33,6 +33,8 @@ struct PartitionCommand UNFREEZE_ALL_PARTITIONS, UNFREEZE_PARTITION, REPLACE_PARTITION, + EXPORT_PART, + EXPORT_PARTITION, }; Type type = UNKNOWN; diff --git a/src/Storages/PartitionedSink.cpp b/src/Storages/PartitionedSink.cpp index 078237483154..2a3df191dd92 100644 --- a/src/Storages/PartitionedSink.cpp +++ b/src/Storages/PartitionedSink.cpp @@ -26,10 +26,12 @@ namespace ErrorCodes PartitionedSink::PartitionedSink( std::shared_ptr partition_strategy_, + std::shared_ptr sink_creator_, ContextPtr context_, SharedHeader source_header_) : SinkToStorage(source_header_) , partition_strategy(partition_strategy_) + , sink_creator(sink_creator_) , context(context_) , source_header(source_header_) { @@ -41,7 +43,7 @@ SinkPtr PartitionedSink::getSinkForPartitionKey(StringRef partition_key) auto it = partition_id_to_sink.find(partition_key); if (it == partition_id_to_sink.end()) { - auto sink = createSinkForPartition(partition_key.toString()); + auto sink = sink_creator->createSinkForPartition(partition_key.toString()); std::tie(it, std::ignore) = partition_id_to_sink.emplace(partition_key, sink); } diff --git a/src/Storages/PartitionedSink.h b/src/Storages/PartitionedSink.h index 444624ba6c8e..bc446477e9dd 100644 --- a/src/Storages/PartitionedSink.h +++ b/src/Storages/PartitionedSink.h @@ -17,10 +17,17 @@ namespace DB class PartitionedSink : public SinkToStorage { public: + struct SinkCreator + { + virtual ~SinkCreator() = default; + virtual SinkPtr createSinkForPartition(const String & partition_id) = 0; + }; + static constexpr auto PARTITION_ID_WILDCARD = "{_partition_id}"; PartitionedSink( std::shared_ptr partition_strategy_, + std::shared_ptr sink_creator_, ContextPtr context_, SharedHeader source_header_); @@ -34,16 +41,15 @@ class PartitionedSink : public SinkToStorage void onFinish() override; - virtual SinkPtr createSinkForPartition(const String & partition_id) = 0; - static void validatePartitionKey(const String & str, bool allow_slash); static String replaceWildcards(const String & haystack, const String & partition_id); + protected: std::shared_ptr partition_strategy; - private: + std::shared_ptr sink_creator; ContextPtr context; SharedHeader source_header; diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index f5b86d8f783f..d518887b3f6d 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -87,6 +88,7 @@ #include #include +#include #include #include @@ -102,6 +104,7 @@ #include #include +#include #include #include @@ -115,9 +118,9 @@ #include #include #include - #include #include +#include namespace fs = std::filesystem; @@ -148,6 +151,29 @@ namespace CurrentMetrics namespace DB { +namespace +{ +void replaceCurrentDatabaseFunction(ASTPtr & ast, const ContextPtr & context) +{ + if (!ast) + return; + + if (auto * func = ast->as()) + { + if (func->name == "currentDatabase") + { + ast = evaluateConstantExpressionForDatabaseName(ast, context); + return; + } + } + + for (auto & child : ast->children) + replaceCurrentDatabaseFunction(child, context); +} + + +} + namespace Setting { extern const SettingsBool allow_experimental_analyzer; @@ -178,6 +204,7 @@ namespace Setting extern const SettingsUInt64 allow_experimental_parallel_reading_from_replicas; extern const SettingsBool prefer_global_in_and_join; extern const SettingsBool enable_global_with_statement; + extern const SettingsBool allow_experimental_hybrid_table; } namespace DistributedSetting @@ -198,6 +225,8 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; extern const int STORAGE_REQUIRES_PARAMETER; extern const int BAD_ARGUMENTS; + extern const int UNKNOWN_DATABASE; + extern const int UNKNOWN_TABLE; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int INCORRECT_NUMBER_OF_COLUMNS; extern const int INFINITE_LOOP; @@ -209,6 +238,7 @@ namespace ErrorCodes extern const int DISTRIBUTED_TOO_MANY_PENDING_BYTES; extern const int ARGUMENT_OUT_OF_BOUND; extern const int TOO_LARGE_DISTRIBUTED_DEPTH; + extern const int SUPPORT_IS_DISABLED; } namespace ActionLocks @@ -522,6 +552,10 @@ QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage( if (to_stage == QueryProcessingStage::WithMergeableState) return QueryProcessingStage::WithMergeableState; + // TODO: check logic + if (!segments.empty()) + nodes += segments.size(); + /// If there is only one node, the query can be fully processed by the /// shard, initiator will work as a proxy only. if (nodes == 1) @@ -564,6 +598,9 @@ QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage( bool StorageDistributed::isShardingKeySuitsQueryTreeNodeExpression( const QueryTreeNodePtr & expr, const SelectQueryInfo & query_info) const { + if (!segments.empty()) + return false; + ColumnsWithTypeAndName empty_input_columns; ColumnNodePtrWithHashSet empty_correlated_columns_set; // When comparing sharding key expressions, we need to ignore table qualifiers in column names @@ -604,6 +641,7 @@ bool StorageDistributed::isShardingKeySuitsQueryTreeNodeExpression( return allOutputsDependsOnlyOnAllowedNodes(sharding_key_dag, irreducibe_nodes, matches); } +// TODO: support additional segments std::optional StorageDistributed::getOptimizedQueryProcessingStageAnalyzer(const SelectQueryInfo & query_info, const Settings & settings) const { bool optimize_sharding_key_aggregation = settings[Setting::optimize_skip_unused_shards] && settings[Setting::optimize_distributed_group_by_sharding_key] @@ -662,6 +700,7 @@ std::optional StorageDistributed::getOptimizedQueryP return QueryProcessingStage::Complete; } +// TODO: support additional segments std::optional StorageDistributed::getOptimizedQueryProcessingStage(const SelectQueryInfo & query_info, const Settings & settings) const { bool optimize_sharding_key_aggregation = settings[Setting::optimize_skip_unused_shards] && settings[Setting::optimize_distributed_group_by_sharding_key] @@ -771,9 +810,11 @@ static bool requiresObjectColumns(const ColumnsDescription & all_columns, ASTPtr StorageSnapshotPtr StorageDistributed::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const { + /// TODO: support additional segments return getStorageSnapshotForQuery(metadata_snapshot, nullptr, query_context); } +/// TODO: support additional segments StorageSnapshotPtr StorageDistributed::getStorageSnapshotForQuery( const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query, ContextPtr /*query_context*/) const { @@ -909,7 +950,8 @@ bool rewriteJoinToGlobalJoinIfNeeded(QueryTreeNodePtr join_tree) QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, const StorageSnapshotPtr & distributed_storage_snapshot, const StorageID & remote_storage_id, - const ASTPtr & remote_table_function) + const ASTPtr & remote_table_function, + const ASTPtr & additional_filter = nullptr) { auto & planner_context = query_info.planner_context; const auto & query_context = planner_context->getQueryContext(); @@ -976,7 +1018,28 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, replacement_table_expression->setAlias(query_info.table_expression->getAlias()); - auto query_tree_to_modify = query_info.query_tree->cloneAndReplace(query_info.table_expression, std::move(replacement_table_expression)); + QueryTreeNodePtr filter; + + if (additional_filter) + { + const auto & context = query_info.planner_context->getQueryContext(); + + filter = buildQueryTree(additional_filter->clone(), query_context); + + QueryAnalysisPass(replacement_table_expression).run(filter, context); + } + + auto query_tree_to_modify = query_info.query_tree->cloneAndReplace(query_info.table_expression, replacement_table_expression); + + // Apply additional filter if provided + if (filter) + { + auto & query = query_tree_to_modify->as(); + query.getWhere() = query.hasWhere() + ? mergeConditionNodes({query.getWhere(), filter}, query_context) + : std::move(filter); + } + ReplaseAliasColumnsVisitor replase_alias_columns_visitor; replase_alias_columns_visitor.visit(query_tree_to_modify); @@ -995,6 +1058,7 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, } return buildQueryTreeForShard(query_info.planner_context, query_tree_to_modify, /*allow_global_join_for_right_table*/ false); + } } @@ -1013,30 +1077,90 @@ void StorageDistributed::read( SelectQueryInfo modified_query_info = query_info; + std::vector additional_query_infos; + const auto & settings = local_context->getSettingsRef(); + auto metadata_ptr = getInMemoryMetadataPtr(); + + auto describe_segment_target = [&](const HybridSegment & segment) -> String + { + if (segment.storage_id) + return segment.storage_id->getNameForLogs(); + if (segment.table_function_ast) + return segment.table_function_ast->formatForLogging(); + chassert(false, "Hybrid segment is missing both storage_id and table_function_ast"); + return String{""}; + }; + + auto describe_base_target = [&]() -> String + { + if (remote_table_function_ptr) + return remote_table_function_ptr->formatForLogging(); + if (!remote_database.empty()) + return remote_database + "." + remote_table; + return remote_table; + }; + + String base_target = describe_base_target(); + + const bool log_hybrid_query_rewrites = (!segments.empty() || base_segment_predicate); + + auto log_rewritten_query = [&](const String & target, const ASTPtr & ast) + { + if (!log_hybrid_query_rewrites || !ast) + return; + + LOG_TRACE(log, "rewriteSelectQuery (target: {}) -> {}", target, ast->formatForLogging()); + }; if (settings[Setting::allow_experimental_analyzer]) { - StorageID remote_storage_id = StorageID{remote_database, remote_table}; + StorageID remote_storage_id = StorageID::createEmpty(); + if (!remote_table_function_ptr) + remote_storage_id = StorageID{remote_database, remote_table}; auto query_tree_distributed = buildQueryTreeDistributed(modified_query_info, query_info.initial_storage_snapshot ? query_info.initial_storage_snapshot : storage_snapshot, remote_storage_id, - remote_table_function_ptr); + remote_table_function_ptr, + base_segment_predicate); Block block = *InterpreterSelectQueryAnalyzer::getSampleBlock(query_tree_distributed, local_context, SelectQueryOptions(processed_stage).analyze()); /** For distributed tables we do not need constants in header, since we don't send them to remote servers. * Moreover, constants can break some functions like `hostName` that are constants only for local queries. */ for (auto & column : block) column.column = column.column->convertToFullColumnIfConst(); + header = std::make_shared(std::move(block)); modified_query_info.query = queryNodeToDistributedSelectQuery(query_tree_distributed); modified_query_info.query_tree = std::move(query_tree_distributed); + log_rewritten_query(base_target, modified_query_info.query); + + if (!segments.empty()) + { + for (const auto & segment : segments) + { + // Create a modified query info with the segment predicate + SelectQueryInfo additional_query_info = query_info; + + auto additional_query_tree = buildQueryTreeDistributed(additional_query_info, + query_info.initial_storage_snapshot ? query_info.initial_storage_snapshot : storage_snapshot, + segment.storage_id ? *segment.storage_id : StorageID::createEmpty(), + segment.storage_id ? nullptr : segment.table_function_ast, + segment.predicate_ast); + + additional_query_info.query = queryNodeToDistributedSelectQuery(additional_query_tree); + additional_query_info.query_tree = std::move(additional_query_tree); + log_rewritten_query(describe_segment_target(segment), additional_query_info.query); - /// Return directly (with correct header) if no shard to query. - if (modified_query_info.getCluster()->getShardsInfo().empty()) + additional_query_infos.push_back(std::move(additional_query_info)); + } + } + + // For empty shards - avoid early return if we have additional segments + if (modified_query_info.getCluster()->getShardsInfo().empty() && segments.empty()) return; } else @@ -1045,9 +1169,39 @@ void StorageDistributed::read( modified_query_info.query = ClusterProxy::rewriteSelectQuery( local_context, modified_query_info.query, - remote_database, remote_table, remote_table_function_ptr); + remote_database, remote_table, remote_table_function_ptr, + base_segment_predicate); + log_rewritten_query(base_target, modified_query_info.query); - if (modified_query_info.getCluster()->getShardsInfo().empty()) + if (!segments.empty()) + { + for (const auto & segment : segments) + { + SelectQueryInfo additional_query_info = query_info; + + if (segment.storage_id) + { + additional_query_info.query = ClusterProxy::rewriteSelectQuery( + local_context, additional_query_info.query, + segment.storage_id->database_name, segment.storage_id->table_name, + nullptr, + segment.predicate_ast); + } + else + { + additional_query_info.query = ClusterProxy::rewriteSelectQuery( + local_context, additional_query_info.query, + "", "", segment.table_function_ast, + segment.predicate_ast); + } + + log_rewritten_query(describe_segment_target(segment), additional_query_info.query); + additional_query_infos.push_back(std::move(additional_query_info)); + } + } + + // For empty shards - avoid early return if we have additional segments + if (modified_query_info.getCluster()->getShardsInfo().empty() && segments.empty()) { Pipe pipe(std::make_shared(header)); auto read_from_pipe = std::make_unique(std::move(pipe)); @@ -1059,35 +1213,40 @@ void StorageDistributed::read( } const auto & snapshot_data = assert_cast(*storage_snapshot->data); - ClusterProxy::SelectStreamFactory select_stream_factory = - ClusterProxy::SelectStreamFactory( + + if (!modified_query_info.getCluster()->getShardsInfo().empty() || !additional_query_infos.empty()) + { + ClusterProxy::SelectStreamFactory select_stream_factory = + ClusterProxy::SelectStreamFactory( + header, + snapshot_data.objects_by_shard, + storage_snapshot, + processed_stage); + + auto shard_filter_generator = ClusterProxy::getShardFilterGeneratorForCustomKey( + *modified_query_info.getCluster(), local_context, metadata_ptr->columns); + + ClusterProxy::executeQuery( + query_plan, header, - snapshot_data.objects_by_shard, - storage_snapshot, - processed_stage); - - auto shard_filter_generator = ClusterProxy::getShardFilterGeneratorForCustomKey( - *modified_query_info.getCluster(), local_context, getInMemoryMetadataPtr()->columns); - - ClusterProxy::executeQuery( - query_plan, - header, - processed_stage, - remote_storage, - remote_table_function_ptr, - select_stream_factory, - log, - local_context, - modified_query_info, - sharding_key_expr, - sharding_key_column_name, - *distributed_settings, - shard_filter_generator, - is_remote_function); - - /// This is a bug, it is possible only when there is no shards to query, and this is handled earlier. - if (!query_plan.isInitialized()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Pipeline is not initialized"); + processed_stage, + remote_storage, + remote_table_function_ptr, + select_stream_factory, + log, + local_context, + modified_query_info, + sharding_key_expr, + sharding_key_column_name, + *distributed_settings, + shard_filter_generator, + is_remote_function, + additional_query_infos); + + /// This is a bug, it is possible only when there is no shards to query, and this is handled earlier. + if (!query_plan.isInitialized()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Pipeline is not initialized"); + } } @@ -2071,6 +2230,36 @@ void StorageDistributed::delayInsertOrThrowIfNeeded() const } } +void StorageDistributed::setHybridLayout(std::vector segments_) +{ + segments = std::move(segments_); + log = getLogger("Hybrid (" + getStorageID().table_name + ")"); + + auto virtuals = createVirtuals(); + // or _segment_index? + virtuals.addEphemeral("_table_index", std::make_shared(), "Index of the table function in Hybrid (0 for main table, 1+ for additional segments)"); + setVirtuals(virtuals); +} + +void StorageDistributed::setCachedColumnsToCast(ColumnsDescription columns) +{ + cached_columns_to_cast = std::move(columns); + if (!cached_columns_to_cast.empty() && log) + { + Names columns_with_types; + columns_with_types.reserve(cached_columns_to_cast.getAllPhysical().size()); + for (const auto & col : cached_columns_to_cast.getAllPhysical()) + columns_with_types.emplace_back(col.name + " " + col.type->getName()); + LOG_DEBUG(log, "Hybrid auto-cast will apply to: [{}]", fmt::join(columns_with_types, ", ")); + } +} + +ColumnsDescription StorageDistributed::getColumnsToCast() const +{ + return cached_columns_to_cast; +} + + void registerStorageDistributed(StorageFactory & factory) { factory.registerStorage("Distributed", [](const StorageFactory::Arguments & args) @@ -2175,6 +2364,279 @@ void registerStorageDistributed(StorageFactory & factory) }); } +void registerStorageHybrid(StorageFactory & factory) +{ + factory.registerStorage("Hybrid", [](const StorageFactory::Arguments & args) -> StoragePtr + { + ASTs & engine_args = args.engine_args; + + if (engine_args.size() < 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Storage Hybrid requires at least 2 arguments, got {}", engine_args.size()); + + const ContextPtr & global_context = args.getContext(); + ContextPtr local_context = args.getLocalContext(); + if (!local_context) + local_context = global_context; + + if (args.mode <= LoadingStrictnessLevel::CREATE + && !local_context->getSettingsRef()[Setting::allow_experimental_hybrid_table]) + { + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Experimental Hybrid table engine is not enabled (the setting 'allow_experimental_hybrid_table')"); + } + + // Validate first argument - must be a table function + ASTPtr first_arg = engine_args[0]; + if (const auto * func = first_arg->as()) + { + // Check if it's a valid table function name + if (!TableFunctionFactory::instance().isTableFunctionName(func->name)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "First argument must be a table function, got: {}", func->name); + + // Check if it's one of the supported remote table functions + if (func->name != "remote" && func->name != "remoteSecure" && + func->name != "cluster" && func->name != "clusterAllReplicas") + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "First argument must be one of: remote, remoteSecure, cluster, clusterAllReplicas, got: {}", func->name); + } + else + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "First argument must be a table function, got: {}", first_arg->getID()); + } + + // Now handle the first table function (which must be a TableFunctionRemote) + auto table_function = TableFunctionFactory::instance().get(first_arg, local_context); + if (!table_function) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid table function in Hybrid engine"); + + // Capture the physical columns reported by the first segment (table function) + ColumnsDescription first_segment_columns = table_function->getActualTableStructure(local_context, true); + + // For schema inference, prefer user-provided columns, otherwise use the physical ones + ColumnsDescription columns_to_use = args.columns; + if (columns_to_use.empty()) + columns_to_use = first_segment_columns; + + NameSet columns_to_cast_names; + auto validate_segment_schema = [&](const ColumnsDescription & segment_columns, const String & segment_name) + { + for (const auto & column : columns_to_use.getAllPhysical()) + { + auto found = segment_columns.tryGetPhysical(column.name); + if (!found) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Hybrid segment {} is missing column '{}' required by Hybrid schema", + segment_name, column.name); + } + + if (!found->type->equals(*column.type)) + columns_to_cast_names.emplace(column.name); + } + }; + + validate_segment_schema(first_segment_columns, engine_args[0]->formatForLogging()); + + // Execute the table function to get the underlying storage + StoragePtr storage = table_function->execute( + first_arg, + local_context, + args.table_id.table_name, + columns_to_use, + false, // use_global_context = false + false); // is_insert_query = false + + // table function execution wraps the actual storage in a StorageTableFunctionProxy, to make initialize it lazily in queries + // here we need to get the nested storage + if (auto proxy = std::dynamic_pointer_cast(storage)) + { + storage = proxy->getNested(); + } + + // Cast to StorageDistributed to access its methods + auto distributed_storage = std::dynamic_pointer_cast(storage); + if (!distributed_storage) + { + // Debug: Print the actual type we got + std::string actual_type = storage ? storage->getName() : "nullptr"; + throw Exception(ErrorCodes::LOGICAL_ERROR, + "TableFunctionRemote did not return a StorageDistributed or StorageProxy, got: {}", actual_type); + } + + const auto physical_columns = columns_to_use.getAllPhysical(); + + auto validate_predicate = [&](ASTPtr & predicate, size_t argument_index) + { + try + { + auto syntax_result = TreeRewriter(local_context).analyze(predicate, physical_columns); + ExpressionAnalyzer(predicate, syntax_result, local_context).getActions(true); + } + catch (const Exception & e) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Argument #{} must be a valid SQL expression: {}", argument_index, e.message()); + } + }; + + ASTPtr second_arg = engine_args[1]; + validate_predicate(second_arg, 1); + distributed_storage->setBaseSegmentPredicate(second_arg); + + // Parse additional table function pairs (if any) + std::vector segment_definitions; + for (size_t i = 2; i < engine_args.size(); i += 2) + { + if (i + 1 >= engine_args.size()) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Table function pairs must have both table function and predicate, got odd number of arguments"); + + ASTPtr table_function_ast = engine_args[i]; + ASTPtr predicate_ast = engine_args[i + 1]; + + validate_predicate(predicate_ast, i + 1); + + // Validate table function or table identifier + if (const auto * func = table_function_ast->as()) + { + // It's a table function - validate it + if (!TableFunctionFactory::instance().isTableFunctionName(func->name)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Argument #{}: additional table function must be a valid table function, got: {}", i, func->name); + } + + // Normalize arguments (evaluate `currentDatabase()`, expand named collections, etc.). + // TableFunctionFactory::get mutates the AST in-place inside TableFunctionRemote::parseArguments. + ASTPtr normalized_table_function_ast = table_function_ast->clone(); + auto additional_table_function = TableFunctionFactory::instance().get(normalized_table_function_ast, local_context); + ColumnsDescription segment_columns = additional_table_function->getActualTableStructure(local_context, true); + replaceCurrentDatabaseFunction(normalized_table_function_ast, local_context); + + validate_segment_schema(segment_columns, normalized_table_function_ast->formatForLogging()); + + // It's a table function - store the AST and cached schema for later execution + segment_definitions.emplace_back(normalized_table_function_ast, predicate_ast); + } + else if (const auto * ast_identifier = table_function_ast->as()) + { + // It's an identifier - try to convert it to a table identifier + auto table_identifier = ast_identifier->createTable(); + if (!table_identifier) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Argument #{}: identifier '{}' cannot be converted to table identifier", i, ast_identifier->name()); + } + + StoragePtr validated_table; + try + { + // Parse table identifier to get StorageID + StorageID storage_id(table_identifier); + + // Fill database for unqualified identifiers using current database (or the target table database). + if (storage_id.database_name.empty()) + { + String default_database = local_context->getCurrentDatabase(); + if (default_database.empty()) + default_database = args.table_id.database_name; + + if (default_database.empty()) + { + throw Exception(ErrorCodes::UNKNOWN_DATABASE, + "Argument #{}: table identifier '{}' does not specify database and no default database is selected", + i, ast_identifier->name()); + } + + storage_id.database_name = default_database; + + // Update AST so the table definition stores a fully qualified name. + auto qualified_identifier = std::make_shared(storage_id.database_name, storage_id.table_name); + qualified_identifier->alias = ast_identifier->alias; + qualified_identifier->prefer_alias_to_column_name = ast_identifier->prefer_alias_to_column_name; + table_function_ast = qualified_identifier; + engine_args[i] = table_function_ast; + } + + // Sanity check: verify the table exists + try + { + auto database = DatabaseCatalog::instance().getDatabase(storage_id.database_name, local_context); + if (!database) + { + throw Exception(ErrorCodes::UNKNOWN_DATABASE, + "Database '{}' does not exist", storage_id.database_name); + } + + auto table = database->tryGetTable(storage_id.table_name, local_context); + if (!table) + { + throw Exception(ErrorCodes::UNKNOWN_TABLE, + "Table '{}.{}' does not exist", storage_id.database_name, storage_id.table_name); + } + validated_table = table; + } + catch (const Exception & e) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Argument #{}: table '{}' validation failed: {}", i, ast_identifier->name(), e.message()); + } + + ColumnsDescription segment_columns; + + if (validated_table) + segment_columns = validated_table->getInMemoryMetadataPtr()->getColumns(); + + validate_segment_schema(segment_columns, storage_id.getNameForLogs()); + + segment_definitions.emplace_back(table_function_ast, predicate_ast, storage_id); + } + catch (const Exception & e) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Argument #{}: invalid table identifier '{}': {}", i, ast_identifier->name(), e.message()); + } + } + else + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Argument #{}: additional argument must be either a table function or table identifier, got: {}", i, table_function_ast->getID()); + } + + } + + // Fix the database and table names - this is the same pattern used in InterpreterCreateQuery + // The TableFunctionRemote creates a StorageDistributed with "_table_function" database, + // but we need to rename it to the correct database and table names + distributed_storage->renameInMemory({args.table_id.database_name, args.table_id.table_name, args.table_id.uuid}); + + // Store segment definitions for later use + distributed_storage->setHybridLayout(std::move(segment_definitions)); + if (!columns_to_cast_names.empty()) + { + NamesAndTypesList cast_cols; + for (const auto & col : columns_to_use.getAllPhysical()) + { + if (columns_to_cast_names.contains(col.name)) + cast_cols.emplace_back(col.name, col.type); + } + distributed_storage->setCachedColumnsToCast(ColumnsDescription(cast_cols)); + } + + return distributed_storage; + }, + { + .supports_settings = false, + .supports_parallel_insert = true, + .supports_schema_inference = true, + .source_access_type = AccessTypeObjects::Source::REMOTE, + }); +} + bool StorageDistributed::initializeDiskOnConfigChange(const std::set & new_added_disks) { if (!storage_policy || !data_volume) diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h index 75354edb7ffc..9526fc647fbc 100644 --- a/src/Storages/StorageDistributed.h +++ b/src/Storages/StorageDistributed.h @@ -50,6 +50,27 @@ class StorageDistributed final : public IStorage, WithContext friend class StorageSystemDistributionQueue; public: + /// Structure to hold table function AST, predicate, optional StorageID, and cached physical columns for the segment. + /// Cached columns let us detect schema mismatches and enable features like hybrid_table_auto_cast_columns without + /// re-fetching remote headers on every query. + struct HybridSegment + { + ASTPtr table_function_ast; + ASTPtr predicate_ast; + std::optional storage_id; // For table identifiers instead of table functions + + HybridSegment(ASTPtr table_function_ast_, ASTPtr predicate_ast_) + : table_function_ast(std::move(table_function_ast_)) + , predicate_ast(std::move(predicate_ast_)) + {} + + HybridSegment(ASTPtr table_function_ast_, ASTPtr predicate_ast_, StorageID storage_id_) + : table_function_ast(std::move(table_function_ast_)) + , predicate_ast(std::move(predicate_ast_)) + , storage_id(std::move(storage_id_)) + {} + }; + StorageDistributed( const StorageID & id_, const ColumnsDescription & columns_, @@ -70,7 +91,12 @@ class StorageDistributed final : public IStorage, WithContext ~StorageDistributed() override; - std::string getName() const override { return "Distributed"; } + std::string getName() const override + { + return (segments.empty() && !base_segment_predicate) + ? "Distributed" + : "Hybrid"; + } bool supportsSampling() const override { return true; } bool supportsFinal() const override { return true; } @@ -149,6 +175,20 @@ class StorageDistributed final : public IStorage, WithContext size_t getShardCount() const; + /// Set optional predicate applied to the base segment + void setBaseSegmentPredicate(ASTPtr predicate) { base_segment_predicate = std::move(predicate); } + + /// Set segment definitions for Hybrid engine along with cached schema info + void setHybridLayout(std::vector segments_); + void setCachedColumnsToCast(ColumnsDescription columns); + + /// Getter methods for ClusterProxy::executeQuery + StorageID getRemoteStorageID() const { return remote_storage; } + ColumnsDescription getColumnsToCast() const; + ExpressionActionsPtr getShardingKeyExpression() const { return sharding_key_expr; } + const DistributedSettings * getDistributedSettings() const { return distributed_settings.get(); } + bool isRemoteFunction() const { return is_remote_function; } + bool initializeDiskOnConfigChange(const std::set & new_added_disks) override; private: @@ -283,6 +323,21 @@ class StorageDistributed final : public IStorage, WithContext pcg64 rng; bool is_remote_function; + + /// Additional filter expression for Hybrid engine + ASTPtr base_segment_predicate; + + /// Additional segments for Hybrid engine + std::vector segments; + + /// Hybrid build the list of columns which need to be casted once during CREATE/ATTACH + /// those are columns which type differs from the expected at least on one segment. + /// is is used by HybridCastsPass and hybrid_table_auto_cast_columns feature + /// without cache that would require reading the headers of the segments before every query + /// which may trigger extra DESCRIBE call in case of remote queries. + /// Subsequent segment DDL changes are not auto-detected; + /// reattach/recreate the Hybrid table to refresh. + ColumnsDescription cached_columns_to_cast; }; } diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index d57ca4f996a6..f0270cd78dba 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1029,8 +1029,9 @@ bool StorageFile::canMoveConditionsToPrewhere() const std::optional StorageFile::supportedPrewhereColumns() const { - /// Currently don't support prewhere for virtual columns and columns with default expressions. - return getInMemoryMetadataPtr()->getColumnsWithoutDefaultExpressions(); + /// Currently don't support prewhere for virtual columns, columns with default expressions, + /// and columns taken from file path (hive partitioning). + return getInMemoryMetadataPtr()->getColumnsWithoutDefaultExpressions(/*exclude=*/ hive_partition_columns_to_read_from_file_path); } IStorage::ColumnSizeByName StorageFile::getColumnSizes() const @@ -1989,7 +1990,7 @@ class StorageFileSink final : public SinkToStorage, WithContext std::unique_lock lock; }; -class PartitionedStorageFileSink : public PartitionedSink +class PartitionedStorageFileSink : public PartitionedSink::SinkCreator { public: PartitionedStorageFileSink( @@ -2004,7 +2005,7 @@ class PartitionedStorageFileSink : public PartitionedSink const String format_name_, ContextPtr context_, int flags_) - : PartitionedSink(partition_strategy_, context_, std::make_shared(metadata_snapshot_->getSampleBlock())) + : partition_strategy(partition_strategy_) , path(path_) , metadata_snapshot(metadata_snapshot_) , table_name_for_log(table_name_for_log_) @@ -2020,11 +2021,12 @@ class PartitionedStorageFileSink : public PartitionedSink SinkPtr createSinkForPartition(const String & partition_id) override { - std::string filepath = partition_strategy->getPathForWrite(path, partition_id); + const auto file_path_generator = std::make_shared(path); + std::string filepath = file_path_generator->getPathForWrite(partition_id); fs::create_directories(fs::path(filepath).parent_path()); - validatePartitionKey(filepath, true); + PartitionedSink::validatePartitionKey(filepath, true); checkCreationIsAllowed(context, context->getUserFilesPath(), filepath, /*can_be_directory=*/ true); return std::make_shared( metadata_snapshot, @@ -2041,6 +2043,7 @@ class PartitionedStorageFileSink : public PartitionedSink } private: + std::shared_ptr partition_strategy; const String path; StorageMetadataPtr metadata_snapshot; String table_name_for_log; @@ -2092,7 +2095,7 @@ SinkToStoragePtr StorageFile::write( has_wildcards, /* partition_columns_in_data_file */true); - return std::make_shared( + auto sink_creator = std::make_shared( partition_strategy, metadata_snapshot, getStorageID().getNameForLogs(), @@ -2104,6 +2107,13 @@ SinkToStoragePtr StorageFile::write( format_name, context, flags); + + return std::make_shared( + partition_strategy, + sink_creator, + context, + std::make_shared(metadata_snapshot->getSampleBlock()) + ); } String path; @@ -2129,6 +2139,7 @@ SinkToStoragePtr StorageFile::write( String new_path; do { + new_path = path.substr(0, pos) + "." + std::to_string(index) + (pos == std::string::npos ? "" : path.substr(pos)); ++index; } diff --git a/src/Storages/StorageFileCluster.cpp b/src/Storages/StorageFileCluster.cpp index 0dde974ce9e8..34d5474869af 100644 --- a/src/Storages/StorageFileCluster.cpp +++ b/src/Storages/StorageFileCluster.cpp @@ -95,21 +95,52 @@ void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const Sto ); } +class FileTaskIterator : public TaskIterator +{ +public: + FileTaskIterator(const Strings & files, + std::optional archive_info, + const ActionsDAG::Node * predicate, + const NamesAndTypesList & virtual_columns, + const NamesAndTypesList & hive_partition_columns_to_read_from_file_path, + const ContextPtr & context, + bool distributed_processing = false) + : iterator(files + , archive_info + , predicate + , virtual_columns + , hive_partition_columns_to_read_from_file_path + , context + , distributed_processing) {} + + ~FileTaskIterator() override = default; + + ClusterFunctionReadTaskResponsePtr operator()(size_t /* number_of_current_replica */) const override + { + auto file = iterator.next(); + if (file.empty()) + return std::make_shared(); + return std::make_shared(std::move(file)); + } + +private: + mutable StorageFileSource::FilesIterator iterator; +}; + RemoteQueryExecutor::Extension StorageFileCluster::getTaskIteratorExtension( const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr) const { - auto iterator = std::make_shared(paths, std::nullopt, predicate, getVirtualsList(), hive_partition_columns_to_read_from_file_path, context); - auto next_callback = [iter = std::move(iterator)](size_t) mutable -> ClusterFunctionReadTaskResponsePtr - { - auto file = iter->next(); - if (file.empty()) - return std::make_shared(); - return std::make_shared(std::move(file)); - }; - auto callback = std::make_shared(std::move(next_callback)); + auto callback = std::make_shared( + paths, + std::nullopt, + predicate, + getVirtualsList(), + hive_partition_columns_to_read_from_file_path, + context + ); return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)}; } diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index 5e9ec5ae98e4..cd010a280a68 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -788,11 +788,12 @@ std::unordered_map StorageInMemoryMetadata::getFakeColu return sizes; } -NameSet StorageInMemoryMetadata::getColumnsWithoutDefaultExpressions() const +NameSet StorageInMemoryMetadata::getColumnsWithoutDefaultExpressions(const NamesAndTypesList & exclude) const { + auto exclude_map = exclude.getNameToTypeMap(); NameSet names; for (const auto & col : columns) - if (!col.default_desc.expression) + if (!col.default_desc.expression && !exclude_map.contains(col.name)) names.insert(col.name); return names; } diff --git a/src/Storages/StorageInMemoryMetadata.h b/src/Storages/StorageInMemoryMetadata.h index 43af71ed7713..8fcb419fd376 100644 --- a/src/Storages/StorageInMemoryMetadata.h +++ b/src/Storages/StorageInMemoryMetadata.h @@ -289,7 +289,7 @@ struct StorageInMemoryMetadata std::unordered_map getFakeColumnSizes() const; /// Elements of `columns` that have `default_desc.expression == nullptr`. - NameSet getColumnsWithoutDefaultExpressions() const; + NameSet getColumnsWithoutDefaultExpressions(const NamesAndTypesList & exclude) const; }; using StorageMetadataPtr = std::shared_ptr; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 45331190bc1d..1cc616bd8521 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -120,6 +120,7 @@ namespace ErrorCodes extern const int TABLE_IS_READ_ONLY; extern const int TOO_MANY_PARTS; extern const int PART_IS_LOCKED; + extern const int INCOMPATIBLE_COLUMNS; } namespace ActionLocks @@ -209,7 +210,7 @@ void StorageMergeTree::startup() try { background_operations_assignee.start(); - startBackgroundMovesIfNeeded(); + startBackgroundMoves(); startOutdatedAndUnexpectedDataPartsLoadingTask(); } catch (...) @@ -255,6 +256,11 @@ void StorageMergeTree::shutdown(bool) if (deduplication_log) deduplication_log->shutdown(); + + { + std::lock_guard lock(export_manifests_mutex); + export_manifests.clear(); + } } @@ -2812,12 +2818,6 @@ MutationCounters StorageMergeTree::getMutationCounters() const return mutation_counters; } -void StorageMergeTree::startBackgroundMovesIfNeeded() -{ - if (areBackgroundMovesNeeded()) - background_moves_assignee.start(); -} - std::unique_ptr StorageMergeTree::getDefaultSettings() const { return std::make_unique(getContext()->getMergeTreeSettings()); diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index 00cbc7acdad8..0bffa6ead7d3 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -286,8 +286,6 @@ class StorageMergeTree final : public MergeTreeData std::unique_ptr fillNewPartName(MutableDataPartPtr & part, DataPartsLock & lock); std::unique_ptr fillNewPartNameAndResetLevel(MutableDataPartPtr & part, DataPartsLock & lock); - void startBackgroundMovesIfNeeded() override; - BackupEntries backupMutations(UInt64 version, const String & data_path_in_backup) const; /// Attaches restored parts to the storage. diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 3a4374a6c9dc..ec6c43ff775e 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -7,6 +7,7 @@ #include #include +#include "Common/ZooKeeper/IKeeper.h" #include #include #include @@ -67,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -116,6 +118,13 @@ #include #include +#include "Functions/generateSnowflakeID.h" +#include "Interpreters/StorageID.h" +#include "QueryPipeline/QueryPlanResourceHolder.h" +#include "Storages/ExportReplicatedMergeTreePartitionManifest.h" +#include "Storages/ExportReplicatedMergeTreePartitionTaskEntry.h" +#include +#include #include #include @@ -183,6 +192,14 @@ namespace Setting extern const SettingsInt64 replication_wait_for_inactive_replica_timeout; extern const SettingsUInt64 select_sequential_consistency; extern const SettingsBool update_sequential_consistency; + extern const SettingsBool allow_experimental_export_merge_tree_part; + extern const SettingsBool export_merge_tree_partition_force_export; + extern const SettingsUInt64 export_merge_tree_partition_max_retries; + extern const SettingsUInt64 export_merge_tree_partition_manifest_ttl; + extern const SettingsBool output_format_parallel_formatting; + extern const SettingsBool output_format_parquet_parallel_encoding; + extern const SettingsMaxThreads max_threads; + extern const SettingsMergeTreePartExportFileAlreadyExistsPolicy export_merge_tree_part_file_already_exists_policy; } namespace MergeTreeSetting @@ -285,6 +302,12 @@ namespace ErrorCodes extern const int FAULT_INJECTED; extern const int CANNOT_FORGET_PARTITION; extern const int TIMEOUT_EXCEEDED; + extern const int INVALID_SETTING_VALUE; +} + +namespace ServerSetting +{ + extern const ServerSettingsBool enable_experimental_export_merge_tree_partition_feature; } namespace ActionLocks @@ -414,6 +437,9 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( , merge_strategy_picker(*this) , queue(*this, merge_strategy_picker) , fetcher(*this) + , export_merge_tree_partition_task_entries_by_key(export_merge_tree_partition_task_entries.get()) + , export_merge_tree_partition_task_entries_by_transaction_id(export_merge_tree_partition_task_entries.get()) + , export_merge_tree_partition_task_entries_by_create_time(export_merge_tree_partition_task_entries.get()) , cleanup_thread(*this) , async_block_ids_cache(*this) , part_check_thread(*this) @@ -460,6 +486,31 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( /// Will be activated by restarting thread. mutations_finalizing_task->deactivate(); + if (getContext()->getServerSettings()[ServerSetting::enable_experimental_export_merge_tree_partition_feature]) + { + export_merge_tree_partition_manifest_updater = std::make_shared(*this); + + export_merge_tree_partition_task_scheduler = std::make_shared(*this); + + export_merge_tree_partition_updating_task = getContext()->getSchedulePool().createTask( + getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::export_merge_tree_partition_updating_task)", [this] { exportMergeTreePartitionUpdatingTask(); }); + + export_merge_tree_partition_updating_task->deactivate(); + + export_merge_tree_partition_status_handling_task = getContext()->getSchedulePool().createTask( + getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::export_merge_tree_partition_status_handling_task)", [this] { exportMergeTreePartitionStatusHandlingTask(); }); + + export_merge_tree_partition_status_handling_task->deactivate(); + + export_merge_tree_partition_watch_callback = std::make_shared(export_merge_tree_partition_updating_task->getWatchCallback()); + + export_merge_tree_partition_select_task = getContext()->getSchedulePool().createTask( + getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::export_merge_tree_partition_select_task)", [this] { selectPartsToExport(); }); + + export_merge_tree_partition_select_task->deactivate(); + } + + bool has_zookeeper = getContext()->hasZooKeeper() || getContext()->hasAuxiliaryZooKeeper(zookeeper_info.zookeeper_name); if (has_zookeeper) { @@ -884,6 +935,7 @@ void StorageReplicatedMergeTree::createNewZooKeeperNodesAttempt() const futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/quorum/last_part", String(), zkutil::CreateMode::Persistent)); futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/quorum/failed_parts", String(), zkutil::CreateMode::Persistent)); futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/mutations", String(), zkutil::CreateMode::Persistent)); + futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/exports", String(), zkutil::CreateMode::Persistent)); futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/quorum/parallel", String(), zkutil::CreateMode::Persistent)); @@ -1046,6 +1098,8 @@ bool StorageReplicatedMergeTree::createTableIfNotExistsAttempt(const StorageMeta zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/mutations", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/exports", "", + zkutil::CreateMode::Persistent)); /// And create first replica atomically. See also "createReplica" method that is used to create not the first replicas. @@ -4354,6 +4408,159 @@ void StorageReplicatedMergeTree::mutationsFinalizingTask() } } +void StorageReplicatedMergeTree::exportMergeTreePartitionUpdatingTask() +{ + try + { + export_merge_tree_partition_manifest_updater->poll(); + } + catch (...) + { + tryLogCurrentException(log, __PRETTY_FUNCTION__); + } + + + export_merge_tree_partition_updating_task->scheduleAfter(30 * 1000); +} + +void StorageReplicatedMergeTree::selectPartsToExport() +{ + try + { + export_merge_tree_partition_task_scheduler->run(); + } + catch (...) + { + tryLogCurrentException(log, __PRETTY_FUNCTION__); + } + + export_merge_tree_partition_select_task->scheduleAfter(1000 * 5); +} + +void StorageReplicatedMergeTree::exportMergeTreePartitionStatusHandlingTask() +{ + try + { + export_merge_tree_partition_manifest_updater->handleStatusChanges(); + } + catch (...) + { + tryLogCurrentException(log, __PRETTY_FUNCTION__); + } +} + +std::vector StorageReplicatedMergeTree::getPartitionExportsInfo() const +{ + std::vector infos; + + const auto zk = getZooKeeper(); + const auto exports_path = fs::path(zookeeper_path) / "exports"; + std::vector children; + if (Coordination::Error::ZOK != zk->tryGetChildren(exports_path, children)) + { + LOG_INFO(log, "Failed to get children from exports path, returning empty export info list"); + return infos; + } + + for (const auto & child : children) + { + ReplicatedPartitionExportInfo info; + + const auto export_partition_path = fs::path(exports_path) / child; + std::string metadata_json; + if (!zk->tryGet(export_partition_path / "metadata.json", metadata_json)) + { + LOG_INFO(log, "Skipping {}: missing metadata.json", child); + continue; + } + + std::string status; + if (!zk->tryGet(export_partition_path / "status", status)) + { + LOG_INFO(log, "Skipping {}: missing status", child); + continue; + } + + std::vector processing_parts; + if (Coordination::Error::ZOK != zk->tryGetChildren(export_partition_path / "processing", processing_parts)) + { + LOG_INFO(log, "Skipping {}: missing processing parts", child); + continue; + } + + const auto parts_to_do = processing_parts.size(); + + std::string exception_replica; + std::string last_exception; + std::string exception_part; + std::size_t exception_count = 0; + + const auto exceptions_per_replica_path = export_partition_path / "exceptions_per_replica"; + + Strings exception_replicas; + if (Coordination::Error::ZOK != zk->tryGetChildren(exceptions_per_replica_path, exception_replicas)) + { + LOG_INFO(log, "Skipping {}: missing exceptions_per_replica", export_partition_path); + continue; + } + + for (const auto & replica : exception_replicas) + { + std::string exception_count_string; + if (!zk->tryGet(exceptions_per_replica_path / replica / "count", exception_count_string)) + { + LOG_INFO(log, "Skipping {}: missing count", replica); + continue; + } + + exception_count += std::stoull(exception_count_string.c_str()); + + if (last_exception.empty()) + { + const auto last_exception_path = exceptions_per_replica_path / replica / "last_exception"; + std::string last_exception_string; + if (!zk->tryGet(last_exception_path / "exception", last_exception_string)) + { + LOG_INFO(log, "Skipping {}: missing last_exception/exception", last_exception_path); + continue; + } + + std::string exception_part_zk; + if (!zk->tryGet(last_exception_path / "part", exception_part_zk)) + { + LOG_INFO(log, "Skipping {}: missing exception part", last_exception_path); + continue; + } + + exception_replica = replica; + last_exception = last_exception_string; + exception_part = exception_part_zk; + } + } + + const auto metadata = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); + + info.destination_database = metadata.destination_database; + info.destination_table = metadata.destination_table; + info.partition_id = metadata.partition_id; + info.transaction_id = metadata.transaction_id; + info.create_time = metadata.create_time; + info.source_replica = metadata.source_replica; + info.parts_count = metadata.number_of_parts; + info.parts_to_do = parts_to_do; + info.parts = metadata.parts; + info.status = status; + info.exception_replica = exception_replica; + info.last_exception = last_exception; + info.exception_part = exception_part; + info.exception_count = exception_count; + + infos.emplace_back(std::move(info)); + } + + return infos; +} + StorageReplicatedMergeTree::CreateMergeEntryResult StorageReplicatedMergeTree::createLogEntryToMergeParts( zkutil::ZooKeeperPtr & zookeeper, @@ -5634,7 +5841,7 @@ void StorageReplicatedMergeTree::startupImpl(bool from_attach_thread, const ZooK restarting_thread.start(true); }); - startBackgroundMovesIfNeeded(); + startBackgroundMoves(); part_moves_between_shards_orchestrator.start(); @@ -5733,6 +5940,13 @@ void StorageReplicatedMergeTree::partialShutdown() mutations_updating_task->deactivate(); mutations_finalizing_task->deactivate(); + if (getContext()->getServerSettings()[ServerSetting::enable_experimental_export_merge_tree_partition_feature]) + { + export_merge_tree_partition_updating_task->deactivate(); + export_merge_tree_partition_select_task->deactivate(); + export_merge_tree_partition_status_handling_task->deactivate(); + } + cleanup_thread.stop(); async_block_ids_cache.stop(); part_check_thread.stop(); @@ -5800,6 +6014,17 @@ void StorageReplicatedMergeTree::shutdown(bool) /// Wait for all of them std::lock_guard lock(data_parts_exchange_ptr->rwlock); } + + { + std::lock_guard lock(export_merge_tree_partition_mutex); + export_merge_tree_partition_task_entries.clear(); + } + + { + std::lock_guard lock(export_manifests_mutex); + export_manifests.clear(); + } + LOG_TRACE(log, "Shutdown finished"); } @@ -7877,6 +8102,180 @@ void StorageReplicatedMergeTree::fetchPartition( LOG_TRACE(log, "Fetch took {} sec. ({} tries)", watch.elapsedSeconds(), try_no); } +void StorageReplicatedMergeTree::exportPartitionToTable(const PartitionCommand & command, ContextPtr query_context) +{ + if (!query_context->getServerSettings()[ServerSetting::enable_experimental_export_merge_tree_partition_feature]) + { + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, + "Exporting merge tree partition is experimental. Set the server setting `enable_experimental_export_merge_tree_partition_feature` to enable it"); + } + + const auto dest_database = query_context->resolveDatabase(command.to_database); + const auto dest_table = command.to_table; + const auto dest_storage_id = StorageID(dest_database, dest_table); + auto dest_storage = DatabaseCatalog::instance().getTable({dest_database, dest_table}, query_context); + + if (dest_storage->getStorageID() == this->getStorageID()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Exporting to the same table is not allowed"); + } + + if (!dest_storage->supportsImport()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Destination storage {} does not support MergeTree parts or uses unsupported partitioning", dest_storage->getName()); + + auto query_to_string = [] (const ASTPtr & ast) + { + return ast ? ast->formatWithSecretsOneLine() : ""; + }; + + auto src_snapshot = getInMemoryMetadataPtr(); + auto destination_snapshot = dest_storage->getInMemoryMetadataPtr(); + + if (destination_snapshot->getColumns().getAllPhysical().sizeOfDifference(src_snapshot->getColumns().getAllPhysical())) + throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure"); + + if (query_to_string(src_snapshot->getPartitionKeyAST()) != query_to_string(destination_snapshot->getPartitionKeyAST())) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key"); + + zkutil::ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly(); + + const String partition_id = getPartitionIDFromQuery(command.partition, query_context); + + const auto exports_path = fs::path(zookeeper_path) / "exports"; + + const auto export_key = partition_id + "_" + dest_storage_id.getQualifiedName().getFullName(); + + const auto partition_exports_path = fs::path(exports_path) / export_key; + + /// check if entry already exists + if (zookeeper->exists(partition_exports_path)) + { + LOG_INFO(log, "Export with key {} is already exported or it is being exported. Checking if it has expired so that we can overwrite it", export_key); + + bool has_expired = false; + + if (zookeeper->exists(fs::path(partition_exports_path) / "metadata.json")) + { + std::string metadata_json; + if (zookeeper->tryGet(fs::path(partition_exports_path) / "metadata.json", metadata_json)) + { + const auto manifest = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); + + const auto now = time(nullptr); + const auto expiration_time = manifest.create_time + manifest.ttl_seconds; + + LOG_INFO(log, "Export with key {} has expiration time {}, now is {}", export_key, expiration_time, now); + + if (static_cast(expiration_time) < now) + { + has_expired = true; + } + } + } + + if (!has_expired && !query_context->getSettingsRef()[Setting::export_merge_tree_partition_force_export]) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Export with key {} already exported or it is being exported, and it has not expired. Set `export_merge_tree_partition_force_export` to overwrite it.", export_key); + } + + LOG_INFO(log, "Overwriting export with key {}", export_key); + + /// Not putting in ops (same transaction) because we can't construct a "tryRemoveRecursive" request. + /// It is possible that the zk being used does not support RemoveRecursive requests. + /// It is ok for this to be non transactional. Worst case scenario an on-going export is going to be killed and a new task won't be scheduled. + zookeeper->tryRemoveRecursive(partition_exports_path); + } + + Coordination::Requests ops; + + ops.emplace_back(zkutil::makeCreateRequest(partition_exports_path, "", zkutil::CreateMode::Persistent)); + + auto data_parts_lock = lockParts(); + + const auto parts = getDataPartsVectorInPartitionForInternalUsage(MergeTreeDataPartState::Active, partition_id, &data_parts_lock); + + if (parts.empty()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition {} doesn't exist", partition_id); + } + + std::vector part_names; + for (const auto & part : parts) + { + part_names.push_back(part->name); + } + + /// TODO arthur somehow check if the list of parts is updated "enough" + + ExportReplicatedMergeTreePartitionManifest manifest; + + manifest.transaction_id = generateSnowflakeIDString(); + manifest.partition_id = partition_id; + manifest.destination_database = dest_database; + manifest.destination_table = dest_table; + manifest.source_replica = replica_name; + manifest.number_of_parts = part_names.size(); + manifest.parts = part_names; + manifest.create_time = time(nullptr); + manifest.max_retries = query_context->getSettingsRef()[Setting::export_merge_tree_partition_max_retries]; + manifest.ttl_seconds = query_context->getSettingsRef()[Setting::export_merge_tree_partition_manifest_ttl]; + manifest.max_threads = query_context->getSettingsRef()[Setting::max_threads]; + manifest.parallel_formatting = query_context->getSettingsRef()[Setting::output_format_parallel_formatting]; + manifest.parquet_parallel_encoding = query_context->getSettingsRef()[Setting::output_format_parquet_parallel_encoding]; + + manifest.file_already_exists_policy = query_context->getSettingsRef()[Setting::export_merge_tree_part_file_already_exists_policy].value; + + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(partition_exports_path) / "metadata.json", + manifest.toJsonString(), + zkutil::CreateMode::Persistent)); + + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(partition_exports_path) / "exceptions_per_replica", + "", + zkutil::CreateMode::Persistent)); + + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(partition_exports_path) / "processing", + "", + zkutil::CreateMode::Persistent)); + + for (const auto & part : part_names) + { + ExportReplicatedMergeTreePartitionProcessingPartEntry entry; + entry.status = ExportReplicatedMergeTreePartitionProcessingPartEntry::Status::PENDING; + entry.part_name = part; + entry.retry_count = 0; + + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(partition_exports_path) / "processing" / part, + entry.toJsonString(), + zkutil::CreateMode::Persistent)); + } + + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(partition_exports_path) / "processed", + "", + zkutil::CreateMode::Persistent)); + + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(partition_exports_path) / "locks", + "", + zkutil::CreateMode::Persistent)); + + /// status: IN_PROGRESS, COMPLETED, FAILED + ops.emplace_back(zkutil::makeCreateRequest( + fs::path(partition_exports_path) / "status", + "PENDING", + zkutil::CreateMode::Persistent)); + + Coordination::Responses responses; + Coordination::Error code = zookeeper->tryMulti(ops, responses); + + if (code != Coordination::Error::ZOK) + throw zkutil::KeeperException::fromPath(code, partition_exports_path); +} + void StorageReplicatedMergeTree::forgetPartition(const ASTPtr & partition, ContextPtr query_context) { @@ -9284,6 +9683,89 @@ CancellationCode StorageReplicatedMergeTree::killPartMoveToShard(const UUID & ta return part_moves_between_shards_orchestrator.killPartMoveToShard(task_uuid); } +CancellationCode StorageReplicatedMergeTree::killExportPartition(const String & transaction_id) +{ + auto try_set_status_to_killed = [this](const zkutil::ZooKeeperPtr & zk, const std::string & status_path) + { + Coordination::Stat stat; + std::string status_from_zk_string; + + if (!zk->tryGet(status_path, status_from_zk_string, &stat)) + { + /// found entry locally, but not in zk. It might have been deleted by another replica and we did not have time to update the local entry. + LOG_INFO(log, "Export partition task not found in zk, can not cancel it"); + return CancellationCode::CancelCannotBeSent; + } + + const auto status_from_zk = magic_enum::enum_cast(status_from_zk_string); + + if (!status_from_zk) + { + LOG_INFO(log, "Export partition task status is invalid, can not cancel it"); + return CancellationCode::CancelCannotBeSent; + } + + if (status_from_zk.value() != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + { + LOG_INFO(log, "Export partition task is {}, can not cancel it", String(magic_enum::enum_name(status_from_zk.value()))); + return CancellationCode::CancelCannotBeSent; + } + + if (zk->trySet(status_path, String(magic_enum::enum_name(ExportReplicatedMergeTreePartitionTaskEntry::Status::KILLED)), stat.version) != Coordination::Error::ZOK) + { + LOG_INFO(log, "Status has been updated while trying to kill the export partition task, can not cancel it"); + return CancellationCode::CancelCannotBeSent; + } + + return CancellationCode::CancelSent; + }; + + std::lock_guard lock(export_merge_tree_partition_mutex); + + const auto zk = getZooKeeper(); + + /// if we have the entry locally, no need to list from zk. we can save some requests. + const auto & entry = export_merge_tree_partition_task_entries_by_transaction_id.find(transaction_id); + if (entry != export_merge_tree_partition_task_entries_by_transaction_id.end()) + { + LOG_INFO(log, "Export partition task found locally, trying to cancel it"); + /// found locally, no need to get children on zk + if (entry->status != ExportReplicatedMergeTreePartitionTaskEntry::Status::PENDING) + { + LOG_INFO(log, "Export partition task is not pending, can not cancel it"); + return CancellationCode::CancelCannotBeSent; + } + + return try_set_status_to_killed(zk, fs::path(zookeeper_path) / "exports" / entry->getCompositeKey() / "status"); + } + else + { + LOG_INFO(log, "Export partition task not found locally, trying to find it on zk"); + /// for some reason, we don't have the entry locally. ls on zk to find the entry + const auto exports_path = fs::path(zookeeper_path) / "exports"; + + const auto export_keys = zk->getChildren(exports_path); + String export_key_to_be_cancelled; + + for (const auto & export_key : export_keys) + { + std::string metadata_json; + if (!zk->tryGet(fs::path(exports_path) / export_key / "metadata.json", metadata_json)) + continue; + const auto manifest = ExportReplicatedMergeTreePartitionManifest::fromJsonString(metadata_json); + if (manifest.transaction_id == transaction_id) + { + LOG_INFO(log, "Export partition task found on zk, trying to cancel it"); + return try_set_status_to_killed(zk, fs::path(exports_path) / export_key / "status"); + } + } + } + + LOG_INFO(log, "Export partition task not found, can not cancel it"); + + return CancellationCode::NotFound; +} + void StorageReplicatedMergeTree::getCommitPartOps( Coordination::Requests & ops, const DataPartPtr & part, @@ -9842,13 +10324,6 @@ MutationCounters StorageReplicatedMergeTree::getMutationCounters() const return queue.getMutationCounters(); } -void StorageReplicatedMergeTree::startBackgroundMovesIfNeeded() -{ - if (areBackgroundMovesNeeded()) - background_moves_assignee.start(); -} - - std::unique_ptr StorageReplicatedMergeTree::getDefaultSettings() const { return std::make_unique(getContext()->getReplicatedMergeTreeSettings()); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 487c3a3f44c0..712ba0ba4183 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -13,6 +13,10 @@ #include #include #include +#include "Interpreters/CancellationCode.h" +#include "Storages/MergeTree/ExportPartitionManifestUpdatingTask.h" +#include "Storages/MergeTree/ExportPartitionTaskScheduler.h" +#include #include #include #include @@ -97,6 +101,8 @@ namespace DB class ZooKeeperWithFaultInjection; using ZooKeeperWithFaultInjectionPtr = std::shared_ptr; +struct ReplicatedPartitionExportInfo; + class StorageReplicatedMergeTree final : public MergeTreeData { public: @@ -371,6 +377,8 @@ class StorageReplicatedMergeTree final : public MergeTreeData using ShutdownDeadline = std::chrono::time_point; void waitForUniquePartsToBeFetchedByOtherReplicas(ShutdownDeadline shutdown_deadline); + std::vector getPartitionExportsInfo() const; + private: std::atomic_bool are_restoring_replica {false}; @@ -396,6 +404,8 @@ class StorageReplicatedMergeTree final : public MergeTreeData friend class MergeFromLogEntryTask; friend class MutateFromLogEntryTask; friend class ReplicatedMergeMutateTaskBase; + friend class ExportPartitionManifestUpdatingTask; + friend class ExportPartitionTaskScheduler; using MergeStrategyPicker = ReplicatedMergeTreeMergeStrategyPicker; using LogEntry = ReplicatedMergeTreeLogEntry; @@ -510,6 +520,26 @@ class StorageReplicatedMergeTree final : public MergeTreeData /// A task that marks finished mutations as done. BackgroundSchedulePoolTaskHolder mutations_finalizing_task; + BackgroundSchedulePoolTaskHolder export_merge_tree_partition_updating_task; + + /// mostly handle kill operations + BackgroundSchedulePoolTaskHolder export_merge_tree_partition_status_handling_task; + std::shared_ptr export_merge_tree_partition_manifest_updater; + + std::shared_ptr export_merge_tree_partition_task_scheduler; + + Coordination::WatchCallbackPtr export_merge_tree_partition_watch_callback; + + std::mutex export_merge_tree_partition_mutex; + + BackgroundSchedulePoolTaskHolder export_merge_tree_partition_select_task; + + ExportPartitionTaskEntriesContainer export_merge_tree_partition_task_entries; + + // Convenience references to indexes + ExportPartitionTaskEntriesContainer::index::type & export_merge_tree_partition_task_entries_by_key; + ExportPartitionTaskEntriesContainer::index::type & export_merge_tree_partition_task_entries_by_transaction_id; + ExportPartitionTaskEntriesContainer::index::type & export_merge_tree_partition_task_entries_by_create_time; /// A thread that removes old parts, log entries, and blocks. ReplicatedMergeTreeCleanupThread cleanup_thread; @@ -737,6 +767,14 @@ class StorageReplicatedMergeTree final : public MergeTreeData /// Checks if some mutations are done and marks them as done. void mutationsFinalizingTask(); + void selectPartsToExport(); + + /// update in-memory list of partition exports + void exportMergeTreePartitionUpdatingTask(); + + /// handle status changes for export partition tasks + void exportMergeTreePartitionStatusHandlingTask(); + /** Write the selected parts to merge into the log, * Call when merge_selecting_mutex is locked. * Returns false if any part is not in ZK. @@ -916,6 +954,7 @@ class StorageReplicatedMergeTree final : public MergeTreeData void movePartitionToTable(const StoragePtr & dest_table, const ASTPtr & partition, ContextPtr query_context) override; void movePartitionToShard(const ASTPtr & partition, bool move_part, const String & to, ContextPtr query_context) override; CancellationCode killPartMoveToShard(const UUID & task_uuid) override; + CancellationCode killExportPartition(const String & transaction_id) override; void fetchPartition( const ASTPtr & partition, const StorageMetadataPtr & metadata_snapshot, @@ -923,7 +962,8 @@ class StorageReplicatedMergeTree final : public MergeTreeData bool fetch_part, ContextPtr query_context) override; void forgetPartition(const ASTPtr & partition, ContextPtr query_context) override; - + + void exportPartitionToTable(const PartitionCommand &, ContextPtr) override; /// NOTE: there are no guarantees for concurrent merges. Dropping part can /// be concurrently merged into some covering part and dropPart will do @@ -955,8 +995,6 @@ class StorageReplicatedMergeTree final : public MergeTreeData MutationsSnapshotPtr getMutationsSnapshot(const IMutationsSnapshot::Params & params) const override; - void startBackgroundMovesIfNeeded() override; - /// Attaches restored parts to the storage. void attachRestoredParts(MutableDataPartsVector && parts) override; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index c3cd88dd6014..e0d7158ed44b 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -727,7 +727,7 @@ void StorageURLSink::cancelBuffers() write_buf->cancel(); } -class PartitionedStorageURLSink : public PartitionedSink +class PartitionedStorageURLSink : public PartitionedSink::SinkCreator { public: PartitionedStorageURLSink( @@ -741,7 +741,7 @@ class PartitionedStorageURLSink : public PartitionedSink const CompressionMethod compression_method_, const HTTPHeaderEntries & headers_, const String & http_method_) - : PartitionedSink(partition_strategy_, context_, std::make_shared(sample_block_)) + : partition_strategy(partition_strategy_) , uri(uri_) , format(format_) , format_settings(format_settings_) @@ -756,7 +756,8 @@ class PartitionedStorageURLSink : public PartitionedSink SinkPtr createSinkForPartition(const String & partition_id) override { - std::string partition_path = partition_strategy->getPathForWrite(uri, partition_id); + const auto file_path_generator = std::make_shared(uri); + std::string partition_path = file_path_generator->getPathForWrite(partition_id); context->getRemoteHostFilter().checkURL(Poco::URI(partition_path)); return std::make_shared( @@ -764,6 +765,7 @@ class PartitionedStorageURLSink : public PartitionedSink } private: + std::shared_ptr partition_strategy; const String uri; const String format; const std::optional format_settings; @@ -1096,7 +1098,7 @@ bool IStorageURLBase::canMoveConditionsToPrewhere() const std::optional IStorageURLBase::supportedPrewhereColumns() const { - return getInMemoryMetadataPtr()->getColumnsWithoutDefaultExpressions(); + return getInMemoryMetadataPtr()->getColumnsWithoutDefaultExpressions(/*exclude=*/ hive_partition_columns_to_read_from_file_path); } IStorage::ColumnSizeByName IStorageURLBase::getColumnSizes() const @@ -1445,7 +1447,7 @@ SinkToStoragePtr IStorageURLBase::write(const ASTPtr & query, const StorageMetad has_wildcards, /* partition_columns_in_data_file */true); - return std::make_shared( + auto sink_creator = std::make_shared( partition_strategy, uri, format_name, @@ -1456,6 +1458,8 @@ SinkToStoragePtr IStorageURLBase::write(const ASTPtr & query, const StorageMetad compression_method, headers, http_method); + + return std::make_shared(partition_strategy, sink_creator, context, std::make_shared(metadata_snapshot->getSampleBlock())); } return std::make_shared( diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index e8154c934197..695cd0c21b92 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -128,23 +128,45 @@ void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageS ); } +class UrlTaskIterator : public TaskIterator +{ +public: + UrlTaskIterator(const String & uri, + size_t max_addresses, + const ActionsDAG::Node * predicate, + const NamesAndTypesList & virtual_columns, + const NamesAndTypesList & hive_partition_columns_to_read_from_file_path, + const ContextPtr & context) + : iterator(uri, max_addresses, predicate, virtual_columns, hive_partition_columns_to_read_from_file_path, context) {} + + ~UrlTaskIterator() override = default; + + ClusterFunctionReadTaskResponsePtr operator()(size_t /* number_of_current_replica */) const override + { + auto url = iterator.next(); + if (url.empty()) + return std::make_shared(); + return std::make_shared(std::move(url)); + } + +private: + mutable StorageURLSource::DisclosedGlobIterator iterator; +}; + RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension( const ActionsDAG::Node * predicate, const ActionsDAG * /* filter */, const ContextPtr & context, ClusterPtr) const { - auto iterator = std::make_shared( - uri, context->getSettingsRef()[Setting::glob_expansion_max_elements], predicate, getVirtualsList(), hive_partition_columns_to_read_from_file_path, context); - - auto next_callback = [iter = std::move(iterator)](size_t) mutable -> ClusterFunctionReadTaskResponsePtr - { - auto url = iter->next(); - if (url.empty()) - return std::make_shared(); - return std::make_shared(std::move(url)); - }; - auto callback = std::make_shared(std::move(next_callback)); + auto callback = std::make_shared( + uri, + context->getSettingsRef()[Setting::glob_expansion_max_elements], + predicate, + getVirtualsList(), + hive_partition_columns_to_read_from_file_path, + context + ); return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)}; } diff --git a/src/Storages/System/StorageSystemExports.cpp b/src/Storages/System/StorageSystemExports.cpp new file mode 100644 index 000000000000..bd56a40c3a68 --- /dev/null +++ b/src/Storages/System/StorageSystemExports.cpp @@ -0,0 +1,66 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +ColumnsDescription StorageSystemExports::getColumnsDescription() +{ + return ColumnsDescription + { + {"source_database", std::make_shared(), "Name of the source database."}, + {"source_table", std::make_shared(), "Name of the source table."}, + {"destination_database", std::make_shared(), "Name of the destination database."}, + {"destination_table", std::make_shared(), "Name of the destination table."}, + {"create_time", std::make_shared(), "Date and time when the export command was received in the server."}, + {"part_name", std::make_shared(), "Name of the part"}, + {"destination_file_path", std::make_shared(), "File path where the part is being exported."}, + {"elapsed", std::make_shared(), "The time elapsed (in seconds) since the export started."}, + {"rows_read", std::make_shared(), "The number of rows read from the exported part."}, + {"total_rows_to_read", std::make_shared(), "The total number of rows to read from the exported part."}, + {"total_size_bytes_compressed", std::make_shared(), "The total size of the compressed data in the exported part."}, + {"total_size_bytes_uncompressed", std::make_shared(), "The total size of the uncompressed data in the exported part."}, + {"bytes_read_uncompressed", std::make_shared(), "The number of uncompressed bytes read from the exported part."}, + {"memory_usage", std::make_shared(), "Current memory usage in bytes for the export operation."}, + {"peak_memory_usage", std::make_shared(), "Peak memory usage in bytes during the export operation."}, + }; +} + +void StorageSystemExports::fillData(MutableColumns & res_columns, ContextPtr context, const ActionsDAG::Node *, std::vector) const +{ + const auto access = context->getAccess(); + const bool check_access_for_tables = !access->isGranted(AccessType::SHOW_TABLES); + + for (const auto & export_info : context->getExportsList().get()) + { + if (check_access_for_tables && !access->isGranted(AccessType::SHOW_TABLES, export_info.source_database, export_info.source_table)) + continue; + + size_t i = 0; + res_columns[i++]->insert(export_info.source_database); + res_columns[i++]->insert(export_info.source_table); + res_columns[i++]->insert(export_info.destination_database); + res_columns[i++]->insert(export_info.destination_table); + res_columns[i++]->insert(export_info.create_time); + res_columns[i++]->insert(export_info.part_name); + res_columns[i++]->insert(export_info.destination_file_path); + res_columns[i++]->insert(export_info.elapsed); + res_columns[i++]->insert(export_info.rows_read); + res_columns[i++]->insert(export_info.total_rows_to_read); + res_columns[i++]->insert(export_info.total_size_bytes_compressed); + res_columns[i++]->insert(export_info.total_size_bytes_uncompressed); + res_columns[i++]->insert(export_info.bytes_read_uncompressed); + res_columns[i++]->insert(export_info.memory_usage); + res_columns[i++]->insert(export_info.peak_memory_usage); + } +} + +} diff --git a/src/Storages/System/StorageSystemExports.h b/src/Storages/System/StorageSystemExports.h new file mode 100644 index 000000000000..e13fbfa26aaa --- /dev/null +++ b/src/Storages/System/StorageSystemExports.h @@ -0,0 +1,25 @@ +#pragma once + +#include + + +namespace DB +{ + +class Context; + + +class StorageSystemExports final : public IStorageSystemOneBlock +{ +public: + std::string getName() const override { return "SystemExports"; } + + static ColumnsDescription getColumnsDescription(); + +protected: + using IStorageSystemOneBlock::IStorageSystemOneBlock; + + void fillData(MutableColumns & res_columns, ContextPtr context, const ActionsDAG::Node *, std::vector) const override; +}; + +} diff --git a/src/Storages/System/StorageSystemIcebergHistory.cpp b/src/Storages/System/StorageSystemIcebergHistory.cpp index f2148a753bb5..db1fb68b0e5c 100644 --- a/src/Storages/System/StorageSystemIcebergHistory.cpp +++ b/src/Storages/System/StorageSystemIcebergHistory.cpp @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -57,7 +57,7 @@ void StorageSystemIcebergHistory::fillData([[maybe_unused]] MutableColumns & res const auto access = context_copy->getAccess(); - auto add_history_record = [&](const DatabaseTablesIteratorPtr & it, StorageObjectStorage * object_storage) + auto add_history_record = [&](const DatabaseTablesIteratorPtr & it, StorageObjectStorageCluster * object_storage) { if (!access->isGranted(AccessType::SHOW_TABLES, it->databaseName(), it->name())) return; @@ -106,7 +106,7 @@ void StorageSystemIcebergHistory::fillData([[maybe_unused]] MutableColumns & res // Table was dropped while acquiring the lock, skipping table continue; - if (auto * object_storage_table = dynamic_cast(storage.get())) + if (auto * object_storage_table = dynamic_cast(storage.get())) { add_history_record(iterator, object_storage_table); } diff --git a/src/Storages/System/StorageSystemMerges.cpp b/src/Storages/System/StorageSystemMerges.cpp index 0fca5dc84a2b..c8c569ff4696 100644 --- a/src/Storages/System/StorageSystemMerges.cpp +++ b/src/Storages/System/StorageSystemMerges.cpp @@ -1,5 +1,5 @@ -#include #include +#include #include #include diff --git a/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp b/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp new file mode 100644 index 000000000000..018f0c8ffac7 --- /dev/null +++ b/src/Storages/System/StorageSystemReplicatedPartitionExports.cpp @@ -0,0 +1,143 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Columns/ColumnString.h" +#include "Storages/VirtualColumnUtils.h" + + +namespace DB +{ + +ColumnsDescription StorageSystemReplicatedPartitionExports::getColumnsDescription() +{ + return ColumnsDescription + { + {"source_database", std::make_shared(), "Name of the source database."}, + {"source_table", std::make_shared(), "Name of the source table."}, + {"destination_database", std::make_shared(), "Name of the destination database."}, + {"destination_table", std::make_shared(), "Name of the destination table."}, + {"create_time", std::make_shared(), "Date and time when the export command was submitted"}, + {"partition_id", std::make_shared(), "ID of the partition"}, + {"transaction_id", std::make_shared(), "ID of the transaction."}, + {"source_replica", std::make_shared(), "Name of the source replica."}, + {"parts", std::make_shared(std::make_shared()), "List of part names to be exported."}, + {"parts_count", std::make_shared(), "Number of parts in the export."}, + {"parts_to_do", std::make_shared(), "Number of parts pending to be exported."}, + {"status", std::make_shared(), "Status of the export."}, + {"exception_replica", std::make_shared(), "Replica that caused the last exception"}, + {"last_exception", std::make_shared(), "Last exception message of any part (not necessarily the last global exception)"}, + {"exception_part", std::make_shared(), "Part that caused the last exception"}, + {"exception_count", std::make_shared(), "Number of global exceptions"}, + }; +} + +void StorageSystemReplicatedPartitionExports::fillData(MutableColumns & res_columns, ContextPtr context, const ActionsDAG::Node * predicate, std::vector) const +{ + const auto access = context->getAccess(); + const bool check_access_for_databases = !access->isGranted(AccessType::SHOW_TABLES); + + std::map> replicated_merge_tree_tables; + for (const auto & db : DatabaseCatalog::instance().getDatabases()) + { + /// Check if database can contain MergeTree tables + if (!db.second->canContainMergeTreeTables()) + continue; + + const bool check_access_for_tables = check_access_for_databases && !access->isGranted(AccessType::SHOW_TABLES, db.first); + + for (auto iterator = db.second->getTablesIterator(context); iterator->isValid(); iterator->next()) + { + const auto & table = iterator->table(); + if (!table) + continue; + + StorageReplicatedMergeTree * table_replicated = dynamic_cast(table.get()); + if (!table_replicated) + continue; + + if (check_access_for_tables && !access->isGranted(AccessType::SHOW_TABLES, db.first, iterator->name())) + continue; + + replicated_merge_tree_tables[db.first][iterator->name()] = table; + } + } + + MutableColumnPtr col_database_mut = ColumnString::create(); + MutableColumnPtr col_table_mut = ColumnString::create(); + + for (auto & db : replicated_merge_tree_tables) + { + for (auto & table : db.second) + { + col_database_mut->insert(db.first); + col_table_mut->insert(table.first); + } + } + + ColumnPtr col_database = std::move(col_database_mut); + ColumnPtr col_table = std::move(col_table_mut); + + /// Determine what tables are needed by the conditions in the query. + { + Block filtered_block + { + { col_database, std::make_shared(), "database" }, + { col_table, std::make_shared(), "table" }, + }; + + VirtualColumnUtils::filterBlockWithPredicate(predicate, filtered_block, context); + + if (!filtered_block.rows()) + return; + + col_database = filtered_block.getByName("database").column; + col_table = filtered_block.getByName("table").column; + } + + for (size_t i_storage = 0; i_storage < col_database->size(); ++i_storage) + { + const auto database = (*col_database)[i_storage].safeGet(); + const auto table = (*col_table)[i_storage].safeGet(); + + std::vector partition_exports_info; + { + const IStorage * storage = replicated_merge_tree_tables[database][table].get(); + if (const auto * replicated_merge_tree = dynamic_cast(storage)) + partition_exports_info = replicated_merge_tree->getPartitionExportsInfo(); + } + + for (const ReplicatedPartitionExportInfo & info : partition_exports_info) + { + std::size_t i = 0; + res_columns[i++]->insert(database); + res_columns[i++]->insert(table); + res_columns[i++]->insert(info.destination_database); + res_columns[i++]->insert(info.destination_table); + res_columns[i++]->insert(info.create_time); + res_columns[i++]->insert(info.partition_id); + res_columns[i++]->insert(info.transaction_id); + res_columns[i++]->insert(info.source_replica); + Array parts_array; + parts_array.reserve(info.parts.size()); + for (const auto & part : info.parts) + parts_array.push_back(part); + res_columns[i++]->insert(parts_array); + res_columns[i++]->insert(info.parts_count); + res_columns[i++]->insert(info.parts_to_do); + res_columns[i++]->insert(info.status); + res_columns[i++]->insert(info.exception_replica); + res_columns[i++]->insert(info.last_exception); + res_columns[i++]->insert(info.exception_part); + res_columns[i++]->insert(info.exception_count); + } + } +} + +} diff --git a/src/Storages/System/StorageSystemReplicatedPartitionExports.h b/src/Storages/System/StorageSystemReplicatedPartitionExports.h new file mode 100644 index 000000000000..de2547437c21 --- /dev/null +++ b/src/Storages/System/StorageSystemReplicatedPartitionExports.h @@ -0,0 +1,42 @@ +#pragma once + +#include + +namespace DB +{ + +class Context; + +struct ReplicatedPartitionExportInfo +{ + String destination_database; + String destination_table; + String partition_id; + String transaction_id; + time_t create_time; + String source_replica; + size_t parts_count; + size_t parts_to_do; + std::vector parts; + String status; + std::string exception_replica; + std::string last_exception; + std::string exception_part; + size_t exception_count; +}; + +class StorageSystemReplicatedPartitionExports final : public IStorageSystemOneBlock +{ +public: + + std::string getName() const override { return "SystemReplicatedPartitionExports"; } + + static ColumnsDescription getColumnsDescription(); + +protected: + using IStorageSystemOneBlock::IStorageSystemOneBlock; + + void fillData(MutableColumns & res_columns, ContextPtr context, const ActionsDAG::Node *, std::vector) const override; +}; + +} diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp index ff1d570a8024..c3e72231079c 100644 --- a/src/Storages/System/StorageSystemTables.cpp +++ b/src/Storages/System/StorageSystemTables.cpp @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include #include @@ -604,18 +606,69 @@ class TablesBlockSource : public ISource ASTPtr expression_ptr; if (columns_mask[src_index++]) { - if (metadata_snapshot && (expression_ptr = metadata_snapshot->getPartitionKeyAST())) - res_columns[res_index++]->insert(format({context, *expression_ptr})); - else - res_columns[res_index++]->insertDefault(); + bool inserted = false; + try + { + // Extract from specific DataLake metadata if suitable + if (auto * obj = dynamic_cast(table.get())) + { + if (auto * dl_meta = obj->getExternalMetadata(context)) + { + if (auto p = dl_meta->partitionKey(context); p.has_value()) + { + res_columns[res_index++]->insert(*p); + inserted = true; + } + } + } + } + catch (const Exception &) + { + /// Failed to get info from Iceberg. It's not critical, just log it. + tryLogCurrentException("StorageSystemTables"); + } + + if (!inserted) + { + if (metadata_snapshot && (expression_ptr = metadata_snapshot->getPartitionKeyAST())) + res_columns[res_index++]->insert(format({context, *expression_ptr})); + else + res_columns[res_index++]->insertDefault(); + } } if (columns_mask[src_index++]) { - if (metadata_snapshot && (expression_ptr = metadata_snapshot->getSortingKey().expression_list_ast)) - res_columns[res_index++]->insert(format({context, *expression_ptr})); - else - res_columns[res_index++]->insertDefault(); + bool inserted = false; + + try + { + // Extract from specific DataLake metadata if suitable + if (auto * obj = dynamic_cast(table.get())) + { + if (auto * dl_meta = obj->getExternalMetadata(context)) + { + if (auto p = dl_meta->sortingKey(context); p.has_value()) + { + res_columns[res_index++]->insert(*p); + inserted = true; + } + } + } + } + catch (const Exception &) + { + /// Failed to get info from Iceberg. It's not critical, just log it. + tryLogCurrentException("StorageSystemTables"); + } + + if (!inserted) + { + if (metadata_snapshot && (expression_ptr = metadata_snapshot->getSortingKey().expression_list_ast)) + res_columns[res_index++]->insert(format({context, *expression_ptr})); + else + res_columns[res_index++]->insertDefault(); + } } if (columns_mask[src_index++]) diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index 9e996d5ca2b9..d6ce54003aea 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -1,10 +1,11 @@ #include +#include #include "config.h" #include #include #include - +#include #include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -103,6 +105,7 @@ #include #include #include +#include #include #include @@ -127,6 +130,11 @@ namespace DB { +namespace ServerSetting +{ + extern const ServerSettingsBool enable_experimental_export_merge_tree_partition_feature; +} + void attachSystemTablesServer(ContextPtr context, IDatabase & system_database, bool has_zookeeper) { attachNoDescription(context, system_database, "one", "This table contains a single row with a single dummy UInt8 column containing the value 0. Used when the table is not specified explicitly, for example in queries like `SELECT 1`."); @@ -208,6 +216,11 @@ void attachSystemTablesServer(ContextPtr context, IDatabase & system_database, b attach(context, system_database, "dimensional_metrics", "Contains dimensional metrics, which have multiple dimensions (labels) to provide more granular information. For example, counting failed merges by their error code. This table is always up to date."); attach(context, system_database, "merges", "Contains a list of merges currently executing merges of MergeTree tables and their progress. Each merge operation is represented by a single row."); attach(context, system_database, "moves", "Contains information about in-progress data part moves of MergeTree tables. Each data part movement is represented by a single row."); + attach(context, system_database, "exports", "Contains a list of exports currently executing exports of MergeTree tables and their progress. Each export operation is represented by a single row."); + if (context->getServerSettings()[ServerSetting::enable_experimental_export_merge_tree_partition_feature]) + { + attach(context, system_database, "replicated_partition_exports", "Contains a list of partition exports of ReplicatedMergeTree tables and their progress. Each export operation is represented by a single row."); + } attach(context, system_database, "mutations", "Contains a list of mutations and their progress. Each mutation command is represented by a single row."); attachNoDescription(context, system_database, "replicas", "Contains information and status of all table replicas on current server. Each replica is represented by a single row."); attach(context, system_database, "replication_queue", "Contains information about tasks from replication queues stored in ClickHouse Keeper, or ZooKeeper, for each table replica."); diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 462a56bb1e92..bd0db1f52248 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -312,18 +313,23 @@ void addRequestedFileLikeStorageVirtualsToChunk( else if (virtual_column.name == "_row_number") { #if USE_PARQUET - auto chunk_info = chunk.getChunkInfos().get(); + auto chunk_info = chunk.getChunkInfos().get(); if (chunk_info) { size_t row_num_offset = chunk_info->row_num_offset; + const auto & applied_filter = chunk_info->applied_filter; + size_t num_indices = applied_filter.has_value() ? applied_filter->size() : chunk.getNumRows(); auto column = ColumnInt64::create(); - for (size_t i = 0; i < chunk.getNumRows(); ++i) - column->insertValue(i + row_num_offset); - chunk.addColumn(std::move(column)); + for (size_t i = 0; i < num_indices; ++i) + if (!applied_filter.has_value() || applied_filter.value()[i]) + column->insertValue(i + row_num_offset); + auto null_map = ColumnUInt8::create(chunk.getNumRows(), 0); + chunk.addColumn(ColumnNullable::create(std::move(column), std::move(null_map))); return; } #endif - chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), -1)->convertToFullColumnIfConst()); + /// Row numbers not known, _row_number = NULL. + chunk.addColumn(virtual_column.type->createColumnConstWithDefaultValue(chunk.getNumRows())->convertToFullColumnIfConst()); } } } diff --git a/src/Storages/extractTableFunctionFromSelectQuery.cpp b/src/Storages/extractTableFunctionFromSelectQuery.cpp index 57302036c889..064f538eeae7 100644 --- a/src/Storages/extractTableFunctionFromSelectQuery.cpp +++ b/src/Storages/extractTableFunctionFromSelectQuery.cpp @@ -9,7 +9,7 @@ namespace DB { -ASTFunction * extractTableFunctionFromSelectQuery(ASTPtr & query) +ASTTableExpression * extractTableExpressionASTPtrFromSelectQuery(ASTPtr & query) { auto * select_query = query->as(); if (!select_query || !select_query->tables()) @@ -17,10 +17,36 @@ ASTFunction * extractTableFunctionFromSelectQuery(ASTPtr & query) auto * tables = select_query->tables()->as(); auto * table_expression = tables->children[0]->as()->table_expression->as(); - if (!table_expression->table_function) + return table_expression; +} + +ASTPtr extractTableFunctionASTPtrFromSelectQuery(ASTPtr & query) +{ + auto table_expression = extractTableExpressionASTPtrFromSelectQuery(query); + return table_expression ? table_expression->table_function : nullptr; +} + +ASTPtr extractTableASTPtrFromSelectQuery(ASTPtr & query) +{ + auto table_expression = extractTableExpressionASTPtrFromSelectQuery(query); + return table_expression ? table_expression->database_and_table_name : nullptr; +} + +ASTFunction * extractTableFunctionFromSelectQuery(ASTPtr & query) +{ + auto table_function_ast = extractTableFunctionASTPtrFromSelectQuery(query); + if (!table_function_ast) return nullptr; - return table_expression->table_function->as(); + return table_function_ast->as(); +} + +ASTExpressionList * extractTableFunctionArgumentsFromSelectQuery(ASTPtr & query) +{ + auto * table_function = extractTableFunctionFromSelectQuery(query); + if (!table_function) + return nullptr; + return table_function->arguments->as(); } } diff --git a/src/Storages/extractTableFunctionFromSelectQuery.h b/src/Storages/extractTableFunctionFromSelectQuery.h index c69cc7ce6c52..2a845477df82 100644 --- a/src/Storages/extractTableFunctionFromSelectQuery.h +++ b/src/Storages/extractTableFunctionFromSelectQuery.h @@ -1,12 +1,17 @@ #pragma once #include -#include #include +#include namespace DB { +struct ASTTableExpression; +ASTTableExpression * extractTableExpressionASTPtrFromSelectQuery(ASTPtr & query); +ASTPtr extractTableFunctionASTPtrFromSelectQuery(ASTPtr & query); +ASTPtr extractTableASTPtrFromSelectQuery(ASTPtr & query); ASTFunction * extractTableFunctionFromSelectQuery(ASTPtr & query); +ASTExpressionList * extractTableFunctionArgumentsFromSelectQuery(ASTPtr & query); } diff --git a/src/Storages/prepareReadingFromFormat.cpp b/src/Storages/prepareReadingFromFormat.cpp index 3b31a9b0d2ae..e84005511bb4 100644 --- a/src/Storages/prepareReadingFromFormat.cpp +++ b/src/Storages/prepareReadingFromFormat.cpp @@ -234,7 +234,12 @@ ReadFromFormatInfo prepareReadingFromFormat( } /// Create header for InputFormat with columns that will be read from the data. - info.format_header = storage_snapshot->getSampleBlockForColumns(info.columns_description.getNamesOfPhysical()); + for (const auto & column : info.columns_description) + { + /// Never read hive partition columns from the data file. This fixes https://github.com/ClickHouse/ClickHouse/issues/87515 + if (!hive_parameters.hive_partition_columns_to_read_from_file_path_map.contains(column.name)) + info.format_header.insert(ColumnWithTypeAndName{column.type, column.name}); + } info.serialization_hints = getSerializationHintsForFileLikeStorage(storage_snapshot->metadata, context); diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp index 3d050fb9633b..74cb706cccec 100644 --- a/src/Storages/registerStorages.cpp +++ b/src/Storages/registerStorages.cpp @@ -13,6 +13,7 @@ void registerStorageNull(StorageFactory & factory); void registerStorageMerge(StorageFactory & factory); void registerStorageBuffer(StorageFactory & factory); void registerStorageDistributed(StorageFactory & factory); +void registerStorageHybrid(StorageFactory & factory); void registerStorageMemory(StorageFactory & factory); void registerStorageFile(StorageFactory & factory); void registerStorageURL(StorageFactory & factory); @@ -122,6 +123,7 @@ void registerStorages() registerStorageMerge(factory); registerStorageBuffer(factory); registerStorageDistributed(factory); + registerStorageHybrid(factory); registerStorageMemory(factory); registerStorageFile(factory); registerStorageURL(factory); diff --git a/src/TableFunctions/ITableFunction.h b/src/TableFunctions/ITableFunction.h index c4e7534c1e49..f60ffd005c8b 100644 --- a/src/TableFunctions/ITableFunction.h +++ b/src/TableFunctions/ITableFunction.h @@ -78,7 +78,7 @@ class ITableFunction : public std::enable_shared_from_this virtual bool supportsReadingSubsetOfColumns(const ContextPtr &) { return true; } - virtual bool canBeUsedToCreateTable() const { return true; } + virtual void validateUseToCreateTable() const {} // INSERT INTO TABLE FUNCTION ... PARTITION BY // Set partition by expression so `ITableFunctionObjectStorage` can construct a proper representation diff --git a/src/TableFunctions/ITableFunctionCluster.h b/src/TableFunctions/ITableFunctionCluster.h index 5345e1a0f0db..975322d054b3 100644 --- a/src/TableFunctions/ITableFunctionCluster.h +++ b/src/TableFunctions/ITableFunctionCluster.h @@ -16,6 +16,7 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int CLUSTER_DOESNT_EXIST; extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } /// Base class for *Cluster table functions that require cluster_name for the first argument. @@ -46,9 +47,13 @@ class ITableFunctionCluster : public Base throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected table function name: {}", table_function->name); } - bool canBeUsedToCreateTable() const override { return false; } bool isClusterFunction() const override { return true; } + void validateUseToCreateTable() const override + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table function '{}' cannot be used to create a table", getName()); + } + protected: void parseArguments(const ASTPtr & ast, ContextPtr context) override { diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index 959ac829a8e1..145581a35989 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -125,7 +125,7 @@ template ColumnsDescription TableFunctionObjectStorage< Definition, Configuration, is_data_lake>::getActualTableStructure(ContextPtr context, bool is_insert_query) const { - if (configuration->structure == "auto") + if (configuration->getStructure() == "auto") { if (const auto access_object = getSourceAccessObject()) context->checkAccess(AccessType::READ, toStringSource(*access_object)); @@ -141,7 +141,6 @@ ColumnsDescription TableFunctionObjectStorage< ColumnsDescription columns; resolveSchemaAndFormat( columns, - configuration->format, std::move(storage), configuration, /* format_settings */std::nullopt, @@ -158,7 +157,7 @@ ColumnsDescription TableFunctionObjectStorage< return columns; } - return parseColumnsListFromString(configuration->structure, context); + return parseColumnsListFromString(configuration->getStructure(), context); } template @@ -172,8 +171,8 @@ StoragePtr TableFunctionObjectStorage:: chassert(configuration); ColumnsDescription columns; - if (configuration->structure != "auto") - columns = parseColumnsListFromString(configuration->structure, context); + if (configuration->getStructure() != "auto") + columns = parseColumnsListFromString(configuration->getStructure(), context); else if (!structure_hint.empty()) columns = structure_hint; else if (!cached_columns.empty()) @@ -201,7 +200,15 @@ StoragePtr TableFunctionObjectStorage:: columns, ConstraintsDescription{}, partition_by, - context); + context, + /* comment */ String{}, + /* format_settings */ std::nullopt, /// No format_settings + /* mode */ LoadingStrictnessLevel::CREATE, + configuration->getCatalog(context, /* attach */ false), + /* if_not_exists */ false, + /* is_datalake_query */ false, + /* is_table_function */ true, + /* lazy_init */ false); storage->startup(); return storage; @@ -236,18 +243,7 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) { UNUSED(factory); #if USE_AWS_S3 - factory.registerFunction>( - { - .documentation = - { - .description=R"(The table function can be used to read the data stored on AWS S3.)", - .examples{{S3Definition::name, "SELECT * FROM s3(url, access_key_id, secret_access_key)", ""}}, - .category = FunctionDocumentation::Category::TableFunction - }, - .allow_readonly = false - }); - - factory.registerFunction>( + factory.registerFunction>( { .documentation = { @@ -258,7 +254,7 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) .allow_readonly = false }); - factory.registerFunction>( + factory.registerFunction>( { .documentation = { @@ -269,7 +265,7 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) .allow_readonly = false }); - factory.registerFunction>( + factory.registerFunction>( { .documentation = { @@ -280,58 +276,29 @@ void registerTableFunctionObjectStorage(TableFunctionFactory & factory) .allow_readonly = false }); #endif - -#if USE_AZURE_BLOB_STORAGE - factory.registerFunction>( - { - .documentation = - { - .description=R"(The table function can be used to read the data stored on Azure Blob Storage.)", - .examples{ - { - AzureDefinition::name, - "SELECT * FROM azureBlobStorage(connection_string|storage_account_url, container_name, blobpath, " - "[account_name, account_key, format, compression, structure])", "" - }}, - .category = FunctionDocumentation::Category::TableFunction - }, - .allow_readonly = false - }); -#endif -#if USE_HDFS - factory.registerFunction>( - { - .documentation = - { - .description=R"(The table function can be used to read the data stored on HDFS virtual filesystem.)", - .examples{ - { - HDFSDefinition::name, - "SELECT * FROM hdfs(url, format, compression, structure])", "" - }}, - .category = FunctionDocumentation::Category::TableFunction - }, - .allow_readonly = false - }); -#endif } #if USE_AZURE_BLOB_STORAGE -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; #endif #if USE_AWS_S3 -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; #endif #if USE_HDFS -template class TableFunctionObjectStorage; -template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +#endif + +#if USE_AVRO +template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; #endif #if USE_AVRO && USE_AWS_S3 @@ -348,6 +315,11 @@ template class TableFunctionObjectStorage; +template class TableFunctionObjectStorage; +#endif + +#if USE_PARQUET && USE_AWS_S3 && USE_DELTA_KERNEL_RS +template class TableFunctionObjectStorage; #endif #if USE_AWS_S3 @@ -357,37 +329,6 @@ template class TableFunctionObjectStorage( - {.documentation - = {.description = R"(The table function can be used to read the Iceberg table stored on S3 object store. Alias to icebergS3)", - .examples{{IcebergDefinition::name, "SELECT * FROM iceberg(url, access_key_id, secret_access_key)", ""}}, - .category = FunctionDocumentation::Category::TableFunction}, - .allow_readonly = false}); - factory.registerFunction( - {.documentation - = {.description = R"(The table function can be used to read the Iceberg table stored on S3 object store.)", - .examples{{IcebergS3Definition::name, "SELECT * FROM icebergS3(url, access_key_id, secret_access_key)", ""}}, - .category = FunctionDocumentation::Category::TableFunction}, - .allow_readonly = false}); - -#endif -#if USE_AZURE_BLOB_STORAGE - factory.registerFunction( - {.documentation - = {.description = R"(The table function can be used to read the Iceberg table stored on Azure object store.)", - .examples{{IcebergAzureDefinition::name, "SELECT * FROM icebergAzure(url, access_key_id, secret_access_key)", ""}}, - .category = FunctionDocumentation::Category::TableFunction}, - .allow_readonly = false}); -#endif -#if USE_HDFS - factory.registerFunction( - {.documentation - = {.description = R"(The table function can be used to read the Iceberg table stored on HDFS virtual filesystem.)", - .examples{{IcebergHDFSDefinition::name, "SELECT * FROM icebergHDFS(url)", ""}}, - .category = FunctionDocumentation::Category::TableFunction}, - .allow_readonly = false}); -#endif factory.registerFunction( {.documentation = {.description = R"(The table function can be used to read the Iceberg table stored locally.)", @@ -396,70 +337,4 @@ void registerTableFunctionIceberg(TableFunctionFactory & factory) .allow_readonly = false}); } #endif - - -#if USE_PARQUET && USE_DELTA_KERNEL_RS -void registerTableFunctionDeltaLake(TableFunctionFactory & factory) -{ -#if USE_AWS_S3 - factory.registerFunction( - {.documentation - = {.description = R"(The table function can be used to read the DeltaLake table stored on S3, alias of deltaLakeS3.)", - .examples{{DeltaLakeDefinition::name, "SELECT * FROM deltaLake(url, access_key_id, secret_access_key)", ""}}, - .category = FunctionDocumentation::Category::TableFunction}, - .allow_readonly = false}); - - factory.registerFunction( - {.documentation - = {.description = R"(The table function can be used to read the DeltaLake table stored on S3.)", - .examples{{DeltaLakeS3Definition::name, "SELECT * FROM deltaLakeS3(url, access_key_id, secret_access_key)", ""}}, - .category = FunctionDocumentation::Category::TableFunction}, - .allow_readonly = false}); -#endif - -#if USE_AZURE_BLOB_STORAGE - factory.registerFunction( - {.documentation - = {.description = R"(The table function can be used to read the DeltaLake table stored on Azure object store.)", - .examples{{DeltaLakeAzureDefinition::name, "SELECT * FROM deltaLakeAzure(connection_string|storage_account_url, container_name, blobpath, \"\n" - " \"[account_name, account_key, format, compression, structure])", ""}}, - .category = FunctionDocumentation::Category::TableFunction}, - .allow_readonly = false}); -#endif - // Register the new local Delta Lake table function - factory.registerFunction( - {.documentation - = {.description = R"(The table function can be used to read the DeltaLake table stored locally.)", - .examples{{DeltaLakeLocalDefinition::name, "SELECT * FROM deltaLakeLocal(path)", ""}}, - .category = FunctionDocumentation::Category::TableFunction}, - .allow_readonly = false}); -} -#endif - -#if USE_AWS_S3 -void registerTableFunctionHudi(TableFunctionFactory & factory) -{ - factory.registerFunction( - {.documentation - = {.description = R"(The table function can be used to read the Hudi table stored on object store.)", - .examples{{HudiDefinition::name, "SELECT * FROM hudi(url, access_key_id, secret_access_key)", ""}}, - .category = FunctionDocumentation::Category::TableFunction}, - .allow_readonly = false}); -} -#endif - -void registerDataLakeTableFunctions(TableFunctionFactory & factory) -{ - UNUSED(factory); -#if USE_AVRO - registerTableFunctionIceberg(factory); -#endif - -#if USE_PARQUET && USE_DELTA_KERNEL_RS - registerTableFunctionDeltaLake(factory); -#endif -#if USE_AWS_S3 - registerTableFunctionHudi(factory); -#endif -} } diff --git a/src/TableFunctions/TableFunctionObjectStorage.h b/src/TableFunctions/TableFunctionObjectStorage.h index 0b56a747ffce..05b8b293d5d4 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.h +++ b/src/TableFunctions/TableFunctionObjectStorage.h @@ -37,15 +37,16 @@ class TableFunctionObjectStorage : public ITableFunction String getName() const override { return name; } - bool hasStaticStructure() const override { return configuration->structure != "auto"; } + bool hasStaticStructure() const override { return configuration->getStructure() != "auto"; } - bool needStructureHint() const override { return configuration->structure == "auto"; } + bool needStructureHint() const override { return configuration->getStructure() == "auto"; } void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; } bool supportsReadingSubsetOfColumns(const ContextPtr & context) override { - return configuration->format != "auto" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->format, context); + return configuration->getFormat() != "auto" + && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration->getFormat(), context); } std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override @@ -55,7 +56,7 @@ class TableFunctionObjectStorage : public ITableFunction virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context) { - StorageObjectStorageConfiguration::initialize(*getConfiguration(), args, context, true); + getConfiguration()->initialize(args, context, true); } static void updateStructureAndFormatArgumentsIfNeeded( @@ -67,8 +68,8 @@ class TableFunctionObjectStorage : public ITableFunction if constexpr (is_data_lake) { Configuration configuration(createEmptySettings()); - if (configuration.format == "auto") - configuration.format = "Parquet"; /// Default format of data lakes. + if (configuration.getFormat() == "auto") + configuration.setFormat("Parquet"); /// Default format of data lakes. configuration.addStructureAndFormatToArgsIfNeeded(args, structure, format, context, /*with_structure=*/true); } @@ -110,21 +111,22 @@ class TableFunctionObjectStorage : public ITableFunction }; #if USE_AWS_S3 -using TableFunctionS3 = TableFunctionObjectStorage; +using TableFunctionS3 = TableFunctionObjectStorage; #endif #if USE_AZURE_BLOB_STORAGE -using TableFunctionAzureBlob = TableFunctionObjectStorage; +using TableFunctionAzureBlob = TableFunctionObjectStorage; #endif #if USE_HDFS -using TableFunctionHDFS = TableFunctionObjectStorage; +using TableFunctionHDFS = TableFunctionObjectStorage; #endif #if USE_AVRO +using TableFunctionIceberg = TableFunctionObjectStorage; + # if USE_AWS_S3 -using TableFunctionIceberg = TableFunctionObjectStorage; using TableFunctionIcebergS3 = TableFunctionObjectStorage; # endif # if USE_AZURE_BLOB_STORAGE @@ -136,13 +138,13 @@ using TableFunctionIcebergHDFS = TableFunctionObjectStorage; #endif #if USE_PARQUET && USE_DELTA_KERNEL_RS -#if USE_AWS_S3 +# if USE_AWS_S3 using TableFunctionDeltaLake = TableFunctionObjectStorage; using TableFunctionDeltaLakeS3 = TableFunctionObjectStorage; -#endif -#if USE_AZURE_BLOB_STORAGE +# endif +# if USE_AZURE_BLOB_STORAGE using TableFunctionDeltaLakeAzure = TableFunctionObjectStorage; -#endif +# endif // New alias for local Delta Lake table function using TableFunctionDeltaLakeLocal = TableFunctionObjectStorage; #endif diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp index c636ad4a934c..b6567ce3d760 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp @@ -22,8 +22,9 @@ StoragePtr TableFunctionObjectStorageClusterstructure != "auto") - columns = parseColumnsListFromString(configuration->structure, context); + + if (configuration->getStructure() != "auto") + columns = parseColumnsListFromString(configuration->getStructure(), context); else if (!Base::structure_hint.empty()) columns = Base::structure_hint; else if (!cached_columns.empty()) @@ -69,7 +70,15 @@ StoragePtr TableFunctionObjectStorageClusterstartup(); @@ -130,48 +139,90 @@ void registerTableFunctionIcebergCluster(TableFunctionFactory & factory) { UNUSED(factory); -#if USE_AWS_S3 + factory.registerFunction( + {.documentation + = {.description = R"(The table function can be used to read the Iceberg table stored on any object store in parallel for many nodes in a specified cluster.)", + .examples{ +# if USE_AWS_S3 + {"icebergCluster", "SELECT * FROM icebergCluster(cluster, url, [, NOSIGN | access_key_id, secret_access_key, [session_token]], format, [,compression], storage_type='s3')", ""}, +# endif +# if USE_AZURE_BLOB_STORAGE + {"icebergCluster", "SELECT * FROM icebergCluster(cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression], storage_type='azure')", ""}, +# endif +# if USE_HDFS + {"icebergCluster", "SELECT * FROM icebergCluster(cluster, uri, [format], [structure], [compression_method], storage_type='hdfs')", ""}, +# endif + }, + .category = FunctionDocumentation::Category::TableFunction}, + .allow_readonly = false}); + + factory.registerFunction( + {.documentation + = {.description = R"(The table function can be used to read the Iceberg table stored on shared storage in parallel for many nodes in a specified cluster.)", + .examples{{IcebergLocalClusterDefinition::name, "SELECT * FROM icebergLocalCluster(cluster, filename, format, [,compression])", ""}}, + .category = FunctionDocumentation::Category::TableFunction}, + .allow_readonly = false}); + +# if USE_AWS_S3 factory.registerFunction( {.documentation = {.description = R"(The table function can be used to read the Iceberg table stored on S3 object store in parallel for many nodes in a specified cluster.)", .examples{{IcebergS3ClusterDefinition::name, "SELECT * FROM icebergS3Cluster(cluster, url, [, NOSIGN | access_key_id, secret_access_key, [session_token]], format, [,compression])", ""}}, .category = FunctionDocumentation::Category::TableFunction}, .allow_readonly = false}); -#endif +# endif -#if USE_AZURE_BLOB_STORAGE +# if USE_AZURE_BLOB_STORAGE factory.registerFunction( {.documentation = {.description = R"(The table function can be used to read the Iceberg table stored on Azure object store in parallel for many nodes in a specified cluster.)", .examples{{IcebergAzureClusterDefinition::name, "SELECT * FROM icebergAzureCluster(cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression])", ""}}, .category = FunctionDocumentation::Category::TableFunction}, .allow_readonly = false}); -#endif +# endif -#if USE_HDFS +# if USE_HDFS factory.registerFunction( {.documentation = {.description = R"(The table function can be used to read the Iceberg table stored on HDFS virtual filesystem in parallel for many nodes in a specified cluster.)", .examples{{IcebergHDFSClusterDefinition::name, "SELECT * FROM icebergHDFSCluster(cluster, uri, [format], [structure], [compression_method])", ""}}, .category = FunctionDocumentation::Category::TableFunction}, .allow_readonly = false}); -#endif +# endif } #endif -#if USE_AWS_S3 #if USE_PARQUET && USE_DELTA_KERNEL_RS void registerTableFunctionDeltaLakeCluster(TableFunctionFactory & factory) { +# if USE_AWS_S3 factory.registerFunction( {.documentation = {.description = R"(The table function can be used to read the DeltaLake table stored on object store in parallel for many nodes in a specified cluster.)", .examples{{DeltaLakeClusterDefinition::name, "SELECT * FROM deltaLakeCluster(cluster, url, access_key_id, secret_access_key)", ""}}, .category = FunctionDocumentation::Category::TableFunction}, .allow_readonly = false}); + + factory.registerFunction( + {.documentation + = {.description = R"(The table function can be used to read the DeltaLake table stored on object store in parallel for many nodes in a specified cluster.)", + .examples{{DeltaLakeS3ClusterDefinition::name, "SELECT * FROM deltaLakeS3Cluster(cluster, url, access_key_id, secret_access_key)", ""}}, + .category = FunctionDocumentation::Category::TableFunction}, + .allow_readonly = false}); +# endif + +# if USE_AZURE_BLOB_STORAGE + factory.registerFunction( + {.documentation + = {.description = R"(The table function can be used to read the DeltaLake table stored on object store in parallel for many nodes in a specified cluster.)", + .examples{{DeltaLakeAzureClusterDefinition::name, "SELECT * FROM deltaLakeAzureCluster(cluster, url, access_key_id, secret_access_key)", ""}}, + .category = FunctionDocumentation::Category::TableFunction}, + .allow_readonly = false}); +# endif } #endif +#if USE_AWS_S3 void registerTableFunctionHudiCluster(TableFunctionFactory & factory) { factory.registerFunction( diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.h b/src/TableFunctions/TableFunctionObjectStorageCluster.h index 7cafeb05b230..06044e480590 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.h +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.h @@ -12,8 +12,6 @@ namespace DB class Context; -class StorageS3Settings; -class StorageAzureBlobSettings; class StorageS3Configuration; class StorageAzureConfiguration; @@ -45,21 +43,26 @@ class TableFunctionObjectStorageCluster : public ITableFunctionClusterstructure != "auto"; } - bool needStructureHint() const override { return Base::getConfiguration()->structure == "auto"; } + bool hasStaticStructure() const override { return Base::getConfiguration()->getStructure() != "auto"; } + bool needStructureHint() const override { return Base::getConfiguration()->getStructure() == "auto"; } void setStructureHint(const ColumnsDescription & structure_hint_) override { Base::structure_hint = structure_hint_; } }; #if USE_AWS_S3 -using TableFunctionS3Cluster = TableFunctionObjectStorageCluster; +using TableFunctionS3Cluster = TableFunctionObjectStorageCluster; #endif #if USE_AZURE_BLOB_STORAGE -using TableFunctionAzureBlobCluster = TableFunctionObjectStorageCluster; +using TableFunctionAzureBlobCluster = TableFunctionObjectStorageCluster; #endif #if USE_HDFS -using TableFunctionHDFSCluster = TableFunctionObjectStorageCluster; +using TableFunctionHDFSCluster = TableFunctionObjectStorageCluster; +#endif + +#if USE_AVRO +using TableFunctionIcebergCluster = TableFunctionObjectStorageCluster; +using TableFunctionIcebergLocalCluster = TableFunctionObjectStorageCluster; #endif #if USE_AVRO && USE_AWS_S3 @@ -76,6 +79,11 @@ using TableFunctionIcebergHDFSCluster = TableFunctionObjectStorageCluster; +using TableFunctionDeltaLakeS3Cluster = TableFunctionObjectStorageCluster; +#endif + +#if USE_AZURE_BLOB_STORAGE && USE_PARQUET && USE_DELTA_KERNEL_RS +using TableFunctionDeltaLakeAzureCluster = TableFunctionObjectStorageCluster; #endif #if USE_AWS_S3 diff --git a/src/TableFunctions/TableFunctionObjectStorageClusterFallback.cpp b/src/TableFunctions/TableFunctionObjectStorageClusterFallback.cpp new file mode 100644 index 000000000000..0ec8b178f6a3 --- /dev/null +++ b/src/TableFunctions/TableFunctionObjectStorageClusterFallback.cpp @@ -0,0 +1,478 @@ +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace Setting +{ + extern const SettingsString object_storage_cluster; +} + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int BAD_ARGUMENTS; +} + +struct S3ClusterFallbackDefinition +{ + static constexpr auto name = "s3"; + static constexpr auto storage_engine_name = "S3"; + static constexpr auto storage_engine_cluster_name = "S3Cluster"; +}; + +struct AzureClusterFallbackDefinition +{ + static constexpr auto name = "azureBlobStorage"; + static constexpr auto storage_engine_name = "Azure"; + static constexpr auto storage_engine_cluster_name = "AzureBlobStorageCluster"; +}; + +struct HDFSClusterFallbackDefinition +{ + static constexpr auto name = "hdfs"; + static constexpr auto storage_engine_name = "HDFS"; + static constexpr auto storage_engine_cluster_name = "HDFSCluster"; +}; + +struct IcebergClusterFallbackDefinition +{ + static constexpr auto name = "iceberg"; + static constexpr auto storage_engine_name = "UNDEFINED"; + static constexpr auto storage_engine_cluster_name = "IcebergCluster"; +}; + +struct IcebergS3ClusterFallbackDefinition +{ + static constexpr auto name = "icebergS3"; + static constexpr auto storage_engine_name = "S3"; + static constexpr auto storage_engine_cluster_name = "IcebergS3Cluster"; +}; + +struct IcebergAzureClusterFallbackDefinition +{ + static constexpr auto name = "icebergAzure"; + static constexpr auto storage_engine_name = "Azure"; + static constexpr auto storage_engine_cluster_name = "IcebergAzureCluster"; +}; + +struct IcebergHDFSClusterFallbackDefinition +{ + static constexpr auto name = "icebergHDFS"; + static constexpr auto storage_engine_name = "HDFS"; + static constexpr auto storage_engine_cluster_name = "IcebergHDFSCluster"; +}; + +struct IcebergLocalClusterFallbackDefinition +{ + static constexpr auto name = "icebergLocal"; + static constexpr auto storage_engine_name = "Local"; + static constexpr auto storage_engine_cluster_name = "IcebergLocalCluster"; +}; + +struct DeltaLakeClusterFallbackDefinition +{ + static constexpr auto name = "deltaLake"; + static constexpr auto storage_engine_name = "S3"; + static constexpr auto storage_engine_cluster_name = "DeltaLakeS3Cluster"; +}; + +struct DeltaLakeS3ClusterFallbackDefinition +{ + static constexpr auto name = "deltaLakeS3"; + static constexpr auto storage_engine_name = "S3"; + static constexpr auto storage_engine_cluster_name = "DeltaLakeS3Cluster"; +}; + +struct DeltaLakeAzureClusterFallbackDefinition +{ + static constexpr auto name = "deltaLakeAzure"; + static constexpr auto storage_engine_name = "Azure"; + static constexpr auto storage_engine_cluster_name = "DeltaLakeAzureCluster"; +}; + +struct HudiClusterFallbackDefinition +{ + static constexpr auto name = "hudi"; + static constexpr auto storage_engine_name = "S3"; + static constexpr auto storage_engine_cluster_name = "HudiS3Cluster"; +}; + +template +void TableFunctionObjectStorageClusterFallback::parseArgumentsImpl(ASTs & args, const ContextPtr & context) +{ + if (args.empty()) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "The function {} should have arguments. The first argument must be the cluster name and the rest are the arguments of " + "corresponding table function", + getName()); + + const auto & settings = context->getSettingsRef(); + + is_cluster_function = !settings[Setting::object_storage_cluster].value.empty(); + + if (is_cluster_function) + { + ASTPtr cluster_name_arg = std::make_shared(settings[Setting::object_storage_cluster].value); + args.insert(args.begin(), cluster_name_arg); + BaseCluster::parseArgumentsImpl(args, context); + args.erase(args.begin()); + } + else + BaseSimple::parseArgumentsImpl(args, context); // NOLINT(bugprone-parent-virtual-call) +} + +template +StoragePtr TableFunctionObjectStorageClusterFallback::executeImpl( + const ASTPtr & ast_function, + ContextPtr context, + const std::string & table_name, + ColumnsDescription cached_columns, + bool is_insert_query) const +{ + if (is_cluster_function) + { + auto result = BaseCluster::executeImpl(ast_function, context, table_name, cached_columns, is_insert_query); + if (auto storage = typeid_cast>(result)) + storage->setClusterNameInSettings(true); + return result; + } + else + return BaseSimple::executeImpl(ast_function, context, table_name, cached_columns, is_insert_query); // NOLINT(bugprone-parent-virtual-call) +} + +template +void TableFunctionObjectStorageClusterFallback::validateUseToCreateTable() const +{ + if (is_cluster_function) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Table function '{}' cannot be used to create a table in cluster mode", + getName()); +} + +#if USE_AWS_S3 +using TableFunctionS3ClusterFallback = TableFunctionObjectStorageClusterFallback; +#endif + +#if USE_AZURE_BLOB_STORAGE +using TableFunctionAzureClusterFallback = TableFunctionObjectStorageClusterFallback; +#endif + +#if USE_HDFS +using TableFunctionHDFSClusterFallback = TableFunctionObjectStorageClusterFallback; +#endif + +#if USE_AVRO +using TableFunctionIcebergClusterFallback = TableFunctionObjectStorageClusterFallback; +using TableFunctionIcebergLocalClusterFallback = TableFunctionObjectStorageClusterFallback; +#endif + +#if USE_AVRO && USE_AWS_S3 +using TableFunctionIcebergS3ClusterFallback = TableFunctionObjectStorageClusterFallback; +#endif + +#if USE_AVRO && USE_AZURE_BLOB_STORAGE +using TableFunctionIcebergAzureClusterFallback = TableFunctionObjectStorageClusterFallback; +#endif + +#if USE_AVRO && USE_HDFS +using TableFunctionIcebergHDFSClusterFallback = TableFunctionObjectStorageClusterFallback; +#endif + +#if USE_AWS_S3 && USE_PARQUET && USE_DELTA_KERNEL_RS +using TableFunctionDeltaLakeClusterFallback = TableFunctionObjectStorageClusterFallback; +using TableFunctionDeltaLakeS3ClusterFallback = TableFunctionObjectStorageClusterFallback; +#endif + +#if USE_AZURE_BLOB_STORAGE && USE_PARQUET && USE_DELTA_KERNEL_RS +using TableFunctionDeltaLakeAzureClusterFallback = TableFunctionObjectStorageClusterFallback; +#endif + +#if USE_AWS_S3 +using TableFunctionHudiClusterFallback = TableFunctionObjectStorageClusterFallback; +#endif + +void registerTableFunctionObjectStorageClusterFallback(TableFunctionFactory & factory) +{ + UNUSED(factory); +#if USE_AWS_S3 + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the data stored on S3 in parallel for many nodes in a specified cluster or from single node.)", + .examples{ + {"s3", "SELECT * FROM s3(url, format, structure)", ""}, + {"s3", "SELECT * FROM s3(url, format, structure) SETTINGS object_storage_cluster='cluster'", ""} + }, + .category = FunctionDocumentation::Category::TableFunction + }, + .allow_readonly = false + } + ); +#endif + +#if USE_AZURE_BLOB_STORAGE + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the data stored on Azure Blob Storage in parallel for many nodes in a specified cluster or from single node.)", + .examples{ + { + "azureBlobStorage", + "SELECT * FROM azureBlobStorage(connection_string|storage_account_url, container_name, blobpath, " + "[account_name, account_key, format, compression, structure])", "" + }, + { + "azureBlobStorage", + "SELECT * FROM azureBlobStorage(connection_string|storage_account_url, container_name, blobpath, " + "[account_name, account_key, format, compression, structure]) " + "SETTINGS object_storage_cluster='cluster'", "" + }, + }, + .category = FunctionDocumentation::Category::TableFunction + }, + .allow_readonly = false + } + ); +#endif + +#if USE_HDFS + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the data stored on HDFS virtual filesystem in parallel for many nodes in a specified cluster or from single node.)", + .examples{ + { + "hdfs", + "SELECT * FROM hdfs(url, format, compression, structure])", "" + }, + { + "hdfs", + "SELECT * FROM hdfs(url, format, compression, structure]) " + "SETTINGS object_storage_cluster='cluster'", "" + }, + }, + .category = FunctionDocumentation::Category::TableFunction + }, + .allow_readonly = false + } + ); +#endif + +#if USE_AVRO + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the Iceberg table stored on different object store in parallel for many nodes in a specified cluster or from single node.)", + .examples{ + { + "iceberg", + "SELECT * FROM iceberg(url, access_key_id, secret_access_key, storage_type='s3')", "" + }, + { + "iceberg", + "SELECT * FROM iceberg(url, access_key_id, secret_access_key, storage_type='s3') " + "SETTINGS object_storage_cluster='cluster'", "" + }, + { + "iceberg", + "SELECT * FROM iceberg(url, access_key_id, secret_access_key, storage_type='azure')", "" + }, + { + "iceberg", + "SELECT * FROM iceberg(url, storage_type='hdfs') SETTINGS object_storage_cluster='cluster'", "" + }, + }, + .category = FunctionDocumentation::Category::TableFunction + }, + .allow_readonly = false + } + ); + + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the Iceberg table stored on shared disk in parallel for many nodes in a specified cluster or from single node.)", + .examples{ + { + "icebergLocal", + "SELECT * FROM icebergLocal(filename)", "" + }, + { + "icebergLocal", + "SELECT * FROM icebergLocal(filename) " + "SETTINGS object_storage_cluster='cluster'", "" + }, + }, + .category = FunctionDocumentation::Category::TableFunction + }, + .allow_readonly = false + } + ); +#endif + +#if USE_AVRO && USE_AWS_S3 + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the Iceberg table stored on S3 object store in parallel for many nodes in a specified cluster or from single node.)", + .examples{ + { + "icebergS3", + "SELECT * FROM icebergS3(url, access_key_id, secret_access_key)", "" + }, + { + "icebergS3", + "SELECT * FROM icebergS3(url, access_key_id, secret_access_key) " + "SETTINGS object_storage_cluster='cluster'", "" + }, + }, + .category = FunctionDocumentation::Category::TableFunction + }, + .allow_readonly = false + } + ); +#endif + +#if USE_AVRO && USE_AZURE_BLOB_STORAGE + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the Iceberg table stored on Azure object store in parallel for many nodes in a specified cluster or from single node.)", + .examples{ + { + "icebergAzure", + "SELECT * FROM icebergAzure(url, access_key_id, secret_access_key)", "" + }, + { + "icebergAzure", + "SELECT * FROM icebergAzure(url, access_key_id, secret_access_key) " + "SETTINGS object_storage_cluster='cluster'", "" + }, + }, + .category = FunctionDocumentation::Category::TableFunction + }, + .allow_readonly = false + } + ); +#endif + +#if USE_AVRO && USE_HDFS + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the Iceberg table stored on HDFS virtual filesystem in parallel for many nodes in a specified cluster or from single node.)", + .examples{ + { + "icebergHDFS", + "SELECT * FROM icebergHDFS(url)", "" + }, + { + "icebergHDFS", + "SELECT * FROM icebergHDFS(url) SETTINGS object_storage_cluster='cluster'", "" + }, + }, + .category = FunctionDocumentation::Category::TableFunction + }, + .allow_readonly = false + } + ); +#endif + +#if USE_PARQUET && USE_DELTA_KERNEL_RS +# if USE_AWS_S3 + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the DeltaLake table stored on object store in parallel for many nodes in a specified cluster or from single node.)", + .examples{ + { + "deltaLake", + "SELECT * FROM deltaLake(url, access_key_id, secret_access_key)", "" + }, + { + "deltaLake", + "SELECT * FROM deltaLake(url, access_key_id, secret_access_key) " + "SETTINGS object_storage_cluster='cluster'", "" + }, + }, + .category = FunctionDocumentation::Category::TableFunction + }, + .allow_readonly = false + } + ); + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the DeltaLake table stored on object store in parallel for many nodes in a specified cluster or from single node.)", + .examples{ + { + "deltaLakeS3", + "SELECT * FROM deltaLakeS3(url, access_key_id, secret_access_key)", "" + }, + { + "deltaLakeS3", + "SELECT * FROM deltaLakeS3(url, access_key_id, secret_access_key) " + "SETTINGS object_storage_cluster='cluster'", "" + }, + }, + .category = FunctionDocumentation::Category::TableFunction + }, + .allow_readonly = false + } + ); +# endif +# if USE_AZURE_BLOB_STORAGE + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the DeltaLake table stored on object store in parallel for many nodes in a specified cluster or from single node.)", + .examples{ + { + "deltaLakeAzure", + "SELECT * FROM deltaLakeAzure(url, access_key_id, secret_access_key)", "" + }, + { + "deltaLakeAzure", + "SELECT * FROM deltaLakeAzure(url, access_key_id, secret_access_key) " + "SETTINGS object_storage_cluster='cluster'", "" + }, + }, + .category = FunctionDocumentation::Category::TableFunction + }, + .allow_readonly = false + } + ); +# endif +#endif + +#if USE_AWS_S3 + factory.registerFunction( + { + .documentation = { + .description=R"(The table function can be used to read the Hudi table stored on object store in parallel for many nodes in a specified cluster or from single node.)", + .examples{ + { + "hudi", + "SELECT * FROM hudi(url, access_key_id, secret_access_key)", "" + }, + { + "hudi", + "SELECT * FROM hudi(url, access_key_id, secret_access_key) SETTINGS object_storage_cluster='cluster'", "" + }, + }, + .category = FunctionDocumentation::Category::TableFunction + }, + .allow_readonly = false + } + ); +#endif +} + +} diff --git a/src/TableFunctions/TableFunctionObjectStorageClusterFallback.h b/src/TableFunctions/TableFunctionObjectStorageClusterFallback.h new file mode 100644 index 000000000000..d81acac2be50 --- /dev/null +++ b/src/TableFunctions/TableFunctionObjectStorageClusterFallback.h @@ -0,0 +1,49 @@ +#pragma once +#include "config.h" +#include + +namespace DB +{ + +/** +* Class implementing s3/hdfs/azureBlobStorage(...) table functions, +* which allow to use simple or distributed function variant based on settings. +* If setting `object_storage_cluster` is empty, +* simple single-host variant is used, if setting not empty, cluster variant is used. +* `SELECT * FROM s3('s3://...', ...) SETTINGS object_storage_cluster='cluster'` +* is equal to +* `SELECT * FROM s3Cluster('cluster', 's3://...', ...)` +*/ + +template +class TableFunctionObjectStorageClusterFallback : public Base +{ +public: + using BaseCluster = Base; + using BaseSimple = BaseCluster::Base; + + static constexpr auto name = Definition::name; + + String getName() const override { return name; } + + void validateUseToCreateTable() const override; + +private: + const char * getStorageEngineName() const override + { + return is_cluster_function ? Definition::storage_engine_cluster_name : Definition::storage_engine_name; + } + + StoragePtr executeImpl( + const ASTPtr & ast_function, + ContextPtr context, + const std::string & table_name, + ColumnsDescription cached_columns, + bool is_insert_query) const override; + + void parseArgumentsImpl(ASTs & args, const ContextPtr & context) override; + + bool is_cluster_function = false; +}; + +} diff --git a/src/TableFunctions/TableFunctionRemote.h b/src/TableFunctions/TableFunctionRemote.h index e58d30cf48df..498339231153 100644 --- a/src/TableFunctions/TableFunctionRemote.h +++ b/src/TableFunctions/TableFunctionRemote.h @@ -26,6 +26,8 @@ class TableFunctionRemote : public ITableFunction bool needStructureConversion() const override { return false; } + void setRemoteTableFunction(ASTPtr remote_table_function_ptr_) { remote_table_function_ptr = remote_table_function_ptr_; } + private: StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; diff --git a/src/TableFunctions/registerTableFunctions.cpp b/src/TableFunctions/registerTableFunctions.cpp index 643fb9fc94a6..c8497fbfdda2 100644 --- a/src/TableFunctions/registerTableFunctions.cpp +++ b/src/TableFunctions/registerTableFunctions.cpp @@ -68,7 +68,7 @@ void registerTableFunctions() registerTableFunctionObjectStorage(factory); registerTableFunctionObjectStorageCluster(factory); - registerDataLakeTableFunctions(factory); + registerTableFunctionObjectStorageClusterFallback(factory); registerDataLakeClusterTableFunctions(factory); #if USE_YTSAURUS diff --git a/src/TableFunctions/registerTableFunctions.h b/src/TableFunctions/registerTableFunctions.h index 4ee5c17d262d..84f0418bc4e1 100644 --- a/src/TableFunctions/registerTableFunctions.h +++ b/src/TableFunctions/registerTableFunctions.h @@ -69,7 +69,7 @@ void registerTableFunctionExplain(TableFunctionFactory & factory); void registerTableFunctionObjectStorage(TableFunctionFactory & factory); void registerTableFunctionObjectStorageCluster(TableFunctionFactory & factory); -void registerDataLakeTableFunctions(TableFunctionFactory & factory); +void registerTableFunctionObjectStorageClusterFallback(TableFunctionFactory & factory); void registerDataLakeClusterTableFunctions(TableFunctionFactory & factory); void registerTableFunctionTimeSeries(TableFunctionFactory & factory); diff --git a/tests/clickhouse-test b/tests/clickhouse-test index b0003e1403e8..077abaa0cdb6 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1146,6 +1146,8 @@ class SettingsRandomizer: "use_query_condition_cache": lambda: random.randint(0, 1), "secondary_indices_enable_bulk_filtering": lambda: random.randint(0, 1), "use_skip_indexes_if_final": lambda: random.randint(0, 1), + # Use the new reader most of the time. + "input_format_parquet_use_native_reader_v3": lambda: min(1, random.randint(0, 5)), **randomize_external_sort_group_by(), } diff --git a/tests/config/config.d/enable_experimental_export_merge_tree_partition.xml b/tests/config/config.d/enable_experimental_export_merge_tree_partition.xml new file mode 100644 index 000000000000..72014c9de4db --- /dev/null +++ b/tests/config/config.d/enable_experimental_export_merge_tree_partition.xml @@ -0,0 +1,3 @@ + + 1 + diff --git a/tests/config/install.sh b/tests/config/install.sh index b8874097c940..90b70f3ce5cc 100755 --- a/tests/config/install.sh +++ b/tests/config/install.sh @@ -85,6 +85,7 @@ ln -sf $SRC_PATH/config.d/blob_storage_log.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/custom_settings_prefixes.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/database_catalog_drop_table_concurrency.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/enable_access_control_improvements.xml $DEST_SERVER_PATH/config.d/ +ln -sf $SRC_PATH/config.d/enable_experimental_export_merge_tree_partition.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/macros.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/secure_ports.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/clusters.xml $DEST_SERVER_PATH/config.d/ diff --git a/tests/integration/test_database_iceberg/configs/cluster.xml b/tests/integration/test_database_iceberg/configs/cluster.xml new file mode 100644 index 000000000000..b9638e40bc1e --- /dev/null +++ b/tests/integration/test_database_iceberg/configs/cluster.xml @@ -0,0 +1,12 @@ + + + + + + node1 + 9000 + + + + + \ No newline at end of file diff --git a/tests/integration/test_database_iceberg/test.py b/tests/integration/test_database_iceberg/test.py index 662b51e0f687..a92a1581dc48 100644 --- a/tests/integration/test_database_iceberg/test.py +++ b/tests/integration/test_database_iceberg/test.py @@ -14,12 +14,13 @@ import pytz from minio import Minio from pyiceberg.catalog import load_catalog -from pyiceberg.partitioning import PartitionField, PartitionSpec +from pyiceberg.partitioning import PartitionField, PartitionSpec, UNPARTITIONED_PARTITION_SPEC from pyiceberg.schema import Schema from pyiceberg.table.sorting import SortField, SortOrder from pyiceberg.transforms import DayTransform, IdentityTransform from pyiceberg.types import ( DoubleType, + LongType, FloatType, NestedField, StringType, @@ -27,6 +28,7 @@ TimestampType, TimestamptzType ) +from pyiceberg.table.sorting import UNSORTED_SORT_ORDER from helpers.cluster import ClickHouseCluster, ClickHouseInstance, is_arm from helpers.config_cluster import minio_secret_key, minio_access_key @@ -72,6 +74,8 @@ DEFAULT_SORT_ORDER = SortOrder(SortField(source_id=2, transform=IdentityTransform())) +AVAILABLE_ENGINES = ["DataLakeCatalog", "Iceberg"] + def list_namespaces(): response = requests.get(f"{BASE_URL_LOCAL}/namespaces") @@ -122,7 +126,7 @@ def generate_record(): def create_clickhouse_iceberg_database( - started_cluster, node, name, additional_settings={} + started_cluster, node, name, additional_settings={}, engine='DataLakeCatalog' ): settings = { "catalog_type": "rest", @@ -137,7 +141,7 @@ def create_clickhouse_iceberg_database( DROP DATABASE IF EXISTS {name}; SET allow_database_iceberg=true; SET write_full_path_in_iceberg_metadata=1; -CREATE DATABASE {name} ENGINE = DataLakeCatalog('{BASE_URL}', 'minio', '{minio_secret_key}') +CREATE DATABASE {name} ENGINE = {engine}('{BASE_URL}', 'minio', '{minio_secret_key}') SETTINGS {",".join((k+"="+repr(v) for k, v in settings.items()))} """ ) @@ -184,10 +188,11 @@ def started_cluster(): cluster = ClickHouseCluster(__file__) cluster.add_instance( "node1", - main_configs=["configs/backups.xml"], + main_configs=["configs/backups.xml", "configs/cluster.xml"], user_configs=[], stay_alive=True, with_iceberg_catalog=True, + with_zookeeper=True, ) logging.info("Starting cluster...") @@ -202,7 +207,8 @@ def started_cluster(): cluster.shutdown() -def test_list_tables(started_cluster): +@pytest.mark.parametrize("engine", AVAILABLE_ENGINES) +def test_list_tables(started_cluster, engine): node = started_cluster.instances["node1"] root_namespace = f"clickhouse_{uuid.uuid4()}" @@ -233,7 +239,7 @@ def test_list_tables(started_cluster): for namespace in [namespace_1, namespace_2]: assert len(catalog.list_tables(namespace)) == 0 - create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME) + create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME, engine=engine) tables_list = "" for table in namespace_1_tables: @@ -268,7 +274,8 @@ def test_list_tables(started_cluster): ) -def test_many_namespaces(started_cluster): +@pytest.mark.parametrize("engine", AVAILABLE_ENGINES) +def test_many_namespaces(started_cluster, engine): node = started_cluster.instances["node1"] root_namespace_1 = f"A_{uuid.uuid4()}" root_namespace_2 = f"B_{uuid.uuid4()}" @@ -289,7 +296,7 @@ def test_many_namespaces(started_cluster): for table in tables: create_table(catalog, namespace, table) - create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME) + create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME, engine=engine) for namespace in namespaces: for table in tables: @@ -301,7 +308,8 @@ def test_many_namespaces(started_cluster): ) -def test_select(started_cluster): +@pytest.mark.parametrize("engine", AVAILABLE_ENGINES) +def test_select(started_cluster, engine): node = started_cluster.instances["node1"] test_ref = f"test_list_tables_{uuid.uuid4()}" @@ -329,7 +337,7 @@ def test_select(started_cluster): df = pa.Table.from_pylist(data) table.append(df) - create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME) + create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME, engine=engine) expected = DEFAULT_CREATE_TABLE.format(CATALOG_NAME, namespace, table_name) assert expected == node.query( @@ -343,7 +351,8 @@ def test_select(started_cluster): assert int(node.query(f"SELECT count() FROM system.iceberg_history WHERE table = '{namespace}.{table_name}' and database = '{CATALOG_NAME}'").strip()) == 1 -def test_hide_sensitive_info(started_cluster): +@pytest.mark.parametrize("engine", AVAILABLE_ENGINES) +def test_hide_sensitive_info(started_cluster, engine): node = started_cluster.instances["node1"] test_ref = f"test_hide_sensitive_info_{uuid.uuid4()}" @@ -361,6 +370,7 @@ def test_hide_sensitive_info(started_cluster): node, CATALOG_NAME, additional_settings={"catalog_credential": "SECRET_1"}, + engine=engine, ) assert "SECRET_1" not in node.query(f"SHOW CREATE DATABASE {CATALOG_NAME}") @@ -369,11 +379,13 @@ def test_hide_sensitive_info(started_cluster): node, CATALOG_NAME, additional_settings={"auth_header": "SECRET_2"}, + engine=engine, ) assert "SECRET_2" not in node.query(f"SHOW CREATE DATABASE {CATALOG_NAME}") -def test_tables_with_same_location(started_cluster): +@pytest.mark.parametrize("engine", AVAILABLE_ENGINES) +def test_tables_with_same_location(started_cluster, engine): node = started_cluster.instances["node1"] test_ref = f"test_tables_with_same_location_{uuid.uuid4()}" @@ -404,7 +416,7 @@ def record(key): df = pa.Table.from_pylist(data) table_2.append(df) - create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME) + create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME, engine=engine) assert 'aaa\naaa\naaa' == node.query(f"SELECT symbol FROM {CATALOG_NAME}.`{namespace}.{table_name}`").strip() assert 'bbb\nbbb\nbbb' == node.query(f"SELECT symbol FROM {CATALOG_NAME}.`{namespace}.{table_name_2}`").strip() @@ -526,6 +538,52 @@ def test_timestamps(started_cluster): assert node.query(f"SHOW CREATE TABLE {CATALOG_NAME}.`{root_namespace}.{table_name}`") == f"CREATE TABLE {CATALOG_NAME}.`{root_namespace}.{table_name}`\\n(\\n `timestamp` Nullable(DateTime64(6)),\\n `timestamptz` Nullable(DateTime64(6, \\'UTC\\'))\\n)\\nENGINE = Iceberg(\\'http://minio:9000/warehouse-rest/data/\\', \\'minio\\', \\'[HIDDEN]\\')\n" assert node.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}`") == "2024-01-01 12:00:00.000000\t2024-01-01 12:00:00.000000\n" + # Berlin - UTC+1 at winter + # Istanbul - UTC+3 at winter + + # 'UTC' is default value, responce is equal to query above + assert node.query(f""" + SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` + SETTINGS iceberg_timezone_for_timestamptz='UTC' + """) == "2024-01-01 12:00:00.000000\t2024-01-01 12:00:00.000000\n" + # Timezone from setting + assert node.query(f""" + SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` + SETTINGS iceberg_timezone_for_timestamptz='Europe/Berlin' + """) == "2024-01-01 12:00:00.000000\t2024-01-01 13:00:00.000000\n" + # Empty value means session timezone, by default it is 'UTC' too + assert node.query(f""" + SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` + SETTINGS iceberg_timezone_for_timestamptz='' + """) == "2024-01-01 12:00:00.000000\t2024-01-01 12:00:00.000000\n" + # If session timezone is used, `timestamptz` does not changed, 'UTC' by default + assert node.query(f""" + SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` + SETTINGS session_timezone='Asia/Istanbul' + """) == "2024-01-01 15:00:00.000000\t2024-01-01 12:00:00.000000\n" + # Setiing `iceberg_timezone_for_timestamptz` does not affect `timestamp` column + assert node.query(f""" + SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` + SETTINGS session_timezone='Asia/Istanbul', iceberg_timezone_for_timestamptz='Europe/Berlin' + """) == "2024-01-01 15:00:00.000000\t2024-01-01 13:00:00.000000\n" + # Empty value, used non-default session timezone + assert node.query(f""" + SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` + SETTINGS session_timezone='Asia/Istanbul', iceberg_timezone_for_timestamptz='' + """) == "2024-01-01 15:00:00.000000\t2024-01-01 15:00:00.000000\n" + # Invalid timezone + assert "Invalid time zone: Foo/Bar" in node.query_and_get_error(f""" + SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` + SETTINGS iceberg_timezone_for_timestamptz='Foo/Bar' + """) + + assert node.query(f"SHOW CREATE TABLE {CATALOG_NAME}.`{root_namespace}.{table_name}` SETTINGS iceberg_timezone_for_timestamptz='UTC'") == f"CREATE TABLE {CATALOG_NAME}.`{root_namespace}.{table_name}`\\n(\\n `timestamp` Nullable(DateTime64(6)),\\n `timestamptz` Nullable(DateTime64(6, \\'UTC\\'))\\n)\\nENGINE = Iceberg(\\'http://minio:9000/warehouse-rest/data/\\', \\'minio\\', \\'[HIDDEN]\\')\n" + assert node.query(f"SHOW CREATE TABLE {CATALOG_NAME}.`{root_namespace}.{table_name}` SETTINGS iceberg_timezone_for_timestamptz='Europe/Berlin'") == f"CREATE TABLE {CATALOG_NAME}.`{root_namespace}.{table_name}`\\n(\\n `timestamp` Nullable(DateTime64(6)),\\n `timestamptz` Nullable(DateTime64(6, \\'Europe/Berlin\\'))\\n)\\nENGINE = Iceberg(\\'http://minio:9000/warehouse-rest/data/\\', \\'minio\\', \\'[HIDDEN]\\')\n" + + assert node.query(f"SELECT timezoneOf(timestamptz) FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` LIMIT 1") == "UTC\n" + assert node.query(f"SELECT timezoneOf(timestamptz) FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` LIMIT 1 SETTINGS iceberg_timezone_for_timestamptz='UTC'") == "UTC\n" + assert node.query(f"SELECT timezoneOf(timestamptz) FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` LIMIT 1 SETTINGS iceberg_timezone_for_timestamptz='Europe/Berlin'") == "Europe/Berlin\n" + def test_insert(started_cluster): node = started_cluster.instances["node1"] @@ -560,6 +618,7 @@ def test_create(started_cluster): node.query(f"INSERT INTO {CATALOG_NAME}.`{root_namespace}.{table_name}` VALUES ('AAPL');", settings={"allow_experimental_insert_into_iceberg": 1, 'write_full_path_in_iceberg_metadata': 1}) assert node.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_name}`") == "AAPL\n" + def test_drop_table(started_cluster): node = started_cluster.instances["node1"] @@ -575,3 +634,167 @@ def test_drop_table(started_cluster): drop_clickhouse_iceberg_table(node, root_namespace, table_name) assert len(catalog.list_tables(root_namespace)) == 0 + + +def test_table_with_slash(started_cluster): + node = started_cluster.instances["node1"] + + # pyiceberg at current moment (version 0.9.1) has a bug with table names with slashes + # see https://github.com/apache/iceberg-python/issues/2462 + # so we need to encode it manually + table_raw_suffix = "table/foo" + table_encoded_suffix = "table%2Ffoo" + + test_ref = f"test_list_tables_{uuid.uuid4()}" + table_name = f"{test_ref}_{table_raw_suffix}" + table_encoded_name = f"{test_ref}_{table_encoded_suffix}" + root_namespace = f"{test_ref}_namespace" + + catalog = load_catalog_impl(started_cluster) + catalog.create_namespace(root_namespace) + + create_table(catalog, root_namespace, table_name, DEFAULT_SCHEMA, PartitionSpec(), DEFAULT_SORT_ORDER) + + create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME) + node.query(f"INSERT INTO {CATALOG_NAME}.`{root_namespace}.{table_encoded_name}` VALUES (NULL, 'AAPL', 193.24, 193.31, tuple('bot'));", settings={"allow_experimental_insert_into_iceberg": 1, 'write_full_path_in_iceberg_metadata': 1}) + assert node.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_encoded_name}`") == "\\N\tAAPL\t193.24\t193.31\t('bot')\n" + + +def test_cluster_joins(started_cluster): + node = started_cluster.instances["node1"] + + test_ref = f"test_join_tables_{uuid.uuid4()}" + table_name = f"{test_ref}_table" + table_name_2 = f"{test_ref}_table_2" + table_name_local = f"{test_ref}_table_local" + + root_namespace = f"{test_ref}_namespace" + + catalog = load_catalog_impl(started_cluster) + catalog.create_namespace(root_namespace) + + schema = Schema( + NestedField( + field_id=1, + name="tag", + field_type=LongType(), + required=False + ), + NestedField( + field_id=2, + name="name", + field_type=StringType(), + required=False, + ), + ) + table = create_table(catalog, root_namespace, table_name, schema, + partition_spec=UNPARTITIONED_PARTITION_SPEC, sort_order=UNSORTED_SORT_ORDER) + data = [{"tag": 1, "name": "John"}, {"tag": 2, "name": "Jack"}] + df = pa.Table.from_pylist(data) + table.append(df) + + schema2 = Schema( + NestedField( + field_id=1, + name="id", + field_type=LongType(), + required=False + ), + NestedField( + field_id=2, + name="second_name", + field_type=StringType(), + required=False, + ), + ) + table2 = create_table(catalog, root_namespace, table_name_2, schema2, + partition_spec=UNPARTITIONED_PARTITION_SPEC, sort_order=UNSORTED_SORT_ORDER) + data = [{"id": 1, "second_name": "Dow"}, {"id": 2, "second_name": "Sparrow"}] + df = pa.Table.from_pylist(data) + table2.append(df) + + node.query(f"CREATE TABLE `{table_name_local}` (id Int64, second_name String) ENGINE = Memory()") + node.query(f"INSERT INTO `{table_name_local}` VALUES (1, 'Silver'), (2, 'Black')") + + create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME) + + res = node.query( + f""" + SELECT t1.name,t2.second_name + FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` AS t1 + JOIN {CATALOG_NAME}.`{root_namespace}.{table_name_2}` AS t2 + ON t1.tag=t2.id + WHERE t1.tag < 10 AND t2.id < 20 + ORDER BY ALL + SETTINGS + object_storage_cluster='cluster_simple', + object_storage_cluster_join_mode='local' + """ + ) + + assert res == "Jack\tSparrow\nJohn\tDow\n" + + #res = node.query( + # f""" + # SELECT name + # FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` + # WHERE tag in ( + # SELECT id + # FROM {CATALOG_NAME}.`{root_namespace}.{table_name_2}` + # ) + # ORDER BY ALL + # SETTINGS + # object_storage_cluster='cluster_simple', + # object_storage_cluster_join_mode='local' + # """ + #) + + #assert res == "Jack\nJohn\n" + + res = node.query( + f""" + SELECT t1.name,t2.second_name + FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` AS t1 + JOIN `{table_name_local}` AS t2 + ON t1.tag=t2.id + WHERE t1.tag < 10 AND t2.id < 20 + ORDER BY ALL + SETTINGS + object_storage_cluster='cluster_simple', + object_storage_cluster_join_mode='local' + """ + ) + + assert res == "Jack\tBlack\nJohn\tSilver\n" + + #res = node.query( + # f""" + # SELECT name + # FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` + # WHERE tag in ( + # SELECT id + # FROM `{table_name_local}` + # ) + # ORDER BY ALL + # SETTINGS + # object_storage_cluster='cluster_simple', + # object_storage_cluster_join_mode='local' + # """ + #) + + #assert res == "Jack\nJohn\n" + + res = node.query( + f""" + SELECT t1.name,t2.second_name + FROM {CATALOG_NAME}.`{root_namespace}.{table_name}` AS t1 + CROSS JOIN `{table_name_local}` AS t2 + WHERE t1.tag < 10 AND t2.id < 20 + ORDER BY ALL + SETTINGS + object_storage_cluster='cluster_simple', + object_storage_cluster_join_mode='local' + """ + ) + + assert res == "Jack\tBlack\nJack\tSilver\nJohn\tBlack\nJohn\tSilver\n" diff --git a/tests/integration/test_export_merge_tree_part_to_object_storage/__init__.py b/tests/integration/test_export_merge_tree_part_to_object_storage/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/integration/test_export_merge_tree_part_to_object_storage/configs/named_collections.xml b/tests/integration/test_export_merge_tree_part_to_object_storage/configs/named_collections.xml new file mode 100644 index 000000000000..d46920b7ba88 --- /dev/null +++ b/tests/integration/test_export_merge_tree_part_to_object_storage/configs/named_collections.xml @@ -0,0 +1,9 @@ + + + + http://minio1:9001/root/data + minio + ClickHouse_Minio_P@ssw0rd + + + \ No newline at end of file diff --git a/tests/integration/test_export_merge_tree_part_to_object_storage/test.py b/tests/integration/test_export_merge_tree_part_to_object_storage/test.py new file mode 100644 index 000000000000..ce6b23bf4231 --- /dev/null +++ b/tests/integration/test_export_merge_tree_part_to_object_storage/test.py @@ -0,0 +1,131 @@ +import logging +import pytest +import random +import string +import time +from typing import Optional +import uuid + +from helpers.cluster import ClickHouseCluster +from helpers.network import PartitionManager + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node1", + main_configs=["configs/named_collections.xml"], + with_minio=True, + ) + logging.info("Starting cluster...") + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def create_s3_table(node, s3_table): + node.query(f"CREATE TABLE {s3_table} (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='{s3_table}', format=Parquet, partition_strategy='hive') PARTITION BY year") + + +def create_tables_and_insert_data(node, mt_table, s3_table): + node.query(f"CREATE TABLE {mt_table} (id UInt64, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple()") + node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020), (3, 2020), (4, 2021)") + + create_s3_table(node, s3_table) + + +def test_drop_column_during_export_snapshot(cluster): + node = cluster.instances["node1"] + + mt_table = "mutations_snapshot_mt_table" + s3_table = "mutations_snapshot_s3_table" + + create_tables_and_insert_data(node, mt_table, s3_table) + + # Block traffic to/from MinIO to force upload errors and retries, following existing S3 tests style + minio_ip = cluster.minio_ip + minio_port = cluster.minio_port + + # Ensure export sees a consistent snapshot at start time even if we mutate the source later + with PartitionManager() as pm: + # Block responses from MinIO (source_port matches MinIO service) + pm_rule_reject_responses = { + "destination": node.ip_address, + "source_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_responses) + + # Block requests to MinIO (destination: MinIO, destination_port: minio_port) + pm_rule_reject_requests = { + "destination": minio_ip, + "destination_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_requests) + + # Start export of 2020 + node.query( + f"ALTER TABLE {mt_table} EXPORT PART '2020_1_1_0' TO TABLE {s3_table};" + ) + + # Drop a column that is required for the export + node.query(f"ALTER TABLE {mt_table} DROP COLUMN id") + + time.sleep(3) + # assert the mutation has been applied AND the data has not been exported yet + assert "Unknown expression identifier `id`" in node.query_and_get_error(f"SELECT id FROM {mt_table}"), "Column id is not removed" + + # Wait for export to finish and then verify destination still reflects the original snapshot (3 rows) + time.sleep(5) + assert node.query(f"SELECT count() FROM {s3_table} WHERE id >= 0") == '3\n', "Export did not preserve snapshot at start time after source mutation" + + +def test_add_column_during_export(cluster): + node = cluster.instances["node1"] + + mt_table = "add_column_during_export_mt_table" + s3_table = "add_column_during_export_s3_table" + + create_tables_and_insert_data(node, mt_table, s3_table) + + # Block traffic to/from MinIO to force upload errors and retries, following existing S3 tests style + minio_ip = cluster.minio_ip + minio_port = cluster.minio_port + + # Ensure export sees a consistent snapshot at start time even if we mutate the source later + with PartitionManager() as pm: + # Block responses from MinIO (source_port matches MinIO service) + pm_rule_reject_responses = { + "destination": node.ip_address, + "source_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_responses) + + # Block requests to MinIO (destination: MinIO, destination_port: minio_port) + pm_rule_reject_requests = { + "destination": minio_ip, + "destination_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_requests) + + # Start export of 2020 + node.query( + f"ALTER TABLE {mt_table} EXPORT PART '2020_1_1_0' TO TABLE {s3_table};" + ) + + node.query(f"ALTER TABLE {mt_table} ADD COLUMN id2 UInt64") + + time.sleep(3) + + # assert the mutation has been applied AND the data has not been exported yet + assert node.query(f"SELECT count(id2) FROM {mt_table}") == '4\n', "Column id2 is not added" + + # Wait for export to finish and then verify destination still reflects the original snapshot (3 rows) + time.sleep(5) + assert node.query(f"SELECT count() FROM {s3_table} WHERE id >= 0") == '3\n', "Export did not preserve snapshot at start time after source mutation" + assert "Unknown expression identifier `id2`" in node.query_and_get_error(f"SELECT id2 FROM {s3_table}"), "Column id2 is present in the exported data" diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/__init__.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/configs/allow_experimental_export_partition.xml b/tests/integration/test_export_replicated_mt_partition_to_object_storage/configs/allow_experimental_export_partition.xml new file mode 100644 index 000000000000..f8c5fab1a3be --- /dev/null +++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/configs/allow_experimental_export_partition.xml @@ -0,0 +1,3 @@ + + 1 + \ No newline at end of file diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/configs/disable_experimental_export_partition.xml b/tests/integration/test_export_replicated_mt_partition_to_object_storage/configs/disable_experimental_export_partition.xml new file mode 100644 index 000000000000..ba6508ebd660 --- /dev/null +++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/configs/disable_experimental_export_partition.xml @@ -0,0 +1,3 @@ + + 0 + \ No newline at end of file diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/configs/named_collections.xml b/tests/integration/test_export_replicated_mt_partition_to_object_storage/configs/named_collections.xml new file mode 100644 index 000000000000..d46920b7ba88 --- /dev/null +++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/configs/named_collections.xml @@ -0,0 +1,9 @@ + + + + http://minio1:9001/root/data + minio + ClickHouse_Minio_P@ssw0rd + + + \ No newline at end of file diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/configs/users.d/profile.xml b/tests/integration/test_export_replicated_mt_partition_to_object_storage/configs/users.d/profile.xml new file mode 100644 index 000000000000..518f29708929 --- /dev/null +++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/configs/users.d/profile.xml @@ -0,0 +1,8 @@ + + + + 3 + + + + diff --git a/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py new file mode 100644 index 000000000000..a4cb0807d6ee --- /dev/null +++ b/tests/integration/test_export_replicated_mt_partition_to_object_storage/test.py @@ -0,0 +1,749 @@ +import logging +import pytest +import random +import string +import time +from typing import Optional +import uuid + +from helpers.cluster import ClickHouseCluster +from helpers.network import PartitionManager + + +def wait_for_export_status( + node, + mt_table: str, + s3_table: str, + partition_id: str, + expected_status: str = "COMPLETED", + timeout: int = 30, + poll_interval: float = 0.5, +): + start_time = time.time() + while time.time() - start_time < timeout: + status = node.query( + f""" + SELECT status FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '{partition_id}' + """ + ).strip() + + if status and status == expected_status: + return status + + time.sleep(poll_interval) + + raise TimeoutError( + f"Export status did not reach '{expected_status}' within {timeout}s. ") + + +def wait_for_export_to_start( + node, + mt_table: str, + s3_table: str, + partition_id: str, + timeout: int = 10, + poll_interval: float = 0.2, +): + start_time = time.time() + while time.time() - start_time < timeout: + count = node.query( + f""" + SELECT count() FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '{partition_id}' + """ + ).strip() + + if count != '0': + return True + + time.sleep(poll_interval) + + raise TimeoutError(f"Export did not start within {timeout}s. ") + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "replica1", + main_configs=["configs/named_collections.xml", "configs/allow_experimental_export_partition.xml"], + user_configs=["configs/users.d/profile.xml"], + with_minio=True, + stay_alive=True, + with_zookeeper=True, + ) + cluster.add_instance( + "replica2", + main_configs=["configs/named_collections.xml", "configs/allow_experimental_export_partition.xml"], + user_configs=["configs/users.d/profile.xml"], + with_minio=True, + stay_alive=True, + with_zookeeper=True, + ) + # node that does not participate in the export, but will have visibility over the s3 table + cluster.add_instance( + "watcher_node", + main_configs=["configs/named_collections.xml"], + user_configs=[], + with_minio=True, + ) + cluster.add_instance( + "replica_with_export_disabled", + main_configs=["configs/named_collections.xml", "configs/disable_experimental_export_partition.xml"], + user_configs=["configs/users.d/profile.xml"], + with_minio=True, + stay_alive=True, + with_zookeeper=True, + ) + logging.info("Starting cluster...") + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def create_s3_table(node, s3_table): + node.query(f"CREATE TABLE {s3_table} (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='{s3_table}', format=Parquet, partition_strategy='hive') PARTITION BY year") + + +def create_tables_and_insert_data(node, mt_table, s3_table, replica_name): + node.query(f"CREATE TABLE {mt_table} (id UInt64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{mt_table}', '{replica_name}') PARTITION BY year ORDER BY tuple()") + node.query(f"INSERT INTO {mt_table} VALUES (1, 2020), (2, 2020), (3, 2020), (4, 2021)") + + create_s3_table(node, s3_table) + + +def test_restart_nodes_during_export(cluster): + node = cluster.instances["replica1"] + node2 = cluster.instances["replica2"] + watcher_node = cluster.instances["watcher_node"] + + mt_table = "disaster_mt_table" + s3_table = "disaster_s3_table" + + create_tables_and_insert_data(node, mt_table, s3_table, "replica1") + create_tables_and_insert_data(node2, mt_table, s3_table, "replica2") + create_s3_table(watcher_node, s3_table) + + # Block S3/MinIO requests to keep exports alive via retry mechanism + # This allows ZooKeeper operations to proceed quickly + minio_ip = cluster.minio_ip + minio_port = cluster.minio_port + + with PartitionManager() as pm: + # Block responses from MinIO (source_port matches MinIO service) + pm_rule_reject_responses_node1 = { + "destination": node.ip_address, + "source_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_responses_node1) + + pm_rule_reject_responses_node2 = { + "destination": node2.ip_address, + "source_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_responses_node2) + + # Block requests to MinIO (destination: MinIO, destination_port: minio_port) + pm_rule_reject_requests = { + "destination": minio_ip, + "destination_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_requests) + + export_queries = f""" + ALTER TABLE {mt_table} + EXPORT PARTITION ID '2020' TO TABLE {s3_table} + SETTINGS export_merge_tree_partition_max_retries = 50; + ALTER TABLE {mt_table} + EXPORT PARTITION ID '2021' TO TABLE {s3_table} + SETTINGS export_merge_tree_partition_max_retries = 50; + """ + + node.query(export_queries) + + # wait for the exports to start + wait_for_export_to_start(node, mt_table, s3_table, "2020") + wait_for_export_to_start(node, mt_table, s3_table, "2021") + + node.stop_clickhouse(kill=True) + node2.stop_clickhouse(kill=True) + + assert watcher_node.query(f"SELECT count() FROM {s3_table} where year = 2020") == '0\n', "Partition 2020 was written to S3 during network delay crash" + + assert watcher_node.query(f"SELECT count() FROM {s3_table} where year = 2021") == '0\n', "Partition 2021 was written to S3 during network delay crash" + + # start the nodes, they should finish the export + node.start_clickhouse() + node2.start_clickhouse() + + wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") + wait_for_export_status(node, mt_table, s3_table, "2021", "COMPLETED") + + assert node.query(f"SELECT count() FROM {s3_table} WHERE year = 2020") != f'0\n', "Export of partition 2020 did not resume after crash" + + assert node.query(f"SELECT count() FROM {s3_table} WHERE year = 2021") != f'0\n', "Export of partition 2021 did not resume after crash" + + +def test_kill_export(cluster): + node = cluster.instances["replica1"] + node2 = cluster.instances["replica2"] + watcher_node = cluster.instances["watcher_node"] + + mt_table = "kill_export_mt_table" + s3_table = "kill_export_s3_table" + + create_tables_and_insert_data(node, mt_table, s3_table, "replica1") + create_tables_and_insert_data(node2, mt_table, s3_table, "replica2") + + # Block S3/MinIO requests to keep exports alive via retry mechanism + # This allows ZooKeeper operations (KILL) to proceed quickly + minio_ip = cluster.minio_ip + minio_port = cluster.minio_port + + with PartitionManager() as pm: + # Block responses from MinIO (source_port matches MinIO service) + pm_rule_reject_responses = { + "destination": node.ip_address, + "source_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_responses) + + # Block requests to MinIO (destination: MinIO, destination_port: minio_port) + pm_rule_reject_requests = { + "destination": minio_ip, + "destination_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_requests) + + export_queries = f""" + ALTER TABLE {mt_table} + EXPORT PARTITION ID '2020' TO TABLE {s3_table} + SETTINGS export_merge_tree_partition_max_retries = 50; + ALTER TABLE {mt_table} + EXPORT PARTITION ID '2021' TO TABLE {s3_table} + SETTINGS export_merge_tree_partition_max_retries = 50; + """ + + node.query(export_queries) + + # Kill only 2020 while S3 is blocked - retry mechanism keeps exports alive + # ZooKeeper operations (KILL) proceed quickly since only S3 is blocked + node.query(f"KILL EXPORT PARTITION WHERE partition_id = '2020' and source_table = '{mt_table}' and destination_table = '{s3_table}'") + + # wait for 2021 to finish + wait_for_export_status(node, mt_table, s3_table, "2021", "COMPLETED") + + # checking for the commit file because maybe the data file was too fast? + assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '0\n', "Partition 2020 was written to S3, it was not killed as expected" + assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2021_*', format=LineAsString)") != f'0\n', "Partition 2021 was not written to S3, but it should have been" + + # check system.replicated_partition_exports for the export, status should be KILLED + assert node.query(f"SELECT status FROM system.replicated_partition_exports WHERE partition_id = '2020' and source_table = '{mt_table}' and destination_table = '{s3_table}'") == 'KILLED\n', "Partition 2020 was not killed as expected" + assert node.query(f"SELECT status FROM system.replicated_partition_exports WHERE partition_id = '2021' and source_table = '{mt_table}' and destination_table = '{s3_table}'") == 'COMPLETED\n', "Partition 2021 was not completed, this is unexpected" + + +def test_drop_source_table_during_export(cluster): + node = cluster.instances["replica1"] + # node2 = cluster.instances["replica2"] + watcher_node = cluster.instances["watcher_node"] + + mt_table = "drop_source_table_during_export_mt_table" + s3_table = "drop_source_table_during_export_s3_table" + + create_tables_and_insert_data(node, mt_table, s3_table, "replica1") + # create_tables_and_insert_data(node2, mt_table, s3_table, "replica2") + create_s3_table(watcher_node, s3_table) + + # Block S3/MinIO requests to keep exports alive via retry mechanism + # This allows ZooKeeper operations (KILL) to proceed quickly + minio_ip = cluster.minio_ip + minio_port = cluster.minio_port + + with PartitionManager() as pm: + # Block responses from MinIO (source_port matches MinIO service) + pm_rule_reject_responses = { + "destination": node.ip_address, + "source_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_responses) + + # Block requests to MinIO (destination: MinIO, destination_port: minio_port) + pm_rule_reject_requests = { + "destination": minio_ip, + "destination_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_requests) + + export_queries = f""" + ALTER TABLE {mt_table} + EXPORT PARTITION ID '2020' TO TABLE {s3_table}; + ALTER TABLE {mt_table} + EXPORT PARTITION ID '2021' TO TABLE {s3_table}; + """ + + node.query(export_queries) + + # This should kill the background operations and drop the table + node.query(f"DROP TABLE {mt_table}") + + # Sleep some time to let the export finish (assuming it was not properly cancelled) + time.sleep(10) + + assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_*', format=LineAsString)") == '0\n', "Background operations completed even with the table dropped" + + +def test_concurrent_exports_to_different_targets(cluster): + node = cluster.instances["replica1"] + + mt_table = "concurrent_diff_targets_mt_table" + s3_table_a = "concurrent_diff_targets_s3_a" + s3_table_b = "concurrent_diff_targets_s3_b" + + create_tables_and_insert_data(node, mt_table, s3_table_a, "replica1") + create_s3_table(node, s3_table_b) + + # Launch two exports of the same partition to two different S3 tables concurrently + with PartitionManager() as pm: + pm.add_network_delay(node, delay_ms=1000) + + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table_a}" + ) + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table_b}" + ) + + wait_for_export_status(node, mt_table, s3_table_a, "2020", "COMPLETED") + wait_for_export_status(node, mt_table, s3_table_b, "2020", "COMPLETED") + + # Both targets should receive the same data independently + assert node.query(f"SELECT count() FROM {s3_table_a} WHERE year = 2020") == '3\n', "First target did not receive expected rows" + assert node.query(f"SELECT count() FROM {s3_table_b} WHERE year = 2020") == '3\n', "Second target did not receive expected rows" + + # And both should have a commit marker + assert node.query( + f"SELECT count() FROM s3(s3_conn, filename='{s3_table_a}/commit_2020_*', format=LineAsString)" + ) != '0\n', "Commit file missing for first target" + assert node.query( + f"SELECT count() FROM s3(s3_conn, filename='{s3_table_b}/commit_2020_*', format=LineAsString)" + ) != '0\n', "Commit file missing for second target" + + +def test_failure_is_logged_in_system_table(cluster): + node = cluster.instances["replica1"] + + mt_table = "failure_is_logged_in_system_table_mt_table" + s3_table = "failure_is_logged_in_system_table_s3_table" + + create_tables_and_insert_data(node, mt_table, s3_table, "replica1") + + # Block traffic to/from MinIO to force upload errors and retries, following existing S3 tests style + minio_ip = cluster.minio_ip + minio_port = cluster.minio_port + + with PartitionManager() as pm: + # Block responses from MinIO (source_port matches MinIO service) + pm_rule_reject_responses = { + "destination": node.ip_address, + "source_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_responses) + + # Also block requests to MinIO (destination: MinIO, destination_port: 9001) with REJECT to fail fast + pm_rule_reject_requests = { + "destination": minio_ip, + "destination_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_requests) + + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_max_retries=1;" + ) + + # Wait so that the export fails + wait_for_export_status(node, mt_table, s3_table, "2020", "FAILED", timeout=10) + + # Network restored; verify the export is marked as FAILED in the system table + # Also verify we captured at least one exception and no commit file exists + status = node.query( + f""" + SELECT status FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '2020' + """ + ) + + assert status.strip() == "FAILED", f"Expected FAILED status, got: {status!r}" + + exception_count = node.query( + f""" + SELECT any(exception_count) FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '2020' + """ + ) + assert int(exception_count.strip()) > 0, "Expected non-zero exception_count in system.replicated_partition_exports" + + # No commit should have been produced for this partition + assert node.query( + f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)" + ) == '0\n', "Commit file exists despite forced S3 failures" + + +def test_inject_short_living_failures(cluster): + node = cluster.instances["replica1"] + + mt_table = "inject_short_living_failures_mt_table" + s3_table = "inject_short_living_failures_s3_table" + + create_tables_and_insert_data(node, mt_table, s3_table, "replica1") + + # Block traffic to/from MinIO to force upload errors and retries, following existing S3 tests style + minio_ip = cluster.minio_ip + minio_port = cluster.minio_port + + with PartitionManager() as pm: + # Block responses from MinIO (source_port matches MinIO service) + pm_rule_reject_responses = { + "destination": node.ip_address, + "source_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_responses) + + # Also block requests to MinIO (destination: MinIO, destination_port: 9001) with REJECT to fail fast + pm_rule_reject_requests = { + "destination": minio_ip, + "destination_port": minio_port, + "action": "REJECT --reject-with tcp-reset", + } + pm._add_rule(pm_rule_reject_requests) + + # set big max_retries so that the export does not fail completely + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_max_retries=100;" + ) + + # wait only for a second to get at least one failure, but not enough to finish the export + time.sleep(5) + + # wait for the export to finish + wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") + + # Assert the export succeeded + assert node.query(f"SELECT count() FROM {s3_table} WHERE year = 2020") == '3\n', "Export did not succeed" + assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '1\n', "Export did not succeed" + + # check system.replicated_partition_exports for the export + assert node.query( + f""" + SELECT status FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '2020' + """ + ) == "COMPLETED\n", "Export should be marked as COMPLETED" + + exception_count = node.query( + f""" + SELECT exception_count FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '2020' + """ + ) + assert int(exception_count.strip()) >= 1, "Expected at least one exception" + + +def test_export_ttl(cluster): + node = cluster.instances["replica1"] + + mt_table = "export_ttl_mt_table" + s3_table = "export_ttl_s3_table" + + expiration_time = 3 + + create_tables_and_insert_data(node, mt_table, s3_table, "replica1") + + # start export + node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_manifest_ttl={expiration_time};") + + # assert that I get an error when trying to export the same partition again, query_and_get_error + error = node.query_and_get_error(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};") + assert "Export with key" in error, "Expected error about expired export" + + # wait for the export to finish and for the manifest to expire + wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") + time.sleep(expiration_time * 2) + + # assert that the export succeeded, check the commit file + assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '1\n', "Export did not succeed" + + # start export again + node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table}") + + # wait for the export to finish + wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") + + # assert that the export succeeded, check the commit file + # there should be two commit files now, one for the first export and one for the second export + assert node.query(f"SELECT count() FROM s3(s3_conn, filename='{s3_table}/commit_2020_*', format=LineAsString)") == '2\n', "Export did not succeed" + + +def test_export_partition_file_already_exists_policy(cluster): + node = cluster.instances["replica1"] + + mt_table = "export_partition_file_already_exists_policy_mt_table" + s3_table = "export_partition_file_already_exists_policy_s3_table" + + create_tables_and_insert_data(node, mt_table, s3_table, "replica1") + + # stop merges so part names remain stable. it is important for the test. + node.query(f"SYSTEM STOP MERGES {mt_table}") + + # Export all parts + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table}", + ) + + # check system.replicated_partition_exports for the export + assert node.query( + f""" + SELECT status FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '2020' + """ + ) == "COMPLETED\n", "Export should be marked as COMPLETED" + + # wait for the exports to finish + wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") + + # try to export the partition + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_force_export=1" + ) + + wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") + + assert node.query( + f""" + SELECT count() FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '2020' + AND status = 'COMPLETED' + """ + ) == '1\n', "Expected the export to be marked as COMPLETED" + + # overwrite policy + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_force_export=1, export_merge_tree_part_file_already_exists_policy='overwrite'" + ) + + # wait for the export to finish + wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") + + # check system.replicated_partition_exports for the export + # ideally we would make sure the transaction id is different, but I do not have the time to do that now + assert node.query( + f""" + SELECT count() FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '2020' + AND status = 'COMPLETED' + """ + ) == '1\n', "Expected the export to be marked as COMPLETED" + + # last but not least, let's try with the error policy + # max retries = 1 so it fails fast + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table} SETTINGS export_merge_tree_partition_force_export=1, export_merge_tree_part_file_already_exists_policy='error', export_merge_tree_partition_max_retries=1", + ) + + # wait for the export to finish + wait_for_export_status(node, mt_table, s3_table, "2020", "FAILED") + + # check system.replicated_partition_exports for the export + assert node.query( + f""" + SELECT count() FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '2020' + AND status = 'FAILED' + """ + ) == '1\n', "Expected the export to be marked as FAILED" + + +def test_export_partition_feature_is_disabled(cluster): + replica_with_export_disabled = cluster.instances["replica_with_export_disabled"] + + mt_table = "export_partition_feature_is_disabled_mt_table" + s3_table = "export_partition_feature_is_disabled_s3_table" + + create_tables_and_insert_data(replica_with_export_disabled, mt_table, s3_table, "replica1") + + error = replica_with_export_disabled.query_and_get_error(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};") + assert "experimental" in error, "Expected error about disabled feature" + + # make sure kill operation also throws + error = replica_with_export_disabled.query_and_get_error(f"KILL EXPORT PARTITION WHERE partition_id = '2020' and source_table = '{mt_table}' and destination_table = '{s3_table}'") + assert "experimental" in error, "Expected error about disabled feature" + + +def test_export_partition_permissions(cluster): + """Test that export partition validates permissions correctly: + - User needs ALTER permission on source table + - User needs INSERT permission on destination table + """ + node = cluster.instances["replica1"] + + mt_table = "permissions_mt_table" + s3_table = "permissions_s3_table" + + # Create tables as default user + create_tables_and_insert_data(node, mt_table, s3_table, "replica1") + + # Create test users with specific permissions + node.query("CREATE USER IF NOT EXISTS user_no_alter IDENTIFIED WITH no_password") + node.query("CREATE USER IF NOT EXISTS user_no_insert IDENTIFIED WITH no_password") + node.query("CREATE USER IF NOT EXISTS user_with_permissions IDENTIFIED WITH no_password") + + # Grant basic access to all users + node.query(f"GRANT SELECT ON {mt_table} TO user_no_alter") + node.query(f"GRANT SELECT ON {s3_table} TO user_no_alter") + + # user_no_insert has ALTER on source but no INSERT on destination + node.query(f"GRANT ALTER ON {mt_table} TO user_no_insert") + node.query(f"GRANT SELECT ON {s3_table} TO user_no_insert") + + # user_with_permissions has both ALTER and INSERT + node.query(f"GRANT ALTER ON {mt_table} TO user_with_permissions") + node.query(f"GRANT INSERT ON {s3_table} TO user_with_permissions") + + # Test 1: User without ALTER permission should fail + error = node.query_and_get_error( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table}", + user="user_no_alter" + ) + assert "ACCESS_DENIED" in error or "Not enough privileges" in error, \ + f"Expected ACCESS_DENIED error for user without ALTER, got: {error}" + + # Test 2: User with ALTER but without INSERT permission should fail + error = node.query_and_get_error( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table}", + user="user_no_insert" + ) + assert "ACCESS_DENIED" in error or "Not enough privileges" in error, \ + f"Expected ACCESS_DENIED error for user without INSERT, got: {error}" + + # Test 3: User with both ALTER and INSERT should succeed + node.query( + f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table}", + user="user_with_permissions" + ) + + # Wait for export to complete + wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") + + # Verify the export succeeded + result = node.query(f"SELECT count() FROM {s3_table} WHERE year = 2020") + assert result.strip() == "3", f"Expected 3 rows exported, got: {result}" + + # Verify system table shows COMPLETED status + status = node.query( + f""" + SELECT status FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '2020' + """ + ) + assert status.strip() == "COMPLETED", f"Expected COMPLETED status, got: {status}" + + +# assert multiple exports within a single query are executed. They all share the same query id +# and previously the transaction id was the query id, which would cause problems +def test_multiple_exports_within_a_single_query(cluster): + node = cluster.instances["replica1"] + + mt_table = "multiple_exports_within_a_single_query_mt_table" + s3_table = "multiple_exports_within_a_single_query_s3_table" + + create_tables_and_insert_data(node, mt_table, s3_table, "replica1") + + node.query(f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table}, EXPORT PARTITION ID '2021' TO TABLE {s3_table};") + + wait_for_export_status(node, mt_table, s3_table, "2020", "COMPLETED") + wait_for_export_status(node, mt_table, s3_table, "2021", "COMPLETED") + + # assert the exports have been executed + assert node.query(f"SELECT count() FROM {s3_table} WHERE year = 2020") == '3\n', "Export did not succeed" + assert node.query(f"SELECT count() FROM {s3_table} WHERE year = 2021") == '1\n', "Export did not succeed" + + # check system.replicated_partition_exports for the exports + assert node.query( + f""" + SELECT status FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '2020' + """ + ) == "COMPLETED\n", "Export should be marked as COMPLETED" + + assert node.query( + f""" + SELECT status FROM system.replicated_partition_exports + WHERE source_table = '{mt_table}' + AND destination_table = '{s3_table}' + AND partition_id = '2021' + """ + ) == "COMPLETED\n", "Export should be marked as COMPLETED" + +# def test_source_mutations_during_export_snapshot(cluster): +# node = cluster.instances["replica1"] + +# mt_table = "mutations_snapshot_mt_table" +# s3_table = "mutations_snapshot_s3_table" + +# create_tables_and_insert_data(node, mt_table, s3_table, "replica1") + +# # Ensure export sees a consistent snapshot at start time even if we mutate the source later +# with PartitionManager() as pm: +# pm.add_network_delay(node, delay_ms=5000) + +# # Start export of 2020 +# node.query( +# f"ALTER TABLE {mt_table} EXPORT PARTITION ID '2020' TO TABLE {s3_table};" +# ) + +# # Mutate the source after export started (delete the same partition) +# node.query(f"ALTER TABLE {mt_table} DROP COLUMN id") + +# # assert the mutation has been applied AND the data has not been exported yet +# assert node.query(f"SELECT count() FROM {mt_table} WHERE year = 2020") == '0\n', "Mutation has not been applied" +# assert node.query(f"SELECT count() FROM {s3_table} WHERE year = 2020") == '0\n', "Data has been exported" + +# # Wait for export to finish and then verify destination still reflects the original snapshot (3 rows) +# time.sleep(5) +# assert node.query(f"SELECT count() FROM {s3_table} WHERE year = 2020") == '3\n', "Export did not preserve snapshot at start time after source mutation" diff --git a/tests/integration/test_mask_sensitive_info/test.py b/tests/integration/test_mask_sensitive_info/test.py index 453613060b91..08d6ba193f25 100644 --- a/tests/integration/test_mask_sensitive_info/test.py +++ b/tests/integration/test_mask_sensitive_info/test.py @@ -273,11 +273,13 @@ def test_create_table(): f"IcebergS3('http://minio1:9001/root/data/test11.csv.gz', 'minio', '{password}')", "DNS_ERROR", ), + ( + f"Iceberg(storage_type='s3', 'http://minio1:9001/root/data/test11.csv.gz', 'minio', '{password}')", + "DNS_ERROR", + ), f"AzureBlobStorage('{azure_conn_string}', 'cont', 'test_simple.csv', 'CSV')", f"AzureBlobStorage('{azure_conn_string}', 'cont', 'test_simple_1.csv', 'CSV', 'none')", - f"AzureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_2.csv', '{azure_account_name}', '{azure_account_key}')", - f"AzureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_3.csv', '{azure_account_name}', '{azure_account_key}', 'CSV')", - f"AzureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none')", + f"AzureQueue('{azure_conn_string}', 'cont', '*', 'CSV') SETTINGS mode = 'unordered'", f"AzureQueue('{azure_conn_string}', 'cont', '*', 'CSV', 'none') SETTINGS mode = 'unordered'", f"AzureQueue('{azure_storage_account_url}', 'cont', '*', '{azure_account_name}', '{azure_account_key}', 'CSV') SETTINGS mode = 'unordered'", @@ -286,6 +288,45 @@ def test_create_table(): f"AzureBlobStorage('BlobEndpoint=https://my-endpoint/;SharedAccessSignature=sp=r&st=2025-09-29T14:58:11Z&se=2025-09-29T00:00:00Z&spr=https&sv=2022-11-02&sr=c&sig=SECRET%SECRET%SECRET%SECRET', 'exampledatasets', 'example.csv')", "STD_EXCEPTION", ), + + f"AzureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_3.csv', '{azure_account_name}', '{azure_account_key}')", + f"AzureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '{azure_account_key}', 'CSV')", + f"AzureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_5.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none')", + f"AzureBlobStorage(named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", + f"AzureBlobStorage(named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '{azure_account_key}')", + ( + f"IcebergAzure('{azure_conn_string}', 'cont', 'test_simple.csv')", + "FILE_DOESNT_EXIST", + ), + ( + f"IcebergAzure('{azure_storage_account_url}', 'cont', 'test_simple.csv', '{azure_account_name}', '{azure_account_key}')", + "FILE_DOESNT_EXIST", + ), + ( + f"IcebergAzure(named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", + "FILE_DOESNT_EXIST", + ), + ( + f"IcebergAzure(named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '{azure_account_key}')", + "FILE_DOESNT_EXIST", + ), + ( + f"Iceberg(storage_type='azure', '{azure_conn_string}', 'cont', 'test_simple.csv')", + "FILE_DOESNT_EXIST", + ), + ( + f"Iceberg(storage_type='azure', '{azure_storage_account_url}', 'cont', 'test_simple.csv', '{azure_account_name}', '{azure_account_key}')", + "FILE_DOESNT_EXIST", + ), + ( + f"Iceberg(storage_type='azure', named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", + "FILE_DOESNT_EXIST", + ), + ( + f"Iceberg(storage_type='azure', named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '{azure_account_key}')", + "FILE_DOESNT_EXIST", + ), + f"Kafka() SETTINGS kafka_broker_list = '127.0.0.1', kafka_topic_list = 'topic', kafka_group_name = 'group', kafka_format = 'JSONEachRow', kafka_security_protocol = 'sasl_ssl', kafka_sasl_mechanism = 'PLAIN', kafka_sasl_username = 'user', kafka_sasl_password = '{password}', format_avro_schema_registry_url = 'http://schema_user:{password}@'", f"Kafka() SETTINGS kafka_broker_list = '127.0.0.1', kafka_topic_list = 'topic', kafka_group_name = 'group', kafka_format = 'JSONEachRow', kafka_security_protocol = 'sasl_ssl', kafka_sasl_mechanism = 'PLAIN', kafka_sasl_username = 'user', kafka_sasl_password = '{password}', format_avro_schema_registry_url = 'http://schema_user:{password}@domain.com'", f"S3('http://minio1:9001/root/data/test5.csv.gz', 'CSV', access_key_id = 'minio', secret_access_key = '{password}', compression_method = 'gzip')", @@ -355,19 +396,31 @@ def make_test_case(i): "CREATE TABLE table20 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV', 'gzip') SETTINGS mode = 'ordered'", "CREATE TABLE table21 (`x` int) ENGINE = Iceberg('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", "CREATE TABLE table22 (`x` int) ENGINE = IcebergS3('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", - f"CREATE TABLE table23 (`x` int) ENGINE = AzureBlobStorage('{masked_azure_conn_string}', 'cont', 'test_simple.csv', 'CSV')", - f"CREATE TABLE table24 (`x` int) ENGINE = AzureBlobStorage('{masked_azure_conn_string}', 'cont', 'test_simple_1.csv', 'CSV', 'none')", - f"CREATE TABLE table25 (`x` int) ENGINE = AzureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_2.csv', '{azure_account_name}', '[HIDDEN]')", - f"CREATE TABLE table26 (`x` int) ENGINE = AzureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_3.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')", - f"CREATE TABLE table27 (`x` int) ENGINE = AzureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none')", - f"CREATE TABLE table28 (`x` int) ENGINE = AzureQueue('{masked_azure_conn_string}', 'cont', '*', 'CSV') SETTINGS mode = 'unordered'", - f"CREATE TABLE table29 (`x` int) ENGINE = AzureQueue('{masked_azure_conn_string}', 'cont', '*', 'CSV', 'none') SETTINGS mode = 'unordered'", - f"CREATE TABLE table30 (`x` int) ENGINE = AzureQueue('{azure_storage_account_url}', 'cont', '*', '{azure_account_name}', '[HIDDEN]', 'CSV') SETTINGS mode = 'unordered'", - f"CREATE TABLE table31 (`x` int) ENGINE = AzureQueue('{azure_storage_account_url}', 'cont', '*', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none') SETTINGS mode = 'unordered'", - f"CREATE TABLE table32 (`x` int) ENGINE = AzureBlobStorage('{masked_sas_conn_string}', 'exampledatasets', 'example.csv')", - "CREATE TABLE table33 (`x` int) ENGINE = Kafka SETTINGS kafka_broker_list = '127.0.0.1', kafka_topic_list = 'topic', kafka_group_name = 'group', kafka_format = 'JSONEachRow', kafka_security_protocol = 'sasl_ssl', kafka_sasl_mechanism = 'PLAIN', kafka_sasl_username = 'user', kafka_sasl_password = '[HIDDEN]', format_avro_schema_registry_url = 'http://schema_user:[HIDDEN]@'", - "CREATE TABLE table34 (`x` int) ENGINE = Kafka SETTINGS kafka_broker_list = '127.0.0.1', kafka_topic_list = 'topic', kafka_group_name = 'group', kafka_format = 'JSONEachRow', kafka_security_protocol = 'sasl_ssl', kafka_sasl_mechanism = 'PLAIN', kafka_sasl_username = 'user', kafka_sasl_password = '[HIDDEN]', format_avro_schema_registry_url = 'http://schema_user:[HIDDEN]@domain.com'", - "CREATE TABLE table35 (`x` int) ENGINE = S3('http://minio1:9001/root/data/test5.csv.gz', 'CSV', access_key_id = 'minio', secret_access_key = '[HIDDEN]', compression_method = 'gzip')", + "CREATE TABLE table23 (`x` int) ENGINE = Iceberg(storage_type = 's3', 'http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", + f"CREATE TABLE table24 (`x` int) ENGINE = AzureBlobStorage('{masked_azure_conn_string}', 'cont', 'test_simple.csv', 'CSV')", + f"CREATE TABLE table25 (`x` int) ENGINE = AzureBlobStorage('{masked_azure_conn_string}', 'cont', 'test_simple_1.csv', 'CSV', 'none')", + f"CREATE TABLE table26 (`x` int) ENGINE = AzureQueue('{masked_azure_conn_string}', 'cont', '*', 'CSV') SETTINGS mode = 'unordered'", + f"CREATE TABLE table27 (`x` int) ENGINE = AzureQueue('{masked_azure_conn_string}', 'cont', '*', 'CSV', 'none') SETTINGS mode = 'unordered'", + f"CREATE TABLE table28 (`x` int) ENGINE = AzureQueue('{azure_storage_account_url}', 'cont', '*', '{azure_account_name}', '[HIDDEN]', 'CSV') SETTINGS mode = 'unordered'", + f"CREATE TABLE table29 (`x` int) ENGINE = AzureQueue('{azure_storage_account_url}', 'cont', '*', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none') SETTINGS mode = 'unordered'", + f"CREATE TABLE table30 (`x` int) ENGINE = AzureBlobStorage('{masked_sas_conn_string}', 'exampledatasets', 'example.csv')", + f"CREATE TABLE table31 (`x` int) ENGINE = AzureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_3.csv', '{azure_account_name}', '[HIDDEN]')", + f"CREATE TABLE table32 (`x` int) ENGINE = AzureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')", + f"CREATE TABLE table33 (`x` int) ENGINE = AzureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_5.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none')", + f"CREATE TABLE table34 (`x` int) ENGINE = AzureBlobStorage(named_collection_2, connection_string = '{masked_azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", + f"CREATE TABLE table35 (`x` int) ENGINE = AzureBlobStorage(named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '[HIDDEN]')", + f"CREATE TABLE table36 (`x` int) ENGINE = IcebergAzure('{masked_azure_conn_string}', 'cont', 'test_simple.csv')", + f"CREATE TABLE table37 (`x` int) ENGINE = IcebergAzure('{azure_storage_account_url}', 'cont', 'test_simple.csv', '{azure_account_name}', '[HIDDEN]')", + f"CREATE TABLE table38 (`x` int) ENGINE = IcebergAzure(named_collection_2, connection_string = '{masked_azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", + f"CREATE TABLE table39 (`x` int) ENGINE = IcebergAzure(named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '[HIDDEN]')", + f"CREATE TABLE table40 (`x` int) ENGINE = Iceberg(storage_type = 'azure', '{masked_azure_conn_string}', 'cont', 'test_simple.csv')", + f"CREATE TABLE table41 (`x` int) ENGINE = Iceberg(storage_type = 'azure', '{azure_storage_account_url}', 'cont', 'test_simple.csv', '{azure_account_name}', '[HIDDEN]')", + f"CREATE TABLE table42 (`x` int) ENGINE = Iceberg(storage_type = 'azure', named_collection_2, connection_string = '{masked_azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", + f"CREATE TABLE table43 (`x` int) ENGINE = Iceberg(storage_type = 'azure', named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '[HIDDEN]')", + + "CREATE TABLE table44 (`x` int) ENGINE = Kafka SETTINGS kafka_broker_list = '127.0.0.1', kafka_topic_list = 'topic', kafka_group_name = 'group', kafka_format = 'JSONEachRow', kafka_security_protocol = 'sasl_ssl', kafka_sasl_mechanism = 'PLAIN', kafka_sasl_username = 'user', kafka_sasl_password = '[HIDDEN]', format_avro_schema_registry_url = 'http://schema_user:[HIDDEN]@'", + "CREATE TABLE table45 (`x` int) ENGINE = Kafka SETTINGS kafka_broker_list = '127.0.0.1', kafka_topic_list = 'topic', kafka_group_name = 'group', kafka_format = 'JSONEachRow', kafka_security_protocol = 'sasl_ssl', kafka_sasl_mechanism = 'PLAIN', kafka_sasl_username = 'user', kafka_sasl_password = '[HIDDEN]', format_avro_schema_registry_url = 'http://schema_user:[HIDDEN]@domain.com'", + "CREATE TABLE table46 (`x` int) ENGINE = S3('http://minio1:9001/root/data/test5.csv.gz', 'CSV', access_key_id = 'minio', secret_access_key = '[HIDDEN]', compression_method = 'gzip')", ], must_not_contain=[password], ) @@ -482,9 +535,22 @@ def test_table_functions(): f"azureBlobStorage(named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", f"azureBlobStorage(named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '{azure_account_key}')", f"iceberg('http://minio1:9001/root/data/test11.csv.gz', 'minio', '{password}')", - f"gcs('http://minio1:9001/root/data/test11.csv.gz', 'minio', '{password}')", + f"iceberg(named_collection_2, url = 'http://minio1:9001/root/data/test4.csv', access_key_id = 'minio', secret_access_key = '{password}')", f"icebergS3('http://minio1:9001/root/data/test11.csv.gz', 'minio', '{password}')", + f"icebergS3(named_collection_2, url = 'http://minio1:9001/root/data/test4.csv', access_key_id = 'minio', secret_access_key = '{password}')", + f"icebergAzure('{azure_conn_string}', 'cont', 'test_simple.csv')", + f"icebergAzure('{azure_storage_account_url}', 'cont', 'test_simple.csv', '{azure_account_name}', '{azure_account_key}')", f"icebergAzure('{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none', 'auto')", + f"icebergAzure(named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", + f"icebergAzure(named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '{azure_account_key}')", + f"iceberg(storage_type='s3', 'http://minio1:9001/root/data/test11.csv.gz', 'minio', '{password}')", + f"iceberg(storage_type='s3', named_collection_2, url = 'http://minio1:9001/root/data/test4.csv', access_key_id = 'minio', secret_access_key = '{password}')", + f"iceberg(storage_type='azure', '{azure_conn_string}', 'cont', 'test_simple.csv')", + f"iceberg(storage_type='azure', '{azure_storage_account_url}', 'cont', 'test_simple.csv', '{azure_account_name}', '{azure_account_key}')", + f"iceberg(storage_type='azure', '{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none', 'auto')", + f"iceberg(storage_type='azure', named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", + f"iceberg(storage_type='azure', named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '{azure_account_key}')", + f"gcs('http://minio1:9001/root/data/test11.csv.gz', 'minio', '{password}')", f"deltaLakeAzure('{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none', 'auto')", f"hudi('http://minio1:9001/root/data/test7.csv', 'minio', '{password}')", ] @@ -567,11 +633,24 @@ def make_test_case(i): f"CREATE TABLE tablefunc37 (`x` int) AS azureBlobStorage(named_collection_2, connection_string = '{masked_azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", f"CREATE TABLE tablefunc38 (`x` int) AS azureBlobStorage(named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '[HIDDEN]')", "CREATE TABLE tablefunc39 (`x` int) AS iceberg('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", - "CREATE TABLE tablefunc40 (`x` int) AS gcs('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", + "CREATE TABLE tablefunc40 (`x` int) AS iceberg(named_collection_2, url = 'http://minio1:9001/root/data/test4.csv', access_key_id = 'minio', secret_access_key = '[HIDDEN]')", "CREATE TABLE tablefunc41 (`x` int) AS icebergS3('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", - f"CREATE TABLE tablefunc42 (`x` int) AS icebergAzure('{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')", - f"CREATE TABLE tablefunc43 (`x` int) AS deltaLakeAzure('{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')", - "CREATE TABLE tablefunc44 (`x` int) AS hudi('http://minio1:9001/root/data/test7.csv', 'minio', '[HIDDEN]')", + "CREATE TABLE tablefunc42 (`x` int) AS icebergS3(named_collection_2, url = 'http://minio1:9001/root/data/test4.csv', access_key_id = 'minio', secret_access_key = '[HIDDEN]')", + f"CREATE TABLE tablefunc43 (`x` int) AS icebergAzure('{masked_azure_conn_string}', 'cont', 'test_simple.csv')", + f"CREATE TABLE tablefunc44 (`x` int) AS icebergAzure('{azure_storage_account_url}', 'cont', 'test_simple.csv', '{azure_account_name}', '[HIDDEN]')", + f"CREATE TABLE tablefunc45 (`x` int) AS icebergAzure('{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')", + f"CREATE TABLE tablefunc46 (`x` int) AS icebergAzure(named_collection_2, connection_string = '{masked_azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", + f"CREATE TABLE tablefunc47 (`x` int) AS icebergAzure(named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '[HIDDEN]')", + "CREATE TABLE tablefunc48 (`x` int) AS iceberg(storage_type = 's3', 'http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", + "CREATE TABLE tablefunc49 (`x` int) AS iceberg(storage_type = 's3', named_collection_2, url = 'http://minio1:9001/root/data/test4.csv', access_key_id = 'minio', secret_access_key = '[HIDDEN]')", + f"CREATE TABLE tablefunc50 (`x` int) AS iceberg(storage_type = 'azure', '{masked_azure_conn_string}', 'cont', 'test_simple.csv')", + f"CREATE TABLE tablefunc51 (`x` int) AS iceberg(storage_type = 'azure', '{azure_storage_account_url}', 'cont', 'test_simple.csv', '{azure_account_name}', '[HIDDEN]')", + f"CREATE TABLE tablefunc52 (`x` int) AS iceberg(storage_type = 'azure', '{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')", + f"CREATE TABLE tablefunc53 (`x` int) AS iceberg(storage_type = 'azure', named_collection_2, connection_string = '{masked_azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", + f"CREATE TABLE tablefunc54 (`x` int) AS iceberg(storage_type = 'azure', named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '[HIDDEN]')", + "CREATE TABLE tablefunc55 (`x` int) AS gcs('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", + f"CREATE TABLE tablefunc56 (`x` int) AS deltaLakeAzure('{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')", + "CREATE TABLE tablefunc57 (`x` int) AS hudi('http://minio1:9001/root/data/test7.csv', 'minio', '[HIDDEN]')", ], must_not_contain=[password], ) diff --git a/tests/integration/test_s3_cache_locality/__init__.py b/tests/integration/test_s3_cache_locality/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/integration/test_s3_cache_locality/configs/cluster.xml b/tests/integration/test_s3_cache_locality/configs/cluster.xml new file mode 100644 index 000000000000..db54c35374b9 --- /dev/null +++ b/tests/integration/test_s3_cache_locality/configs/cluster.xml @@ -0,0 +1,126 @@ + + + + + + + + clickhouse1 + 9000 + + + clickhouse2 + 9000 + + + clickhouse3 + 9000 + + + clickhouse4 + 9000 + + + clickhouse5 + 9000 + + + + + + + + clickhouse1 + 9000 + + + clickhouse2 + 9000 + + + clickhouse3 + 9000 + + + clickhouse4 + 9000 + + + + + + + + clickhouse2 + 9000 + + + clickhouse3 + 9000 + + + clickhouse4 + 9000 + + + clickhouse5 + 9000 + + + + + + + + clickhouse3 + 9000 + + + clickhouse4 + 9000 + + + clickhouse5 + 9000 + + + clickhouse1 + 9000 + + + clickhouse2 + 9000 + + + + + + + + clickhouse4 + 9000 + + + clickhouse5 + 9000 + + + clickhouse2 + 9000 + + + clickhouse3 + 9000 + + + + + + + + + /var/lib/clickhouse/raw_s3_cache + 10Gi + + + diff --git a/tests/integration/test_s3_cache_locality/configs/named_collections.xml b/tests/integration/test_s3_cache_locality/configs/named_collections.xml new file mode 100644 index 000000000000..6994aa3f5e77 --- /dev/null +++ b/tests/integration/test_s3_cache_locality/configs/named_collections.xml @@ -0,0 +1,10 @@ + + + + http://minio1:9001/root/data/* + minio + ClickHouse_Minio_P@ssw0rd + CSV> + + + diff --git a/tests/integration/test_s3_cache_locality/configs/users.xml b/tests/integration/test_s3_cache_locality/configs/users.xml new file mode 100644 index 000000000000..4b6ba057ecb1 --- /dev/null +++ b/tests/integration/test_s3_cache_locality/configs/users.xml @@ -0,0 +1,9 @@ + + + + + default + 1 + + + diff --git a/tests/integration/test_s3_cache_locality/test.py b/tests/integration/test_s3_cache_locality/test.py new file mode 100644 index 000000000000..68993d85aeed --- /dev/null +++ b/tests/integration/test_s3_cache_locality/test.py @@ -0,0 +1,262 @@ +import csv +import logging +import os +import shutil +import uuid + +import pytest + +from helpers.cluster import ClickHouseCluster +from helpers.config_cluster import minio_secret_key + + +logging.getLogger().setLevel(logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler()) + +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) + + +def create_buckets_s3(cluster, files=1000): + minio = cluster.minio_client + + s3_data = [] + + for file_number in range(files): + file_name = f"data/generated_{files}/file_{file_number}.csv" + os.makedirs(os.path.join(SCRIPT_DIR, f"data/generated_{files}/"), exist_ok=True) + s3_data.append(file_name) + with open(os.path.join(SCRIPT_DIR, file_name), "w+", encoding="utf-8") as f: + # a String, b UInt64 + data = [] + + # Make all files a bit different + data.append( + ["str_" + str(file_number), file_number] + ) + + writer = csv.writer(f) + writer.writerows(data) + + for file in s3_data: + minio.fput_object( + bucket_name=cluster.minio_bucket, + object_name=file, + file_path=os.path.join(SCRIPT_DIR, file), + ) + + for obj in minio.list_objects(cluster.minio_bucket, recursive=True): + print(obj.object_name) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster = ClickHouseCluster(__file__) + # clickhouse0 not a member of cluster_XXX + for i in range(6): + cluster.add_instance( + f"clickhouse{i}", + main_configs=["configs/cluster.xml", "configs/named_collections.xml"], + user_configs=["configs/users.xml"], + macros={"replica": f"clickhouse{i}"}, + with_minio=True, + with_zookeeper=True, + stay_alive=True, + ) + + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + create_buckets_s3(cluster) + create_buckets_s3(cluster, files=3) + + yield cluster + finally: + shutil.rmtree(os.path.join(SCRIPT_DIR, "data/generated_1000/"), ignore_errors=True) + shutil.rmtree(os.path.join(SCRIPT_DIR, "data/generated_3/"), ignore_errors=True) + cluster.shutdown() + + +def check_s3_gets(cluster, node, expected_result, cluster_first, cluster_second, enable_filesystem_cache, + lock_object_storage_task_distribution_ms, files=1000): + for host in list(cluster.instances.values()): + host.query("SYSTEM DROP FILESYSTEM CACHE 'raw_s3_cache'", ignore_error=True) + + settings = { + "enable_filesystem_cache": enable_filesystem_cache, + "filesystem_cache_name": "'raw_s3_cache'", + } + + settings["lock_object_storage_task_distribution_ms"] = lock_object_storage_task_distribution_ms + + query_id_first = str(uuid.uuid4()) + result_first = node.query( + f""" + SELECT count(*) + FROM s3Cluster('{cluster_first}', 'http://minio1:9001/root/data/generated_{files}/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64') + WHERE b=42 + SETTINGS {",".join(f"{k}={v}" for k, v in settings.items())} + """, + query_id=query_id_first, + ) + assert result_first == expected_result + query_id_second = str(uuid.uuid4()) + result_second = node.query( + f""" + SELECT count(*) + FROM s3Cluster('{cluster_second}', 'http://minio1:9001/root/data/generated_{files}/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64') + WHERE b=42 + SETTINGS {",".join(f"{k}={v}" for k, v in settings.items())} + """, + query_id=query_id_second, + ) + assert result_second == expected_result + + node.query(f"SYSTEM FLUSH LOGS ON CLUSTER {cluster_first}") + node.query(f"SYSTEM FLUSH LOGS ON CLUSTER {cluster_second}") + + s3_get_first = node.query( + f""" + SELECT sum(ProfileEvents['S3GetObject']) + FROM clusterAllReplicas('{cluster_first}', system.query_log) + WHERE type='QueryFinish' + AND initial_query_id='{query_id_first}' + """, + ) + s3_get_second = node.query( + f""" + SELECT sum(ProfileEvents['S3GetObject']) + FROM clusterAllReplicas('{cluster_second}', system.query_log) + WHERE type='QueryFinish' + AND initial_query_id='{query_id_second}' + """, + ) + + return int(s3_get_first), int(s3_get_second) + + +def check_s3_gets_by_hosts(cluster, node, expected_result, + lock_object_storage_task_distribution_ms, files=1000): + settings = { + "enable_filesystem_cache": False, + } + + settings["lock_object_storage_task_distribution_ms"] = lock_object_storage_task_distribution_ms + query_id = str(uuid.uuid4()) + result = node.query( + f""" + SELECT count(*) + FROM s3Cluster('{cluster}', 'http://minio1:9001/root/data/generated_{files}/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64') + WHERE b=42 + SETTINGS {",".join(f"{k}={v}" for k, v in settings.items())} + """, + query_id=query_id, + ) + assert result == expected_result + + node.query(f"SYSTEM FLUSH LOGS ON CLUSTER {cluster}") + + s3_get = node.query( + f""" + SELECT ProfileEvents['S3GetObject'] + FROM clusterAllReplicas('{cluster}', system.query_log) + WHERE type='QueryFinish' + AND initial_query_id='{query_id}' + ORDER BY hostname + """, + ) + + return [int(events) for events in s3_get.strip().split("\n")] + + +def check_s3_gets_repeat(cluster, node, expected_result, cluster_first, cluster_second, enable_filesystem_cache, + lock_object_storage_task_distribution_ms): + # Repeat test several times to get average result + iterations = 1 if lock_object_storage_task_distribution_ms > 0 else 10 + s3_get_first_sum = 0 + s3_get_second_sum = 0 + for _ in range(iterations): + (s3_get_first, s3_get_second) = check_s3_gets(cluster, node, expected_result, cluster_first, cluster_second, enable_filesystem_cache, lock_object_storage_task_distribution_ms) + s3_get_first_sum += s3_get_first + s3_get_second_sum += s3_get_second + return s3_get_first_sum, s3_get_second_sum + + +@pytest.mark.parametrize("lock_object_storage_task_distribution_ms ", [0, 30000]) +def test_cache_locality(started_cluster, lock_object_storage_task_distribution_ms): + node = started_cluster.instances["clickhouse0"] + + expected_result = node.query( + f""" + SELECT count(*) + FROM s3('http://minio1:9001/root/data/generated_1000/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64') + WHERE b=42 + """ + ) + + # Algorithm does not give 100% guarantee, so add 10% on dispersion + dispersion = 0.0 if lock_object_storage_task_distribution_ms > 0 else 0.1 + + # No cache + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_12345', 0, lock_object_storage_task_distribution_ms) + assert s3_get_second == s3_get_first + + # With cache + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_12345', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * dispersion + + # Different replicas order + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_34512', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * dispersion + + # No last replica + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_1234', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * (0.179 + dispersion) # actual value - 179 of 1000 files changed replica + + # No first replica + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_2345', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * (0.189 + dispersion) # actual value - 189 of 1000 files changed replica + + # No first replica, different replicas order + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_12345', 'cluster_4523', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * (0.189 + dispersion) + + # Add new replica, different replicas order + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_4523', 'cluster_12345', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * (0.189 + dispersion) + + # New replica and old replica, different replicas order + # All files from removed replica changed replica + # Some files from existed replicas changed replica on the new replica + (s3_get_first, s3_get_second) = check_s3_gets_repeat(started_cluster, node, expected_result, 'cluster_1234', 'cluster_4523', 1, lock_object_storage_task_distribution_ms) + assert s3_get_second <= s3_get_first * (0.368 + dispersion) # actual value - 368 of 1000 changed replica + + if (lock_object_storage_task_distribution_ms > 0): + s3_get = check_s3_gets_by_hosts('cluster_12345', node, expected_result, lock_object_storage_task_distribution_ms, files=1000) + assert s3_get == [189,210,220,202,179] + s3_get = check_s3_gets_by_hosts('cluster_1234', node, expected_result, lock_object_storage_task_distribution_ms, files=1000) + assert s3_get == [247,243,264,246] + s3_get = check_s3_gets_by_hosts('cluster_2345', node, expected_result, lock_object_storage_task_distribution_ms, files=1000) + assert s3_get == [251,280,248,221] + + +def test_cache_locality_few_files(started_cluster): + node = started_cluster.instances["clickhouse0"] + + expected_result = node.query( + f""" + SELECT count(*) + FROM s3('http://minio1:9001/root/data/generated_3/*', 'minio', '{minio_secret_key}', 'CSV', 'a String, b UInt64') + WHERE b=42 + """ + ) + + # Rendezvous hash makes the next distribution: + # file_0 - clickhouse1 + # file_1 - clickhouse4 + # file_2 - clickhouse3 + # The same distribution must be in each query + for _ in range(10): + s3_get = check_s3_gets_by_hosts('cluster_12345', node, expected_result, lock_object_storage_task_distribution_ms=30000, files=3) + assert s3_get == [1,0,1,1,0] diff --git a/tests/integration/test_s3_cluster/data/graceful/part0.csv b/tests/integration/test_s3_cluster/data/graceful/part0.csv new file mode 100644 index 000000000000..2a8ceabbea58 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part0.csv @@ -0,0 +1 @@ +0,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part1.csv b/tests/integration/test_s3_cluster/data/graceful/part1.csv new file mode 100644 index 000000000000..1950012fffd2 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part1.csv @@ -0,0 +1 @@ +1,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part2.csv b/tests/integration/test_s3_cluster/data/graceful/part2.csv new file mode 100644 index 000000000000..dc782d5adf9b --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part2.csv @@ -0,0 +1 @@ +2,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part3.csv b/tests/integration/test_s3_cluster/data/graceful/part3.csv new file mode 100644 index 000000000000..6e581549d23c --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part3.csv @@ -0,0 +1 @@ +3,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part4.csv b/tests/integration/test_s3_cluster/data/graceful/part4.csv new file mode 100644 index 000000000000..bb5a4d956c51 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part4.csv @@ -0,0 +1 @@ +4,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part5.csv b/tests/integration/test_s3_cluster/data/graceful/part5.csv new file mode 100644 index 000000000000..5cb2c6be144b --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part5.csv @@ -0,0 +1 @@ +5,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part6.csv b/tests/integration/test_s3_cluster/data/graceful/part6.csv new file mode 100644 index 000000000000..e2e2428d100d --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part6.csv @@ -0,0 +1 @@ +6,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part7.csv b/tests/integration/test_s3_cluster/data/graceful/part7.csv new file mode 100644 index 000000000000..3c819a315c20 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part7.csv @@ -0,0 +1 @@ +7,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part8.csv b/tests/integration/test_s3_cluster/data/graceful/part8.csv new file mode 100644 index 000000000000..72f39e512be3 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part8.csv @@ -0,0 +1 @@ +8,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/part9.csv b/tests/integration/test_s3_cluster/data/graceful/part9.csv new file mode 100644 index 000000000000..f288cb2051dd --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/part9.csv @@ -0,0 +1 @@ +9,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/partA.csv b/tests/integration/test_s3_cluster/data/graceful/partA.csv new file mode 100644 index 000000000000..da99f68ba784 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/partA.csv @@ -0,0 +1 @@ +10,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/partB.csv b/tests/integration/test_s3_cluster/data/graceful/partB.csv new file mode 100644 index 000000000000..46591e0be815 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/partB.csv @@ -0,0 +1 @@ +11,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/partC.csv b/tests/integration/test_s3_cluster/data/graceful/partC.csv new file mode 100644 index 000000000000..24af8010b5c6 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/partC.csv @@ -0,0 +1 @@ +12,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/partD.csv b/tests/integration/test_s3_cluster/data/graceful/partD.csv new file mode 100644 index 000000000000..0365a5024871 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/partD.csv @@ -0,0 +1 @@ +13,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/partE.csv b/tests/integration/test_s3_cluster/data/graceful/partE.csv new file mode 100644 index 000000000000..3143c0eed915 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/partE.csv @@ -0,0 +1 @@ +14,"Foo" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/data/graceful/partF.csv b/tests/integration/test_s3_cluster/data/graceful/partF.csv new file mode 100644 index 000000000000..d0306b9bb806 --- /dev/null +++ b/tests/integration/test_s3_cluster/data/graceful/partF.csv @@ -0,0 +1 @@ +15,"Bar" \ No newline at end of file diff --git a/tests/integration/test_s3_cluster/test.py b/tests/integration/test_s3_cluster/test.py index 76b8f0df2881..5cecf58f37b0 100644 --- a/tests/integration/test_s3_cluster/test.py +++ b/tests/integration/test_s3_cluster/test.py @@ -3,11 +3,18 @@ import os import shutil import time +import uuid from email.errors import HeaderParseError + +import time +from email.errors import HeaderParseError +import threading + import pytest from helpers.cluster import ClickHouseCluster +from helpers.client import QueryRuntimeException from helpers.config_cluster import minio_secret_key from helpers.mock_servers import start_mock_servers from helpers.test_tools import TSV @@ -21,6 +28,22 @@ "data/clickhouse/part123.csv", "data/database/part2.csv", "data/database/partition675.csv", + "data/graceful/part0.csv", + "data/graceful/part1.csv", + "data/graceful/part2.csv", + "data/graceful/part3.csv", + "data/graceful/part4.csv", + "data/graceful/part5.csv", + "data/graceful/part6.csv", + "data/graceful/part7.csv", + "data/graceful/part8.csv", + "data/graceful/part9.csv", + "data/graceful/partA.csv", + "data/graceful/partB.csv", + "data/graceful/partC.csv", + "data/graceful/partD.csv", + "data/graceful/partE.csv", + "data/graceful/partF.csv", ] @@ -76,6 +99,7 @@ def started_cluster(): macros={"replica": "node1", "shard": "shard1"}, with_minio=True, with_zookeeper=True, + stay_alive=True, ) cluster.add_instance( "s0_0_1", @@ -83,6 +107,7 @@ def started_cluster(): user_configs=["configs/users.xml"], macros={"replica": "replica2", "shard": "shard1"}, with_zookeeper=True, + stay_alive=True, ) cluster.add_instance( "s0_1_0", @@ -90,6 +115,7 @@ def started_cluster(): user_configs=["configs/users.xml"], macros={"replica": "replica1", "shard": "shard2"}, with_zookeeper=True, + stay_alive=True, ) logging.info("Starting cluster...") @@ -234,6 +260,21 @@ def test_wrong_cluster(started_cluster): assert "not found" in error + error = node.query_and_get_error( + f""" + SELECT count(*) from s3( + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', + 'minio', '{minio_secret_key}', 'CSV', 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') + UNION ALL + SELECT count(*) from s3( + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', + 'minio', '{minio_secret_key}', 'CSV', 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') + SETTINGS object_storage_cluster = 'non_existing_cluster' + """ + ) + + assert "not found" in error + def test_ambiguous_join(started_cluster): node = started_cluster.instances["s0_0_0"] @@ -252,6 +293,20 @@ def test_ambiguous_join(started_cluster): ) assert "AMBIGUOUS_COLUMN_NAME" not in result + result = node.query( + f""" + SELECT l.name, r.value from s3( + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') as l + JOIN s3( + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') as r + ON l.name = r.name + SETTINGS object_storage_cluster = 'cluster_simple' + """ + ) + assert "AMBIGUOUS_COLUMN_NAME" not in result + def test_skip_unavailable_shards(started_cluster): node = started_cluster.instances["s0_0_0"] @@ -267,6 +322,17 @@ def test_skip_unavailable_shards(started_cluster): assert result == "10\n" + result = node.query( + f""" + SELECT count(*) from s3( + 'http://minio1:9001/root/data/clickhouse/part1.csv', + 'minio', '{minio_secret_key}', 'CSV', 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') + SETTINGS skip_unavailable_shards = 1, object_storage_cluster = 'cluster_non_existent_port' + """ + ) + + assert result == "10\n" + def test_unset_skip_unavailable_shards(started_cluster): # Although skip_unavailable_shards is not set, cluster table functions should always skip unavailable shards. @@ -282,6 +348,17 @@ def test_unset_skip_unavailable_shards(started_cluster): assert result == "10\n" + result = node.query( + f""" + SELECT count(*) from s3( + 'http://minio1:9001/root/data/clickhouse/part1.csv', + 'minio', '{minio_secret_key}', 'CSV', 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') + SETTINGS object_storage_cluster = 'cluster_non_existent_port' + """ + ) + + assert result == "10\n" + def test_distributed_insert_select_with_replicated(started_cluster): first_replica_first_shard = started_cluster.instances["s0_0_0"] @@ -462,6 +539,18 @@ def test_cluster_format_detection(started_cluster): assert result == expected_result + result = node.query( + f"SELECT * FROM s3('http://minio1:9001/root/data/generated/*', 'minio', '{minio_secret_key}') order by c1, c2 SETTINGS object_storage_cluster = 'cluster_simple'" + ) + + assert result == expected_result + + result = node.query( + f"SELECT * FROM s3('http://minio1:9001/root/data/generated/*', 'minio', '{minio_secret_key}', auto, 'a String, b UInt64') order by a, b SETTINGS object_storage_cluster = 'cluster_simple'" + ) + + assert result == expected_result + def test_cluster_default_expression(started_cluster): node = started_cluster.instances["s0_0_0"] @@ -509,3 +598,641 @@ def test_cluster_default_expression(started_cluster): ) assert result == expected_result + + result = node.query( + f"SELECT * FROM s3('http://minio1:9001/root/data/data{{1,2,3}}', 'minio', '{minio_secret_key}', 'JSONEachRow', 'id UInt32, date Date DEFAULT 18262') order by id SETTINGS object_storage_cluster = 'cluster_simple'" + ) + + assert result == expected_result + + result = node.query( + f"SELECT * FROM s3('http://minio1:9001/root/data/data{{1,2,3}}', 'minio', '{minio_secret_key}', 'auto', 'id UInt32, date Date DEFAULT 18262') order by id SETTINGS object_storage_cluster = 'cluster_simple'" + ) + + assert result == expected_result + + result = node.query( + f"SELECT * FROM s3('http://minio1:9001/root/data/data{{1,2,3}}', 'minio', '{minio_secret_key}', 'JSONEachRow', 'id UInt32, date Date DEFAULT 18262', 'auto') order by id SETTINGS object_storage_cluster = 'cluster_simple'" + ) + + assert result == expected_result + + result = node.query( + f"SELECT * FROM s3('http://minio1:9001/root/data/data{{1,2,3}}', 'minio', '{minio_secret_key}', 'auto', 'id UInt32, date Date DEFAULT 18262', 'auto') order by id SETTINGS object_storage_cluster = 'cluster_simple'" + ) + + assert result == expected_result + + result = node.query( + "SELECT * FROM s3(test_s3_with_default) order by id SETTINGS object_storage_cluster = 'cluster_simple'" + ) + + assert result == expected_result + + +def test_distributed_s3_table_engine(started_cluster): + node = started_cluster.instances["s0_0_0"] + + resp_def = node.query( + f""" + SELECT * from s3Cluster( + 'cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') ORDER BY (name, value, polygon) + """ + ) + + node.query("DROP TABLE IF EXISTS single_node"); + node.query( + f""" + CREATE TABLE single_node + (name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))) + ENGINE=S3('http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV') + """ + ) + query_id_engine_single_node = str(uuid.uuid4()) + resp_engine_single_node = node.query( + """ + SELECT * FROM single_node ORDER BY (name, value, polygon) + """, + query_id = query_id_engine_single_node + ) + assert resp_def == resp_engine_single_node + + node.query("DROP TABLE IF EXISTS distributed"); + node.query( + f""" + CREATE TABLE distributed + (name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))) + ENGINE=S3('http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV') + SETTINGS object_storage_cluster='cluster_simple' + """ + ) + query_id_engine_distributed = str(uuid.uuid4()) + resp_engine_distributed = node.query( + """ + SELECT * FROM distributed ORDER BY (name, value, polygon) + """, + query_id = query_id_engine_distributed + ) + assert resp_def == resp_engine_distributed + + node.query("SYSTEM FLUSH LOGS ON CLUSTER 'cluster_simple'") + + hosts_engine_single_node = node.query( + f""" + SELECT uniq(hostname) + FROM clusterAllReplicas('cluster_simple', system.query_log) + WHERE type='QueryFinish' AND initial_query_id='{query_id_engine_single_node}' + """ + ) + assert int(hosts_engine_single_node) == 1 + hosts_engine_distributed = node.query( + f""" + SELECT uniq(hostname) + FROM clusterAllReplicas('cluster_simple', system.query_log) + WHERE type='QueryFinish' AND initial_query_id='{query_id_engine_distributed}' + """ + ) + assert int(hosts_engine_distributed) == 3 + + +def test_distributed_s3_table_engine(started_cluster): + node = started_cluster.instances["s0_0_0"] + + resp_def = node.query( + f""" + SELECT * from s3Cluster( + 'cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') ORDER BY (name, value, polygon) + """ + ) + + node.query("DROP TABLE IF EXISTS single_node"); + node.query( + f""" + CREATE TABLE single_node + (name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))) + ENGINE=S3('http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV') + """ + ) + query_id_engine_single_node = str(uuid.uuid4()) + resp_engine_single_node = node.query( + """ + SELECT * FROM single_node ORDER BY (name, value, polygon) + """, + query_id = query_id_engine_single_node + ) + assert resp_def == resp_engine_single_node + + node.query("DROP TABLE IF EXISTS distributed"); + node.query( + f""" + CREATE TABLE distributed + (name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))) + ENGINE=S3('http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV') + SETTINGS object_storage_cluster='cluster_simple' + """ + ) + query_id_engine_distributed = str(uuid.uuid4()) + resp_engine_distributed = node.query( + """ + SELECT * FROM distributed ORDER BY (name, value, polygon) + """, + query_id = query_id_engine_distributed + ) + assert resp_def == resp_engine_distributed + + node.query("SYSTEM FLUSH LOGS ON CLUSTER 'cluster_simple'") + + hosts_engine_single_node = node.query( + f""" + SELECT uniq(hostname) + FROM clusterAllReplicas('cluster_simple', system.query_log) + WHERE type='QueryFinish' AND initial_query_id='{query_id_engine_single_node}' + """ + ) + assert int(hosts_engine_single_node) == 1 + hosts_engine_distributed = node.query( + f""" + SELECT uniq(hostname) + FROM clusterAllReplicas('cluster_simple', system.query_log) + WHERE type='QueryFinish' AND initial_query_id='{query_id_engine_distributed}' + """ + ) + assert int(hosts_engine_distributed) == 3 + + +def test_cluster_hosts_limit(started_cluster): + node = started_cluster.instances["s0_0_0"] + + query_id_def = str(uuid.uuid4()) + resp_def = node.query( + f""" + SELECT * from s3Cluster( + 'cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') ORDER BY (name, value, polygon) + """, + query_id = query_id_def + ) + + # object_storage_max_nodes is greater than number of hosts in cluster + query_id_4_hosts = str(uuid.uuid4()) + resp_4_hosts = node.query( + f""" + SELECT * from s3Cluster( + 'cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') ORDER BY (name, value, polygon) + SETTINGS object_storage_max_nodes=4 + """, + query_id = query_id_4_hosts + ) + assert resp_def == resp_4_hosts + + # object_storage_max_nodes is equal number of hosts in cluster + query_id_3_hosts = str(uuid.uuid4()) + resp_3_hosts = node.query( + f""" + SELECT * from s3Cluster( + 'cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') ORDER BY (name, value, polygon) + SETTINGS object_storage_max_nodes=3 + """, + query_id = query_id_3_hosts + ) + assert resp_def == resp_3_hosts + + # object_storage_max_nodes is less than number of hosts in cluster + query_id_2_hosts = str(uuid.uuid4()) + resp_2_hosts = node.query( + f""" + SELECT * from s3Cluster( + 'cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') ORDER BY (name, value, polygon) + SETTINGS object_storage_max_nodes=2 + """, + query_id = query_id_2_hosts + ) + assert resp_def == resp_2_hosts + + node.query("SYSTEM FLUSH LOGS ON CLUSTER 'cluster_simple'") + + hosts_def = node.query( + f""" + SELECT uniq(hostname) + FROM clusterAllReplicas('cluster_simple', system.query_log) + WHERE type='QueryFinish' AND initial_query_id='{query_id_def}' AND query_id!='{query_id_def}' + """ + ) + assert int(hosts_def) == 3 + + hosts_4 = node.query( + f""" + SELECT uniq(hostname) + FROM clusterAllReplicas('cluster_simple', system.query_log) + WHERE type='QueryFinish' AND initial_query_id='{query_id_4_hosts}' AND query_id!='{query_id_4_hosts}' + """ + ) + assert int(hosts_4) == 3 + + hosts_3 = node.query( + f""" + SELECT uniq(hostname) + FROM clusterAllReplicas('cluster_simple', system.query_log) + WHERE type='QueryFinish' AND initial_query_id='{query_id_3_hosts}' AND query_id!='{query_id_3_hosts}' + """ + ) + assert int(hosts_3) == 3 + + hosts_2 = node.query( + f""" + SELECT uniq(hostname) + FROM clusterAllReplicas('cluster_simple', system.query_log) + WHERE type='QueryFinish' AND initial_query_id='{query_id_2_hosts}' AND query_id!='{query_id_2_hosts}' + """ + ) + assert int(hosts_2) == 2 + + +def test_remote_hedged(started_cluster): + node = started_cluster.instances["s0_0_0"] + pure_s3 = node.query( + f""" + SELECT * from s3( + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', + 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') + ORDER BY (name, value, polygon) + LIMIT 1 + """ + ) + s3_distributed = node.query( + f""" + SELECT * from remote('s0_0_1', s3Cluster( + 'cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))')) + ORDER BY (name, value, polygon) + LIMIT 1 + SETTINGS use_hedged_requests=True + """ + ) + + assert TSV(pure_s3) == TSV(s3_distributed) + + +def test_remote_no_hedged(started_cluster): + node = started_cluster.instances["s0_0_0"] + pure_s3 = node.query( + f""" + SELECT * from s3( + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', + 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') + ORDER BY (name, value, polygon) + LIMIT 1 + """ + ) + s3_distributed = node.query( + f""" + SELECT * from remote('s0_0_1', s3Cluster( + 'cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))')) + ORDER BY (name, value, polygon) + LIMIT 1 + SETTINGS use_hedged_requests=False + """ + ) + + assert TSV(pure_s3) == TSV(s3_distributed) + + +@pytest.mark.parametrize("allow_experimental_analyzer", [0, 1]) +def test_hive_partitioning(started_cluster, allow_experimental_analyzer): + node = started_cluster.instances["s0_0_0"] + + node.query(f"SET allow_experimental_analyzer = {allow_experimental_analyzer}") + + for i in range(1, 5): + exists = node.query( + f""" + SELECT + count() + FROM s3('http://minio1:9001/root/data/hive/key={i}/*', 'minio', '{minio_secret_key}', 'Parquet', 'key Int32, value Int32') + GROUP BY ALL + FORMAT TSV + """ + ) + if int(exists) == 0: + node.query( + f""" + INSERT + INTO FUNCTION s3('http://minio1:9001/root/data/hive/key={i}/data.parquet', 'minio', '{minio_secret_key}', 'Parquet', 'key Int32, value Int32') + SELECT {i}, {i} + SETTINGS use_hive_partitioning = 0 + """ + ) + + query_id_full = str(uuid.uuid4()) + result = node.query( + f""" + SELECT count() + FROM s3('http://minio1:9001/root/data/hive/key=**.parquet', 'minio', '{minio_secret_key}', 'Parquet', 'key Int32, value Int32') + WHERE key <= 2 + FORMAT TSV + SETTINGS enable_filesystem_cache = 0, use_query_cache = 0, use_cache_for_count_from_files = 0, use_hive_partitioning = 0 + """, + query_id=query_id_full, + ) + result = int(result) + assert result == 2 + + query_id_optimized = str(uuid.uuid4()) + result = node.query( + f""" + SELECT count() + FROM s3('http://minio1:9001/root/data/hive/key=**.parquet', 'minio', '{minio_secret_key}', 'Parquet', 'key Int32, value Int32') + WHERE key <= 2 + FORMAT TSV + SETTINGS enable_filesystem_cache = 0, use_query_cache = 0, use_cache_for_count_from_files = 0, use_hive_partitioning = 1 + """, + query_id=query_id_optimized, + ) + result = int(result) + assert result == 2 + + query_id_cluster_full = str(uuid.uuid4()) + result = node.query( + f""" + SELECT count() + FROM s3Cluster(cluster_simple, 'http://minio1:9001/root/data/hive/key=**.parquet', 'minio', '{minio_secret_key}', 'Parquet', 'key Int32, value Int32') + WHERE key <= 2 + FORMAT TSV + SETTINGS enable_filesystem_cache = 0, use_query_cache = 0, use_cache_for_count_from_files = 0, use_hive_partitioning = 0 + """, + query_id=query_id_cluster_full, + ) + result = int(result) + assert result == 2 + + query_id_cluster_optimized = str(uuid.uuid4()) + result = node.query( + f""" + SELECT count() + FROM s3Cluster(cluster_simple, 'http://minio1:9001/root/data/hive/key=**.parquet', 'minio', '{minio_secret_key}', 'Parquet', 'key Int32, value Int32') + WHERE key <= 2 + FORMAT TSV + SETTINGS enable_filesystem_cache = 0, use_query_cache = 0, use_cache_for_count_from_files = 0, use_hive_partitioning = 1 + """, + query_id=query_id_cluster_optimized, + ) + result = int(result) + assert result == 2 + + node.query("SYSTEM FLUSH LOGS ON CLUSTER 'cluster_simple'") + + full_traffic = node.query( + f""" + SELECT sum(ProfileEvents['ReadBufferFromS3Bytes']) + FROM clusterAllReplicas(cluster_simple, system.query_log) + WHERE type='QueryFinish' AND initial_query_id='{query_id_full}' + FORMAT TSV + """ + ) + full_traffic = int(full_traffic) + assert full_traffic > 0 # 612*4 + + optimized_traffic = node.query( + f""" + SELECT sum(ProfileEvents['ReadBufferFromS3Bytes']) + FROM clusterAllReplicas(cluster_simple, system.query_log) + WHERE type='QueryFinish' AND initial_query_id='{query_id_optimized}' + FORMAT TSV + """ + ) + optimized_traffic = int(optimized_traffic) + assert optimized_traffic > 0 # 612*2 + assert full_traffic > optimized_traffic + + cluster_full_traffic = node.query( + f""" + SELECT sum(ProfileEvents['ReadBufferFromS3Bytes']) + FROM clusterAllReplicas(cluster_simple, system.query_log) + WHERE type='QueryFinish' AND initial_query_id='{query_id_cluster_full}' + FORMAT TSV + """ + ) + cluster_full_traffic = int(cluster_full_traffic) + assert cluster_full_traffic == full_traffic + + cluster_optimized_traffic = node.query( + f""" + SELECT sum(ProfileEvents['ReadBufferFromS3Bytes']) + FROM clusterAllReplicas(cluster_simple, system.query_log) + WHERE type='QueryFinish' AND initial_query_id='{query_id_cluster_optimized}' + FORMAT TSV + """ + ) + cluster_optimized_traffic = int(cluster_optimized_traffic) + assert cluster_optimized_traffic == optimized_traffic + + node.query("SET allow_experimental_analyzer = DEFAULT") + + +def test_joins(started_cluster): + node = started_cluster.instances["s0_0_0"] + + # Table join_table only exists on the node 's0_0_0'. + node.query( + """ + CREATE TABLE IF NOT EXISTS join_table ( + id UInt32, + name String + ) ENGINE=MergeTree() + ORDER BY id; + """ + ) + + node.query( + f""" + INSERT INTO join_table + SELECT value, concat(name, '_jt') FROM s3Cluster('cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))'); + """ + ) + + result1 = node.query( + f""" + SELECT t1.name, t2.name FROM + s3Cluster('cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') AS t1 + JOIN + join_table AS t2 + ON t1.value = t2.id + ORDER BY t1.name + SETTINGS object_storage_cluster_join_mode='local'; + """ + ) + + res = list(map(str.split, result1.splitlines())) + assert len(res) == 25 + + for line in res: + if len(line) == 2: + assert line[1] == f"{line[0]}_jt" + else: + assert line == ["_jt"] # for empty name + + result2 = node.query( + f""" + SELECT t1.name, t2.name FROM + join_table AS t2 + JOIN + s3Cluster('cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') AS t1 + ON t1.value = t2.id + ORDER BY t1.name + SETTINGS object_storage_cluster_join_mode='local'; + """ + ) + + assert result1 == result2 + + # With WHERE clause with remote column only + result3 = node.query( + f""" + SELECT t1.name, t2.name FROM + s3Cluster('cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') AS t1 + JOIN + join_table AS t2 + ON t1.value = t2.id + WHERE (t1.value % 2) + ORDER BY t1.name + SETTINGS object_storage_cluster_join_mode='local'; + """ + ) + + res = list(map(str.split, result3.splitlines())) + assert len(res) == 8 + + # With WHERE clause with local column only + result4 = node.query( + f""" + SELECT t1.name, t2.name FROM + s3Cluster('cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') AS t1 + JOIN + join_table AS t2 + ON t1.value = t2.id + WHERE (t2.id % 2) + ORDER BY t1.name + SETTINGS object_storage_cluster_join_mode='local'; + """ + ) + + assert result3 == result4 + + # With WHERE clause with local and remote columns + result5 = node.query( + f""" + SELECT t1.name, t2.name FROM + s3Cluster('cluster_simple', + 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') AS t1 + JOIN + join_table AS t2 + ON t1.value = t2.id + WHERE (t1.value % 2) AND ((t2.id % 3) == 2) + ORDER BY t1.name + SETTINGS object_storage_cluster_join_mode='local'; + """ + ) + + res = list(map(str.split, result5.splitlines())) + assert len(res) == 6 + + #result6 = node.query( + # f""" + # SELECT name FROM + # s3Cluster('cluster_simple', + # 'http://minio1:9001/root/data/{{clickhouse,database}}/*', 'minio', '{minio_secret_key}', 'CSV', + # 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') + # WHERE value IN (SELECT id FROM join_table) + # ORDER BY name + # SETTINGS object_storage_cluster_join_mode='local'; + # """ + #) + #res = list(map(str.split, result6.splitlines())) + #assert len(res) == 25 + + +def test_graceful_shutdown(started_cluster): + node = started_cluster.instances["s0_0_0"] + node_to_shutdown = started_cluster.instances["s0_1_0"] + + expected = TSV("64\tBar\t8\n56\tFoo\t8\n") + + num_lock = threading.Lock() + errors = 0 + + def query_cycle(): + nonlocal errors + try: + i = 0 + while i < 10: + i += 1 + # Query time 3-4 seconds + # Processing single object 1-2 seconds + result = node.query(f""" + SELECT sum(value),name,sum(sleep(1)+1) as sleep FROM s3Cluster( + 'cluster_simple', + 'http://minio1:9001/root/data/graceful/*', 'minio', '{minio_secret_key}', 'CSV', + 'value UInt32, name String') + GROUP BY name + ORDER BY name + SETTINGS max_threads=2 + """) + with num_lock: + if TSV(result) != expected: + errors += 1 + if errors >= 1: + break + except QueryRuntimeException: + with num_lock: + errors += 1 + + threads = [] + + for _ in range(10): + thread = threading.Thread(target=query_cycle) + thread.start() + threads.append(thread) + time.sleep(0.2) + + time.sleep(3) + + node_to_shutdown.query("SYSTEM STOP SWARM MODE") + + # enough time to complete processing of objects, started before "SYSTEM STOP SWARM MODE" + time.sleep(3) + + node_to_shutdown.stop_clickhouse(kill=True) + + for thread in threads: + thread.join() + + node_to_shutdown.start_clickhouse() + + assert errors == 0 diff --git a/tests/integration/test_storage_delta/configs/users.d/disable_parquet_metadata_caching.xml b/tests/integration/test_storage_delta/configs/users.d/disable_parquet_metadata_caching.xml new file mode 100644 index 000000000000..bc34464e30da --- /dev/null +++ b/tests/integration/test_storage_delta/configs/users.d/disable_parquet_metadata_caching.xml @@ -0,0 +1,7 @@ + + + + 0 + + + diff --git a/tests/integration/test_storage_delta/test.py b/tests/integration/test_storage_delta/test.py index 6241ed6ecd8a..3b40e1eb4870 100644 --- a/tests/integration/test_storage_delta/test.py +++ b/tests/integration/test_storage_delta/test.py @@ -101,6 +101,7 @@ def started_cluster(): user_configs=[ "configs/users.d/users.xml", "configs/users.d/enable_writes.xml", + "configs/users.d/disable_parquet_metadata_caching.xml", ], with_minio=True, with_azurite=True, @@ -117,6 +118,7 @@ def started_cluster(): user_configs=[ "configs/users.d/users.xml", "configs/users.d/enable_writes.xml", + "configs/users.d/disable_parquet_metadata_caching.xml", ], with_minio=True, stay_alive=True, @@ -163,6 +165,7 @@ def started_cluster(): user_configs=[ "configs/users.d/users.xml", "configs/users.d/disabled_delta_kernel.xml", + "configs/users.d/disable_parquet_metadata_caching.xml", ], with_minio=True, with_azurite=True, @@ -1374,7 +1377,7 @@ def test_session_token(started_cluster): parquet_data_path = create_initial_data_file( started_cluster, instance, - "SELECT toUInt64(number), toString(number) FROM numbers(100)", + "SELECT toUInt64(number), toString(number) FROM numbers(100) SETTINGS input_format_parquet_use_metadata_cache=0", TABLE_NAME, node_name=node_name, ) @@ -1387,7 +1390,7 @@ def test_session_token(started_cluster): f""" SELECT count() FROM deltaLake( 'http://{started_cluster.minio_host}:{started_cluster.minio_port}/{started_cluster.minio_bucket}/{TABLE_NAME}/', - SETTINGS allow_experimental_delta_kernel_rs=1) + SETTINGS allow_experimental_delta_kernel_rs=1, input_format_parquet_use_metadata_cache=0) """ ) ) diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py index 11ddcdfe4649..a5f08571c862 100644 --- a/tests/integration/test_storage_iceberg/test.py +++ b/tests/integration/test_storage_iceberg/test.py @@ -1,21 +1,19 @@ -import glob -import json import logging +import json import os import subprocess import time import uuid +import time from datetime import datetime, timezone import pyspark import pytest -from azure.storage.blob import BlobServiceClient from minio.deleteobjects import DeleteObject from pyspark.sql.functions import ( monotonically_increasing_id, row_number, ) -from pyspark.sql.readwriter import DataFrameWriter, DataFrameWriterV2 from pyspark.sql.types import ( ArrayType, BooleanType, @@ -40,6 +38,7 @@ from helpers.iceberg_utils import ( default_upload_directory, + additional_upload_directory, default_download_directory, execute_spark_query_general, get_creation_expression, @@ -343,9 +342,76 @@ def test_types(started_cluster, format_version, storage_type): ] ) + # Test storage type as function argument + table_function_expr = get_creation_expression( + storage_type, + TABLE_NAME, + started_cluster, + table_function=True, + storage_type_as_arg=True, + ) + assert ( + instance.query(f"SELECT a, b, c, d, e FROM {table_function_expr}").strip() + == "123\tstring\t2000-01-01\t['str1','str2']\ttrue" + ) + + assert instance.query(f"DESCRIBE {table_function_expr} FORMAT TSV") == TSV( + [ + ["a", "Nullable(Int32)"], + ["b", "Nullable(String)"], + ["c", "Nullable(Date)"], + ["d", "Array(Nullable(String))"], + ["e", "Nullable(Bool)"], + ] + ) + + # Test storage type as field in named collection + table_function_expr = get_creation_expression( + storage_type, + TABLE_NAME, + started_cluster, + table_function=True, + storage_type_in_named_collection=True, + ) + assert ( + instance.query(f"SELECT a, b, c, d, e FROM {table_function_expr}").strip() + == "123\tstring\t2000-01-01\t['str1','str2']\ttrue" + ) + + assert instance.query(f"DESCRIBE {table_function_expr} FORMAT TSV") == TSV( + [ + ["a", "Nullable(Int32)"], + ["b", "Nullable(String)"], + ["c", "Nullable(Date)"], + ["d", "Array(Nullable(String))"], + ["e", "Nullable(Bool)"], + ] + ) + + +def count_secondary_subqueries(started_cluster, query_id, expected, comment): + for node_name, replica in started_cluster.instances.items(): + cluster_secondary_queries = ( + replica.query( + f""" + SELECT count(*) FROM system.query_log + WHERE + type = 'QueryFinish' + AND NOT is_initial_query + AND initial_query_id='{query_id}' + """ + ) + .strip() + ) + + logging.info( + f"[{node_name}] cluster_secondary_queries {comment}: {cluster_secondary_queries}" + ) + assert int(cluster_secondary_queries) == expected + @pytest.mark.parametrize("format_version", ["1", "2"]) -@pytest.mark.parametrize("storage_type", ["s3", "azure"]) +@pytest.mark.parametrize("storage_type", ["s3", "azure", "local"]) def test_cluster_table_function(started_cluster, format_version, storage_type): instance = started_cluster.instances["node1"] @@ -378,6 +444,19 @@ def add_df(mode): logging.info(f"Adding another dataframe. result files: {files}") + if storage_type == "local": + # For local storage we need to upload data from each node + for node_name, replica in started_cluster.instances.items(): + if node_name == "node1": + continue + additional_upload_directory( + started_cluster, + node_name, + storage_type, + f"/iceberg_data/default/{TABLE_NAME}/", + f"/iceberg_data/default/{TABLE_NAME}/", + ) + return files files = add_df(mode="overwrite") @@ -398,53 +477,176 @@ def add_df(mode): instance.query(f"SELECT * FROM {table_function_expr}").strip().split() ) + def make_query_from_function( + run_on_cluster=False, + alt_syntax=False, + remote=False, + storage_type_as_arg=False, + storage_type_in_named_collection=False, + ): + expr = get_creation_expression( + storage_type, + TABLE_NAME, + started_cluster, + table_function=True, + run_on_cluster=run_on_cluster, + storage_type_as_arg=storage_type_as_arg, + storage_type_in_named_collection=storage_type_in_named_collection, + ) + query_id = str(uuid.uuid4()) + settings = f"SETTINGS object_storage_cluster='cluster_simple'" if (alt_syntax and not run_on_cluster) else "" + if remote: + query = f"SELECT * FROM remote('node2', {expr}) {settings}" + else: + query = f"SELECT * FROM {expr} {settings}" + responce = instance.query(query, query_id=query_id).strip().split() + return responce, query_id + # Cluster Query with node1 as coordinator - table_function_expr_cluster = get_creation_expression( - storage_type, - TABLE_NAME, - started_cluster, - table_function=True, + select_cluster, query_id_cluster = make_query_from_function(run_on_cluster=True) + + # Cluster Query with node1 as coordinator with alternative syntax + select_cluster_alt_syntax, query_id_cluster_alt_syntax = make_query_from_function( + run_on_cluster=True, + alt_syntax=True) + + # Cluster Query with node1 as coordinator and storage type as arg + select_cluster_with_type_arg, query_id_cluster_with_type_arg = make_query_from_function( + run_on_cluster=True, + storage_type_as_arg=True, + ) + + # Cluster Query with node1 as coordinator and storage type in named collection + select_cluster_with_type_in_nc, query_id_cluster_with_type_in_nc = make_query_from_function( run_on_cluster=True, + storage_type_in_named_collection=True, ) - select_cluster = ( - instance.query(f"SELECT * FROM {table_function_expr_cluster}").strip().split() + + # Cluster Query with node1 as coordinator and storage type as arg, alternative syntax + select_cluster_with_type_arg_alt_syntax, query_id_cluster_with_type_arg_alt_syntax = make_query_from_function( + storage_type_as_arg=True, + alt_syntax=True, + ) + + # Cluster Query with node1 as coordinator and storage type in named collection, alternative syntax + select_cluster_with_type_in_nc_alt_syntax, query_id_cluster_with_type_in_nc_alt_syntax = make_query_from_function( + storage_type_in_named_collection=True, + alt_syntax=True, ) + #select_remote_cluster, _ = make_query_from_function(run_on_cluster=True, remote=True) + + def make_query_from_table(alt_syntax=False): + query_id = str(uuid.uuid4()) + settings = "SETTINGS object_storage_cluster='cluster_simple'" if alt_syntax else "" + responce = ( + instance.query( + f"SELECT * FROM {TABLE_NAME} {settings}", + query_id=query_id, + ) + .strip() + .split() + ) + return responce, query_id + + create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, object_storage_cluster='cluster_simple') + select_cluster_table_engine, query_id_cluster_table_engine = make_query_from_table() + + #select_remote_cluster = ( + # instance.query(f"SELECT * FROM remote('node2',{table_function_expr_cluster})") + # .strip() + # .split() + #) + + instance.query(f"DROP TABLE IF EXISTS `{TABLE_NAME}` SYNC") + + create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster) + select_pure_table_engine, query_id_pure_table_engine = make_query_from_table() + select_pure_table_engine_cluster, query_id_pure_table_engine_cluster = make_query_from_table(alt_syntax=True) + + create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, storage_type_as_arg=True) + select_pure_table_engine_with_type_arg, query_id_pure_table_engine_with_type_arg = make_query_from_table() + select_pure_table_engine_cluster_with_type_arg, query_id_pure_table_engine_cluster_with_type_arg = make_query_from_table(alt_syntax=True) + + create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, storage_type_in_named_collection=True) + select_pure_table_engine_with_type_in_nc, query_id_pure_table_engine_with_type_in_nc = make_query_from_table() + select_pure_table_engine_cluster_with_type_in_nc, query_id_pure_table_engine_cluster_with_type_in_nc = make_query_from_table(alt_syntax=True) + # Simple size check assert len(select_regular) == 600 assert len(select_cluster) == 600 + assert len(select_cluster_alt_syntax) == 600 + assert len(select_cluster_table_engine) == 600 + #assert len(select_remote_cluster) == 600 + assert len(select_cluster_with_type_arg) == 600 + assert len(select_cluster_with_type_in_nc) == 600 + assert len(select_cluster_with_type_arg_alt_syntax) == 600 + assert len(select_cluster_with_type_in_nc_alt_syntax) == 600 + assert len(select_pure_table_engine) == 600 + assert len(select_pure_table_engine_cluster) == 600 + assert len(select_pure_table_engine_with_type_arg) == 600 + assert len(select_pure_table_engine_cluster_with_type_arg) == 600 + assert len(select_pure_table_engine_with_type_in_nc) == 600 + assert len(select_pure_table_engine_cluster_with_type_in_nc) == 600 # Actual check assert select_cluster == select_regular + assert select_cluster_alt_syntax == select_regular + assert select_cluster_table_engine == select_regular + #assert select_remote_cluster == select_regular + assert select_cluster_with_type_arg == select_regular + assert select_cluster_with_type_in_nc == select_regular + assert select_cluster_with_type_arg_alt_syntax == select_regular + assert select_cluster_with_type_in_nc_alt_syntax == select_regular + assert select_pure_table_engine == select_regular + assert select_pure_table_engine_cluster == select_regular + assert select_pure_table_engine_with_type_arg == select_regular + assert select_pure_table_engine_cluster_with_type_arg == select_regular + assert select_pure_table_engine_with_type_in_nc == select_regular + assert select_pure_table_engine_cluster_with_type_in_nc == select_regular # Check query_log for replica in started_cluster.instances.values(): replica.query("SYSTEM FLUSH LOGS") - for node_name, replica in started_cluster.instances.items(): - cluster_secondary_queries = ( - replica.query( - f""" - SELECT query, type, is_initial_query, read_rows, read_bytes FROM system.query_log - WHERE - type = 'QueryStart' AND - positionCaseInsensitive(query, '{storage_type}Cluster') != 0 AND - position(query, '{TABLE_NAME}') != 0 AND - position(query, 'system.query_log') = 0 AND - NOT is_initial_query - """ - ) - .strip() - .split("\n") - ) + count_secondary_subqueries(started_cluster, query_id_cluster, 1, "table function") + count_secondary_subqueries(started_cluster, query_id_cluster_alt_syntax, 1, "table function alt syntax") + count_secondary_subqueries(started_cluster, query_id_cluster_table_engine, 1, "cluster table engine") + count_secondary_subqueries(started_cluster, query_id_cluster_with_type_arg, 1, "table function with storage type in args") + count_secondary_subqueries(started_cluster, query_id_cluster_with_type_in_nc, 1, "table function with storage type in named collection") + count_secondary_subqueries(started_cluster, query_id_cluster_with_type_arg_alt_syntax, 1, "table function with storage type in args alt syntax") + count_secondary_subqueries(started_cluster, query_id_cluster_with_type_in_nc_alt_syntax, 1, "table function with storage type in named collection alt syntax") + count_secondary_subqueries(started_cluster, query_id_pure_table_engine, 0, "table engine") + count_secondary_subqueries(started_cluster, query_id_pure_table_engine_cluster, 1, "table engine with cluster setting") + count_secondary_subqueries(started_cluster, query_id_pure_table_engine_with_type_arg, 0, "table engine with storage type in args") + count_secondary_subqueries(started_cluster, query_id_pure_table_engine_cluster_with_type_arg, 1, "table engine with cluster setting with storage type in args") + count_secondary_subqueries(started_cluster, query_id_pure_table_engine_with_type_in_nc, 0, "table engine with storage type in named collection") + count_secondary_subqueries(started_cluster, query_id_pure_table_engine_cluster_with_type_in_nc, 1, "table engine with cluster setting with storage type in named collection") - logging.info( - f"[{node_name}] cluster_secondary_queries: {cluster_secondary_queries}" - ) - assert len(cluster_secondary_queries) == 1 - # write 3 times - assert int(instance.query(f"SELECT count() FROM {table_function_expr_cluster}")) == 100 * 3 + # Cluster Query with node1 as coordinator + table_function_expr_cluster = get_creation_expression( + storage_type, + TABLE_NAME, + started_cluster, + table_function=True, + run_on_cluster=True, + ) + select_remote_cluster = ( + instance.query(f"SELECT * FROM remote('node2',{table_function_expr_cluster})") + .strip() + .split() + ) + assert len(select_remote_cluster) == 600 + assert select_remote_cluster == select_regular + + select_remote_cluster = ( + instance.query(f"SELECT * FROM remote('node2',{table_function_expr_cluster})") + .strip() + .split() + ) + assert len(select_remote_cluster) == 600 + assert select_remote_cluster == select_regular @pytest.mark.parametrize("format_version", ["1", "2"]) @@ -533,6 +735,12 @@ def test_delete_files(started_cluster, format_version, storage_type): assert instance.query(f"SELECT ProfileEvents['IcebergTrivialCountOptimizationApplied'] FROM system.query_log where query_id = '{query_id}' and type = 'QueryFinish'") == "1\n" +def get_array(query_result: str): + arr = sorted([int(x) for x in query_result.strip().split("\n")]) + print(arr) + return arr + + @pytest.mark.parametrize("use_roaring_bitmaps", [0, 1]) @pytest.mark.parametrize("storage_type", ["s3", "azure", "local"]) def test_position_deletes(started_cluster, use_roaring_bitmaps, storage_type): @@ -549,11 +757,6 @@ def test_position_deletes(started_cluster, use_roaring_bitmaps, storage_type): ) spark.sql(f"INSERT INTO {TABLE_NAME} select id, char(id + ascii('a')) from range(10, 100)") - def get_array(query_result: str): - arr = sorted([int(x) for x in query_result.strip().split("\n")]) - print(arr) - return arr - default_upload_directory( started_cluster, storage_type, @@ -634,6 +837,49 @@ def get_array(query_result: str): instance.query(f"DROP TABLE {TABLE_NAME}") +@pytest.mark.parametrize("use_roaring_bitmaps", [0, 1]) +def test_position_deletes_out_of_order(started_cluster, use_roaring_bitmaps): + storage_type = "local" + instance = started_cluster.instances["node1"] + spark = started_cluster.spark_session + TABLE_NAME = "test_position_deletes_out_of_order_" + get_uuid_str() + instance.query(f"SET use_roaring_bitmap_iceberg_positional_deletes={use_roaring_bitmaps};") + instance.query(f"SET input_format_parquet_use_native_reader_v3=1;") + + # There are a few flaky hacks chained together here. + # We want the parquet reader to produce chunks corresponding to row groups out of order if + # `format_settings.parquet.preserve_order` wasn't enabled. For that we: + # * Use `PREWHERE NOT sleepEachRow(...)` to make the reader take longer for bigger row groups. + # * Set spark row group size limit to 1 byte. Rely on current spark implementation detail: + # it'll check this limit every 100 rows. So effectively we've set row group size to 100 rows. + # * Insert 105 rows. So the first row group will have 100 rows, the second 5 rows. + # If one of these steps breaks in future, this test will be less effective but won't fail. + + spark.sql( + f""" + CREATE TABLE {TABLE_NAME} (id bigint, data string) USING iceberg TBLPROPERTIES ('format-version' = '2', 'write.update.mode'='merge-on-read', 'write.delete.mode'='merge-on-read', 'write.merge.mode'='merge-on-read', 'write.parquet.row-group-size-bytes'='1') + """ + ) + spark.sql(f"INSERT INTO {TABLE_NAME} select /*+ COALESCE(1) */ id, char(id + ascii('a')) from range(0, 105)") + # (Fun fact: if you replace these two queries with one query with `WHERE id < 10 OR id = 103`, + # spark either quetly fails to delete row 103 or outright crashes with segfault in jre.) + spark.sql(f"DELETE FROM {TABLE_NAME} WHERE id < 10") + spark.sql(f"DELETE FROM {TABLE_NAME} WHERE id = 103") + + default_upload_directory( + started_cluster, + storage_type, + f"/iceberg_data/default/{TABLE_NAME}/", + f"/iceberg_data/default/{TABLE_NAME}/", + ) + + create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, additional_settings=["input_format_parquet_use_native_reader_v3=1", f"use_roaring_bitmap_iceberg_positional_deletes={use_roaring_bitmaps}"]) + + # TODO: Replace WHERE with PREWHERE when we add prewhere support for datalakes. + assert get_array(instance.query(f"SELECT id FROM {TABLE_NAME} WHERE NOT sleepEachRow(1/100) order by id")) == list(range(10, 103)) + [104] + + instance.query(f"DROP TABLE {TABLE_NAME}") + @pytest.mark.parametrize("format_version", ["1", "2"]) @pytest.mark.parametrize("storage_type", ["s3", "azure", "local"]) def test_schema_inference(started_cluster, format_version, storage_type): @@ -854,7 +1100,7 @@ def test_metadata_file_selection_from_version_hint(started_cluster, format_versi spark.sql( f"INSERT INTO {TABLE_NAME} select id, char(id + ascii('a')) from range(10)" ) - + # test the case where version_hint.text file contains just the version number with open(f"/iceberg_data/default/{TABLE_NAME}/metadata/version-hint.text", "w") as f: f.write('5') @@ -1026,7 +1272,7 @@ def test_filesystem_cache(started_cluster, storage_type): @pytest.mark.parametrize( "storage_type, run_on_cluster", - [("s3", False), ("s3", True), ("azure", False), ("local", False)], + [("s3", False), ("s3", True), ("azure", False), ("azure", True), ("local", False), ("local", True)], ) def test_partition_pruning(started_cluster, storage_type, run_on_cluster): instance = started_cluster.instances["node1"] @@ -1040,6 +1286,7 @@ def execute_spark_query(query: str): storage_type, TABLE_NAME, query, + additional_nodes=["node2", "node3"] if storage_type=="local" else [], ) execute_spark_query( @@ -1816,6 +2063,7 @@ def check_validity_and_get_prunned_files(select_expression): == 1 ) + @pytest.mark.parametrize("storage_type", ["s3", "azure", "local"]) def test_explicit_metadata_file(started_cluster, storage_type): instance = started_cluster.instances["node1"] @@ -1860,8 +2108,10 @@ def test_explicit_metadata_file(started_cluster, storage_type): with pytest.raises(Exception): create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, explicit_metadata_path="../metadata/v11.metadata.json") + @pytest.mark.parametrize("storage_type", ["s3", "azure", "local"]) -def test_minmax_pruning_with_null(started_cluster, storage_type): +@pytest.mark.parametrize("run_on_cluster", [False, True]) +def test_minmax_pruning_with_null(started_cluster, storage_type, run_on_cluster): instance = started_cluster.instances["node1"] spark = started_cluster.spark_session TABLE_NAME = "test_minmax_pruning_with_null" + storage_type + "_" + get_uuid_str() @@ -1873,6 +2123,7 @@ def execute_spark_query(query: str): storage_type, TABLE_NAME, query, + additional_nodes=["node2", "node3"] if storage_type=="local" else [], ) execute_spark_query( @@ -1931,7 +2182,7 @@ def execute_spark_query(query: str): ) creation_expression = get_creation_expression( - storage_type, TABLE_NAME, started_cluster, table_function=True + storage_type, TABLE_NAME, started_cluster, table_function=True, run_on_cluster=run_on_cluster ) def check_validity_and_get_prunned_files(select_expression): @@ -2452,7 +2703,7 @@ def test_writes_create_table(started_cluster, format_version, storage_type): with pytest.raises(Exception): create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, "(x String)", format_version) - create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, "(x String)", format_version, "", True) + create_iceberg_table(storage_type, instance, TABLE_NAME, started_cluster, "(x String)", format_version, "", True) assert '`x` String' in instance.query(f"SHOW CREATE TABLE {TABLE_NAME}") @@ -2517,7 +2768,7 @@ def test_relevant_iceberg_schema_chosen(started_cluster, storage_type): instance = started_cluster.instances["node1"] spark = started_cluster.spark_session TABLE_NAME = "test_relevant_iceberg_schema_chosen_" + storage_type + "_" + get_uuid_str() - + spark.sql( f""" CREATE TABLE IF NOT EXISTS {TABLE_NAME} ( @@ -2992,7 +3243,7 @@ def __init__(self, not_pruned, partition_pruned, min_max_index_pruned): def __repr__(self): return "PrunedInfo(not_pruned={}, partition_pruned={}, min_max_index_pruned={})".format(self.not_pruned, self.partition_pruned, self.min_max_index_pruned) - + def __eq__(self, other): return (self.not_pruned == other.not_pruned and self.partition_pruned == other.partition_pruned and @@ -3088,7 +3339,7 @@ def verify_result_dictionary(diction : dict, allowed_content_types : set): number_of_missing_row_values += 1 if partitioned_rows != not_deleted_files: raise ValueError("Partitioned rows are not consistent with not deleted files for file path: {}, partitioned rows: {}, not deleted files: {}".format(file_path, partitioned_rows, not_deleted_files)) - + # We have exactly one metadata file if number_of_missing_row_values != 1: raise ValueError("Not a one row value (corresponding to metadata file) is missing for file path: {}".format(file_path)) @@ -3201,6 +3452,404 @@ def execute_spark_query(query: str): raise +@pytest.mark.parametrize("storage_type", ["s3", "azure"]) +@pytest.mark.parametrize("run_on_cluster", [False, True]) +def test_read_constant_columns_optimization(started_cluster, storage_type, run_on_cluster): + instance = started_cluster.instances["node1"] + spark = started_cluster.spark_session + TABLE_NAME = "test_read_constant_columns_optimization_" + storage_type + "_" + get_uuid_str() + + def execute_spark_query(query: str): + return execute_spark_query_general( + spark, + started_cluster, + storage_type, + TABLE_NAME, + query, + ) + + execute_spark_query( + f""" + CREATE TABLE {TABLE_NAME} ( + tag INT, + date DATE, + date2 DATE, + name VARCHAR(50), + number BIGINT + ) + USING iceberg + PARTITIONED BY (identity(tag), years(date)) + OPTIONS('format-version'='2') + """ + ) + + execute_spark_query( + f""" + INSERT INTO {TABLE_NAME} VALUES + (1, DATE '2024-01-20', DATE '2024-01-20', 'vasya', 5), + (1, DATE '2024-01-20', DATE '2024-01-20', 'vasilisa', 5), + (1, DATE '2025-01-20', DATE '2025-01-20', 'vasya', 5), + (1, DATE '2025-01-20', DATE '2025-01-20', 'vasya', 5), + (2, DATE '2025-01-20', DATE '2025-01-20', 'vasilisa', 5), + (2, DATE '2025-01-21', DATE '2025-01-20', 'vasilisa', 5) + """ + ) + + execute_spark_query( + f""" + ALTER TABLE {TABLE_NAME} ALTER COLUMN number FIRST; + """ + ) + + execute_spark_query( + f""" + INSERT INTO {TABLE_NAME} VALUES + (5, 3, DATE '2025-01-20', DATE '2024-01-20', 'vasilisa'), + (5, 3, DATE '2025-01-20', DATE '2025-01-20', 'vasilisa') + """ + ) + + execute_spark_query( + f""" + ALTER TABLE {TABLE_NAME} RENAME COLUMN name TO name_old; + """ + ) + + execute_spark_query( + f""" + ALTER TABLE {TABLE_NAME} + ADD COLUMNS ( + name string + ); + """ + ) + + execute_spark_query( + f""" + INSERT INTO {TABLE_NAME} VALUES + (5, 4, DATE '2025-01-20', DATE '2024-01-20', 'vasya', 'iceberg'), + (5, 4, DATE '2025-01-20', DATE '2025-01-20', 'vasilisa', 'iceberg'), + (5, 5, DATE '2025-01-20', DATE '2024-01-20', 'vasya', 'iceberg'), + (5, 5, DATE '2025-01-20', DATE '2024-01-20', 'vasilisa', 'icebreaker'), + (5, 6, DATE '2025-01-20', DATE '2024-01-20', 'vasya', 'iceberg'), + (5, 6, DATE '2025-01-20', DATE '2024-01-20', 'vasya', 'iceberg') + """ + ) + + # Totally must be 7 files + # Partitioned column 'tag' is constant in each file + # Column 'date' is constant in 6 files, has different values in (2-2025) + # Column 'date2' is constant in 4 files (1-2024, 2-2025, 5-2025, 6-2025) + # Column 'name_old' is constant in 3 files (1-2025, 2-2025 as 'name', 6-2025 as 'name_old') + # Column 'number' is globally constant + # New column 'name2' is present only in 3 files (4-2025, 5-2025, 6-2025), constant in two (4-2025, 6-2025) + # Files 1-2025 and 6-2025 have only constant columns + + creation_expression = get_creation_expression( + storage_type, TABLE_NAME, started_cluster, table_function=True, run_on_cluster=run_on_cluster + ) + + # Warm up metadata cache + for replica in started_cluster.instances.values(): + replica.query(f"SELECT * FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0") + + all_data_expected_query_id = str(uuid.uuid4()) + all_data_expected = instance.query( + f"SELECT * FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0", + query_id=all_data_expected_query_id, + ) + const_only_expected_query_id = str(uuid.uuid4()) + const_only_expected = instance.query( + f"SELECT tag, number FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0", + query_id=const_only_expected_query_id, + ) + const_partial_expected_query_id = str(uuid.uuid4()) + const_partial_expected = instance.query( + f"SELECT tag, date2, number, name_old FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0", + query_id=const_partial_expected_query_id, + ) + const_partial2_expected_query_id = str(uuid.uuid4()) + const_partial2_expected = instance.query( + f"SELECT tag, date2, number, name FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0", + query_id=const_partial2_expected_query_id, + ) + count_expected_query_id = str(uuid.uuid4()) + count_expected = instance.query( + f"SELECT count(),tag FROM {creation_expression} GROUP BY ALL ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=0", + query_id=count_expected_query_id, + ) + + all_data_query_id = str(uuid.uuid4()) + all_data_optimized = instance.query( + f"SELECT * FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1", + query_id=all_data_query_id, + ) + const_only_query_id = str(uuid.uuid4()) + const_only_optimized = instance.query( + f"SELECT tag, number FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1", + query_id=const_only_query_id, + ) + const_partial_query_id = str(uuid.uuid4()) + const_partial_optimized = instance.query( + f"SELECT tag, date2, number, name_old FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1", + query_id=const_partial_query_id, + ) + const_partial2_query_id = str(uuid.uuid4()) + const_partial2_optimized = instance.query( + f"SELECT tag, date2, number, name FROM {creation_expression} ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1", + query_id=const_partial2_query_id, + ) + count_query_id = str(uuid.uuid4()) + count_optimized = instance.query( + f"SELECT count(),tag FROM {creation_expression} GROUP BY ALL ORDER BY ALL SETTINGS allow_experimental_iceberg_read_optimization=1", + query_id=count_query_id, + ) + + assert all_data_expected == all_data_optimized + assert const_only_expected == const_only_optimized + assert const_partial_expected == const_partial_optimized + assert const_partial2_expected == const_partial2_optimized + assert count_expected == count_optimized + + for replica in started_cluster.instances.values(): + replica.query("SYSTEM FLUSH LOGS") + + def check_events(query_id, event, expected): + res = instance.query( + f""" + SELECT + sum(tupleElement(arrayJoin(ProfileEvents),2)) as value + FROM + clusterAllReplicas('cluster_simple', system.query_log) + WHERE + type='QueryFinish' + AND tupleElement(arrayJoin(ProfileEvents),1)='{event}' + AND initial_query_id='{query_id}' + GROUP BY ALL + FORMAT CSV + """) + assert int(res) == expected + + event = "S3GetObject" if storage_type == "s3" else "AzureGetObject" + + # Without optimization clickhouse reads all 7 files + check_events(all_data_expected_query_id, event, 7) + check_events(const_only_expected_query_id, event, 7) + check_events(const_partial_expected_query_id, event, 7) + check_events(const_partial2_expected_query_id, event, 7) + check_events(count_expected_query_id, event, 7) + + # If file has only constant columns it is not read + check_events(all_data_query_id, event, 5) # 1-2025, 6-2025 must not be read + check_events(const_only_query_id, event, 0) # All must not be read + check_events(const_partial_query_id, event, 4) # 1-2025, 6-2025 and 2-2025 must not be read + check_events(const_partial2_query_id, event, 3) # 6-2025 must not be read, 1-2024, 1-2025, 2-2025 don't have new column 'name' + check_events(count_query_id, event, 0) # All must not be read + + def compare_selects(query): + result_expected = instance.query(f"{query} SETTINGS allow_experimental_iceberg_read_optimization=0") + result_optimized = instance.query(f"{query} SETTINGS allow_experimental_iceberg_read_optimization=1") + assert result_expected == result_optimized + + compare_selects(f"SELECT _path,* FROM {creation_expression} ORDER BY ALL") + compare_selects(f"SELECT _path,* FROM {creation_expression} WHERE name_old='vasily' ORDER BY ALL") + compare_selects(f"SELECT _path,* FROM {creation_expression} WHERE ((tag + length(name_old)) % 2 = 1) ORDER BY ALL") + + +@pytest.mark.parametrize("storage_type", ["s3", "azure"]) +def test_cluster_joins(started_cluster, storage_type): + instance = started_cluster.instances["node1"] + spark = started_cluster.spark_session + TABLE_NAME = "test_cluster_joins_" + storage_type + "_" + get_uuid_str() + TABLE_NAME_2 = "test_cluster_joins_2_" + storage_type + "_" + get_uuid_str() + TABLE_NAME_LOCAL = "test_cluster_joins_local_" + storage_type + "_" + get_uuid_str() + + def execute_spark_query(query: str, table_name): + return execute_spark_query_general( + spark, + started_cluster, + storage_type, + table_name, + query, + ) + + execute_spark_query( + f""" + CREATE TABLE {TABLE_NAME} ( + tag INT, + name VARCHAR(50) + ) + USING iceberg + OPTIONS('format-version'='2') + """, TABLE_NAME + ) + + execute_spark_query( + f""" + INSERT INTO {TABLE_NAME} VALUES + (1, 'john'), + (2, 'jack') + """, TABLE_NAME + ) + + execute_spark_query( + f""" + CREATE TABLE {TABLE_NAME_2} ( + id INT, + second_name VARCHAR(50) + ) + USING iceberg + OPTIONS('format-version'='2') + """, TABLE_NAME_2 + ) + + execute_spark_query( + f""" + INSERT INTO {TABLE_NAME_2} VALUES + (1, 'dow'), + (2, 'sparrow') + """, TABLE_NAME_2 + ) + + creation_expression = get_creation_expression( + storage_type, TABLE_NAME, started_cluster, table_function=True, run_on_cluster=True + ) + + creation_expression_2 = get_creation_expression( + storage_type, TABLE_NAME_2, started_cluster, table_function=True, run_on_cluster=True + ) + + instance.query(f"CREATE TABLE `{TABLE_NAME_LOCAL}` (id Int64, second_name String) ENGINE = Memory()") + instance.query(f"INSERT INTO `{TABLE_NAME_LOCAL}` VALUES (1, 'silver'), (2, 'black')") + + res = instance.query( + f""" + SELECT t1.name,t2.second_name + FROM {creation_expression} AS t1 + JOIN {creation_expression_2} AS t2 + ON t1.tag=t2.id + WHERE t1.tag < 10 AND t2.id < 20 + ORDER BY ALL + SETTINGS + object_storage_cluster='cluster_simple', + object_storage_cluster_join_mode='local' + """ + ) + + assert res == "jack\tsparrow\njohn\tdow\n" + + #res = instance.query( + # f""" + # SELECT name + # FROM {creation_expression} + # WHERE tag in ( + # SELECT id + # FROM {creation_expression_2} + # ) + # ORDER BY ALL + # SETTINGS + # object_storage_cluster='cluster_simple', + # object_storage_cluster_join_mode='local' + # """ + #) + + #assert res == "jack\njohn\n" + + res = instance.query( + f""" + SELECT t1.name,t2.second_name + FROM {creation_expression} AS t1 + JOIN `{TABLE_NAME_LOCAL}` AS t2 + ON t1.tag=t2.id + WHERE t1.tag < 10 AND t2.id < 20 + ORDER BY ALL + SETTINGS + object_storage_cluster='cluster_simple', + object_storage_cluster_join_mode='local' + """ + ) + + assert res == "jack\tblack\njohn\tsilver\n" + + #res = instance.query( + # f""" + # SELECT name + # FROM {creation_expression} + # WHERE tag in ( + # SELECT id + # FROM `{TABLE_NAME_LOCAL}` + # ) + # ORDER BY ALL + # SETTINGS + # object_storage_cluster='cluster_simple', + # object_storage_cluster_join_mode='local' + # """ + #) + + #assert res == "jack\njohn\n" + + res = instance.query( + f""" + SELECT t1.name,t2.second_name + FROM {creation_expression} AS t1 + CROSS JOIN `{TABLE_NAME_LOCAL}` AS t2 + WHERE t1.tag < 10 AND t2.id < 20 + ORDER BY ALL + SETTINGS + object_storage_cluster='cluster_simple', + object_storage_cluster_join_mode='local' + """ + ) + + assert res == "jack\tblack\njack\tsilver\njohn\tblack\njohn\tsilver\n" + + +@pytest.mark.parametrize("storage_type", ["s3"]) +def test_system_tables_partition_sorting_keys(started_cluster, storage_type): + instance = started_cluster.instances["node1"] + spark = started_cluster.spark_session + + table_name = f"test_sys_tables_keys_{storage_type}_{uuid.uuid4().hex[:8]}" + fq_table = f"spark_catalog.default.{table_name}" + + spark.sql(f"DROP TABLE IF EXISTS {fq_table}") + spark.sql(f""" + CREATE TABLE {fq_table} ( + id INT, + ts TIMESTAMP, + payload STRING + ) + USING iceberg + PARTITIONED BY (bucket(16, id), day(ts)) + TBLPROPERTIES ('format-version' = '2') + """) + spark.sql(f"ALTER TABLE {fq_table} WRITE ORDERED BY (id DESC NULLS LAST, hour(ts))") + spark.sql(f""" + INSERT INTO {fq_table} VALUES + (1, timestamp'2024-01-01 10:00:00', 'a'), + (2, timestamp'2024-01-02 11:00:00', 'b'), + (NULL, timestamp'2024-01-03 12:00:00', 'c') + """) + + time.sleep(2) + default_upload_directory( + started_cluster, + storage_type, + f"/iceberg_data/default/{table_name}/", + f"/iceberg_data/default/{table_name}/", + ) + + create_iceberg_table(storage_type, instance, table_name, started_cluster) + + res = instance.query(f""" + SELECT partition_key, sorting_key + FROM system.tables + WHERE name = '{table_name}' FORMAT csv + """).strip().lower() + + assert res == '"bucket(16, id), day(ts)","id desc, hour(ts) asc"' + + @pytest.mark.parametrize( "storage_type", ["s3", "azure", "local"], @@ -3243,7 +3892,6 @@ def execute_spark_query(query: str): """ ) - creation_expression = get_creation_expression( storage_type, TABLE_NAME, started_cluster, table_function=True ) @@ -3251,7 +3899,6 @@ def execute_spark_query(query: str): instance.query(f"CREATE TABLE {IN_MEMORY_TABLE} (id INT) ENGINE = Memory") instance.query(f"INSERT INTO {IN_MEMORY_TABLE} VALUES (2), (4)") - def check_validity_and_get_prunned_files(select_expression): settings1 = { "use_iceberg_partition_pruning": 0 diff --git a/tests/integration/test_storage_s3/configs/lock_object_storage_task_distribution_ms.xml b/tests/integration/test_storage_s3/configs/lock_object_storage_task_distribution_ms.xml new file mode 100644 index 000000000000..a8239a28293c --- /dev/null +++ b/tests/integration/test_storage_s3/configs/lock_object_storage_task_distribution_ms.xml @@ -0,0 +1,7 @@ + + + + 0 + + + diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 53c68fa2e5fb..65ec8d1af5d9 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -74,6 +74,7 @@ def started_cluster(): "configs/access.xml", "configs/users.xml", "configs/s3_retry.xml", + "configs/lock_object_storage_task_distribution_ms.xml", ], ) cluster.add_instance( @@ -122,6 +123,7 @@ def started_cluster(): "configs/users.xml", "configs/s3_retry.xml", "configs/process_archives_as_whole_with_cluster.xml", + "configs/lock_object_storage_task_distribution_ms.xml", ], ) cluster.add_instance( diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index 716d0cd00634..a4b9d2dafd9d 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -43,6 +43,8 @@ ALTER TTL ['ALTER MODIFY TTL','MODIFY TTL'] TABLE ALTER TABLE ALTER MATERIALIZE TTL ['MATERIALIZE TTL'] TABLE ALTER TABLE ALTER SETTINGS ['ALTER SETTING','ALTER MODIFY SETTING','MODIFY SETTING','RESET SETTING'] TABLE ALTER TABLE ALTER MOVE PARTITION ['ALTER MOVE PART','MOVE PARTITION','MOVE PART'] TABLE ALTER TABLE +ALTER EXPORT PART ['ALTER EXPORT PART','EXPORT PART'] TABLE ALTER TABLE +ALTER EXPORT PARTITION ['ALTER EXPORT PARTITION','EXPORT PARTITION'] TABLE ALTER TABLE ALTER FETCH PARTITION ['ALTER FETCH PART','FETCH PARTITION'] TABLE ALTER TABLE ALTER FREEZE PARTITION ['FREEZE PARTITION','UNFREEZE'] TABLE ALTER TABLE ALTER UNLOCK SNAPSHOT ['UNLOCK SNAPSHOT'] TABLE ALTER TABLE @@ -133,6 +135,8 @@ SYSTEM DROP PAGE CACHE ['SYSTEM DROP PAGE CACHE','DROP PAGE CACHE'] GLOBAL SYSTE SYSTEM DROP SCHEMA CACHE ['SYSTEM DROP SCHEMA CACHE','DROP SCHEMA CACHE'] GLOBAL SYSTEM DROP CACHE SYSTEM DROP FORMAT SCHEMA CACHE ['SYSTEM DROP FORMAT SCHEMA CACHE','DROP FORMAT SCHEMA CACHE'] GLOBAL SYSTEM DROP CACHE SYSTEM DROP S3 CLIENT CACHE ['SYSTEM DROP S3 CLIENT','DROP S3 CLIENT CACHE'] GLOBAL SYSTEM DROP CACHE +SYSTEM DROP OBJECT STORAGE LIST OBJECTS CACHE ['SYSTEM DROP OBJECT STORAGE LIST OBJECTS CACHE'] GLOBAL SYSTEM DROP CACHE +SYSTEM DROP PARQUET METADATA CACHE ['SYSTEM DROP PARQUET METADATA CACHE'] GLOBAL SYSTEM DROP CACHE SYSTEM DROP CACHE ['DROP CACHE'] \N SYSTEM SYSTEM RELOAD CONFIG ['RELOAD CONFIG'] GLOBAL SYSTEM RELOAD SYSTEM RELOAD USERS ['RELOAD USERS'] GLOBAL SYSTEM RELOAD @@ -147,6 +151,7 @@ SYSTEM MERGES ['SYSTEM STOP MERGES','SYSTEM START MERGES','STOP MERGES','START M SYSTEM TTL MERGES ['SYSTEM STOP TTL MERGES','SYSTEM START TTL MERGES','STOP TTL MERGES','START TTL MERGES'] TABLE SYSTEM SYSTEM FETCHES ['SYSTEM STOP FETCHES','SYSTEM START FETCHES','STOP FETCHES','START FETCHES'] TABLE SYSTEM SYSTEM MOVES ['SYSTEM STOP MOVES','SYSTEM START MOVES','STOP MOVES','START MOVES'] TABLE SYSTEM +SYSTEM SWARM ['SYSTEM STOP SWARM MODE','SYSTEM START SWARM MODE','STOP SWARM MODE','START SWARM MODE'] GLOBAL SYSTEM SYSTEM PULLING REPLICATION LOG ['SYSTEM STOP PULLING REPLICATION LOG','SYSTEM START PULLING REPLICATION LOG'] TABLE SYSTEM SYSTEM CLEANUP ['SYSTEM STOP CLEANUP','SYSTEM START CLEANUP'] TABLE SYSTEM SYSTEM VIEWS ['SYSTEM REFRESH VIEW','SYSTEM START VIEWS','SYSTEM STOP VIEWS','SYSTEM START VIEW','SYSTEM STOP VIEW','SYSTEM CANCEL VIEW','REFRESH VIEW','START VIEWS','STOP VIEWS','START VIEW','STOP VIEW','CANCEL VIEW'] VIEW SYSTEM diff --git a/tests/queries/0_stateless/02168_avro_bug.sql b/tests/queries/0_stateless/02168_avro_bug.sql index ac98119845f5..338b5ef8b0b4 100644 --- a/tests/queries/0_stateless/02168_avro_bug.sql +++ b/tests/queries/0_stateless/02168_avro_bug.sql @@ -1,5 +1,5 @@ -- Tags: no-fasttest, no-parallel -insert into table function file('data.avro', 'Parquet', 'x UInt64') select * from numbers(10); -insert into table function file('data.avro', 'Parquet', 'x UInt64') select * from numbers(10); -- { serverError CANNOT_APPEND_TO_FILE } -insert into table function file('data.avro', 'Parquet', 'x UInt64') select * from numbers(10); -- { serverError CANNOT_APPEND_TO_FILE } +insert into table function file('02168_avro_bug.avro', 'Parquet', 'x UInt64') select * from numbers(10) settings engine_file_truncate_on_insert=1; +insert into table function file('02168_avro_bug.avro', 'Parquet', 'x UInt64') select * from numbers(10); -- { serverError CANNOT_APPEND_TO_FILE } +insert into table function file('02168_avro_bug.avro', 'Parquet', 'x UInt64') select * from numbers(10); -- { serverError CANNOT_APPEND_TO_FILE } select 'OK'; diff --git a/tests/queries/0_stateless/02221_system_zookeeper_unrestricted.reference b/tests/queries/0_stateless/02221_system_zookeeper_unrestricted.reference index eea09bd06035..1b4fcf850453 100644 --- a/tests/queries/0_stateless/02221_system_zookeeper_unrestricted.reference +++ b/tests/queries/0_stateless/02221_system_zookeeper_unrestricted.reference @@ -18,6 +18,8 @@ columns columns creator_info creator_info +exports +exports failed_parts failed_parts flags diff --git a/tests/queries/0_stateless/02221_system_zookeeper_unrestricted_like.reference b/tests/queries/0_stateless/02221_system_zookeeper_unrestricted_like.reference index 0d6c21be132f..9a250fa65580 100644 --- a/tests/queries/0_stateless/02221_system_zookeeper_unrestricted_like.reference +++ b/tests/queries/0_stateless/02221_system_zookeeper_unrestricted_like.reference @@ -8,6 +8,7 @@ blocks columns columns creator_info +exports failed_parts flags host @@ -49,6 +50,7 @@ blocks columns columns creator_info +exports failed_parts flags host diff --git a/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.reference b/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.reference index 877bb5f390f8..b7211a9f2526 100644 --- a/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.reference +++ b/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.reference @@ -1,2 +1,5 @@ 424242424242424242424242424242424242424242424242424242 22707864971053448441042714569797161695738549521977760418632926980540162388532 +42424242424242424242424242424242 +22707864971053448441042714569797161695738549521977760418632926980540162388532 +42424242424242424242424242424242 diff --git a/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.sh b/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.sh index 0f590027f194..2c5a79e36430 100755 --- a/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.sh +++ b/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.sh @@ -8,5 +8,13 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # This is parsed as text. $CLICKHOUSE_LOCAL -q "select toString(424242424242424242424242424242424242424242424242424242::UInt256) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' -q "select * from table" -# But this is parsed as binary because text length happens to be 32 bytes. Not ideal. -$CLICKHOUSE_LOCAL -q "select toString(42424242424242424242424242424242::UInt256) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' -q "select * from table" +# FIXED_LEN_BYTE_ARRAY(32) is parsed as binary. +$CLICKHOUSE_LOCAL -q "select toFixedString(42424242424242424242424242424242::UInt256::String, 32) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' -q "select * from table" + +# FIXED_LEN_BYTE_ARRAY(not 32) is parsed as text by the new reader, throws exception in the old reader. +$CLICKHOUSE_LOCAL -q "select toFixedString(42424242424242424242424242424242::UInt256::String, 50) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' --input_format_parquet_use_native_reader_v3=1 -q "select * from table" + +# BYTE_ARRAY of length 32 is interpreted as binary by the old parquet reader, as text by the new one. +$CLICKHOUSE_LOCAL -q "select toString(42424242424242424242424242424242::UInt256) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' --input_format_parquet_use_native_reader_v3=0 -q "select * from table" +$CLICKHOUSE_LOCAL -q "select toString(42424242424242424242424242424242::UInt256) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' --input_format_parquet_use_native_reader_v3=1 -q "select * from table" + diff --git a/tests/queries/0_stateless/02845_parquet_odd_decimals.reference b/tests/queries/0_stateless/02845_parquet_odd_decimals.reference index 29d6383b52c1..d932dbb06fef 100644 --- a/tests/queries/0_stateless/02845_parquet_odd_decimals.reference +++ b/tests/queries/0_stateless/02845_parquet_odd_decimals.reference @@ -1 +1,5 @@ 100 +col-1de12c05-5dd5-4fa7-9f93-33c43c9a4028 Decimal(20, 0) +col-5e1b97f1-dade-4c7d-b71b-e31d789e01a4 String +col-1de12c05-5dd5-4fa7-9f93-33c43c9a4028 Decimal(20, 0) +col-5e1b97f1-dade-4c7d-b71b-e31d789e01a4 String diff --git a/tests/queries/0_stateless/02845_parquet_odd_decimals.sh b/tests/queries/0_stateless/02845_parquet_odd_decimals.sh index f1e2ec849c45..48e451bfed85 100755 --- a/tests/queries/0_stateless/02845_parquet_odd_decimals.sh +++ b/tests/queries/0_stateless/02845_parquet_odd_decimals.sh @@ -11,3 +11,6 @@ ${CLICKHOUSE_CLIENT} --query="drop table if exists 02845_parquet_odd_decimals" ${CLICKHOUSE_CLIENT} --query="create table 02845_parquet_odd_decimals (\`col-1de12c05-5dd5-4fa7-9f93-33c43c9a4028\` Decimal(20, 0), \`col-5e1b97f1-dade-4c7d-b71b-e31d789e01a4\` String) engine Memory" ${CLICKHOUSE_CLIENT} --query="insert into 02845_parquet_odd_decimals from infile '$CUR_DIR/data_parquet/nine_byte_decimals_from_spark.parquet'" ${CLICKHOUSE_CLIENT} --query="select count() from 02845_parquet_odd_decimals" + +${CLICKHOUSE_LOCAL} --query="desc file('$CUR_DIR/data_parquet/nine_byte_decimals_from_spark.parquet') settings schema_inference_make_columns_nullable=0, input_format_parquet_use_native_reader_v3=0" +${CLICKHOUSE_LOCAL} --query="desc file('$CUR_DIR/data_parquet/nine_byte_decimals_from_spark.parquet') settings schema_inference_make_columns_nullable=0, input_format_parquet_use_native_reader_v3=1" diff --git a/tests/queries/0_stateless/02995_new_settings_history.sh b/tests/queries/0_stateless/02995_new_settings_history.sh index 9f04c64d47f8..2bd57cec35b3 100755 --- a/tests/queries/0_stateless/02995_new_settings_history.sh +++ b/tests/queries/0_stateless/02995_new_settings_history.sh @@ -23,8 +23,8 @@ fi # Note that this is a broad check. A per version check is done in the upgrade test # Baselines generated with v25.7.1 (pre-release) -# clickhouse local --query "select name, default from system.settings order by name format TSV" > 02995_settings_25_7_1.tsv -# clickhouse local --query "select name, value from system.merge_tree_settings order by name format TSV" > 02995_merge_tree_settings_settings_25_7_1.tsv +# clickhouse local --query "select name, default from system.settings WHERE NOT has((select flatten(groupArray(changes.name)) from system.settings_changes where version >= '25.8'), name) order by name format TSV" > 02995_settings_25_7_1.tsv +# clickhouse local --query "select name, value from system.merge_tree_settings WHERE NOT has((select flatten(groupArray(changes.name)) from system.settings_changes where version >= '25.8'), name) order by name format TSV" > 02995_merge_tree_settings_settings_25_7_1.tsv $CLICKHOUSE_LOCAL --query " WITH old_settings AS ( diff --git a/tests/queries/0_stateless/02995_settings_25_7_1.tsv b/tests/queries/0_stateless/02995_settings_25_7_1.tsv index 490c9e15bc53..23f8a414e3d2 100644 --- a/tests/queries/0_stateless/02995_settings_25_7_1.tsv +++ b/tests/queries/0_stateless/02995_settings_25_7_1.tsv @@ -26,15 +26,16 @@ allow_experimental_bigint_types 1 allow_experimental_codecs 0 allow_experimental_correlated_subqueries 0 allow_experimental_database_atomic 1 -allow_experimental_database_glue_catalog 0 +allow_experimental_database_glue_catalog 1 allow_experimental_database_hms_catalog 0 -allow_experimental_database_iceberg 0 +allow_experimental_database_iceberg 1 allow_experimental_database_materialized_mysql 0 allow_experimental_database_materialized_postgresql 0 allow_experimental_database_replicated 1 -allow_experimental_database_unity_catalog 0 +allow_experimental_database_unity_catalog 1 allow_experimental_delta_kernel_rs 1 allow_experimental_dynamic_type 1 +allow_experimental_export_merge_tree_part 0 allow_experimental_full_text_index 0 allow_experimental_funnel_functions 0 allow_experimental_geo_types 1 @@ -360,6 +361,7 @@ engine_url_skip_empty_files 0 errors_output_format CSV exact_rows_before_limit 0 except_default_mode ALL +export_merge_tree_part_overwrite_file_if_exists 0 external_storage_connect_timeout_sec 10 external_storage_max_read_bytes 0 external_storage_max_read_rows 0 @@ -590,6 +592,7 @@ input_format_parquet_max_block_size 65409 input_format_parquet_prefer_block_bytes 16744704 input_format_parquet_preserve_order 0 input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference 0 +input_format_parquet_use_metadata_cache 1 input_format_parquet_use_native_reader 0 input_format_protobuf_flatten_google_wrappers 0 input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference 0 @@ -862,6 +865,8 @@ network_zstd_compression_level 1 normalize_function_names 1 number_of_mutations_to_delay 0 number_of_mutations_to_throw 0 +object_storage_cluster +object_storage_max_nodes 0 odbc_bridge_connection_pool_size 16 odbc_bridge_use_connection_pooling 1 odbc_max_field_size 0 diff --git a/tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down.sh b/tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down.sh index e206e6a6f84e..204f849db91c 100755 --- a/tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down.sh +++ b/tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down.sh @@ -18,100 +18,102 @@ DATA_FILE_USER_PATH="${WORKING_DIR}/multi_column_bf.gz.parquet" cp ${DATA_FILE} ${DATA_FILE_USER_PATH} +CLICKHOUSE_CLIENT="${CLICKHOUSE_CLIENT} --input_format_parquet_filter_push_down=false --input_format_parquet_page_filter_push_down=false --optimize_move_to_prewhere=false --input_format_parquet_enable_row_group_prefetch=false" + ${CLICKHOUSE_CLIENT} --query="select count(*) from file('${DATA_FILE_USER_PATH}', Parquet) SETTINGS use_cache_for_count_from_files=false;" echo "bloom filter is off, all row groups should be read" echo "expect rows_read = select count()" -${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or flba='WNMM' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or flba='WNMM' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false" | jq 'del(.meta,.statistics.elapsed)' echo "bloom filter is on, some row groups should be skipped" echo "expect rows_read much less than select count()" -${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or flba='WNMM' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or flba='WNMM' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true" | jq 'del(.meta,.statistics.elapsed)' echo "bloom filter is on, but where predicate contains data from 2 row groups out of 3." echo "Rows read should be less than select count, but greater than previous selects" -${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or string='ZHZK' order by uint16_logical asc Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or string='ZHZK' order by uint16_logical asc Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true;" | jq 'del(.meta,.statistics.elapsed)' echo "bloom filter is on, but where predicate contains data from all row groups" echo "expect rows_read = select count()" -${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or string='ZHZK' or uint64_logical=18441251162536403933 order by uint16_logical asc Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or string='ZHZK' or uint64_logical=18441251162536403933 order by uint16_logical asc Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true;" | jq 'del(.meta,.statistics.elapsed)' echo "IN check" -${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string in ('PFJH', 'ZHZK') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string in ('PFJH', 'ZHZK') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true;" | jq 'del(.meta,.statistics.elapsed)' echo "tuple in case, bf is off." -${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in ('PFJH', 'GKJC') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in ('PFJH', 'GKJC') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)' echo "tuple in case, bf is on." -${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in ('PFJH', 'GKJC') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in ('PFJH', 'GKJC') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true;" | jq 'del(.meta,.statistics.elapsed)' echo "complex tuple in case, bf is off" -${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in (('NON1', 'NON1'), ('PFJH', 'GKJC'), ('NON2', 'NON2')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in (('NON1', 'NON1'), ('PFJH', 'GKJC'), ('NON2', 'NON2')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)' echo "complex tuple in case, bf is on" -${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in (('NON1', 'NON1'), ('PFJH', 'GKJC'), ('NON2', 'NON2')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in (('NON1', 'NON1'), ('PFJH', 'GKJC'), ('NON2', 'NON2')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true;" | jq 'del(.meta,.statistics.elapsed)' echo "complex tuple in case, bf is on. Non existent" -${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in (('NON1', 'NON1'), ('NON2', 'NON2'), ('NON3', 'NON3')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in (('NON1', 'NON1'), ('NON2', 'NON2'), ('NON3', 'NON3')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true;" | jq 'del(.meta,.statistics.elapsed)' echo "Bloom filter for json column. BF is off" -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where json = '{\"key\":38, \"value\":\"NXONM\"}' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where json = '{\"key\":38, \"value\":\"NXONM\"}' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where json = '{\"key\":38, \"value\":\"NXONM\"}'::JSON order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where json = '{\"key\":38, \"value\":\"NXONM\"}'::JSON order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' echo "Bloom filter for json column. BF is on" -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where json = '{\"key\":38, \"value\":\"NXONM\"}' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where json = '{\"key\":38, \"value\":\"NXONM\"}' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where json = '{\"key\":38, \"value\":\"NXONM\"}'::JSON order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where json = '{\"key\":38, \"value\":\"NXONM\"}'::JSON order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' echo "Bloom filter for ipv4 column. BF is off" -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where ipv4 = IPv4StringToNum('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where ipv4 = IPv4StringToNum('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where ipv4 = IPv4StringToNum('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where ipv4 = IPv4StringToNum('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' echo "Bloom filter for ipv4 column. BF is on" -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where ipv4 = IPv4StringToNum('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where ipv4 = IPv4StringToNum('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where ipv4 = IPv4StringToNum('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where ipv4 = IPv4StringToNum('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' echo "Bloom filter for ipv4 column. BF is on. Specified in the schema" -${CLICKHOUSE_CLIENT} --query="select ipv4 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv4 IPv4') where ipv4 = toIPv4('0.0.1.143') order by ipv4 asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select ipv4 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv4 IPv4') where ipv4 = toIPv4('0.0.1.143') order by ipv4 asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true;" | jq 'del(.meta,.statistics.elapsed)' echo "Bloom filter on 64 bit column read as ipv4. We explicitly deny it, should read all rg" -${CLICKHOUSE_CLIENT} --query="select uint64_logical from file ('${DATA_FILE_USER_PATH}', Parquet, 'uint64_logical IPv4') where uint64_logical = toIPv4(5552715629697883300) order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select uint64_logical from file ('${DATA_FILE_USER_PATH}', Parquet, 'uint64_logical IPv4') where uint64_logical = toIPv4(5552715629697883300) order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true;" | jq 'del(.meta,.statistics.elapsed)' echo "BF off for parquet uint64 logical type. Should read everything" -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint64_logical=18441251162536403933 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint64_logical=18441251162536403933 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint64_logical=18441251162536403933 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint64_logical=18441251162536403933 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' echo "BF on for parquet uint64 logical type. Uint64 is stored as a signed int 64, but with logical annotation. Make sure a value greater than int64 can be queried" -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint64_logical=18441251162536403933 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint64_logical=18441251162536403933 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint64_logical=18441251162536403933 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint64_logical=18441251162536403933 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' echo "Uint16 is stored as physical type int32 with bidwidth = 16 and sign = false. Make sure a value greater than int16 can be queried. BF is on." -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint16_logical=65528 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint16_logical=65528 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint16_logical=65528 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint16_logical=65528 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' echo "BF off for parquet int8 logical type. Should read everything" -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where int8_logical=-126 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where int8_logical=-126 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where int8_logical=-126 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where int8_logical=-126 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' echo "BF on for parquet int8 logical type. Should skip row groups" -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where int8_logical=-126 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where int8_logical=-126 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_enable_json_parsing=false;" | jq 'del(.meta,.statistics.elapsed)' -${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where int8_logical=-126 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where int8_logical=-126 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_enable_json_parsing=true;" | jq 'del(.meta,.statistics.elapsed)' echo "Invalid column conversion with in operation. String type can not be hashed against parquet int64 physical type. Should read everything" -${CLICKHOUSE_CLIENT} --query="select uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet, 'uint64_logical String') where uint64_logical in ('5') order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet, 'uint64_logical String') where uint64_logical in ('5') order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true;" | jq 'del(.meta,.statistics.elapsed)' echo "Transformations on key column shall not be allowed (=). Should read everything" -${CLICKHOUSE_CLIENT} --query="select uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) where negate(uint64_logical) = -7711695863945021976 order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) where negate(uint64_logical) = -7711695863945021976 order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true;" | jq 'del(.meta,.statistics.elapsed)' echo "Transformations on key column shall not be allowed (IN). Should read everything" -${CLICKHOUSE_CLIENT} --query="select uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) where negate(uint64_logical) in (-7711695863945021976) order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false, input_format_parquet_enable_row_group_prefetch=false;" | jq 'del(.meta,.statistics.elapsed)' +${CLICKHOUSE_CLIENT} --query="select uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) where negate(uint64_logical) in (-7711695863945021976) order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true;" | jq 'del(.meta,.statistics.elapsed)' rm -rf ${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME:?}/* diff --git a/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.reference b/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.reference index ef9b07ba955f..8db1c1f7db07 100644 --- a/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.reference +++ b/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.reference @@ -1,4 +1,7 @@ -65409 -16 +25000 +25000 +64 128 -2363 +2048 +1024 +2048 diff --git a/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.sql b/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.sql index e6b13510301e..402deba4f8ff 100644 --- a/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.sql +++ b/tests/queries/0_stateless/03164_adapting_parquet_reader_output_size.sql @@ -1,25 +1,25 @@ --- Tags: no-fasttest, no-random-settings +-- Tags: no-fasttest, no-random-settings, no-parallel set max_insert_threads=1; +set schema_inference_make_columns_nullable=0; +set engine_file_truncate_on_insert=1; -DROP TABLE IF EXISTS test_parquet; -CREATE TABLE test_parquet (col1 String, col2 String, col3 String, col4 String, col5 String, col6 String, col7 String) ENGINE=File(Parquet); -INSERT INTO test_parquet SELECT rand(),rand(),rand(),rand(),rand(),rand(),rand() FROM numbers(100000); -SELECT max(blockSize()) FROM test_parquet; +-- Average string lengths, approximately: 2, 200, 200, 200 +INSERT INTO FUNCTION file('03164_adapting_parquet_reader_output_size.parquet', Parquet, 'short String, long1 String, long2 String, long_low_cardinality String') SELECT number%100, range(cityHash64(number), cityHash64(number)+10), repeat(cityHash64(number)::String, 6+number%10), repeat((number%10)::String, 200+number%10) FROM numbers(25000); -DROP TABLE IF EXISTS test_parquet; -CREATE TABLE test_parquet (col1 String, col2 String, col3 String, col4 String, col5 String, col6 String, col7 String) ENGINE=File(Parquet) settings input_format_parquet_max_block_size=16; -INSERT INTO test_parquet SELECT rand(),rand(),rand(),rand(),rand(),rand(),rand() FROM numbers(100000); -SELECT max(blockSize()) FROM test_parquet; +-- Default limits are high, everything goes in one block. +SELECT max(blockSize())+sum(ignore(short, long2)) FROM file('03164_adapting_parquet_reader_output_size.parquet'); +-- Small column doesn't take a lot of bytes, everything goes in one block. +SELECT max(blockSize())+sum(ignore(short)) FROM file('03164_adapting_parquet_reader_output_size.parquet') settings input_format_parquet_prefer_block_bytes=100000; +-- Specific number of rows requested. +SELECT max(blockSize())+sum(ignore(short, long2)) FROM file('03164_adapting_parquet_reader_output_size.parquet') settings input_format_parquet_max_block_size=64; +-- Tiny byte limit, reader bumps block size to 128 rows instead of 1 row. +SELECT max(blockSize())+sum(ignore(short, long2)) FROM file('03164_adapting_parquet_reader_output_size.parquet') settings input_format_parquet_prefer_block_bytes=30; -DROP TABLE IF EXISTS test_parquet; -CREATE TABLE test_parquet (col1 String, col2 String, col3 String, col4 String, col5 String, col6 String, col7 String) ENGINE=File(Parquet) settings input_format_parquet_prefer_block_bytes=30; -INSERT INTO test_parquet SELECT rand(),rand(),rand(),rand(),rand(),rand(),rand() FROM numbers(100000); -SELECT max(blockSize()) FROM test_parquet; +-- Intermediate byte limit. The two parquet reader implementations estimate row byte sizes slightly +-- differently it slightly differently and don't match exactly, so we round the result. +SELECT roundToExp2(max(blockSize())+sum(ignore(short, long2))) FROM file('03164_adapting_parquet_reader_output_size.parquet') settings input_format_parquet_prefer_block_bytes=700000; +SELECT roundToExp2(max(blockSize())+sum(ignore(short, long1, long2))) FROM file('03164_adapting_parquet_reader_output_size.parquet') settings input_format_parquet_prefer_block_bytes=700000; -DROP TABLE IF EXISTS test_parquet; -CREATE TABLE test_parquet (col1 String, col2 String, col3 String, col4 String, col5 String, col6 String, col7 String) ENGINE=File(Parquet) settings input_format_parquet_prefer_block_bytes=30720; -INSERT INTO test_parquet SELECT rand(),rand(),rand(),rand(),rand(),rand(),rand() FROM numbers(100000); -SELECT max(blockSize()) FROM test_parquet; - -DROP TABLE IF EXISTS test_parquet; +-- Only the new parquet reader uses correct length estimate for dictionary-encoded strings. +SELECT roundToExp2(max(blockSize())+sum(ignore(short, long_low_cardinality))) FROM file('03164_adapting_parquet_reader_output_size.parquet') settings input_format_parquet_prefer_block_bytes=700000, input_format_parquet_use_native_reader_v3=1; diff --git a/tests/queries/0_stateless/03299_parquet_object_storage_metadata_cache.reference b/tests/queries/0_stateless/03299_parquet_object_storage_metadata_cache.reference new file mode 100644 index 000000000000..c87ad9008b60 --- /dev/null +++ b/tests/queries/0_stateless/03299_parquet_object_storage_metadata_cache.reference @@ -0,0 +1,8 @@ +10 +10 +10 +10 +10 +10 +0 +10 diff --git a/tests/queries/0_stateless/03299_parquet_object_storage_metadata_cache.sql b/tests/queries/0_stateless/03299_parquet_object_storage_metadata_cache.sql new file mode 100644 index 000000000000..c82b3f4dc0e3 --- /dev/null +++ b/tests/queries/0_stateless/03299_parquet_object_storage_metadata_cache.sql @@ -0,0 +1,63 @@ +-- Tags: no-parallel, no-fasttest, no-parallel-replicas + +SET input_format_parquet_use_native_reader_v3=0; + +DROP TABLE IF EXISTS t_parquet_03262; + +CREATE TABLE t_parquet_03262 (a UInt64) +ENGINE = S3(s3_conn, filename = 'test_03262_{_partition_id}', format = Parquet) +PARTITION BY a; + +INSERT INTO t_parquet_03262 SELECT number FROM numbers(10) SETTINGS s3_truncate_on_insert=1; + +SELECT COUNT(*) +FROM s3(s3_conn, filename = 'test_03262_*', format = Parquet) +SETTINGS input_format_parquet_use_metadata_cache=1, use_query_condition_cache=0,optimize_count_from_files=0; + +SELECT COUNT(*) +FROM s3(s3_conn, filename = 'test_03262_*', format = Parquet) +SETTINGS input_format_parquet_use_metadata_cache=1, use_query_condition_cache=0, optimize_count_from_files=0, log_comment='test_03262_parquet_metadata_cache'; + +SELECT COUNT(*) +FROM s3(s3_conn, filename = 'test_03262_*', format = ParquetMetadata) +SETTINGS input_format_parquet_use_metadata_cache=1, use_query_condition_cache=0, optimize_count_from_files=0, log_comment='test_03262_parquet_metadata_format_metadata_cache'; + +SYSTEM FLUSH LOGS; + +SELECT ProfileEvents['ParquetMetaDataCacheHits'] +FROM system.query_log +where log_comment = 'test_03262_parquet_metadata_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1 SETTINGS use_query_condition_cache=0; + +SELECT ProfileEvents['ParquetMetaDataCacheHits'] +FROM system.query_log +where log_comment = 'test_03262_parquet_metadata_format_metadata_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1 SETTINGS use_query_condition_cache=0; + +SYSTEM DROP PARQUET METADATA CACHE; + +SELECT COUNT(*) +FROM s3(s3_conn, filename = 'test_03262_*', format = Parquet) +SETTINGS input_format_parquet_use_metadata_cache=1, use_query_condition_cache=0, optimize_count_from_files=0, log_comment='test_03262_parquet_metadata_cache_cache_empty'; + +SYSTEM FLUSH LOGS; + +SELECT ProfileEvents['ParquetMetaDataCacheHits'] +FROM system.query_log +where log_comment = 'test_03262_parquet_metadata_cache_cache_empty' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1 SETTINGS use_query_condition_cache=0; + +SELECT ProfileEvents['ParquetMetaDataCacheMisses'] +FROM system.query_log +where log_comment = 'test_03262_parquet_metadata_cache_cache_empty' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1 SETTINGS use_query_condition_cache=0; + +DROP TABLE t_parquet_03262; diff --git a/tests/queries/0_stateless/03377_object_storage_list_objects_cache.reference b/tests/queries/0_stateless/03377_object_storage_list_objects_cache.reference new file mode 100644 index 000000000000..657d2e082516 --- /dev/null +++ b/tests/queries/0_stateless/03377_object_storage_list_objects_cache.reference @@ -0,0 +1,103 @@ +-- { echoOn } + +-- The cached key should be `dir_`, and that includes all three files: 1, 2 and 3. Cache should return all three, but ClickHouse should filter out the third. +SELECT _path, id FROM s3(s3_conn, filename='dir_a/dir_b/t_03377_sample_{1..2}.parquet') order by id SETTINGS use_object_storage_list_objects_cache=1; +test/dir_a/dir_b/t_03377_sample_1.parquet 1 +test/dir_a/dir_b/t_03377_sample_2.parquet 2 +-- Make sure the filtering did not interfere with the cached values +SELECT _path, id FROM s3(s3_conn, filename='dir_a/dir_b/t_03377_sample_*.parquet') order by id SETTINGS use_object_storage_list_objects_cache=1; +test/dir_a/dir_b/t_03377_sample_1.parquet 1 +test/dir_a/dir_b/t_03377_sample_2.parquet 2 +test/dir_a/dir_b/t_03377_sample_3.parquet 3 +SYSTEM FLUSH LOGS; +SELECT ProfileEvents['ObjectStorageListObjectsCacheMisses'] > 0 as miss +FROM system.query_log +where log_comment = 'cold_list_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +1 +SELECT ProfileEvents['ObjectStorageListObjectsCacheHits'] > 0 as hit +FROM system.query_log +where log_comment = 'warm_list_exact_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +1 +SELECT ProfileEvents['ObjectStorageListObjectsCacheExactMatchHits'] > 0 as hit +FROM system.query_log +where log_comment = 'warm_list_exact_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +1 +SELECT ProfileEvents['ObjectStorageListObjectsCachePrefixMatchHits'] > 0 as prefix_match_hit +FROM system.query_log +where log_comment = 'warm_list_exact_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +0 +SELECT ProfileEvents['ObjectStorageListObjectsCacheHits'] > 0 as hit +FROM system.query_log +where log_comment = 'warm_list_prefix_match_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +1 +SELECT ProfileEvents['ObjectStorageListObjectsCacheExactMatchHits'] > 0 as exact_match_hit +FROM system.query_log +where log_comment = 'warm_list_prefix_match_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +0 +SELECT ProfileEvents['ObjectStorageListObjectsCachePrefixMatchHits'] > 0 as prefix_match_hit +FROM system.query_log +where log_comment = 'warm_list_prefix_match_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +1 +SELECT ProfileEvents['ObjectStorageListObjectsCacheHits'] > 0 as hit +FROM system.query_log +where log_comment = 'even_shorter_prefix' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +0 +SELECT ProfileEvents['ObjectStorageListObjectsCacheMisses'] > 0 as miss +FROM system.query_log +where log_comment = 'even_shorter_prefix' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +1 +SELECT ProfileEvents['ObjectStorageListObjectsCacheHits'] > 0 as hit +FROM system.query_log +where log_comment = 'still_exact_match_after_shorter_prefix' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +1 +SELECT ProfileEvents['ObjectStorageListObjectsCacheExactMatchHits'] > 0 as exact_match_hit +FROM system.query_log +where log_comment = 'still_exact_match_after_shorter_prefix' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +1 +SELECT ProfileEvents['ObjectStorageListObjectsCacheHits'] > 0 as hit +FROM system.query_log +where log_comment = 'after_drop' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +0 +SELECT ProfileEvents['ObjectStorageListObjectsCacheMisses'] > 0 as miss +FROM system.query_log +where log_comment = 'after_drop' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; +1 diff --git a/tests/queries/0_stateless/03377_object_storage_list_objects_cache.sql b/tests/queries/0_stateless/03377_object_storage_list_objects_cache.sql new file mode 100644 index 000000000000..9cd8fe2794b7 --- /dev/null +++ b/tests/queries/0_stateless/03377_object_storage_list_objects_cache.sql @@ -0,0 +1,115 @@ +-- Tags: no-parallel, no-fasttest + +SYSTEM DROP OBJECT STORAGE LIST OBJECTS CACHE; + +INSERT INTO TABLE FUNCTION s3(s3_conn, filename='dir_a/dir_b/t_03377_sample_{_partition_id}.parquet', format='Parquet', structure='id UInt64') PARTITION BY id SETTINGS s3_truncate_on_insert=1 VALUES (1), (2), (3); + +SELECT * FROM s3(s3_conn, filename='dir_**.parquet') Format Null SETTINGS use_object_storage_list_objects_cache=1, log_comment='cold_list_cache'; +SELECT * FROM s3(s3_conn, filename='dir_**.parquet') Format Null SETTINGS use_object_storage_list_objects_cache=1, log_comment='warm_list_exact_cache'; +SELECT * FROM s3(s3_conn, filename='dir_a/dir_b**.parquet') Format Null SETTINGS use_object_storage_list_objects_cache=1, log_comment='warm_list_prefix_match_cache'; +SELECT * FROM s3(s3_conn, filename='dirr_**.parquet') Format Null SETTINGS use_object_storage_list_objects_cache=1, log_comment='warm_list_cache_miss'; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } +SELECT * FROM s3(s3_conn, filename='d**.parquet') Format Null SETTINGS use_object_storage_list_objects_cache=1, log_comment='even_shorter_prefix'; +SELECT * FROM s3(s3_conn, filename='dir_**.parquet') Format Null SETTINGS use_object_storage_list_objects_cache=1, log_comment='still_exact_match_after_shorter_prefix'; +SYSTEM DROP OBJECT STORAGE LIST OBJECTS CACHE; +SELECT * FROM s3(s3_conn, filename='dir_**.parquet') Format Null SETTINGS use_object_storage_list_objects_cache=1, log_comment='after_drop'; + +-- { echoOn } + +-- The cached key should be `dir_`, and that includes all three files: 1, 2 and 3. Cache should return all three, but ClickHouse should filter out the third. +SELECT _path, id FROM s3(s3_conn, filename='dir_a/dir_b/t_03377_sample_{1..2}.parquet') order by id SETTINGS use_object_storage_list_objects_cache=1; + +-- Make sure the filtering did not interfere with the cached values +SELECT _path, id FROM s3(s3_conn, filename='dir_a/dir_b/t_03377_sample_*.parquet') order by id SETTINGS use_object_storage_list_objects_cache=1; + +SYSTEM FLUSH LOGS; + +SELECT ProfileEvents['ObjectStorageListObjectsCacheMisses'] > 0 as miss +FROM system.query_log +where log_comment = 'cold_list_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +SELECT ProfileEvents['ObjectStorageListObjectsCacheHits'] > 0 as hit +FROM system.query_log +where log_comment = 'warm_list_exact_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +SELECT ProfileEvents['ObjectStorageListObjectsCacheExactMatchHits'] > 0 as hit +FROM system.query_log +where log_comment = 'warm_list_exact_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +SELECT ProfileEvents['ObjectStorageListObjectsCachePrefixMatchHits'] > 0 as prefix_match_hit +FROM system.query_log +where log_comment = 'warm_list_exact_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +SELECT ProfileEvents['ObjectStorageListObjectsCacheHits'] > 0 as hit +FROM system.query_log +where log_comment = 'warm_list_prefix_match_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +SELECT ProfileEvents['ObjectStorageListObjectsCacheExactMatchHits'] > 0 as exact_match_hit +FROM system.query_log +where log_comment = 'warm_list_prefix_match_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +SELECT ProfileEvents['ObjectStorageListObjectsCachePrefixMatchHits'] > 0 as prefix_match_hit +FROM system.query_log +where log_comment = 'warm_list_prefix_match_cache' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +SELECT ProfileEvents['ObjectStorageListObjectsCacheHits'] > 0 as hit +FROM system.query_log +where log_comment = 'even_shorter_prefix' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +SELECT ProfileEvents['ObjectStorageListObjectsCacheMisses'] > 0 as miss +FROM system.query_log +where log_comment = 'even_shorter_prefix' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +SELECT ProfileEvents['ObjectStorageListObjectsCacheHits'] > 0 as hit +FROM system.query_log +where log_comment = 'still_exact_match_after_shorter_prefix' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +SELECT ProfileEvents['ObjectStorageListObjectsCacheExactMatchHits'] > 0 as exact_match_hit +FROM system.query_log +where log_comment = 'still_exact_match_after_shorter_prefix' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +SELECT ProfileEvents['ObjectStorageListObjectsCacheHits'] > 0 as hit +FROM system.query_log +where log_comment = 'after_drop' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; + +SELECT ProfileEvents['ObjectStorageListObjectsCacheMisses'] > 0 as miss +FROM system.query_log +where log_comment = 'after_drop' +AND type = 'QueryFinish' +ORDER BY event_time desc +LIMIT 1; diff --git a/tests/queries/0_stateless/03413_experimental_settings_cannot_be_enabled_by_default.sql b/tests/queries/0_stateless/03413_experimental_settings_cannot_be_enabled_by_default.sql index 718eb63ad923..d1d165e25d57 100644 --- a/tests/queries/0_stateless/03413_experimental_settings_cannot_be_enabled_by_default.sql +++ b/tests/queries/0_stateless/03413_experimental_settings_cannot_be_enabled_by_default.sql @@ -4,5 +4,10 @@ -- However, some settings in the experimental tier are meant to control another experimental feature, and then they can be enabled as long as the feature itself is disabled. -- These are in the exceptions list inside NOT IN. -SELECT name, value FROM system.settings WHERE tier = 'Experimental' AND type = 'Bool' AND value != '0' AND name NOT IN ('throw_on_unsupported_query_inside_transaction'); +SELECT name, value FROM system.settings WHERE tier = 'Experimental' AND type = 'Bool' AND value != '0' AND name NOT IN ( + 'throw_on_unsupported_query_inside_transaction', +-- turned ON for Altinity Antalya builds specifically + 'allow_experimental_iceberg_read_optimization', + 'allow_experimental_export_merge_tree_part', +); SELECT name, value FROM system.merge_tree_settings WHERE tier = 'Experimental' AND type = 'Bool' AND value != '0' AND name NOT IN ('remove_rolled_back_parts_immediately'); diff --git a/tests/queries/0_stateless/03550_analyzer_remote_view_columns.sql b/tests/queries/0_stateless/03550_analyzer_remote_view_columns.sql index 044e34d68b5d..7eb3f2e9b67d 100644 --- a/tests/queries/0_stateless/03550_analyzer_remote_view_columns.sql +++ b/tests/queries/0_stateless/03550_analyzer_remote_view_columns.sql @@ -39,4 +39,4 @@ WHERE AND log_comment = 'THIS IS A COMMENT TO MARK THE INITIAL QUERY' LIMIT 1) AND type = 'QueryFinish' - AND NOT is_initial_query; + AND query_id != initial_query_id; diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.reference b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.reference new file mode 100644 index 000000000000..00fc51f68254 --- /dev/null +++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.reference @@ -0,0 +1,33 @@ +---- Export 2020_1_1_0 and 2021_2_2_0 +---- Both data parts should appear +1 2020 +2 2020 +3 2020 +4 2021 +---- Export the same part again, it should be idempotent +1 2020 +2 2020 +3 2020 +4 2021 +---- Data in roundtrip MergeTree table (should match s3_table) +1 2020 +2 2020 +3 2020 +4 2021 +---- Export 2020_1_1_0 and 2021_2_2_0 to wildcard table +---- Both data parts should appear +1 2020 +2 2020 +3 2020 +4 2021 +---- Export the same part again, it should be idempotent +1 2020 +2 2020 +3 2020 +4 2021 +---- Export 2020_1_1_0 and 2021_2_2_0 to wildcard table with partition expression with function +---- Both data parts should appear +1 2020 +2 2020 +3 2020 +4 2021 diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.sh b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.sh new file mode 100755 index 000000000000..ae7f05dac90a --- /dev/null +++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# Tags: no-fasttest +# Tag no-fasttest: requires s3 storage + + +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +mt_table="mt_table_${RANDOM}" +mt_table_partition_expression_with_function="mt_table_partition_expression_with_function_${RANDOM}" +s3_table="s3_table_${RANDOM}" +s3_table_wildcard="s3_table_wildcard_${RANDOM}" +s3_table_wildcard_partition_expression_with_function="s3_table_wildcard_partition_expression_with_function_${RANDOM}" +mt_table_roundtrip="mt_table_roundtrip_${RANDOM}" + +query() { + $CLICKHOUSE_CLIENT --query "$1" +} + +query "DROP TABLE IF EXISTS $mt_table, $s3_table, $mt_table_roundtrip, $s3_table_wildcard, $s3_table_wildcard_partition_expression_with_function, $mt_table_partition_expression_with_function" + +query "CREATE TABLE $mt_table (id UInt64, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple()" +query "CREATE TABLE $s3_table (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='$s3_table', format=Parquet, partition_strategy='hive') PARTITION BY year" + +query "INSERT INTO $mt_table VALUES (1, 2020), (2, 2020), (3, 2020), (4, 2021)" +echo "---- Export 2020_1_1_0 and 2021_2_2_0" +query "ALTER TABLE $mt_table EXPORT PART '2020_1_1_0' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" +query "ALTER TABLE $mt_table EXPORT PART '2021_2_2_0' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" + +echo "---- Both data parts should appear" +query "SELECT * FROM $s3_table ORDER BY id" + +echo "---- Export the same part again, it should be idempotent" +query "ALTER TABLE $mt_table EXPORT PART '2020_1_1_0' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" + +query "SELECT * FROM $s3_table ORDER BY id" + +query "CREATE TABLE $mt_table_roundtrip ENGINE = MergeTree() PARTITION BY year ORDER BY tuple() AS SELECT * FROM $s3_table" + +echo "---- Data in roundtrip MergeTree table (should match s3_table)" +query "SELECT * FROM $s3_table ORDER BY id" + +query "CREATE TABLE $s3_table_wildcard (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='$s3_table_wildcard/{_partition_id}/{_file}.parquet', format=Parquet, partition_strategy='wildcard') PARTITION BY year" + +echo "---- Export 2020_1_1_0 and 2021_2_2_0 to wildcard table" +query "ALTER TABLE $mt_table EXPORT PART '2020_1_1_0' TO TABLE $s3_table_wildcard SETTINGS allow_experimental_export_merge_tree_part = 1" +query "ALTER TABLE $mt_table EXPORT PART '2021_2_2_0' TO TABLE $s3_table_wildcard SETTINGS allow_experimental_export_merge_tree_part = 1" + +sleep 3 + +echo "---- Both data parts should appear" +query "SELECT * FROM s3(s3_conn, filename='$s3_table_wildcard/**.parquet') ORDER BY id" + +echo "---- Export the same part again, it should be idempotent" +query "ALTER TABLE $mt_table EXPORT PART '2020_1_1_0' TO TABLE $s3_table_wildcard SETTINGS allow_experimental_export_merge_tree_part = 1" + +query "SELECT * FROM s3(s3_conn, filename='$s3_table_wildcard/**.parquet') ORDER BY id" + +query "CREATE TABLE $mt_table_partition_expression_with_function (id UInt64, year UInt16) ENGINE = MergeTree() PARTITION BY toString(year) ORDER BY tuple()" +query "CREATE TABLE $s3_table_wildcard_partition_expression_with_function (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='$s3_table_wildcard_partition_expression_with_function/{_partition_id}/{_file}.parquet', format=Parquet, partition_strategy='wildcard') PARTITION BY toString(year)" + +query "INSERT INTO $mt_table_partition_expression_with_function VALUES (1, 2020), (2, 2020), (3, 2020), (4, 2021)" + +echo "---- Export 2020_1_1_0 and 2021_2_2_0 to wildcard table with partition expression with function" +query "ALTER TABLE $mt_table_partition_expression_with_function EXPORT PART 'cb217c742dc7d143b61583011996a160_1_1_0' TO TABLE $s3_table_wildcard_partition_expression_with_function SETTINGS allow_experimental_export_merge_tree_part = 1" +query "ALTER TABLE $mt_table_partition_expression_with_function EXPORT PART '3be6d49ecf9749a383964bc6fab22d10_2_2_0' TO TABLE $s3_table_wildcard_partition_expression_with_function SETTINGS allow_experimental_export_merge_tree_part = 1" + +sleep 1 + +echo "---- Both data parts should appear" +query "SELECT * FROM s3(s3_conn, filename='$s3_table_wildcard_partition_expression_with_function/**.parquet') ORDER BY id" + +query "DROP TABLE IF EXISTS $mt_table, $s3_table, $mt_table_roundtrip, $s3_table_wildcard, $s3_table_wildcard_partition_expression_with_function, $mt_table_partition_expression_with_function" diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.reference b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.reference new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql new file mode 100644 index 000000000000..a61c066e8789 --- /dev/null +++ b/tests/queries/0_stateless/03572_export_merge_tree_part_to_object_storage_simple.sql @@ -0,0 +1,22 @@ +-- Tags: no-parallel, no-fasttest + +DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table; + +CREATE TABLE 03572_mt_table (id UInt64, year UInt16) ENGINE = MergeTree() PARTITION BY year ORDER BY tuple(); + +INSERT INTO 03572_mt_table VALUES (1, 2020); + +-- Create a table with a different partition key and export a partition to it. It should throw +CREATE TABLE 03572_invalid_schema_table (id UInt64, x UInt16) ENGINE = S3(s3_conn, filename='03572_invalid_schema_table', format='Parquet', partition_strategy='hive') PARTITION BY x; + +ALTER TABLE 03572_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_invalid_schema_table +SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError INCOMPATIBLE_COLUMNS} + +DROP TABLE 03572_invalid_schema_table; + +-- The only partition strategy that supports exports is hive. Wildcard should throw +CREATE TABLE 03572_invalid_schema_table (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_invalid_schema_table/{_partition_id}', format='Parquet', partition_strategy='wildcard') PARTITION BY (id, year); + +ALTER TABLE 03572_mt_table EXPORT PART '2020_1_1_0' TO TABLE 03572_invalid_schema_table SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NOT_IMPLEMENTED} + +DROP TABLE IF EXISTS 03572_mt_table, 03572_invalid_schema_table; diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage.reference b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage.reference new file mode 100644 index 000000000000..991a5901ae6e --- /dev/null +++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage.reference @@ -0,0 +1,11 @@ +---- Get actual part names and export them +---- Both data parts should appear +1 2020 +2 2020 +3 2020 +4 2021 +---- Export the same part again, it should be idempotent +1 2020 +2 2020 +3 2020 +4 2021 diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage.sh b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage.sh new file mode 100755 index 000000000000..142664954bff --- /dev/null +++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# Tags: replica, no-parallel, no-replicated-database, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +rmt_table="rmt_table_${RANDOM}" +s3_table="s3_table_${RANDOM}" + +query() { + $CLICKHOUSE_CLIENT --query "$1" +} + +query "DROP TABLE IF EXISTS $rmt_table, $s3_table" + +query "CREATE TABLE $rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/$rmt_table', 'replica1') PARTITION BY year ORDER BY tuple()" +query "CREATE TABLE $s3_table (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='$s3_table', format=Parquet, partition_strategy='hive') PARTITION BY year" + +query "INSERT INTO $rmt_table VALUES (1, 2020), (2, 2020), (3, 2020), (4, 2021)" + +echo "---- Get actual part names and export them" +part_2020=$(query "SELECT name FROM system.parts WHERE database = currentDatabase() AND table = '$rmt_table' AND partition = '2020' ORDER BY name LIMIT 1" | tr -d '\n') +part_2021=$(query "SELECT name FROM system.parts WHERE database = currentDatabase() AND table = '$rmt_table' AND partition = '2021' ORDER BY name LIMIT 1" | tr -d '\n') + +query "ALTER TABLE $rmt_table EXPORT PART '$part_2020' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" +query "ALTER TABLE $rmt_table EXPORT PART '$part_2021' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" + +echo "---- Both data parts should appear" +query "SELECT * FROM $s3_table ORDER BY id" + +echo "---- Export the same part again, it should be idempotent" +query "ALTER TABLE $rmt_table EXPORT PART '$part_2020' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" + +query "SELECT * FROM $s3_table ORDER BY id" + +query "DROP TABLE IF EXISTS $rmt_table, $s3_table" diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.reference b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.reference new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql new file mode 100644 index 000000000000..f8f23532f0a7 --- /dev/null +++ b/tests/queries/0_stateless/03572_export_replicated_merge_tree_part_to_object_storage_simple.sql @@ -0,0 +1,22 @@ +-- Tags: no-parallel, no-fasttest + +DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table; + +CREATE TABLE 03572_rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/{database}/test_03572_rmt/03572_rmt_table', 'replica1') PARTITION BY year ORDER BY tuple(); + +INSERT INTO 03572_rmt_table VALUES (1, 2020); + +-- Create a table with a different partition key and export a partition to it. It should throw +CREATE TABLE 03572_invalid_schema_table (id UInt64, x UInt16) ENGINE = S3(s3_conn, filename='03572_invalid_schema_table', format='Parquet', partition_strategy='hive') PARTITION BY x; + +ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_schema_table +SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError INCOMPATIBLE_COLUMNS} + +DROP TABLE 03572_invalid_schema_table; + +-- The only partition strategy that supports exports is hive. Wildcard should throw +CREATE TABLE 03572_invalid_schema_table (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='03572_invalid_schema_table/{_partition_id}', format='Parquet', partition_strategy='wildcard') PARTITION BY (id, year); + +ALTER TABLE 03572_rmt_table EXPORT PART '2020_0_0_0' TO TABLE 03572_invalid_schema_table SETTINGS allow_experimental_export_merge_tree_part = 1; -- {serverError NOT_IMPLEMENTED} + +DROP TABLE IF EXISTS 03572_rmt_table, 03572_invalid_schema_table; diff --git a/tests/queries/0_stateless/03596_parquet_prewhere_page_skip_bug.sql b/tests/queries/0_stateless/03596_parquet_prewhere_page_skip_bug.sql index 92fa42185c67..79cac73b12b7 100644 --- a/tests/queries/0_stateless/03596_parquet_prewhere_page_skip_bug.sql +++ b/tests/queries/0_stateless/03596_parquet_prewhere_page_skip_bug.sql @@ -4,6 +4,6 @@ set output_format_parquet_use_custom_encoder = 1; set input_format_parquet_use_native_reader_v3 = 1; set engine_file_truncate_on_insert = 1; -insert into function file('03596_parquet_prewhere_page_skip_bug.parquet') select number as n, number*10 as n10 from numbers(200) settings output_format_parquet_data_page_size=100, output_format_parquet_batch_size=10, output_format_parquet_row_group_size=100, engine_file_truncate_on_insert=1, output_format_parquet_write_page_index=0; +insert into function file('03596_parquet_prewhere_page_skip_bug.parquet') select number as n, number*10 as n10 from numbers(200) settings output_format_parquet_data_page_size=100, output_format_parquet_batch_size=10, output_format_parquet_row_group_size=100, output_format_parquet_write_page_index=0; select n10 from file('03596_parquet_prewhere_page_skip_bug.parquet') prewhere n in (131, 174, 175, 176) order by all settings input_format_parquet_page_filter_push_down=0, input_format_parquet_filter_push_down=0, input_format_parquet_bloom_filter_push_down=0, input_format_parquet_max_block_size=10; diff --git a/tests/queries/0_stateless/03604_export_merge_tree_partition.reference b/tests/queries/0_stateless/03604_export_merge_tree_partition.reference new file mode 100644 index 000000000000..d48023362b99 --- /dev/null +++ b/tests/queries/0_stateless/03604_export_merge_tree_partition.reference @@ -0,0 +1,31 @@ +Select from source table +1 2020 +2 2020 +3 2020 +4 2021 +5 2021 +6 2022 +7 2022 +Select from destination table +1 2020 +2 2020 +3 2020 +4 2021 +5 2021 +Export partition 2022 +Select from destination table again +1 2020 +2 2020 +3 2020 +4 2021 +5 2021 +6 2022 +7 2022 +---- Data in roundtrip ReplicatedMergeTree table (should match s3_table) +1 2020 +2 2020 +3 2020 +4 2021 +5 2021 +6 2022 +7 2022 diff --git a/tests/queries/0_stateless/03604_export_merge_tree_partition.sh b/tests/queries/0_stateless/03604_export_merge_tree_partition.sh new file mode 100755 index 000000000000..87503112aadb --- /dev/null +++ b/tests/queries/0_stateless/03604_export_merge_tree_partition.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, replica, no-parallel, no-replicated-database + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +rmt_table="rmt_table_${RANDOM}" +s3_table="s3_table_${RANDOM}" +rmt_table_roundtrip="rmt_table_roundtrip_${RANDOM}" + +query() { + $CLICKHOUSE_CLIENT --query "$1" +} + +query "DROP TABLE IF EXISTS $rmt_table, $s3_table, $rmt_table_roundtrip" + +query "CREATE TABLE $rmt_table (id UInt64, year UInt16) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/$rmt_table', 'replica1') PARTITION BY year ORDER BY tuple()" +query "CREATE TABLE $s3_table (id UInt64, year UInt16) ENGINE = S3(s3_conn, filename='$s3_table', format=Parquet, partition_strategy='hive') PARTITION BY year" + +query "INSERT INTO $rmt_table VALUES (1, 2020), (2, 2020), (4, 2021)" + +query "INSERT INTO $rmt_table VALUES (3, 2020), (5, 2021)" + +query "INSERT INTO $rmt_table VALUES (6, 2022), (7, 2022)" + +# sync replicas +query "SYSTEM SYNC REPLICA $rmt_table" + +query "ALTER TABLE $rmt_table EXPORT PARTITION ID '2020' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" + +query "ALTER TABLE $rmt_table EXPORT PARTITION ID '2021' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" + +# todo poll some kind of status +sleep 15 + +echo "Select from source table" +query "SELECT * FROM $rmt_table ORDER BY id" + +echo "Select from destination table" +query "SELECT * FROM $s3_table ORDER BY id" + +echo "Export partition 2022" +query "ALTER TABLE $rmt_table EXPORT PARTITION ID '2022' TO TABLE $s3_table SETTINGS allow_experimental_export_merge_tree_part = 1" + +# todo poll some kind of status +sleep 5 + +echo "Select from destination table again" +query "SELECT * FROM $s3_table ORDER BY id" + +query "CREATE TABLE $rmt_table_roundtrip ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/$rmt_table_roundtrip', 'replica1') PARTITION BY year ORDER BY tuple() AS SELECT * FROM $s3_table" + +echo "---- Data in roundtrip ReplicatedMergeTree table (should match s3_table)" +query "SELECT * FROM $rmt_table_roundtrip ORDER BY id" + +query "DROP TABLE IF EXISTS $rmt_table, $s3_table, $rmt_table_roundtrip" \ No newline at end of file diff --git a/tests/queries/0_stateless/03624_parquet_row_number.reference b/tests/queries/0_stateless/03624_parquet_row_number.reference new file mode 100644 index 000000000000..41750ad34aa3 --- /dev/null +++ b/tests/queries/0_stateless/03624_parquet_row_number.reference @@ -0,0 +1,9 @@ +7 70 +8 80 +10 100 +11 110 +13 130 +14 140 +16 160 +17 170 +19 190 diff --git a/tests/queries/0_stateless/03624_parquet_row_number.sql b/tests/queries/0_stateless/03624_parquet_row_number.sql new file mode 100644 index 000000000000..b2dcab18716c --- /dev/null +++ b/tests/queries/0_stateless/03624_parquet_row_number.sql @@ -0,0 +1,7 @@ +-- Tags: no-fasttest, no-parallel + +set engine_file_truncate_on_insert = 1; + +insert into function file('03624_parquet_row_number.parquet') select number*10 as x from numbers(20) settings max_threads=1, output_format_parquet_row_group_size=5; + +select _row_number, x from file('03624_parquet_row_number.parquet') where x % 3 != 0 and x > 60 order by _row_number; diff --git a/tests/queries/0_stateless/03643_hybrid.reference b/tests/queries/0_stateless/03643_hybrid.reference new file mode 100644 index 000000000000..1954c097b478 --- /dev/null +++ b/tests/queries/0_stateless/03643_hybrid.reference @@ -0,0 +1,225 @@ +Hybrid creation requires allow_experimental_hybrid_table +Check Hybrid engine is registered +Hybrid +Ensure no leftovers before validation checks +Expect error when Hybrid has no arguments +Expect error when Hybrid has a single literal argument +Expect error when Hybrid arguments are literals only +Expect error when first argument is a table function of the wrong subtype (can not construct Distributed from file) +Expect error when first argument is not a table function (scalar expression) +Expect error when first argument is a table function of the wrong subtype (can not construct Distributed from url) +Expect error when predicate references a missing column +Missing column + schema inference +Create Hybrid table with remote() and constant predicate (explicit column list) +CREATE TABLE default.test_tiered_distributed\n(\n `dummy` UInt8\n)\nENGINE = Hybrid(remote(\'localhost:9000\'), 1) +dummy UInt8 +0 +Row 1: +────── +database: default +name: test_tiered_distributed +engine: Hybrid +create_table_query: CREATE TABLE default.test_tiered_distributed (`dummy` UInt8) ENGINE = Hybrid(remote('localhost:9000'), 1) +engine_full: Hybrid(remote('localhost:9000'), 1) +Create Hybrid table with remote table function and predicate (inference) +CREATE TABLE default.test_tiered_distributed_numbers_range\n(\n `number` UInt64\n)\nENGINE = Hybrid(remote(\'localhost:9000\', \'system.numbers\'), number < 5) +0 +1 +2 +3 +4 +Create Hybrid table with two remote segments as table +CREATE TABLE default.test_tiered_distributed_numbers_dual\n(\n `number` UInt64\n)\nENGINE = Hybrid(remote(\'localhost:9000\', \'system.numbers\'), number < 5, remote(\'localhost:9000\', system.numbers), (number >= 10) AND (number <= 15))\nCOMMENT \'Generates all natural numbers, starting from 0 (to 2^64 - 1, and then again) in sorted order.\' +0 +1 +2 +3 +4 +10 +11 +12 +13 +14 +15 +0 +1 +2 +3 +4 +10 +11 +12 +13 +14 +15 +Create Hybrid table combining remote function and local table +0 +1 +2 +3 +4 +10 +11 +12 +13 +14 +15 +Verify Hybrid skips segment with always false predicate on the first segment +10 +11 +12 +13 +14 +15 +Verify Hybrid skips segment with always false predicate on the second segment +0 +1 +2 +Hybrid raises when a segment is missing a column used by the base schema +Prepare local MergeTree table for multi-segment tests +Populate local table with sample data +Create Hybrid table with three segment pairs +Count rows across all segments +6 +Count rows from segments with id > 4 +1 +Count rows where value > 200 +3 +Count rows named Alice +1 +Select rows ordered by value descending (id > 2) +4 David 300.2 +5 Eve 250.1 +3 Charlie 150.7 +Limit results ordered by id +0 Invalid 2022-01-01 10:00:00 0.5 +1 Alice 2022-01-01 10:00:00 100.5 +2 Bob 2022-01-02 11:00:00 200.3 +Explain plan for filter on value +Union (Hybrid) + ReadFromRemote (Read from remote replica) + ReadFromRemote (Read from remote replica) + ReadFromRemote (Read from remote replica) +Union (Hybrid) + ReadFromRemote (Read from remote replica) + ReadFromRemote (Read from remote replica) + ReadFromRemote (Read from remote replica) +Union (Hybrid) + ReadFromRemote (Read from remote replica) + Expression ((Projection + Before ORDER BY)) + Expression (WHERE) + ReadFromMergeTree (default.test_tiered_local_data) + Expression ((Projection + Before ORDER BY)) + Expression (WHERE) + ReadFromMergeTree (default.test_tiered_local_data) +Union (Hybrid) + ReadFromRemote (Read from remote replica) + Expression ((Project names + Projection)) + Expression ((WHERE + Change column names to column identifiers)) + ReadFromMergeTree (default.test_tiered_local_data) + Expression ((Project names + Projection)) + Expression ((WHERE + Change column names to column identifiers)) + ReadFromMergeTree (default.test_tiered_local_data) +Aggregate values across name when filtering by event_time +David 1 300.2 +Eve 1 250.1 +Bob 1 200.3 +Charlie 1 150.7 +Verify additional_table_filters works consistently (legacy analyser) +2 Bob 200.3 +Verify additional_table_filters works consistently (new analyser) +2 Bob 200.3 +Clean up Hybrid table with three segment pairs +Clean up local helper table +Drop predicate filtering fixtures if they exist +Create local tables representing before/after watermark partitions +Create second local table with different value type +Insert rows before watermark into both tables +Insert rows after watermark into both tables +Create Hybrid table with analyzer disabled during reads +Insert row via Hybrid table (should go to first segment) +Verify that inserted row landed in first table +17 John 2025-09-25 400 +Verify that second table did not receive the inserted row +0 +Read predicate-filtered data with analyzer disabled and no localhost preference +14 David 2025-09-05 400 +15 Eve 2025-09-10 500 +16 Frank 2025-09-15 600 +17 John 2025-09-25 400 +21 Alice 2025-08-15 100 +22 Bob 2025-08-20 200 +23 Charlie 2025-08-25 300 +Read predicate-filtered data with analyzer enabled and no localhost preference +14 David 2025-09-05 400 +15 Eve 2025-09-10 500 +16 Frank 2025-09-15 600 +17 John 2025-09-25 400 +21 Alice 2025-08-15 100 +22 Bob 2025-08-20 200 +23 Charlie 2025-08-25 300 +Read predicate-filtered data with analyzer disabled and prefer localhost replica +14 David 2025-09-05 400 +15 Eve 2025-09-10 500 +16 Frank 2025-09-15 600 +17 John 2025-09-25 400 +21 Alice 2025-08-15 100 +22 Bob 2025-08-20 200 +23 Charlie 2025-08-25 300 +Read predicate-filtered data with analyzer enabled and prefer localhost replica +14 David 2025-09-05 400 +15 Eve 2025-09-10 500 +16 Frank 2025-09-15 600 +17 John 2025-09-25 400 +21 Alice 2025-08-15 100 +22 Bob 2025-08-20 200 +23 Charlie 2025-08-25 300 +Check if the subqueries were recorded in query_log (hybrid_table_auto_cast_columns = 0) +Row 1: +────── +type: QueryFinish +is_initial_query2: 1 +tbl: ['_table_function.remote','db.test_tiered_watermark'] +qry: SELECT * FROM test_tiered_watermark ORDER BY id DESC SETTINGS enable_analyzer = 1, hybrid_table_auto_cast_columns = 0, prefer_localhost_replica = 0, log_queries=1, serialize_query_plan=0, log_comment = 'test_tiered_watermark1', max_threads=1 FORMAT Null; +log_comment: test_tiered_watermark1 + +Row 2: +────── +type: QueryFinish +is_initial_query2: 0 +tbl: ['db.test_tiered_watermark_after'] +qry: SELECT `__table1`.`id` AS `id`, `__table1`.`name` AS `name`, `__table1`.`date` AS `date`, `__table1`.`value` AS `value` FROM `db`.`test_tiered_watermark_after` AS `__table1` WHERE `__table1`.`date` >= '2025-09-01' ORDER BY `__table1`.`id` DESC +log_comment: test_tiered_watermark1 + +Row 3: +────── +type: QueryFinish +is_initial_query2: 0 +tbl: ['db.test_tiered_watermark_before'] +qry: SELECT `__table1`.`id` AS `id`, `__table1`.`name` AS `name`, `__table1`.`date` AS `date`, `__table1`.`value` AS `value` FROM `db`.`test_tiered_watermark_before` AS `__table1` WHERE `__table1`.`date` < '2025-09-01' ORDER BY `__table1`.`id` DESC +log_comment: test_tiered_watermark1 +Check if the subqueries were recorded in query_log (hybrid_table_auto_cast_columns = 1) +Row 1: +────── +type: QueryFinish +is_initial_query2: 1 +tbl: ['_table_function.remote','db.test_tiered_watermark'] +qry: SELECT * FROM test_tiered_watermark ORDER BY id DESC SETTINGS enable_analyzer = 1, hybrid_table_auto_cast_columns = 1, prefer_localhost_replica = 0, log_queries=1, serialize_query_plan=0, log_comment = 'test_tiered_watermark2', max_threads=1 FORMAT Null; +log_comment: test_tiered_watermark2 + +Row 2: +────── +type: QueryFinish +is_initial_query2: 0 +tbl: ['db.test_tiered_watermark_after'] +qry: SELECT _CAST(`__table1`.`id`, 'UInt32') AS `id`, _CAST(`__table1`.`name`, 'String') AS `name`, `__table1`.`date` AS `date`, _CAST(`__table1`.`value`, 'UInt32') AS `value` FROM `db`.`test_tiered_watermark_after` AS `__table1` WHERE `__table1`.`date` >= '2025-09-01' ORDER BY `id` DESC +log_comment: test_tiered_watermark2 + +Row 3: +────── +type: QueryFinish +is_initial_query2: 0 +tbl: ['db.test_tiered_watermark_before'] +qry: SELECT _CAST(`__table1`.`id`, 'UInt32') AS `id`, _CAST(`__table1`.`name`, 'String') AS `name`, `__table1`.`date` AS `date`, _CAST(`__table1`.`value`, 'UInt32') AS `value` FROM `db`.`test_tiered_watermark_before` AS `__table1` WHERE `__table1`.`date` < '2025-09-01' ORDER BY _CAST(`__table1`.`id`, 'UInt32') DESC +log_comment: test_tiered_watermark2 +Clean up predicate filtering tables diff --git a/tests/queries/0_stateless/03643_hybrid.sql b/tests/queries/0_stateless/03643_hybrid.sql new file mode 100644 index 000000000000..851045aafcfa --- /dev/null +++ b/tests/queries/0_stateless/03643_hybrid.sql @@ -0,0 +1,407 @@ +SELECT 'Hybrid creation requires allow_experimental_hybrid_table'; +SET allow_experimental_hybrid_table = 0; +CREATE TABLE test_hybrid_requires_setting (`dummy` UInt8) ENGINE = Hybrid(remote('localhost:9000'), 1); -- { serverError SUPPORT_IS_DISABLED } +DROP TABLE IF EXISTS test_hybrid_requires_setting SYNC; + +SET allow_experimental_hybrid_table = 1; + +SELECT 'Check Hybrid engine is registered'; +SELECT name FROM system.table_engines WHERE name = 'Hybrid'; + +SELECT 'Ensure no leftovers before validation checks'; +DROP TABLE IF EXISTS test_tiered_distributed SYNC; +DROP TABLE IF EXISTS test_tiered_distributed_bad_args SYNC; +DROP TABLE IF EXISTS test_tiered_distributed_invalid_first_arg SYNC; + +SELECT 'Expect error when Hybrid has no arguments'; +CREATE TABLE test_tiered_distributed_bad_args (`id` UInt32,`name` String) ENGINE = Hybrid(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } + +SELECT 'Expect error when Hybrid has a single literal argument'; +CREATE TABLE test_tiered_distributed_bad_args (`id` UInt32,`name` String) ENGINE = Hybrid(1); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } + +SELECT 'Expect error when Hybrid arguments are literals only'; +CREATE TABLE test_tiered_distributed_bad_args (`id` UInt32,`name` String) ENGINE = Hybrid(1, 1); -- { serverError BAD_ARGUMENTS } + +SELECT 'Expect error when first argument is a table function of the wrong subtype (can not construct Distributed from file)'; +CREATE TABLE test_tiered_distributed_invalid_first_arg (`id` UInt32, `name` String) ENGINE = Hybrid(file('foo.x'), 1); -- { serverError BAD_ARGUMENTS } + +SELECT 'Expect error when first argument is not a table function (scalar expression)'; +CREATE TABLE test_tiered_distributed_invalid_first_arg (`id` UInt32, `name` String) ENGINE = Hybrid(sin(3), 1); -- { serverError BAD_ARGUMENTS } + +SELECT 'Expect error when first argument is a table function of the wrong subtype (can not construct Distributed from url)'; +CREATE TABLE test_tiered_distributed_invalid_first_arg (`id` UInt32, `name` String) ENGINE = Hybrid(url('http://google.com', 'RawBLOB'), 1); -- { serverError BAD_ARGUMENTS } + +SELECT 'Expect error when predicate references a missing column'; +CREATE TABLE test_tiered_distributed_bad_args(`number` UInt64) ENGINE = Hybrid(remote('localhost:9000', system.numbers), number2 < 5); -- { serverError BAD_ARGUMENTS } + +SELECT 'Missing column + schema inference'; +CREATE TABLE test_tiered_distributed_bad_args ENGINE = Hybrid(remote('localhost:9000', system.numbers), number2 < 5); -- { serverError BAD_ARGUMENTS } + +DROP TABLE IF EXISTS test_tiered_distributed_bad_args SYNC; + +SELECT 'Create Hybrid table with remote() and constant predicate (explicit column list)'; +DROP TABLE IF EXISTS test_tiered_distributed SYNC; +CREATE TABLE test_tiered_distributed(`dummy` UInt8) ENGINE = Hybrid(remote('localhost:9000'), 1); +SHOW CREATE TABLE test_tiered_distributed; +DESCRIBE TABLE test_tiered_distributed; +SELECT * FROM test_tiered_distributed; +SELECT database, name, engine, create_table_query, engine_full FROM system.tables WHERE table = 'test_tiered_distributed' FORMAT Vertical; +DROP TABLE IF EXISTS test_tiered_distributed SYNC; + +SELECT 'Create Hybrid table with remote table function and predicate (inference)'; +DROP TABLE IF EXISTS test_tiered_distributed_numbers_range SYNC; +CREATE TABLE test_tiered_distributed_numbers_range ENGINE = Hybrid(remote('localhost:9000', system.numbers), number < 5); +SHOW CREATE TABLE test_tiered_distributed_numbers_range; +SELECT * FROM test_tiered_distributed_numbers_range ORDER BY number; +DROP TABLE IF EXISTS test_tiered_distributed_numbers_range SYNC; + +SELECT 'Create Hybrid table with two remote segments as table'; +DROP TABLE IF EXISTS test_tiered_distributed_numbers_dual SYNC; +CREATE TABLE test_tiered_distributed_numbers_dual ENGINE = Hybrid( + remote('localhost:9000', system.numbers), number < 5, + remote('localhost:9000', system.numbers), number BETWEEN 10 AND 15 +) AS system.numbers; + +SHOW CREATE TABLE test_tiered_distributed_numbers_dual; +SELECT * FROM test_tiered_distributed_numbers_dual ORDER BY number SETTINGS enable_analyzer = 0; +SELECT * FROM test_tiered_distributed_numbers_dual ORDER BY number SETTINGS enable_analyzer = 1; +DROP TABLE IF EXISTS test_tiered_distributed_numbers_dual SYNC; + +SELECT 'Create Hybrid table combining remote function and local table'; +DROP TABLE IF EXISTS test_tiered_distributed_numbers_mixed SYNC; +CREATE TABLE test_tiered_distributed_numbers_mixed +( + `number` UInt64 +) ENGINE = Hybrid( + remote('localhost:9000', system.numbers), number < 5, + system.numbers, number BETWEEN 10 AND 15 +); +SELECT * FROM test_tiered_distributed_numbers_mixed ORDER BY number; +DROP TABLE IF EXISTS test_tiered_distributed_numbers_mixed SYNC; + +SELECT 'Verify Hybrid skips segment with always false predicate on the first segment'; +DROP TABLE IF EXISTS test_tiered_distributed_numbers_skip_first SYNC; +CREATE TABLE test_tiered_distributed_numbers_skip_first +( + `number` UInt64 +) ENGINE = Hybrid( + remote('localhost:9000', system.numbers), 0, + system.numbers, number BETWEEN 10 AND 15 +); +SELECT * FROM test_tiered_distributed_numbers_skip_first ORDER BY number; +DROP TABLE IF EXISTS test_tiered_distributed_numbers_skip_first SYNC; + +SELECT 'Verify Hybrid skips segment with always false predicate on the second segment'; +DROP TABLE IF EXISTS test_tiered_distributed_numbers_skip_second SYNC; +CREATE TABLE test_tiered_distributed_numbers_skip_second +( + `number` UInt64 +) ENGINE = Hybrid( + remote('localhost:9000', system.numbers), number < 3, + system.numbers, 0 +); +SELECT * FROM test_tiered_distributed_numbers_skip_second ORDER BY number; +DROP TABLE IF EXISTS test_tiered_distributed_numbers_skip_second SYNC; + +SELECT 'Hybrid raises when a segment is missing a column used by the base schema'; +DROP TABLE IF EXISTS test_hybrid_segment_full SYNC; +DROP TABLE IF EXISTS test_hybrid_segment_partial SYNC; +DROP TABLE IF EXISTS test_hybrid_missing_column SYNC; + +CREATE TABLE test_hybrid_segment_full +( + `id` UInt32, + `value` UInt32 +) +ENGINE = MergeTree() +ORDER BY id; + +CREATE TABLE test_hybrid_segment_partial +( + `id` UInt32 +) +ENGINE = MergeTree() +ORDER BY id; + +INSERT INTO test_hybrid_segment_full VALUES (1, 10), (2, 20); +INSERT INTO test_hybrid_segment_partial VALUES (3), (4); + +CREATE TABLE test_hybrid_missing_column ENGINE = Hybrid( + remote('localhost:9000', currentDatabase(), 'test_hybrid_segment_full'), id < 3, + remote('localhost:9000', currentDatabase(), 'test_hybrid_segment_partial'), id >= 3 +); -- { serverError BAD_ARGUMENTS } + +DROP TABLE IF EXISTS test_hybrid_missing_column SYNC; +DROP TABLE IF EXISTS test_hybrid_segment_partial SYNC; +DROP TABLE IF EXISTS test_hybrid_segment_full SYNC; + +----------------------------- + +SELECT 'Prepare local MergeTree table for multi-segment tests'; +DROP TABLE IF EXISTS test_tiered_local_data SYNC; +CREATE TABLE test_tiered_local_data +( + `id` UInt32, + `name` String, + `event_time` DateTime, + `value` Float64 +) ENGINE = MergeTree() +ORDER BY id; + +SELECT 'Populate local table with sample data'; +INSERT INTO test_tiered_local_data VALUES + (0, 'Invalid', '2022-01-01 10:00:00', 0.5), + (1, 'Alice', '2022-01-01 10:00:00', 100.5), + (2, 'Bob', '2022-01-02 11:00:00', 200.3), + (3, 'Charlie', '2022-01-03 12:00:00', 150.7), + (4, 'David', '2022-01-04 13:00:00', 300.2), + (5, 'Eve', '2022-01-05 14:00:00', 250.1); + +SELECT 'Create Hybrid table with three segment pairs'; +DROP TABLE IF EXISTS test_tiered_multi_segment SYNC; + +CREATE TABLE test_tiered_multi_segment +( + `id` UInt32, + `name` String, + `event_time` DateTime, + `value` Float64 +) +ENGINE = Hybrid( + remote('127.0.0.2:9000', currentDatabase(), 'test_tiered_local_data'), + id <= 2, + cluster('test_shard_localhost', currentDatabase(), 'test_tiered_local_data'), + id = 3, + remoteSecure('127.0.0.1:9440', currentDatabase(), 'test_tiered_local_data'), + id > 3 +); + +SELECT 'Count rows across all segments'; +SELECT count() FROM test_tiered_multi_segment; +SELECT 'Count rows from segments with id > 4'; +SELECT count() FROM test_tiered_multi_segment WHERE id > 4; +SELECT 'Count rows where value > 200'; +SELECT count() FROM test_tiered_multi_segment WHERE value > 200; +SELECT 'Count rows named Alice'; +SELECT count() AS alice_rows FROM test_tiered_multi_segment WHERE name = 'Alice'; + +SELECT 'Select rows ordered by value descending (id > 2)'; +SELECT id, name, value FROM test_tiered_multi_segment WHERE id > 2 ORDER BY value DESC; +SELECT 'Limit results ordered by id'; +SELECT * FROM test_tiered_multi_segment ORDER BY id LIMIT 3; +SELECT 'Explain plan for filter on value'; +EXPLAIN SELECT * FROM test_tiered_multi_segment WHERE value > 150 SETTINGS prefer_localhost_replica=0, enable_analyzer=0; +EXPLAIN SELECT * FROM test_tiered_multi_segment WHERE value > 150 SETTINGS prefer_localhost_replica=0, enable_analyzer=1; +EXPLAIN SELECT * FROM test_tiered_multi_segment WHERE value > 150 SETTINGS prefer_localhost_replica=1, enable_analyzer=0; +EXPLAIN SELECT * FROM test_tiered_multi_segment WHERE value > 150 SETTINGS prefer_localhost_replica=1, enable_analyzer=1; + +SELECT 'Aggregate values across name when filtering by event_time'; +SELECT + name, + count() AS count, + avg(value) AS avg_value +FROM test_tiered_multi_segment +WHERE event_time >= '2022-01-02' +GROUP BY name +ORDER BY avg_value DESC; + +SELECT 'Verify additional_table_filters works consistently (legacy analyser)'; +SELECT id, name, value +FROM test_tiered_multi_segment +WHERE id < 3 +ORDER BY id +SETTINGS additional_table_filters = {'test_tiered_multi_segment' : 'id > 1'}, allow_experimental_analyzer = 0; + +SELECT 'Verify additional_table_filters works consistently (new analyser)'; +SELECT id, name, value +FROM test_tiered_multi_segment +WHERE id < 3 +ORDER BY id +SETTINGS additional_table_filters = {'test_tiered_multi_segment' : 'id > 1'}, allow_experimental_analyzer = 1; + + +SELECT 'Clean up Hybrid table with three segment pairs'; +DROP TABLE IF EXISTS test_tiered_multi_segment SYNC; +SELECT 'Clean up local helper table'; +DROP TABLE IF EXISTS test_tiered_local_data SYNC; + +--------------------------------- + +-- Test Hybrid engine predicate filtering functionality + +SELECT 'Drop predicate filtering fixtures if they exist'; +DROP TABLE IF EXISTS test_tiered_watermark_after SYNC; +DROP TABLE IF EXISTS test_tiered_watermark_before SYNC; +DROP TABLE IF EXISTS test_tiered_watermark SYNC; + +SELECT 'Create local tables representing before/after watermark partitions'; +CREATE TABLE test_tiered_watermark_after +( + `id` UInt32, + `name` String, + `date` Date, + `value` UInt64 +) +ENGINE = MergeTree() +ORDER BY id; + +SELECT 'Create second local table with different value type'; +CREATE TABLE test_tiered_watermark_before +( + `id` Int32, + `name` Nullable(String), + `date` Date, + `value` Decimal128(0) +) +ENGINE = MergeTree() +ORDER BY id; + +SELECT 'Insert rows before watermark into both tables'; +INSERT INTO test_tiered_watermark_after VALUES + (11, 'Alice', '2025-08-15', 100), + (12, 'Bob', '2025-08-20', 200), + (13, 'Charlie', '2025-08-25', 300); +INSERT INTO test_tiered_watermark_before VALUES + (21, 'Alice', '2025-08-15', 100), + (22, 'Bob', '2025-08-20', 200), + (23, 'Charlie', '2025-08-25', 300); + +SELECT 'Insert rows after watermark into both tables'; +INSERT INTO test_tiered_watermark_after VALUES + (14, 'David', '2025-09-05', 400), + (15, 'Eve', '2025-09-10', 500), + (16, 'Frank', '2025-09-15', 600); +INSERT INTO test_tiered_watermark_before VALUES + (24, 'David', '2025-09-05', 400), + (25, 'Eve', '2025-09-10', 500), + (26, 'Frank', '2025-09-15', 600); + + +SELECT 'Create Hybrid table with analyzer disabled during reads'; +CREATE TABLE test_tiered_watermark +( + `id` UInt32, + `name` String, + `date` Date, + `value` UInt32 +) +ENGINE = Hybrid( + remote('127.0.0.1:9000', currentDatabase(), 'test_tiered_watermark_after'), + date >= '2025-09-01', + remote('127.0.0.1:9000', currentDatabase(), 'test_tiered_watermark_before'), + date < '2025-09-01' +); + +SELECT 'Insert row via Hybrid table (should go to first segment)'; +INSERT INTO test_tiered_watermark SETTINGS distributed_foreground_insert = 1 +VALUES (17, 'John', '2025-09-25', 400); + +SELECT 'Verify that inserted row landed in first table'; +SELECT * FROM test_tiered_watermark_after WHERE id = 17 ORDER BY id; +SELECT 'Verify that second table did not receive the inserted row'; +SELECT count() FROM test_tiered_watermark_before WHERE id = 17; + + +SELECT 'Read predicate-filtered data with analyzer disabled and no localhost preference'; +SELECT * FROM test_tiered_watermark ORDER BY id SETTINGS enable_analyzer = 0, prefer_localhost_replica = 0; +SELECT 'Read predicate-filtered data with analyzer enabled and no localhost preference'; +SELECT * FROM test_tiered_watermark ORDER BY id SETTINGS enable_analyzer = 1, prefer_localhost_replica = 0; +SELECT 'Read predicate-filtered data with analyzer disabled and prefer localhost replica'; +SELECT * FROM test_tiered_watermark ORDER BY id SETTINGS enable_analyzer = 0, prefer_localhost_replica = 1; +SELECT 'Read predicate-filtered data with analyzer enabled and prefer localhost replica'; +SELECT * FROM test_tiered_watermark ORDER BY id SETTINGS enable_analyzer = 1, prefer_localhost_replica = 1; + +-- other combinations of settings work, but give a bit different content in the query_log +-- See the problem around is_initial_query described in https://github.com/Altinity/ClickHouse/issues/1077 +SELECT 'Check if the subqueries were recorded in query_log (hybrid_table_auto_cast_columns = 0)'; + +SELECT * FROM test_tiered_watermark ORDER BY id DESC SETTINGS enable_analyzer = 1, hybrid_table_auto_cast_columns = 0, prefer_localhost_replica = 0, log_queries=1, serialize_query_plan=0, log_comment = 'test_tiered_watermark1', max_threads=1 FORMAT Null; +SYSTEM FLUSH LOGS; +SELECT + type, + query_id = initial_query_id AS is_initial_query2, + arraySort(arrayMap(x -> replaceAll(x, currentDatabase(), 'db'), tables)) as tbl, + replaceAll(query, currentDatabase(), 'db') as qry, + log_comment +FROM system.query_log +WHERE + event_time > now() - 300 AND type = 'QueryFinish' AND + initial_query_id IN ( + SELECT initial_query_id + FROM system.query_log + WHERE + event_time > now() - 300 + and log_comment = 'test_tiered_watermark1' + and current_database = currentDatabase() + and query_id = initial_query_id ) +ORDER BY tbl, event_time_microseconds +FORMAT Vertical; + +SELECT 'Check if the subqueries were recorded in query_log (hybrid_table_auto_cast_columns = 1)'; + +SELECT * FROM test_tiered_watermark ORDER BY id DESC SETTINGS enable_analyzer = 1, hybrid_table_auto_cast_columns = 1, prefer_localhost_replica = 0, log_queries=1, serialize_query_plan=0, log_comment = 'test_tiered_watermark2', max_threads=1 FORMAT Null; +SYSTEM FLUSH LOGS; +SELECT + type, + query_id = initial_query_id AS is_initial_query2, + arraySort(arrayMap(x -> replaceAll(x, currentDatabase(), 'db'), tables)) as tbl, + replaceAll(query, currentDatabase(), 'db') as qry, + log_comment +FROM system.query_log +WHERE + event_time > now() - 300 AND type = 'QueryFinish' AND + initial_query_id IN ( + SELECT initial_query_id + FROM system.query_log + WHERE + event_time > now() - 300 + and log_comment = 'test_tiered_watermark2' + and current_database = currentDatabase() + and query_id = initial_query_id ) +ORDER BY tbl, event_time_microseconds +FORMAT Vertical; + + +SELECT 'Clean up predicate filtering tables'; +DROP TABLE IF EXISTS test_tiered_watermark SYNC; +DROP TABLE IF EXISTS test_tiered_watermark_after SYNC; +DROP TABLE IF EXISTS test_tiered_watermark_before SYNC; + +-- TODO: - addressed by 03644_hybrid_auto_cast.sql +-- Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Conversion from AggregateFunction(sum, Decimal(38, 0)) to AggregateFunction(sum, UInt32) is not supported: while converting source column `sum(__table1.value)` to destination column `sum(__table1.value)`. (CANNOT_CONVERT_TYPE) +-- SELECT sum(value) FROM test_tiered_watermark; + +-- TODO: +-- Code: 47. DB::Exception: Received from localhost:9000. DB::Exception: Received from 127.0.0.2:9000. DB::Exception: Identifier '__table1._database' cannot be resolved from table with name __table1. In scope SELECT __table1._database AS _database, __table1._table AS row_count FROM default.test_tiered_watermark_after AS __table1 WHERE __table1.date >= '2025-09-01'. Maybe you meant: ['__table1._table']. (UNKNOWN_IDENTIFIER) +-- SELECT _database, _table, count() AS row_count FROM test_tiered_watermark GROUP BY _database, _table ORDER BY _database, _table; + +-- Other things which may need attention: +-- complex combinations? (overview / over Merge) +-- prefer_localhost_replica +-- threads versus local subquery pipeline part +-- ALTER support + +-- TODO +-- SELECT _table_index, count() AS row_count FROM test_debug_tiered GROUP BY _table_index ORDER BY _table_index; + +-- TODO +-- 1. Integration tests (similar to tests/queries/0_stateless) +-- - Base SELECT with date split: part in Distributed, part in S3 -> results should match a manual UNION ALL (with correct ORDER BY/aggregation). +-- - GROUP BY / ORDER BY / LIMIT: confirm the stage is selected correctly, finalization happens at the top, rows_before_limit_at_least is correct (createLocalPlan already keeps LIMIT). +-- - JOIN: with a small table on the initiator; check GLOBAL JOIN scenarios. Ensure remote segments behave the same as remote shard subqueries created through createLocalPlan. +-- - skipUnusedShards: with analyzer ensure segment conditions are respected (where FILTER DAG is available). +-- - Constants: hostName()/now() in SELECT across several segments -> ensure no discrepancies. +-- - EXPLAIN PLAN/PIPELINE: show child plans for segments and remote plans. +-- - Subqueries in logs. +-- - Different column sets/types: supertype in snapshot, converting actions on read. +-- - Object columns: same as Distributed — use ColumnsDescriptionByShardNum for segments if needed (optional for local segments; already implemented for Distributed). + +-- Condition with dictGet('a1_watermarks_dict', ...) + +-- access rights check + + +-- TODO: +-- test for distributed_aggregation_memory_efficient & enable_memory_bound_merging_of_aggregation_results +-- to avoid UNKNOWN_AGGREGATED_DATA_VARIANT when mixing different aggregation variants +-- from remote shards (with memory_bound) and local segments (without memory_bound) diff --git a/tests/queries/0_stateless/03643_system_drop_filesystem_cache_on_cluster.reference b/tests/queries/0_stateless/03643_system_drop_filesystem_cache_on_cluster.reference new file mode 100644 index 000000000000..1444d39d9578 --- /dev/null +++ b/tests/queries/0_stateless/03643_system_drop_filesystem_cache_on_cluster.reference @@ -0,0 +1,2 @@ +localhost 9000 0 0 0 +localhost 9000 0 0 0 diff --git a/tests/queries/0_stateless/03643_system_drop_filesystem_cache_on_cluster.sh b/tests/queries/0_stateless/03643_system_drop_filesystem_cache_on_cluster.sh new file mode 100755 index 000000000000..8f40316ac5d1 --- /dev/null +++ b/tests/queries/0_stateless/03643_system_drop_filesystem_cache_on_cluster.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-parallel, no-object-storage, no-random-settings + +# set -x + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + + +disk_name="${CLICKHOUSE_TEST_UNIQUE_NAME}" +$CLICKHOUSE_CLIENT -m --query """ +DROP TABLE IF EXISTS test; +CREATE TABLE test (a Int32, b String) +ENGINE = MergeTree() ORDER BY tuple() +SETTINGS disk = disk(name = '$disk_name', type = cache, max_size = '100Ki', path = ${CLICKHOUSE_TEST_UNIQUE_NAME}, disk = s3_disk); + +INSERT INTO test SELECT 1, 'test'; +""" + +$CLICKHOUSE_CLIENT --query """ +SYSTEM SYNC FILESYSTEM CACHE '$disk_name' ON CLUSTER 'test_shard_localhost'; +""" + +$CLICKHOUSE_CLIENT --query """ +SYSTEM DROP FILESYSTEM CACHE '$disk_name' ON CLUSTER 'test_shard_localhost'; +""" + +$CLICKHOUSE_CLIENT --query """ +DROP TABLE IF EXISTS test; +""" diff --git a/tests/queries/0_stateless/03644_hybrid_auto_cast.reference b/tests/queries/0_stateless/03644_hybrid_auto_cast.reference new file mode 100644 index 000000000000..869ac32216b6 --- /dev/null +++ b/tests/queries/0_stateless/03644_hybrid_auto_cast.reference @@ -0,0 +1,13 @@ +hybrid_table_auto_cast_columns = 0, enable_analyzer = 1 (headers mismatch) +hybrid_table_auto_cast_columns = 0, enable_analyzer = 0 (headers mismatch) +1 +hybrid_table_auto_cast_columns = 0, enable_analyzer = 1 manual cast +600 +1 +hybrid_table_auto_cast_columns = 0, enable_analyzer = 0 manual cast +600 +1 +hybrid_table_auto_cast_columns = 1, enable_analyzer = 1 +600 +1 +hybrid_table_auto_cast_columns = 1, enable_analyzer = 0 (analizer required) diff --git a/tests/queries/0_stateless/03644_hybrid_auto_cast.sql b/tests/queries/0_stateless/03644_hybrid_auto_cast.sql new file mode 100644 index 000000000000..7248dd9a55ef --- /dev/null +++ b/tests/queries/0_stateless/03644_hybrid_auto_cast.sql @@ -0,0 +1,85 @@ +SET allow_experimental_hybrid_table = 1, + prefer_localhost_replica = 0; + +DROP TABLE IF EXISTS test_tiered_watermark_after; +DROP TABLE IF EXISTS test_tiered_watermark_before; +DROP TABLE IF EXISTS test_tiered_watermark; + +CREATE TABLE test_tiered_watermark_after +( + `id` UInt32, + `name` String, + `date` Date, + `value` UInt64, + `categories` Array(UInt32) +) +ENGINE = MergeTree() +ORDER BY id; + +CREATE TABLE test_tiered_watermark_before +( + `id` Int32, + `name` Nullable(String), + `date` Date, + `value` Decimal128(0), + `categories` Array(Int64) +) +ENGINE = MergeTree() +ORDER BY id; + +INSERT INTO test_tiered_watermark_after VALUES + (11, 'Alice', '2025-08-15', 100, [100, 10]), + (12, 'Bob', '2025-08-20', 200, [200, 20]), + (13, 'Charlie', '2025-08-25', 300, [300, 30]), + (14, 'David', '2025-09-05', 400, [400, 40]), + (15, 'Eve', '2025-09-10', 500, [500, 50]), + (16, 'Frank', '2025-09-15', 600, [600, 60]); + +INSERT INTO test_tiered_watermark_before VALUES + (21, 'Alice', '2025-08-15', 100, [100, 10]), + (22, 'Bob', '2025-08-20', 200, [200, 20]), + (23, 'Charlie', '2025-08-25', 300, [300, 30]), + (24, 'David', '2025-09-05', 400, [400, 40]), + (25, 'Eve', '2025-09-10', 500, [500, 50]), + (26, 'Frank', '2025-09-15', 600, [600, 60]); + +CREATE TABLE test_tiered_watermark +ENGINE = Hybrid( + remote('127.0.0.1:9000', currentDatabase(), 'test_tiered_watermark_after'), + date >= '2025-09-01', + remote('127.0.0.1:9000', currentDatabase(), 'test_tiered_watermark_before'), + date < '2025-09-01' +); + +-- the problem +SELECT 'hybrid_table_auto_cast_columns = 0, enable_analyzer = 1 (headers mismatch)'; +SET hybrid_table_auto_cast_columns = 0, enable_analyzer = 1; +SELECT max(value) FROM test_tiered_watermark; -- { serverError CANNOT_CONVERT_TYPE } +SELECT sum(if(arrayExists(x -> (x IN (10)), categories), 1, 0)) AS x FROM test_tiered_watermark; -- { serverError THERE_IS_NO_COLUMN } + +SELECT 'hybrid_table_auto_cast_columns = 0, enable_analyzer = 0 (headers mismatch)'; +SET hybrid_table_auto_cast_columns = 0, enable_analyzer = 0; +SELECT max(value) FROM test_tiered_watermark; -- { serverError CANNOT_CONVERT_TYPE } +SELECT sum(if(arrayExists(x -> (x IN (10)), categories), 1, 0)) AS x FROM test_tiered_watermark; -- works w/o analyzer + +-- workaround - explicit cast +SELECT 'hybrid_table_auto_cast_columns = 0, enable_analyzer = 1 manual cast'; +SET hybrid_table_auto_cast_columns = 0, enable_analyzer = 1; +SELECT max(value::UInt32) FROM test_tiered_watermark; +SELECT sum(if(arrayExists(x -> (x IN (10)), categories::Array(UInt32)), 1, 0)) AS x FROM test_tiered_watermark; + +SELECT 'hybrid_table_auto_cast_columns = 0, enable_analyzer = 0 manual cast'; +SET hybrid_table_auto_cast_columns = 0, enable_analyzer = 0; +SELECT max(value::UInt32) FROM test_tiered_watermark; +SELECT sum(if(arrayExists(x -> (x IN (10)), categories::Array(UInt32)), 1, 0)) AS x FROM test_tiered_watermark; + +-- feature to add casts automatically +SELECT 'hybrid_table_auto_cast_columns = 1, enable_analyzer = 1'; +SET hybrid_table_auto_cast_columns = 1, enable_analyzer = 1; +SELECT max(value) FROM test_tiered_watermark; +SELECT sum(if(arrayExists(x -> (x IN (10)), categories), 1, 0)) AS x FROM test_tiered_watermark; + +SELECT 'hybrid_table_auto_cast_columns = 1, enable_analyzer = 0 (analizer required)'; +SET hybrid_table_auto_cast_columns = 1, enable_analyzer = 0; +SELECT max(value) FROM test_tiered_watermark; -- { serverError CANNOT_CONVERT_TYPE } + diff --git a/tests/queries/0_stateless/03644_hybrid_unqualified_table.reference b/tests/queries/0_stateless/03644_hybrid_unqualified_table.reference new file mode 100644 index 000000000000..98bae7afae05 --- /dev/null +++ b/tests/queries/0_stateless/03644_hybrid_unqualified_table.reference @@ -0,0 +1,3 @@ +Hybrid allows unqualified local tables by default +3 +1 diff --git a/tests/queries/0_stateless/03644_hybrid_unqualified_table.sql b/tests/queries/0_stateless/03644_hybrid_unqualified_table.sql new file mode 100644 index 000000000000..672beea1afc3 --- /dev/null +++ b/tests/queries/0_stateless/03644_hybrid_unqualified_table.sql @@ -0,0 +1,33 @@ +SET allow_experimental_hybrid_table = 1; + +SELECT 'Hybrid allows unqualified local tables by default'; + +DROP TABLE IF EXISTS test_hybrid_unqualified_segment SYNC; +DROP TABLE IF EXISTS test_hybrid_unqualified SYNC; + +CREATE TABLE test_hybrid_unqualified_segment +( + `number` UInt64 +) +ENGINE = MergeTree() +ORDER BY tuple(); + +INSERT INTO test_hybrid_unqualified_segment VALUES (10), (20); + +CREATE TABLE test_hybrid_unqualified +( + `number` UInt64 +) +ENGINE = Hybrid( + remote('localhost:9000', system.numbers), number = 0, + test_hybrid_unqualified_segment, number >= 10 +); + +SELECT count() FROM test_hybrid_unqualified; + +SELECT positionCaseInsensitive(engine_full, concat(currentDatabase(), '.test_hybrid_unqualified_segment')) > 0 +FROM system.tables +WHERE database = currentDatabase() AND name = 'test_hybrid_unqualified'; + +DROP TABLE IF EXISTS test_hybrid_unqualified SYNC; +DROP TABLE IF EXISTS test_hybrid_unqualified_segment SYNC; diff --git a/tests/queries/0_stateless/data_minio/field_ids_complex_test/metadata/v1.metadata.json b/tests/queries/0_stateless/data_minio/field_ids_complex_test/metadata/v1.metadata.json index 8d367d20f041..a983881af8f0 100644 --- a/tests/queries/0_stateless/data_minio/field_ids_complex_test/metadata/v1.metadata.json +++ b/tests/queries/0_stateless/data_minio/field_ids_complex_test/metadata/v1.metadata.json @@ -1,7 +1,7 @@ { "format-version" : 2, "table-uuid" : "d4b695ca-ceeb-4537-8a2a-eee90dc6e313", - "location" : "s3a://test/field_ids_struct_test/metadata/field_ids_complex_test", + "location" : "s3a://test/field_ids_complex_test", "last-sequence-number" : 1, "last-updated-ms" : 1757661733693, "last-column-id" : 9, @@ -96,7 +96,7 @@ "total-position-deletes" : "0", "total-equality-deletes" : "0" }, - "manifest-list" : "s3a://test/field_ids_struct_test/metadata/field_ids_complex_test/metadata/snap-607752583403487091-1-140c8dff-1d83-4841-bc40-9aa85205b555.avro", + "manifest-list" : "s3a://test/field_ids_complex_test/metadata/snap-607752583403487091-1-140c8dff-1d83-4841-bc40-9aa85205b555.avro", "schema-id" : 0 } ], "statistics" : [ ], diff --git a/tests/queries/0_stateless/data_minio/field_ids_struct_test/metadata/v1.metadata.json b/tests/queries/0_stateless/data_minio/field_ids_struct_test/metadata/v1.metadata.json index 2d149abb44e7..d6c9079228ac 100644 --- a/tests/queries/0_stateless/data_minio/field_ids_struct_test/metadata/v1.metadata.json +++ b/tests/queries/0_stateless/data_minio/field_ids_struct_test/metadata/v1.metadata.json @@ -1,7 +1,7 @@ { "format-version" : 2, "table-uuid" : "149ecc15-7afc-4311-86b3-3a4c8d4ec08e", - "location" : "s3a://test/field_ids_struct_test/metadata/field_ids_struct_test", + "location" : "s3a://test/field_ids_struct_test", "last-sequence-number" : 1, "last-updated-ms" : 1753959190403, "last-column-id" : 6, @@ -84,7 +84,7 @@ "total-position-deletes" : "0", "total-equality-deletes" : "0" }, - "manifest-list" : "s3a://test/field_ids_struct_test/metadata/field_ids_struct_test/metadata/snap-2512638186869817292-1-ec467367-15a4-4610-8ea8-cf76797afb03.avro", + "manifest-list" : "s3a://test/field_ids_struct_test/metadata/snap-2512638186869817292-1-ec467367-15a4-4610-8ea8-cf76797afb03.avro", "schema-id" : 0 } ], "statistics" : [ ], diff --git a/tests/queries/0_stateless/data_minio/field_ids_table_test/metadata/v1.metadata.json b/tests/queries/0_stateless/data_minio/field_ids_table_test/metadata/v1.metadata.json index 32225eb618ad..1ddc3492cc82 100644 --- a/tests/queries/0_stateless/data_minio/field_ids_table_test/metadata/v1.metadata.json +++ b/tests/queries/0_stateless/data_minio/field_ids_table_test/metadata/v1.metadata.json @@ -1,7 +1,7 @@ { "format-version" : 2, "table-uuid" : "8f1f9ae2-18bb-421e-b640-ec2f85e67bce", - "location" : "s3a://test/field_ids_table_test/metadata/field_ids_table_test", + "location" : "s3a://test/field_ids_table_test", "last-sequence-number" : 1, "last-updated-ms" : 1752481476160, "last-column-id" : 1, @@ -56,7 +56,7 @@ "total-position-deletes" : "0", "total-equality-deletes" : "0" }, - "manifest-list" : "s3a://test/field_ids_table_test/metadata/field_ids_table_test/metadata/snap-2811410366534688344-1-3b002f99-b012-4041-9a97-db477fcc7115.avro", + "manifest-list" : "s3a://test/field_ids_table_test/metadata/snap-2811410366534688344-1-3b002f99-b012-4041-9a97-db477fcc7115.avro", "schema-id" : 0 } ], "statistics" : [ ],