From 6a2ea461aa1eae6a6a9fc7448c894b8ac119e963 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 20 Oct 2025 22:10:54 +0000 Subject: [PATCH 1/2] Merge pull request #87020 from ianton-ru/iceberg_table_name_encode Fix table name encoding in data lake rest catalog --- src/Databases/DataLake/RestCatalog.cpp | 10 ++++++-- .../integration/test_database_iceberg/test.py | 24 +++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/Databases/DataLake/RestCatalog.cpp b/src/Databases/DataLake/RestCatalog.cpp index 8b0d643f3ec1..dd71315fa6f9 100644 --- a/src/Databases/DataLake/RestCatalog.cpp +++ b/src/Databases/DataLake/RestCatalog.cpp @@ -263,7 +263,8 @@ DB::ReadWriteBufferFromHTTPPtr RestCatalog::createReadBuffer( { const auto & context = getContext(); - Poco::URI url(base_url / endpoint); + /// enable_url_encoding=false to allow use tables with encoded sequences in names like 'foo%2Fbar' + Poco::URI url(base_url / endpoint, /* enable_url_encoding */ false); if (!params.empty()) url.setQueryParameters(params); @@ -496,7 +497,12 @@ DB::Names RestCatalog::parseTables(DB::ReadBuffer & buf, const std::string & bas for (size_t i = 0; i < identifiers_object->size(); ++i) { const auto current_table_json = identifiers_object->get(static_cast(i)).extract(); - const auto table_name = current_table_json->get("name").extract(); + /// If table has encoded sequence (like 'foo%2Fbar') + /// catalog returns decoded character instead of sequence ('foo/bar') + /// Here name encoded back to 'foo%2Fbar' format + const auto table_name_raw = current_table_json->get("name").extract(); + std::string table_name; + Poco::URI::encode(table_name_raw, "/", table_name); tables.push_back(base_namespace + "." + table_name); if (limit && tables.size() >= limit) diff --git a/tests/integration/test_database_iceberg/test.py b/tests/integration/test_database_iceberg/test.py index 373f98cedc1c..c602f97ae07e 100644 --- a/tests/integration/test_database_iceberg/test.py +++ b/tests/integration/test_database_iceberg/test.py @@ -384,3 +384,27 @@ def record(key): assert 'aaa\naaa\naaa' == node.query(f"SELECT symbol FROM {CATALOG_NAME}.`{namespace}.{table_name}`").strip() assert 'bbb\nbbb\nbbb' == node.query(f"SELECT symbol FROM {CATALOG_NAME}.`{namespace}.{table_name_2}`").strip() + + +def test_table_with_slash(started_cluster): + node = started_cluster.instances["node1"] + + # pyiceberg at current moment (version 0.9.1) has a bug with table names with slashes + # see https://github.com/apache/iceberg-python/issues/2462 + # so we need to encode it manually + table_raw_suffix = "table/foo" + table_encoded_suffix = "table%2Ffoo" + + test_ref = f"test_list_tables_{uuid.uuid4()}" + table_name = f"{test_ref}_{table_raw_suffix}" + table_encoded_name = f"{test_ref}_{table_encoded_suffix}" + root_namespace = f"{test_ref}_namespace" + + catalog = load_catalog_impl(started_cluster) + catalog.create_namespace(root_namespace) + + create_table(catalog, root_namespace, table_name, DEFAULT_SCHEMA, PartitionSpec(), DEFAULT_SORT_ORDER) + + create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME) + node.query(f"INSERT INTO {CATALOG_NAME}.`{root_namespace}.{table_encoded_name}` VALUES (NULL, 'AAPL', 193.24, 193.31, tuple('bot'));", settings={"allow_experimental_insert_into_iceberg": 1, 'write_full_path_in_iceberg_metadata': 1}) + assert node.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_encoded_name}`") == "\\N\tAAPL\t193.24\t193.31\t('bot')\n" From f3f830d092da94ff88dfc0ae3cf53ab07b298f7a Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Thu, 6 Nov 2025 13:38:41 +0100 Subject: [PATCH 2/2] Fix test, write with pyiceberg --- tests/integration/test_database_iceberg/test.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_database_iceberg/test.py b/tests/integration/test_database_iceberg/test.py index c602f97ae07e..4d492e62572b 100644 --- a/tests/integration/test_database_iceberg/test.py +++ b/tests/integration/test_database_iceberg/test.py @@ -404,7 +404,18 @@ def test_table_with_slash(started_cluster): catalog.create_namespace(root_namespace) create_table(catalog, root_namespace, table_name, DEFAULT_SCHEMA, PartitionSpec(), DEFAULT_SORT_ORDER) + table = catalog.load_table(f"{root_namespace}.{table_encoded_name}") + data = [ + { + "datetime": datetime.strptime("2025-01-01 12:00:00", "%Y-%m-%d %H:%M:%S"), + "symbol": "AAPL", + "bid": 193.24, + "ask": 193.31, + "details": {"created_by": "bot"}, + } + ] + df = pa.Table.from_pylist(data) + table.append(df) create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME) - node.query(f"INSERT INTO {CATALOG_NAME}.`{root_namespace}.{table_encoded_name}` VALUES (NULL, 'AAPL', 193.24, 193.31, tuple('bot'));", settings={"allow_experimental_insert_into_iceberg": 1, 'write_full_path_in_iceberg_metadata': 1}) - assert node.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_encoded_name}`") == "\\N\tAAPL\t193.24\t193.31\t('bot')\n" + assert node.query(f"SELECT * FROM {CATALOG_NAME}.`{root_namespace}.{table_encoded_name}`") == "2025-01-01 12:00:00.000000\tAAPL\t193.24\t193.31\t('bot')\n"