diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index 3046e25626..1bec186ca8 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -551,23 +551,32 @@ def commit_table( if hive_table and current_table: # Table exists, update it. - new_parameters = _construct_parameters( + + # Note on table properties: + # - Iceberg table properties are stored in both HMS and Iceberg metadata JSON. + # - Updates are reflected in both locations + # - Existing HMS table properties (set by external systems like Hive/Spark) are preserved. + # + # While it is possible to modify HMS table properties through this API, it is not recommended: + # - Mixing HMS-specific properties in Iceberg metadata can cause confusion + # - New/updated HMS table properties will also be stored in Iceberg metadata (even though it is HMS-specific) + # - HMS-native properties (set outside Iceberg) cannot be deleted since they are not visible to Iceberg + # (However, if you first SET an HMS property via Iceberg, it becomes tracked in Iceberg metadata, + # and can then be deleted via Iceberg - which removes it from both Iceberg metadata and HMS) + new_iceberg_properties = _construct_parameters( metadata_location=updated_staged_table.metadata_location, previous_metadata_location=current_table.metadata_location, metadata_properties=updated_staged_table.properties, ) - # Detect properties that were removed from Iceberg metadata - removed_keys = current_table.properties.keys() - updated_staged_table.properties.keys() - - # Sync HMS parameters: Iceberg metadata is the source of truth, HMS parameters are - # a projection of Iceberg state plus any HMS-only properties. - # Start with existing HMS params, remove deleted Iceberg properties, then apply Iceberg values. - merged_params = dict(hive_table.parameters or {}) - for key in removed_keys: - merged_params.pop(key, None) - merged_params.update(new_parameters) - hive_table.parameters = merged_params + deleted_iceberg_properties = current_table.properties.keys() - updated_staged_table.properties.keys() + + # Merge: preserve HMS-native properties, remove deleted Iceberg properties, apply new Iceberg properties + existing_hms_parameters = dict(hive_table.parameters or {}) + for key in deleted_iceberg_properties: + existing_hms_parameters.pop(key, None) + existing_hms_parameters.update(new_iceberg_properties) + hive_table.parameters = existing_hms_parameters # Update hive's schema and properties hive_table.sd = _construct_hive_storage_descriptor( diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 8de1925eec..8811330a8a 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -286,6 +286,66 @@ def test_hive_critical_properties_always_from_iceberg(catalog: Catalog) -> None: assert new_metadata_location != original_metadata_location, "metadata_location should be updated!" +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive")]) +def test_hive_native_properties_cannot_be_deleted_via_iceberg(catalog: Catalog) -> None: + """Test that HMS-native properties (set outside Iceberg) cannot be deleted via Iceberg. + + HMS-native properties are not visible to Iceberg, so remove_properties fails with KeyError. + However, if you first SET an HMS property via Iceberg (making it tracked in Iceberg metadata), + it can then be deleted via Iceberg. + """ + table = create_table(catalog) + hive_client: _HiveClient = _HiveClient(catalog.properties["uri"]) + + # Set an HMS-native property directly (not through Iceberg) + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + hive_table.parameters["hms_native_prop"] = "native_value" + open_client.alter_table(TABLE_NAME[0], TABLE_NAME[1], hive_table) + + # Verify the HMS-native property exists in HMS + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + assert hive_table.parameters.get("hms_native_prop") == "native_value" + + # Refresh the Iceberg table to get the latest state + table.refresh() + + # Verify the HMS-native property is NOT visible in Iceberg + assert "hms_native_prop" not in table.properties + + # Attempt to remove the HMS-native property via Iceberg - this should fail + # because the property is not tracked in Iceberg metadata (not visible to Iceberg) + with pytest.raises(KeyError): + table.transaction().remove_properties("hms_native_prop").commit_transaction() + + # HMS-native property should still exist (cannot be deleted via Iceberg) + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + assert hive_table.parameters.get("hms_native_prop") == "native_value", ( + "HMS-native property should still exist since Iceberg removal failed!" + ) + + # Now SET the same property via Iceberg (this makes it tracked in Iceberg metadata) + table.transaction().set_properties({"hms_native_prop": "iceberg_value"}).commit_transaction() + + # Verify it's updated in both places + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + assert hive_table.parameters.get("hms_native_prop") == "iceberg_value" + + # Now we CAN delete it via Iceberg (because it's now tracked in Iceberg metadata) + table.transaction().remove_properties("hms_native_prop").commit_transaction() + + # Property should be deleted from HMS + with hive_client as open_client: + hive_table = open_client.get_table(*TABLE_NAME) + assert hive_table.parameters.get("hms_native_prop") is None, ( + "Property should be deletable after being SET via Iceberg (making it tracked)!" + ) + + @pytest.mark.integration @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_table_properties_dict(catalog: Catalog) -> None: