From 91695c09caf4dfab6ad376e0e5f416b1c6a5e164 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 25 Jan 2026 12:22:15 -0500 Subject: [PATCH 1/4] remove E501 from ruff ignore --- ruff.toml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ruff.toml b/ruff.toml index 1ce052288e..8033da29d0 100644 --- a/ruff.toml +++ b/ruff.toml @@ -57,19 +57,19 @@ select = [ "I", # isort "UP", # pyupgrade ] -ignore = [ - "E501" -] +ignore = [] # Allow autofix for all enabled rules (when `--fix`) is provided. fixable = ["ALL"] unfixable = [] -per-file-ignores = {} - # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" +[lint.per-file-ignores] +"tests/integration/test_inspect_table.py" = ["E501"] +"tests/io/test_pyarrow.py" = ["E501"] + [lint.pyupgrade] # Preserve types, even if a file imports `from __future__ import annotations`. keep-runtime-typing = true From 92ff522c4fc87fa3da4fa0b1dcf8dd857f9ebcfd Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 25 Jan 2026 12:22:48 -0500 Subject: [PATCH 2/4] ignore E501 in tests --- tests/catalog/test_hive.py | 2 +- tests/catalog/test_rest.py | 24 +++++++++---------- tests/cli/test_console.py | 4 ++-- tests/conftest.py | 14 +++++------ tests/expressions/test_evaluator.py | 2 +- tests/expressions/test_visitors.py | 15 ++++++------ tests/integration/test_add_files.py | 6 ++--- tests/integration/test_inspect_table.py | 2 ++ tests/integration/test_partitioning_key.py | 12 +++++----- tests/integration/test_reads.py | 2 +- .../test_writes/test_partitioned_writes.py | 6 ++--- tests/io/test_fsspec.py | 14 +++++------ tests/io/test_pyarrow.py | 20 +++++++++------- tests/io/test_pyarrow_visitor.py | 8 +++---- tests/table/test_name_mapping.py | 4 ++-- tests/table/test_partitioning.py | 4 ++-- tests/table/test_refs.py | 2 +- tests/table/test_snapshots.py | 20 ++++++++-------- tests/table/test_sorting.py | 6 ++--- tests/table/test_upsert.py | 2 +- tests/test_conversions.py | 3 ++- tests/test_schema.py | 14 +++++------ tests/test_types.py | 4 ++-- tests/utils/test_manifest.py | 6 ++--- 24 files changed, 101 insertions(+), 95 deletions(-) diff --git a/tests/catalog/test_hive.py b/tests/catalog/test_hive.py index 1a3978a045..dcfc812378 100644 --- a/tests/catalog/test_hive.py +++ b/tests/catalog/test_hive.py @@ -970,7 +970,7 @@ def test_rename_table_to_namespace_does_not_exists() -> None: catalog._client = MagicMock() catalog._client.__enter__().alter_table_with_environment_context.side_effect = InvalidOperationException( - message="Unable to change partition or table. Database default does not exist Check metastore logs for detailed stack.does_not_exists" + message="Unable to change partition or table. Database default does not exist Check metastore logs for detailed stack.does_not_exists" # noqa: E501 ) with pytest.raises(NoSuchNamespaceError) as exc_info: diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 71341c8b39..763ff46e03 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -156,7 +156,7 @@ def test_no_uri_supplied() -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 ) def test_token_200(rest_mock: Mocker) -> None: rest_mock.post( @@ -179,7 +179,7 @@ def test_token_200(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 ) def test_token_200_without_optional_fields(rest_mock: Mocker) -> None: rest_mock.post( @@ -198,7 +198,7 @@ def test_token_200_without_optional_fields(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 ) def test_token_with_optional_oauth_params(rest_mock: Mocker) -> None: mock_request = rest_mock.post( @@ -223,7 +223,7 @@ def test_token_with_optional_oauth_params(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 ) def test_token_with_optional_oauth_params_as_empty(rest_mock: Mocker) -> None: mock_request = rest_mock.post( @@ -246,7 +246,7 @@ def test_token_with_optional_oauth_params_as_empty(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 ) def test_token_with_default_scope(rest_mock: Mocker) -> None: mock_request = rest_mock.post( @@ -267,7 +267,7 @@ def test_token_with_default_scope(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 ) def test_token_with_custom_scope(rest_mock: Mocker) -> None: mock_request = rest_mock.post( @@ -289,7 +289,7 @@ def test_token_with_custom_scope(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 ) def test_token_200_w_oauth2_server_uri(rest_mock: Mocker) -> None: rest_mock.post( @@ -314,7 +314,7 @@ def test_token_200_w_oauth2_server_uri(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 ) def test_config_200(requests_mock: Mocker) -> None: requests_mock.get( @@ -402,7 +402,7 @@ def test_config_sets_headers(requests_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 ) def test_token_400(rest_mock: Mocker) -> None: rest_mock.post( @@ -418,7 +418,7 @@ def test_token_400(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 ) def test_token_401(rest_mock: Mocker) -> None: message = "invalid_client" @@ -664,7 +664,7 @@ def test_list_namespace_with_parent_404(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 ) @pytest.mark.parametrize("status_code", [401, 419]) def test_list_namespaces_token_expired_success_on_retries(rest_mock: Mocker, status_code: int) -> None: @@ -2017,7 +2017,7 @@ def test_custom_namespace_separator(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 ) def test_auth_header(rest_mock: Mocker) -> None: mock_request = rest_mock.post( diff --git a/tests/cli/test_console.py b/tests/cli/test_console.py index a713975ec9..a1a84201ef 100644 --- a/tests/cli/test_console.py +++ b/tests/cli/test_console.py @@ -629,7 +629,7 @@ def test_json_schema(catalog: InMemoryCatalog) -> None: assert result.exit_code == 0 assert ( result.output - == """{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}\n""" + == """{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}\n""" # noqa: E501 ) @@ -821,7 +821,7 @@ def test_json_properties_get_table_specific_property_that_doesnt_exist(catalog: assert result.exit_code == 1 assert ( result.output - == """{"type": "NoSuchPropertyException", "message": "Could not find property doesnotexist on table default.my_table"}\n""" + == """{"type": "NoSuchPropertyException", "message": "Could not find property doesnotexist on table default.my_table"}\n""" # noqa: E501 ) diff --git a/tests/conftest.py b/tests/conftest.py index 9d80f48972..928690791a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1188,7 +1188,7 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: "status": 1, "snapshot_id": 8744736658442914487, "data_file": { - "file_path": "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet", + "file_path": "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet", # noqa: E501 "file_format": "PARQUET", "partition": {"VendorID": 1, "tpep_pickup_day": 1925}, "record_count": 19513, @@ -1308,7 +1308,7 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: "status": 1, "snapshot_id": 8744736658442914487, "data_file": { - "file_path": "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=1/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00002.parquet", + "file_path": "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=1/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00002.parquet", # noqa: E501 "file_format": "PARQUET", "partition": {"VendorID": 1, "tpep_pickup_datetime": None}, "record_count": 95050, @@ -2146,7 +2146,7 @@ def adls_fsspec_fileio(request: pytest.FixtureRequest) -> Generator[FsspecFileIO azurite_url = request.config.getoption("--adls.endpoint") azurite_account_name = request.config.getoption("--adls.account-name") azurite_account_key = request.config.getoption("--adls.account-key") - azurite_connection_string = f"DefaultEndpointsProtocol=http;AccountName={azurite_account_name};AccountKey={azurite_account_key};BlobEndpoint={azurite_url}/{azurite_account_name};" + azurite_connection_string = f"DefaultEndpointsProtocol=http;AccountName={azurite_account_name};AccountKey={azurite_account_key};BlobEndpoint={azurite_url}/{azurite_account_name};" # noqa: E501 properties = { "adls.connection-string": azurite_connection_string, "adls.account-name": azurite_account_name, @@ -2183,7 +2183,7 @@ def pyarrow_fileio_adls(request: pytest.FixtureRequest) -> Generator[Any, None, azurite_account_name = request.config.getoption("--adls.account-name") azurite_account_key = request.config.getoption("--adls.account-key") - azurite_connection_string = f"DefaultEndpointsProtocol=http;AccountName={azurite_account_name};AccountKey={azurite_account_key};BlobEndpoint={azurite_url}/{azurite_account_name};" + azurite_connection_string = f"DefaultEndpointsProtocol=http;AccountName={azurite_account_name};AccountKey={azurite_account_key};BlobEndpoint={azurite_url}/{azurite_account_name};" # noqa: E501 properties = { ADLS_ACCOUNT_NAME: azurite_account_name, ADLS_ACCOUNT_KEY: azurite_account_key, @@ -2623,7 +2623,7 @@ def spark() -> "SparkSession": # Not supported by Spark # 'time': [time(1, 22, 0), None, time(19, 25, 0)], # Not natively supported by Arrow - # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes], + # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes], # noqa: E501 "binary": [b"\01", None, b"\22"], "fixed": [ uuid.UUID("00000000-0000-0000-0000-000000000000").bytes, @@ -2676,7 +2676,7 @@ def arrow_table_with_null(pa_schema: "pa.Schema") -> "pa.Table": "long": [1, None, 9], "float": [0.0, None, 0.9], "double": [0.0, None, 0.9], - # 'time': [1_000_000, None, 3_000_000], # Example times: 1s, none, and 3s past midnight #Spark does not support time fields + # 'time': [1_000_000, None, 3_000_000], # Example times: 1s, none, and 3s past midnight #Spark does not support time fields # noqa: E501 "timestamp": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], "timestamptz": [ datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), @@ -2687,7 +2687,7 @@ def arrow_table_with_null(pa_schema: "pa.Schema") -> "pa.Table": # Not supported by Spark # 'time': [time(1, 22, 0), None, time(19, 25, 0)], # Not natively supported by Arrow - # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes], + # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes], # noqa: E501 "binary": [b"\01", None, b"\22"], "fixed": [ uuid.UUID("00000000-0000-0000-0000-000000000000").bytes, diff --git a/tests/expressions/test_evaluator.py b/tests/expressions/test_evaluator.py index 5be3e92be8..09e0d49670 100644 --- a/tests/expressions/test_evaluator.py +++ b/tests/expressions/test_evaluator.py @@ -1427,7 +1427,7 @@ def test_strict_integer_in(strict_data_file_schema: Schema, strict_data_file_1: def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None: - # should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MIN_VALUE - 25, INT_MIN_VALUE - 24})).eval(strict_data_file_1) + # should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MIN_VALUE - 25, INT_MIN_VALUE - 24})).eval(strict_data_file_1) # noqa: E501 # assert should_read, "Should match: all values != 5 and != 6" should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MIN_VALUE - 1, INT_MIN_VALUE})).eval( diff --git a/tests/expressions/test_visitors.py b/tests/expressions/test_visitors.py index 798e9f641e..b52ecd70cc 100644 --- a/tests/expressions/test_visitors.py +++ b/tests/expressions/test_visitors.py @@ -92,10 +92,10 @@ class ExampleVisitor(BooleanExpressionVisitor[list[str]]): - """A test implementation of a BooleanExpressionVisitor + """A test implementation of a BooleanExpressionVisitor. - As this visitor visits each node, it appends an element to a `visit_history` list. This enables testing that a given expression is - visited in an expected order by the `visit` method. + As this visitor visits each node, it appends an element to a `visit_history` list. + This enables testing that a given expression is visited in an expected order by the `visit` method. """ def __init__(self) -> None: @@ -131,9 +131,10 @@ def visit_bound_predicate(self, predicate: BoundPredicate) -> list[str]: class FooBoundBooleanExpressionVisitor(BoundBooleanExpressionVisitor[list[str]]): - """A test implementation of a BoundBooleanExpressionVisitor - As this visitor visits each node, it appends an element to a `visit_history` list. This enables testing that a given bound expression is - visited in an expected order by the `visit` method. + """A test implementation of a BoundBooleanExpressionVisitor. + + As this visitor visits each node, it appends an element to a `visit_history` list. + This enables testing that a given bound expression is visited in an expected order by the `visit` method. """ def __init__(self) -> None: @@ -260,7 +261,7 @@ def test_bind_visitor_already_bound(table_schema_simple: Schema) -> None: with pytest.raises(TypeError) as exc_info: visit(bound, visitor=BindVisitor(schema=table_schema_simple, case_sensitive=True)) assert ( - "Found already bound predicate: BoundEqualTo(term=BoundReference(field=NestedField(field_id=1, name='foo', field_type=StringType(), required=False), accessor=Accessor(position=0,inner=None)), literal=literal('hello'))" + "Found already bound predicate: BoundEqualTo(term=BoundReference(field=NestedField(field_id=1, name='foo', field_type=StringType(), required=False), accessor=Accessor(position=0,inner=None)), literal=literal('hello'))" # noqa: E501 == str(exc_info.value) ) diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py index 86ef05e5f4..13f66bf1bb 100644 --- a/tests/integration/test_add_files.py +++ b/tests/integration/test_add_files.py @@ -470,7 +470,7 @@ def test_add_files_to_bucket_partitioned_table_fails(spark: SparkSession, sessio with pytest.raises(ValueError) as exc_info: tbl.add_files(file_paths=file_paths) assert ( - "Cannot infer partition value from parquet metadata for a non-linear Partition Field: baz_bucket_3 with transform bucket[3]" + "Cannot infer partition value from parquet metadata for a non-linear Partition Field: baz_bucket_3 with transform bucket[3]" # noqa: E501 in str(exc_info.value) ) @@ -518,7 +518,7 @@ def test_add_files_to_partitioned_table_fails_with_lower_and_upper_mismatch( with pytest.raises(ValueError) as exc_info: tbl.add_files(file_paths=file_paths) assert ( - "Cannot infer partition value from parquet metadata as there are more than one partition values for Partition Field: baz. lower_value=123, upper_value=124" + "Cannot infer partition value from parquet metadata as there are more than one partition values for Partition Field: baz. lower_value=123, upper_value=124" # noqa: E501 in str(exc_info.value) ) @@ -754,7 +754,7 @@ def test_add_files_with_timestamp_tz_ns_fails(session_catalog: Catalog, format_v exception_cause = exc_info.value.__cause__ assert isinstance(exception_cause, TypeError) assert ( - "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." + "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." # noqa: E501 in exception_cause.args[0] ) diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index ea0cca9bc5..bfffa34956 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -1096,11 +1096,13 @@ def test_inspect_files_format_version_3(spark: SparkSession, session_catalog: Ca }, ) + # fmt: off insert_data_sql = f"""INSERT INTO {identifier} VALUES (false, 'a', 'aaaaaaaaaaaaaaaaaaaaaa', 1, 1, 0.0, 0.0, TIMESTAMP('2023-01-01 19:25:00'), TIMESTAMP('2023-01-01 19:25:00+00:00'), DATE('2023-01-01'), X'01', X'00000000000000000000000000000000'), (NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL), (true, 'z', 'zzzzzzzzzzzzzzzzzzzzzz', 9, 9, 0.9, 0.9, TIMESTAMP('2023-03-01 19:25:00'), TIMESTAMP('2023-03-01 19:25:00+00:00'), DATE('2023-03-01'), X'12', X'11111111111111111111111111111111'); """ + # fmt: on spark.sql(insert_data_sql) spark.sql(insert_data_sql) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 0419fcf23a..82158002e1 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -77,7 +77,7 @@ @pytest.mark.parametrize( - "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification", + "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification", # noqa: E501 [ # Identity Transform ( @@ -161,7 +161,7 @@ [3.14], Record(3.14), "float_field=3.14", - # spark writes differently as pyiceberg, Record[float_field=3.140000104904175], path:float_field=3.14 (Record has difference) + # spark writes differently as pyiceberg, Record[float_field=3.140000104904175], path:float_field=3.14 (Record has difference) # noqa: E501 # so justification (compare expected value with spark behavior) would fail. None, None, @@ -184,7 +184,7 @@ [6.282], Record(6.282), "double_field=6.282", - # spark writes differently as pyiceberg, Record[double_field=6.2820000648498535] path:double_field=6.282 (Record has difference) + # spark writes differently as pyiceberg, Record[double_field=6.2820000648498535] path:double_field=6.282 (Record has difference) # noqa: E501 # so justification (compare expected value with spark behavior) would fail. None, None, @@ -247,7 +247,7 @@ "timestamp_field=2023-01-01T12%3A00%3A00", # Spark writes differently as pyiceberg, so justification (compare expected value with spark behavior) would fail # AssertionError: assert 'timestamp_field=2023-01-01T12%3A00%3A00' in 's3://warehouse/default/test_table/data/timestamp_field=2023-01-01T12%3A00/00000-5-f9dca69a-9fb7-4830-9ef6-62d3d7afc09e-00001.parquet' - # TLDR: CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ) becomes 2023-01-01T12:00 in the hive partition path when spark writes it (without the seconds). + # TLDR: CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ) becomes 2023-01-01T12:00 in the hive partition path when spark writes it (without the seconds). # noqa: E501 None, None, # f"""CREATE TABLE {identifier} ( @@ -271,7 +271,7 @@ "timestamptz_field=2023-01-01T09%3A00%3A01.000999%2B00%3A00", # Spark writes differently as pyiceberg, so justification (compare expected value with spark behavior) would fail # AssertionError: assert 'timestamptz_field=2023-01-01T09%3A00%3A01.000999%2B00%3A00' in 's3://warehouse/default/test_table/data/timestamptz_field=2023-01-01T09%3A00%3A01.000999Z/00000-5-b710fc4d-66b6-47f1-b8ae-6208f8aaa2d4-00001.parquet' - # TLDR: CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP) becomes 2023-01-01T09:00:01.000999Z in the hive partition path when spark writes it (while iceberg: timestamptz_field=2023-01-01T09:00:01.000999+00:00). + # TLDR: CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP) becomes 2023-01-01T09:00:01.000999Z in the hive partition path when spark writes it (while iceberg: timestamptz_field=2023-01-01T09:00:01.000999+00:00). # noqa: E501 None, None, # f"""CREATE TABLE {identifier} ( @@ -285,7 +285,7 @@ # """, # f"""INSERT INTO {identifier} # VALUES - # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00') + # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00') # noqa: E501 # """ ), ( diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 8de1925eec..250c58bccd 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -1247,7 +1247,7 @@ def test_scan_source_field_missing_in_spec(catalog: Catalog, spark: SparkSession spark.sql(f"DROP TABLE IF EXISTS {identifier}") spark.sql(f"CREATE TABLE {identifier} (foo int, bar int, jaz string) USING ICEBERG PARTITIONED BY (foo, bar)") spark.sql( - f"INSERT INTO {identifier} (foo, bar, jaz) VALUES (1, 1, 'dummy data'), (1, 2, 'dummy data again'), (2, 1, 'another partition')" + f"INSERT INTO {identifier} (foo, bar, jaz) VALUES (1, 1, 'dummy data'), (1, 2, 'dummy data again'), (2, 1, 'another partition')" # noqa: E501 ) spark.sql(f"ALTER TABLE {identifier} DROP PARTITION FIELD foo") spark.sql(f"ALTER TABLE {identifier} DROP COLUMN foo") diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index d194669bd3..ba64434d5e 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -372,7 +372,7 @@ def test_dynamic_partition_overwrite_non_identity_transform( ) with pytest.raises( ValueError, - match="For now dynamic overwrite does not support a table with non-identity-transform field in the latest partition spec: *", + match="For now dynamic overwrite does not support a table with non-identity-transform field in the latest partition spec: *", # noqa: E501 ): tbl.dynamic_partition_overwrite(arrow_table_with_null.slice(0, 1)) @@ -591,7 +591,7 @@ def test_data_files_with_table_partitioned_with_null( # the first snapshot generates M3 with 6 delete data entries collected from M1 and M2. # ML3 = [M3] # - # The second snapshot generates M4 with 3 appended data entries and since M3 (previous manifests) only has delete entries it does not lint to it. + # The second snapshot generates M4 with 3 appended data entries and since M3 (previous manifests) only has delete entries it does not lint to it. # noqa: E501 # ML4 = [M4] # Append : Append generates M5 with new data entries and links to all previous manifests which is M4 . @@ -603,7 +603,7 @@ def test_data_files_with_table_partitioned_with_null( # then it generates M7 as remaining existing entries from M1 and M8 from M2 # ML6 = [M6, M7, M8] # - # The second snapshot generates M9 with 3 appended data entries and it also looks at manifests in ML6 (previous manifests) + # The second snapshot generates M9 with 3 appended data entries and it also looks at manifests in ML6 (previous manifests) # noqa: E501 # it ignores M6 since it only has delete entries but it links to M7 and M8. # ML7 = [M9, M7, M8] diff --git a/tests/io/test_fsspec.py b/tests/io/test_fsspec.py index 9e4a1da52e..900fc6193f 100644 --- a/tests/io/test_fsspec.py +++ b/tests/io/test_fsspec.py @@ -812,7 +812,7 @@ def test_s3v4_rest_signer(requests_mock: Mocker) -> None: "uri": new_uri, "headers": { "Authorization": [ - "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02" + "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02" # noqa: E501 ], "Host": ["bucket.s3.us-west-2.amazonaws.com"], "User-Agent": ["Botocore/1.27.59 Python/3.10.7 Darwin/21.5.0"], @@ -849,11 +849,11 @@ def test_s3v4_rest_signer(requests_mock: Mocker) -> None: assert request.url == new_uri assert dict(request.headers) == { - "Authorization": "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02", + "Authorization": "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02", # noqa: E501 "Host": "bucket.s3.us-west-2.amazonaws.com", "User-Agent": "Botocore/1.27.59 Python/3.10.7 Darwin/21.5.0", "X-Amz-Date": "20221017T102940Z", - "X-Amz-Security-Token": "YQoJb3JpZ2luX2VjEDoaCXVzLXdlc3QtMiJGMEQCID/fFxZP5oaEgQmcwP6XhZa0xSq9lmLSx8ffaWbySfUPAiAesa7sjd/WV4uwRTO0S03y/MWVtgpH+/NyZQ4bZgLVriqrAggTEAEaDDAzMzQwNzIyMjE1OSIMOeFOWhZIurMmAqjsKogCxMCqxX8ZjK0gacAkcDqBCyA7qTSLhdfKQIH/w7WpLBU1km+cRUWWCudan6gZsAq867DBaKEP7qI05DAWr9MChAkgUgyI8/G3Z23ET0gAedf3GsJbakB0F1kklx8jPmj4BPCht9RcTiXiJ5DxTS/cRCcalIQXmPFbaJSqpBusVG2EkWnm1v7VQrNPE2Os2b2P293vpbhwkyCEQiGRVva4Sw9D1sKvqSsK10QCRG+os6dFEOu1kARaXi6pStvR4OVmj7OYeAYjzaFchn7nz2CSae0M4IluiYQ01eQAywbfRo9DpKSmDM/DnPZWJnD/woLhaaaCrCxSSEaFsvGOHFhLd3Rknw1v0jADMILUtJoGOp4BpqKqyMz0CY3kpKL0jfR3ykTf/ge9wWVE0Alr7wRIkGCIURkhslGHqSyFRGoTqIXaxU+oPbwlw/0w/nYO7qQ6bTANOWye/wgw4h/NmJ6vU7wnZTXwREf1r6MF72++bE/fMk19LfVb8jN/qrUqAUXTc8gBAUxL5pgy8+oT/JnI2BkVrrLS4ilxEXP9Ahm+6GDUYXV4fBpqpZwdkzQ/5Gw=", + "X-Amz-Security-Token": "YQoJb3JpZ2luX2VjEDoaCXVzLXdlc3QtMiJGMEQCID/fFxZP5oaEgQmcwP6XhZa0xSq9lmLSx8ffaWbySfUPAiAesa7sjd/WV4uwRTO0S03y/MWVtgpH+/NyZQ4bZgLVriqrAggTEAEaDDAzMzQwNzIyMjE1OSIMOeFOWhZIurMmAqjsKogCxMCqxX8ZjK0gacAkcDqBCyA7qTSLhdfKQIH/w7WpLBU1km+cRUWWCudan6gZsAq867DBaKEP7qI05DAWr9MChAkgUgyI8/G3Z23ET0gAedf3GsJbakB0F1kklx8jPmj4BPCht9RcTiXiJ5DxTS/cRCcalIQXmPFbaJSqpBusVG2EkWnm1v7VQrNPE2Os2b2P293vpbhwkyCEQiGRVva4Sw9D1sKvqSsK10QCRG+os6dFEOu1kARaXi6pStvR4OVmj7OYeAYjzaFchn7nz2CSae0M4IluiYQ01eQAywbfRo9DpKSmDM/DnPZWJnD/woLhaaaCrCxSSEaFsvGOHFhLd3Rknw1v0jADMILUtJoGOp4BpqKqyMz0CY3kpKL0jfR3ykTf/ge9wWVE0Alr7wRIkGCIURkhslGHqSyFRGoTqIXaxU+oPbwlw/0w/nYO7qQ6bTANOWye/wgw4h/NmJ6vU7wnZTXwREf1r6MF72++bE/fMk19LfVb8jN/qrUqAUXTc8gBAUxL5pgy8+oT/JnI2BkVrrLS4ilxEXP9Ahm+6GDUYXV4fBpqpZwdkzQ/5Gw=", # noqa: E501 "x-amz-content-sha256": "UNSIGNED-PAYLOAD", "X-Custom-Header": "value", } @@ -868,7 +868,7 @@ def test_s3v4_rest_signer_endpoint(requests_mock: Mocker) -> None: "uri": new_uri, "headers": { "Authorization": [ - "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02" + "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02" # noqa: E501 ], "Host": ["bucket.s3.us-west-2.amazonaws.com"], "User-Agent": ["Botocore/1.27.59 Python/3.10.7 Darwin/21.5.0"], @@ -904,11 +904,11 @@ def test_s3v4_rest_signer_endpoint(requests_mock: Mocker) -> None: assert request.url == new_uri assert dict(request.headers) == { - "Authorization": "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02", + "Authorization": "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02", # noqa: E501 "Host": "bucket.s3.us-west-2.amazonaws.com", "User-Agent": "Botocore/1.27.59 Python/3.10.7 Darwin/21.5.0", "X-Amz-Date": "20221017T102940Z", - "X-Amz-Security-Token": "YQoJb3JpZ2luX2VjEDoaCXVzLXdlc3QtMiJGMEQCID/fFxZP5oaEgQmcwP6XhZa0xSq9lmLSx8ffaWbySfUPAiAesa7sjd/WV4uwRTO0S03y/MWVtgpH+/NyZQ4bZgLVriqrAggTEAEaDDAzMzQwNzIyMjE1OSIMOeFOWhZIurMmAqjsKogCxMCqxX8ZjK0gacAkcDqBCyA7qTSLhdfKQIH/w7WpLBU1km+cRUWWCudan6gZsAq867DBaKEP7qI05DAWr9MChAkgUgyI8/G3Z23ET0gAedf3GsJbakB0F1kklx8jPmj4BPCht9RcTiXiJ5DxTS/cRCcalIQXmPFbaJSqpBusVG2EkWnm1v7VQrNPE2Os2b2P293vpbhwkyCEQiGRVva4Sw9D1sKvqSsK10QCRG+os6dFEOu1kARaXi6pStvR4OVmj7OYeAYjzaFchn7nz2CSae0M4IluiYQ01eQAywbfRo9DpKSmDM/DnPZWJnD/woLhaaaCrCxSSEaFsvGOHFhLd3Rknw1v0jADMILUtJoGOp4BpqKqyMz0CY3kpKL0jfR3ykTf/ge9wWVE0Alr7wRIkGCIURkhslGHqSyFRGoTqIXaxU+oPbwlw/0w/nYO7qQ6bTANOWye/wgw4h/NmJ6vU7wnZTXwREf1r6MF72++bE/fMk19LfVb8jN/qrUqAUXTc8gBAUxL5pgy8+oT/JnI2BkVrrLS4ilxEXP9Ahm+6GDUYXV4fBpqpZwdkzQ/5Gw=", + "X-Amz-Security-Token": "YQoJb3JpZ2luX2VjEDoaCXVzLXdlc3QtMiJGMEQCID/fFxZP5oaEgQmcwP6XhZa0xSq9lmLSx8ffaWbySfUPAiAesa7sjd/WV4uwRTO0S03y/MWVtgpH+/NyZQ4bZgLVriqrAggTEAEaDDAzMzQwNzIyMjE1OSIMOeFOWhZIurMmAqjsKogCxMCqxX8ZjK0gacAkcDqBCyA7qTSLhdfKQIH/w7WpLBU1km+cRUWWCudan6gZsAq867DBaKEP7qI05DAWr9MChAkgUgyI8/G3Z23ET0gAedf3GsJbakB0F1kklx8jPmj4BPCht9RcTiXiJ5DxTS/cRCcalIQXmPFbaJSqpBusVG2EkWnm1v7VQrNPE2Os2b2P293vpbhwkyCEQiGRVva4Sw9D1sKvqSsK10QCRG+os6dFEOu1kARaXi6pStvR4OVmj7OYeAYjzaFchn7nz2CSae0M4IluiYQ01eQAywbfRo9DpKSmDM/DnPZWJnD/woLhaaaCrCxSSEaFsvGOHFhLd3Rknw1v0jADMILUtJoGOp4BpqKqyMz0CY3kpKL0jfR3ykTf/ge9wWVE0Alr7wRIkGCIURkhslGHqSyFRGoTqIXaxU+oPbwlw/0w/nYO7qQ6bTANOWye/wgw4h/NmJ6vU7wnZTXwREf1r6MF72++bE/fMk19LfVb8jN/qrUqAUXTc8gBAUxL5pgy8+oT/JnI2BkVrrLS4ilxEXP9Ahm+6GDUYXV4fBpqpZwdkzQ/5Gw=", # noqa: E501 "x-amz-content-sha256": "UNSIGNED-PAYLOAD", } @@ -946,7 +946,7 @@ def test_s3v4_rest_signer_forbidden(requests_mock: Mocker) -> None: signer(request) assert ( - """Failed to sign request 401: {'method': 'HEAD', 'region': 'us-west-2', 'uri': 'https://bucket/metadata/snap-8048355899640248710-1-a5c8ea2d-aa1f-48e8-89f4-1fa69db8c742.avro', 'headers': {'User-Agent': ['Botocore/1.27.59 Python/3.10.7 Darwin/21.5.0']}}""" + """Failed to sign request 401: {'method': 'HEAD', 'region': 'us-west-2', 'uri': 'https://bucket/metadata/snap-8048355899640248710-1-a5c8ea2d-aa1f-48e8-89f4-1fa69db8c742.avro', 'headers': {'User-Agent': ['Botocore/1.27.59 Python/3.10.7 Darwin/21.5.0']}}""" # noqa: E501 in str(exc_info.value) ) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index f1ed109d14..1de23ddc28 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -1195,7 +1195,7 @@ def test_projection_concat_files(schema_int: Schema, file_int: str) -> None: def test_identity_transform_column_projection(tmp_path: str, catalog: InMemoryCatalog) -> None: - # Test by adding a non-partitioned data file to a partitioned table, verifying partition value projection from manifest metadata. + # Test by adding a non-partitioned data file to a partitioned table, verifying partition value projection from manifest metadata. # noqa: E501 # TODO: Update to use a data file created by writing data to an unpartitioned table once add_files supports field IDs. # (context: https://github.com/apache/iceberg-python/pull/1443#discussion_r1901374875) @@ -1264,7 +1264,7 @@ def test_identity_transform_column_projection(tmp_path: str, catalog: InMemoryCa def test_identity_transform_columns_projection(tmp_path: str, catalog: InMemoryCatalog) -> None: - # Test by adding a non-partitioned data file to a multi-partitioned table, verifying partition value projection from manifest metadata. + # Test by adding a non-partitioned data file to a multi-partitioned table, verifying partition value projection from manifest metadata. # noqa: E501 # TODO: Update to use a data file created by writing data to an unpartitioned table once add_files supports field IDs. # (context: https://github.com/apache/iceberg-python/pull/1443#discussion_r1901374875) schema = Schema( @@ -1542,6 +1542,7 @@ def test_projection_maps_of_structs(schema_map_of_structs: Schema, file_map_of_s strict=True, ): assert actual.as_py() == expected + # fmt: off assert ( repr(result_table.schema) == """locations: map> @@ -1552,6 +1553,7 @@ def test_projection_maps_of_structs(schema_map_of_structs: Schema, file_map_of_s child 1, longitude: double not null child 2, altitude: double""" ) + # fmt: on def test_projection_nested_struct_different_parent_id(file_struct: str) -> None: @@ -2753,7 +2755,7 @@ def test__to_requested_schema_timestamps_without_downcast_raises_exception( def test_pyarrow_file_io_fs_by_scheme_cache() -> None: - # It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument becomes available for `resolve_s3_region` + # It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument becomes available for `resolve_s3_region` # noqa: E501 # Refer to: https://github.com/apache/arrow/issues/43713 pyarrow_file_io = PyArrowFileIO() @@ -2787,7 +2789,7 @@ def test_pyarrow_file_io_fs_by_scheme_cache() -> None: def test_pyarrow_io_new_input_multi_region(caplog: Any) -> None: - # It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument becomes available for `resolve_s3_region` + # It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument becomes available for `resolve_s3_region` # noqa: E501 # Refer to: https://github.com/apache/arrow/issues/43713 user_provided_region = "ap-southeast-1" bucket_regions = [ @@ -2998,7 +3000,7 @@ def test_iceberg_read_orc(tmp_path: Path) -> None: partition_specs=[PartitionSpec()], properties={ "write.format.default": "parquet", # This doesn't matter for reading - "schema.name-mapping.default": '[{"field-id": 1, "names": ["id"]}, {"field-id": 2, "names": ["name"]}]', # Add name mapping for ORC files without field IDs + "schema.name-mapping.default": '[{"field-id": 1, "names": ["id"]}, {"field-id": 2, "names": ["name"]}]', # Add name mapping for ORC files without field IDs # noqa: E501 }, ) io = PyArrowFileIO() @@ -3098,7 +3100,7 @@ def test_orc_row_filtering_predicate_pushdown(tmp_path: Path) -> None: schemas=[schema], partition_specs=[PartitionSpec()], properties={ - "schema.name-mapping.default": '[{"field-id": 1, "names": ["id"]}, {"field-id": 2, "names": ["name"]}, {"field-id": 3, "names": ["age"]}, {"field-id": 4, "names": ["active"]}]', + "schema.name-mapping.default": '[{"field-id": 1, "names": ["id"]}, {"field-id": 2, "names": ["name"]}, {"field-id": 3, "names": ["age"]}, {"field-id": 4, "names": ["active"]}]', # noqa: E501 }, ) io = PyArrowFileIO() @@ -4378,7 +4380,7 @@ def test_orc_stripe_size_batch_size_compression_interaction(tmp_path: Path) -> N # Assert batching behavior assert len(batches) > 0, f"Should have at least one batch for {description}" assert len(batches) == actual_stripes, ( - f"Number of batches should match number of stripes for {description}: {len(batches)} batches vs {actual_stripes} stripes" + f"Number of batches should match number of stripes for {description}: {len(batches)} batches vs {actual_stripes} stripes" # noqa: E501 ) # Assert data integrity @@ -4448,7 +4450,7 @@ def test_orc_near_perfect_stripe_size_mapping(tmp_path: Path) -> None: { "id": pa.array( [ - f"very_long_string_value_{i:06d}_with_lots_of_padding_to_make_it_harder_to_compress_{i * 7919 % 100000:05d}_more_padding_{i * 7919 % 100000:05d}" + f"very_long_string_value_{i:06d}_with_lots_of_padding_to_make_it_harder_to_compress_{i * 7919 % 100000:05d}_more_padding_{i * 7919 % 100000:05d}" # noqa: E501 for i in range(1, 50001) ] ) # 50K rows @@ -4523,7 +4525,7 @@ def test_orc_near_perfect_stripe_size_mapping(tmp_path: Path) -> None: # Assert batching behavior assert len(batches) > 0, f"Should have at least one batch for stripe_size={stripe_size}" assert len(batches) == actual_stripes, ( - f"Number of batches should match number of stripes for stripe_size={stripe_size}: {len(batches)} batches vs {actual_stripes} stripes" + f"Number of batches should match number of stripes for stripe_size={stripe_size}: {len(batches)} batches vs {actual_stripes} stripes" # noqa: E501 ) # Assert data integrity diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index 59a4857699..5f40f7f374 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -190,7 +190,7 @@ def test_pyarrow_timestamp_invalid_units() -> None: with pytest.raises( TypeError, match=re.escape( - "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." + "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." # noqa: E501 ), ): visit_pyarrow(pyarrow_type, _ConvertToIceberg()) @@ -212,7 +212,7 @@ def test_pyarrow_timestamp_tz_invalid_units() -> None: with pytest.raises( TypeError, match=re.escape( - "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." + "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." # noqa: E501 ), ): visit_pyarrow(pyarrow_type, _ConvertToIceberg()) @@ -832,8 +832,8 @@ def test_expression_to_complementary_pyarrow( Not(bound_is_null_double_field), ) result = _expression_to_complementary_pyarrow(bound_expr) - # Notice an isNan predicate on a str column is automatically converted to always false and removed from Or and thus will not appear in the pc.expr. + # Notice an isNan predicate on a str column is automatically converted to always false and removed from Or and thus will not appear in the pc.expr. # noqa: E501 assert ( repr(result) - == """ 100)) or ((is_nan(float_field) and (double_field == 0)) or (float_field > 100))) and invert(is_null(double_field, {nan_is_null=false})))) or is_null(float_field, {nan_is_null=false})) or is_null(string_field, {nan_is_null=false})) or is_nan(double_field))>""" + == """ 100)) or ((is_nan(float_field) and (double_field == 0)) or (float_field > 100))) and invert(is_null(double_field, {nan_is_null=false})))) or is_null(float_field, {nan_is_null=false})) or is_null(string_field, {nan_is_null=false})) or is_nan(double_field))>""" # noqa: E501 ) diff --git a/tests/table/test_name_mapping.py b/tests/table/test_name_mapping.py index c567f3ffb4..ab60537c9c 100644 --- a/tests/table/test_name_mapping.py +++ b/tests/table/test_name_mapping.py @@ -194,14 +194,14 @@ def test_json_mapped_field_no_field_id_serialization() -> None: assert ( table_name_mapping_nested_no_field_id.model_dump_json() - == """[{"names":["foo"],"field-id":1},{"names":["bar"]},{"names":["qux"],"field-id":2,"fields":[{"names":["element"]}]}]""" + == """[{"names":["foo"],"field-id":1},{"names":["bar"]},{"names":["qux"],"field-id":2,"fields":[{"names":["element"]}]}]""" # noqa: E501 ) def test_json_serialization(table_name_mapping_nested: NameMapping) -> None: assert ( table_name_mapping_nested.model_dump_json() - == """[{"names":["foo"],"field-id":1},{"names":["bar"],"field-id":2},{"names":["baz"],"field-id":3},{"names":["qux"],"field-id":4,"fields":[{"names":["element"],"field-id":5}]},{"names":["quux"],"field-id":6,"fields":[{"names":["key"],"field-id":7},{"names":["value"],"field-id":8,"fields":[{"names":["key"],"field-id":9},{"names":["value"],"field-id":10}]}]},{"names":["location"],"field-id":11,"fields":[{"names":["element"],"field-id":12,"fields":[{"names":["latitude"],"field-id":13},{"names":["longitude"],"field-id":14}]}]},{"names":["person"],"field-id":15,"fields":[{"names":["name"],"field-id":16},{"names":["age"],"field-id":17}]}]""" + == """[{"names":["foo"],"field-id":1},{"names":["bar"],"field-id":2},{"names":["baz"],"field-id":3},{"names":["qux"],"field-id":4,"fields":[{"names":["element"],"field-id":5}]},{"names":["quux"],"field-id":6,"fields":[{"names":["key"],"field-id":7},{"names":["value"],"field-id":8,"fields":[{"names":["key"],"field-id":9},{"names":["value"],"field-id":10}]}]},{"names":["location"],"field-id":11,"fields":[{"names":["element"],"field-id":12,"fields":[{"names":["latitude"],"field-id":13},{"names":["longitude"],"field-id":14}]}]},{"names":["person"],"field-id":15,"fields":[{"names":["name"],"field-id":16},{"names":["age"],"field-id":17}]}]""" # noqa: E501 ) diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index 576297c6f2..7b95d2e9ac 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -122,7 +122,7 @@ def test_serialize_partition_spec() -> None: ) assert ( partitioned.model_dump_json() - == """{"spec-id":3,"fields":[{"source-id":1,"field-id":1000,"transform":"truncate[19]","name":"str_truncate"},{"source-id":2,"field-id":1001,"transform":"bucket[25]","name":"int_bucket"}]}""" + == """{"spec-id":3,"fields":[{"source-id":1,"field-id":1000,"transform":"truncate[19]","name":"str_truncate"},{"source-id":2,"field-id":1001,"transform":"bucket[25]","name":"int_bucket"}]}""" # noqa: E501 ) @@ -134,7 +134,7 @@ def test_deserialize_unpartition_spec() -> None: def test_deserialize_partition_spec() -> None: - json_partition_spec = """{"spec-id": 3, "fields": [{"source-id": 1, "field-id": 1000, "transform": "truncate[19]", "name": "str_truncate"}, {"source-id": 2, "field-id": 1001, "transform": "bucket[25]", "name": "int_bucket"}]}""" + json_partition_spec = """{"spec-id": 3, "fields": [{"source-id": 1, "field-id": 1000, "transform": "truncate[19]", "name": "str_truncate"}, {"source-id": 2, "field-id": 1001, "transform": "bucket[25]", "name": "int_bucket"}]}""" # noqa: E501 spec = PartitionSpec.model_validate_json(json_partition_spec) diff --git a/tests/table/test_refs.py b/tests/table/test_refs.py index e6b7006a99..63a9e34ec8 100644 --- a/tests/table/test_refs.py +++ b/tests/table/test_refs.py @@ -33,7 +33,7 @@ def test_snapshot_with_properties_repr() -> None: assert ( repr(snapshot_ref) - == """SnapshotRef(snapshot_id=3051729675574597004, snapshot_ref_type=SnapshotRefType.TAG, min_snapshots_to_keep=None, max_snapshot_age_ms=None, max_ref_age_ms=10000000)""" + == """SnapshotRef(snapshot_id=3051729675574597004, snapshot_ref_type=SnapshotRefType.TAG, min_snapshots_to_keep=None, max_snapshot_age_ms=None, max_ref_age_ms=10000000)""" # noqa: E501 ) assert snapshot_ref == eval(repr(snapshot_ref)) diff --git a/tests/table/test_snapshots.py b/tests/table/test_snapshots.py index 4aa9521b78..d450f78379 100644 --- a/tests/table/test_snapshots.py +++ b/tests/table/test_snapshots.py @@ -81,7 +81,7 @@ def test_serialize_summary_with_properties() -> None: def test_serialize_snapshot(snapshot: Snapshot) -> None: assert ( snapshot.model_dump_json() - == """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append"},"schema-id":3}""" + == """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append"},"schema-id":3}""" # noqa: E501 ) @@ -96,14 +96,14 @@ def test_serialize_snapshot_without_sequence_number() -> None: schema_id=3, ) actual = snapshot.model_dump_json() - expected = """{"snapshot-id":25,"parent-snapshot-id":19,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append"},"schema-id":3}""" + expected = """{"snapshot-id":25,"parent-snapshot-id":19,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append"},"schema-id":3}""" # noqa: E501 assert actual == expected def test_serialize_snapshot_with_properties(snapshot_with_properties: Snapshot) -> None: assert ( snapshot_with_properties.model_dump_json() - == """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append","foo":"bar"},"schema-id":3}""" + == """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append","foo":"bar"},"schema-id":3}""" # noqa: E501 ) @@ -119,20 +119,20 @@ def test_deserialize_summary_with_properties() -> None: def test_deserialize_snapshot(snapshot: Snapshot) -> None: - payload = """{"snapshot-id": 25, "parent-snapshot-id": 19, "sequence-number": 200, "timestamp-ms": 1602638573590, "manifest-list": "s3:/a/b/c.avro", "summary": {"operation": "append"}, "schema-id": 3}""" + payload = """{"snapshot-id": 25, "parent-snapshot-id": 19, "sequence-number": 200, "timestamp-ms": 1602638573590, "manifest-list": "s3:/a/b/c.avro", "summary": {"operation": "append"}, "schema-id": 3}""" # noqa: E501 actual = Snapshot.model_validate_json(payload) assert actual == snapshot def test_deserialize_snapshot_without_operation(snapshot: Snapshot) -> None: - payload = """{"snapshot-id": 25, "parent-snapshot-id": 19, "sequence-number": 200, "timestamp-ms": 1602638573590, "manifest-list": "s3:/a/b/c.avro", "summary": {}, "schema-id": 3}""" + payload = """{"snapshot-id": 25, "parent-snapshot-id": 19, "sequence-number": 200, "timestamp-ms": 1602638573590, "manifest-list": "s3:/a/b/c.avro", "summary": {}, "schema-id": 3}""" # noqa: E501 with pytest.warns(UserWarning, match="Encountered invalid snapshot summary: operation is missing, defaulting to overwrite"): actual = Snapshot.model_validate_json(payload) assert actual.summary.operation == Operation.OVERWRITE def test_deserialize_snapshot_with_properties(snapshot_with_properties: Snapshot) -> None: - payload = """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append","foo":"bar"},"schema-id":3}""" + payload = """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append","foo":"bar"},"schema-id":3}""" # noqa: E501 snapshot = Snapshot.model_validate_json(payload) assert snapshot == snapshot_with_properties @@ -140,7 +140,7 @@ def test_deserialize_snapshot_with_properties(snapshot_with_properties: Snapshot def test_snapshot_repr(snapshot: Snapshot) -> None: assert ( repr(snapshot) - == """Snapshot(snapshot_id=25, parent_snapshot_id=19, sequence_number=200, timestamp_ms=1602638573590, manifest_list='s3:/a/b/c.avro', summary=Summary(Operation.APPEND), schema_id=3)""" + == """Snapshot(snapshot_id=25, parent_snapshot_id=19, sequence_number=200, timestamp_ms=1602638573590, manifest_list='s3:/a/b/c.avro', summary=Summary(Operation.APPEND), schema_id=3)""" # noqa: E501 ) assert snapshot == eval(repr(snapshot)) @@ -148,7 +148,7 @@ def test_snapshot_repr(snapshot: Snapshot) -> None: def test_snapshot_with_properties_repr(snapshot_with_properties: Snapshot) -> None: assert ( repr(snapshot_with_properties) - == """Snapshot(snapshot_id=25, parent_snapshot_id=19, sequence_number=200, timestamp_ms=1602638573590, manifest_list='s3:/a/b/c.avro', summary=Summary(Operation.APPEND, **{'foo': 'bar'}), schema_id=3)""" + == """Snapshot(snapshot_id=25, parent_snapshot_id=19, sequence_number=200, timestamp_ms=1602638573590, manifest_list='s3:/a/b/c.avro', summary=Summary(Operation.APPEND, **{'foo': 'bar'}), schema_id=3)""" # noqa: E501 ) assert snapshot_with_properties == eval(repr(snapshot_with_properties)) @@ -226,7 +226,7 @@ def test_snapshot_summary_collector_with_partition() -> None: "deleted-records": "300", "changed-partition-count": "2", "partition-summaries-included": "true", - "partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100", + "partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100", # noqa: E501 "partitions.int_field=2": "removed-files-size=4321,deleted-data-files=1,deleted-records=200", } @@ -262,7 +262,7 @@ def test_snapshot_summary_collector_with_partition_limit_in_constructor() -> Non "deleted-records": "300", "changed-partition-count": "2", "partition-summaries-included": "true", - "partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100", + "partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100", # noqa: E501 "partitions.int_field=2": "removed-files-size=4321,deleted-data-files=1,deleted-records=200", } diff --git a/tests/table/test_sorting.py b/tests/table/test_sorting.py index cb7a2c187a..b9855380a2 100644 --- a/tests/table/test_sorting.py +++ b/tests/table/test_sorting.py @@ -53,12 +53,12 @@ def test_serialize_sort_order_unsorted() -> None: def test_serialize_sort_order(sort_order: SortOrder) -> None: - expected = '{"order-id":22,"fields":[{"source-id":19,"transform":"identity","direction":"asc","null-order":"nulls-first"},{"source-id":25,"transform":"bucket[4]","direction":"desc","null-order":"nulls-last"},{"source-id":22,"transform":"void","direction":"asc","null-order":"nulls-first"}]}' + expected = '{"order-id":22,"fields":[{"source-id":19,"transform":"identity","direction":"asc","null-order":"nulls-first"},{"source-id":25,"transform":"bucket[4]","direction":"desc","null-order":"nulls-last"},{"source-id":22,"transform":"void","direction":"asc","null-order":"nulls-first"}]}' # noqa: E501 assert sort_order.model_dump_json() == expected def test_deserialize_sort_order(sort_order: SortOrder) -> None: - payload = '{"order-id": 22, "fields": [{"source-id": 19, "transform": "identity", "direction": "asc", "null-order": "nulls-first"}, {"source-id": 25, "transform": "bucket[4]", "direction": "desc", "null-order": "nulls-last"}, {"source-id": 22, "transform": "void", "direction": "asc", "null-order": "nulls-first"}]}' + payload = '{"order-id": 22, "fields": [{"source-id": 19, "transform": "identity", "direction": "asc", "null-order": "nulls-first"}, {"source-id": 25, "transform": "bucket[4]", "direction": "desc", "null-order": "nulls-last"}, {"source-id": 22, "transform": "void", "direction": "asc", "null-order": "nulls-first"}]}' # noqa: E501 assert SortOrder.model_validate_json(payload) == sort_order @@ -90,7 +90,7 @@ def test_sorting_to_string(sort_order: SortOrder) -> None: def test_sorting_to_repr(sort_order: SortOrder) -> None: - expected = """SortOrder(SortField(source_id=19, transform=IdentityTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST), SortField(source_id=25, transform=BucketTransform(num_buckets=4), direction=SortDirection.DESC, null_order=NullOrder.NULLS_LAST), SortField(source_id=22, transform=VoidTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST), order_id=22)""" + expected = """SortOrder(SortField(source_id=19, transform=IdentityTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST), SortField(source_id=25, transform=BucketTransform(num_buckets=4), direction=SortDirection.DESC, null_order=NullOrder.NULLS_LAST), SortField(source_id=22, transform=VoidTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST), order_id=22)""" # noqa: E501 assert repr(sort_order) == expected diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py index 9bc61799e4..9ff5b9d9f4 100644 --- a/tests/table/test_upsert.py +++ b/tests/table/test_upsert.py @@ -120,7 +120,7 @@ def assert_upsert_result(res: UpsertResult, expected_updated: int, expected_inse @pytest.mark.parametrize( - "join_cols, src_start_row, src_end_row, target_start_row, target_end_row, when_matched_update_all, when_not_matched_insert_all, expected_updated, expected_inserted", + "join_cols, src_start_row, src_end_row, target_start_row, target_end_row, when_matched_update_all, when_not_matched_insert_all, expected_updated, expected_inserted", # noqa: E501 [ (["order_id"], 1, 2, 2, 3, True, True, 1, 1), # single row (["order_id"], 5001, 15000, 1, 10000, True, True, 5000, 5000), # 10k rows diff --git a/tests/test_conversions.py b/tests/test_conversions.py index b366551cf2..e38bdbd6f2 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -66,7 +66,8 @@ - Stored directly - 'Z' -> 90 Decimal: - - Stored as unscaled values in the form of two's-complement big-endian binary using the minimum number of bytes for the values + - Stored as unscaled values in the form of two's-complement big-endian binary + using the minimum number of bytes for the values - 345 is 0...1|01011001 in binary - 00000001 -> 1, 01011001 -> 89 Float: diff --git a/tests/test_schema.py b/tests/test_schema.py index 0c006879ea..d63d498e8d 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -95,7 +95,7 @@ def test_schema_str(table_schema_simple: Schema) -> None: def test_schema_repr_single_field() -> None: """Test schema representation""" actual = repr(Schema(NestedField(field_id=1, name="foo", field_type=StringType()), schema_id=1)) - expected = "Schema(NestedField(field_id=1, name='foo', field_type=StringType(), required=False), schema_id=1, identifier_field_ids=[])" + expected = "Schema(NestedField(field_id=1, name='foo', field_type=StringType(), required=False), schema_id=1, identifier_field_ids=[])" # noqa: E501 assert expected == actual @@ -108,7 +108,7 @@ def test_schema_repr_two_fields() -> None: schema_id=1, ) ) - expected = "Schema(NestedField(field_id=1, name='foo', field_type=StringType(), required=False), NestedField(field_id=2, name='bar', field_type=IntegerType(), required=False), schema_id=1, identifier_field_ids=[])" + expected = "Schema(NestedField(field_id=1, name='foo', field_type=StringType(), required=False), NestedField(field_id=2, name='bar', field_type=IntegerType(), required=False), schema_id=1, identifier_field_ids=[])" # noqa: E501 assert expected == actual @@ -428,13 +428,13 @@ def __getitem__(self, pos: int) -> Any: def test_serialize_schema(table_schema_with_full_nested_fields: Schema) -> None: actual = table_schema_with_full_nested_fields.model_dump_json() - expected = """{"type":"struct","fields":[{"id":1,"name":"foo","type":"string","required":false,"doc":"foo doc","initial-default":"foo initial","write-default":"foo write"},{"id":2,"name":"bar","type":"int","required":true,"doc":"bar doc","initial-default":42,"write-default":43},{"id":3,"name":"baz","type":"boolean","required":false,"doc":"baz doc","initial-default":true,"write-default":false}],"schema-id":1,"identifier-field-ids":[2]}""" + expected = """{"type":"struct","fields":[{"id":1,"name":"foo","type":"string","required":false,"doc":"foo doc","initial-default":"foo initial","write-default":"foo write"},{"id":2,"name":"bar","type":"int","required":true,"doc":"bar doc","initial-default":42,"write-default":43},{"id":3,"name":"baz","type":"boolean","required":false,"doc":"baz doc","initial-default":true,"write-default":false}],"schema-id":1,"identifier-field-ids":[2]}""" # noqa: E501 assert actual == expected def test_deserialize_schema(table_schema_with_full_nested_fields: Schema) -> None: actual = Schema.model_validate_json( - """{"type": "struct", "fields": [{"id": 1, "name": "foo", "type": "string", "required": false, "doc": "foo doc", "initial-default": "foo initial", "write-default": "foo write"}, {"id": 2, "name": "bar", "type": "int", "required": true, "doc": "bar doc", "initial-default": 42, "write-default": 43}, {"id": 3, "name": "baz", "type": "boolean", "required": false, "doc": "baz doc", "initial-default": true, "write-default": false}], "schema-id": 1, "identifier-field-ids": [2]}""" + """{"type": "struct", "fields": [{"id": 1, "name": "foo", "type": "string", "required": false, "doc": "foo doc", "initial-default": "foo initial", "write-default": "foo write"}, {"id": 2, "name": "bar", "type": "int", "required": true, "doc": "bar doc", "initial-default": 42, "write-default": 43}, {"id": 3, "name": "baz", "type": "boolean", "required": false, "doc": "baz doc", "initial-default": true, "write-default": false}], "schema-id": 1, "identifier-field-ids": [2]}""" # noqa: E501 ) expected = table_schema_with_full_nested_fields assert actual == expected @@ -885,21 +885,21 @@ def test_identifier_fields_fails(table_schema_nested_with_struct_key_map: Schema with pytest.raises(ValueError) as exc_info: Schema(*table_schema_nested_with_struct_key_map.fields, schema_id=1, identifier_field_ids=[23]) assert ( - f"Cannot add field zip as an identifier field: must not be nested in {table_schema_nested_with_struct_key_map.find_field('location')}" + f"Cannot add field zip as an identifier field: must not be nested in {table_schema_nested_with_struct_key_map.find_field('location')}" # noqa: E501 in str(exc_info.value) ) with pytest.raises(ValueError) as exc_info: Schema(*table_schema_nested_with_struct_key_map.fields, schema_id=1, identifier_field_ids=[26]) assert ( - f"Cannot add field x as an identifier field: must not be nested in {table_schema_nested_with_struct_key_map.find_field('points')}" + f"Cannot add field x as an identifier field: must not be nested in {table_schema_nested_with_struct_key_map.find_field('points')}" # noqa: E501 in str(exc_info.value) ) with pytest.raises(ValueError) as exc_info: Schema(*table_schema_nested_with_struct_key_map.fields, schema_id=1, identifier_field_ids=[17]) assert ( - f"Cannot add field age as an identifier field: must not be nested in an optional field {table_schema_nested_with_struct_key_map.find_field('person')}" + f"Cannot add field age as an identifier field: must not be nested in an optional field {table_schema_nested_with_struct_key_map.find_field('person')}" # noqa: E501 in str(exc_info.value) ) diff --git a/tests/test_types.py b/tests/test_types.py index 707deb160e..7911ba6494 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -535,7 +535,7 @@ def test_repr_nested_field_default_nones_should_not_appear() -> None: ) assert ( repr(NestedField(1, "required_field", StringType(), required=False, initial_default="hello", write_default="bye")) - == "NestedField(field_id=1, name='required_field', field_type=StringType(), required=False, initial_default='hello', write_default='bye')" + == "NestedField(field_id=1, name='required_field', field_type=StringType(), required=False, initial_default='hello', write_default='bye')" # noqa: E501 ) @@ -635,7 +635,7 @@ def test_str_struct(simple_struct: StructType) -> None: def test_repr_struct(simple_struct: StructType) -> None: assert ( repr(simple_struct) - == "StructType(fields=(NestedField(field_id=1, name='required_field', field_type=StringType(), required=True), NestedField(field_id=2, name='optional_field', field_type=IntegerType(), required=False),))" + == "StructType(fields=(NestedField(field_id=1, name='required_field', field_type=StringType(), required=True), NestedField(field_id=2, name='optional_field', field_type=IntegerType(), required=False),))" # noqa: E501 ) diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index d12019c9e2..7ba8a7e084 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -81,7 +81,7 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None: assert data_file.content == DataFileContent.DATA assert ( data_file.file_path - == "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet" + == "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet" # noqa: E501 ) assert data_file.file_format == FileFormat.PARQUET assert repr(data_file.partition) == "Record[1, 1925]" @@ -396,7 +396,7 @@ def test_write_manifest( expected_metadata = { "schema": test_schema.model_dump_json(), - "partition-spec": """[{"source-id":1,"field-id":1000,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":1001,"transform":"day","name":"tpep_pickup_day"}]""", + "partition-spec": """[{"source-id":1,"field-id":1000,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":1001,"transform":"day","name":"tpep_pickup_day"}]""", # noqa: E501 "partition-spec-id": str(demo_manifest_file.partition_spec_id), "format-version": str(format_version), } @@ -418,7 +418,7 @@ def test_write_manifest( assert data_file.content == DataFileContent.DATA assert ( data_file.file_path - == "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet" + == "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet" # noqa: E501 ) assert data_file.file_format == FileFormat.PARQUET assert data_file.partition == Record(1, 1925) From 408c53c07888cc1624cd95e0cf0431a09c0257b6 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 25 Jan 2026 12:23:54 -0500 Subject: [PATCH 3/4] fix E501 in source --- pyiceberg/catalog/glue.py | 3 ++- pyiceberg/catalog/rest/auth.py | 6 +++-- pyiceberg/catalog/sql.py | 3 ++- pyiceberg/conversions.py | 5 +++-- pyiceberg/expressions/__init__.py | 12 ++++++---- pyiceberg/expressions/visitors.py | 3 ++- pyiceberg/io/__init__.py | 3 ++- pyiceberg/io/pyarrow.py | 33 +++++++++++++++++++--------- pyiceberg/manifest.py | 3 ++- pyiceberg/schema.py | 27 +++++++++++++++++------ pyiceberg/table/__init__.py | 20 +++++++++++------ pyiceberg/table/locations.py | 3 ++- pyiceberg/table/metadata.py | 3 ++- pyiceberg/table/snapshots.py | 6 +++-- pyiceberg/table/sorting.py | 3 ++- pyiceberg/table/update/__init__.py | 24 +++++++++++++------- pyiceberg/table/update/snapshot.py | 3 ++- pyiceberg/utils/schema_conversion.py | 6 +++-- 18 files changed, 113 insertions(+), 53 deletions(-) diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index 7260b29447..e55257af66 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -406,7 +406,8 @@ def _update_glue_table(self, database_name: str, table_name: str, table_input: " raise NoSuchTableError(f"Table does not exist: {database_name}.{table_name} (Glue table version {version_id})") from e except self.glue.exceptions.ConcurrentModificationException as e: raise CommitFailedException( - f"Cannot commit {database_name}.{table_name} because Glue detected concurrent update to table version {version_id}" + f"Cannot commit {database_name}.{table_name} because Glue detected concurrent update " + f"to table version {version_id}" ) from e def _get_glue_table(self, database_name: str, table_name: str) -> "TableTypeDef": diff --git a/pyiceberg/catalog/rest/auth.py b/pyiceberg/catalog/rest/auth.py index 4f67ab9039..72235d8760 100644 --- a/pyiceberg/catalog/rest/auth.py +++ b/pyiceberg/catalog/rest/auth.py @@ -181,7 +181,8 @@ def _refresh_token(self) -> None: expires_in = result.get("expires_in", self.expires_in) if expires_in is None: raise ValueError( - "The expiration time of the Token must be provided by the Server in the Access Token Response in `expires_in` field, or by the PyIceberg Client." + "The expiration time of the Token must be provided by the Server in the Access Token Response " + "in `expires_in` field, or by the PyIceberg Client." ) self._expires_at = time.monotonic() + expires_in - self.refresh_margin @@ -249,8 +250,9 @@ def auth_header(self) -> str: class AuthManagerAdapter(AuthBase): - """A `requests.auth.AuthBase` adapter that integrates an `AuthManager` into a `requests.Session` to automatically attach the appropriate Authorization header to every request. + """A `requests.auth.AuthBase` adapter for integrating an `AuthManager` into a `requests.Session`. + This adapter automatically attaches the appropriate Authorization header to every request. This adapter is useful when working with `requests.Session.auth` and allows reuse of authentication strategies defined by `AuthManager`. This AuthManagerAdapter is only intended to be used against the REST Catalog diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py index 2b6fa74517..a2704c3d5d 100644 --- a/pyiceberg/catalog/sql.py +++ b/pyiceberg/catalog/sql.py @@ -109,7 +109,8 @@ class SqlCatalog(MetastoreCatalog): And you can have as many levels as you want, but you need at least one. The `SqlCatalog` honors the same convention. In the `JDBCCatalog` implementation, a `TableIdentifier` is composed of an optional `Namespace` and a table name. - When a `Namespace` is present, the full name will be `'ns1.ns2.ns3.table'`. A valid `TableIdentifier` could be `'name'` (no namespace). + When a `Namespace` is present, the full name will be `'ns1.ns2.ns3.table'`. + A valid `TableIdentifier` could be `'name'` (no namespace). The `SqlCatalog` has a different convention where a `TableIdentifier` requires a `Namespace`. """ diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py index 8739a1ab08..d599b55513 100644 --- a/pyiceberg/conversions.py +++ b/pyiceberg/conversions.py @@ -188,8 +188,9 @@ def to_bytes( ) -> bytes: """Convert a built-in python value to bytes. - This conversion follows the serialization scheme for storing single values as individual binary values defined in the Iceberg specification that - can be found at https://iceberg.apache.org/spec/#appendix-d-single-value-serialization + This conversion follows the serialization scheme for storing single values as individual binary values + defined in the Iceberg specification that can be found at + https://iceberg.apache.org/spec/#appendix-d-single-value-serialization Args: primitive_type (PrimitiveType): An implementation of the PrimitiveType base class. diff --git a/pyiceberg/expressions/__init__.py b/pyiceberg/expressions/__init__.py index 14ee9328c1..3910a146c7 100644 --- a/pyiceberg/expressions/__init__.py +++ b/pyiceberg/expressions/__init__.py @@ -690,12 +690,14 @@ def bind(self, schema: Schema, case_sensitive: bool = True) -> BoundSetPredicate def __str__(self) -> str: """Return the string representation of the SetPredicate class.""" # Sort to make it deterministic - return f"{str(self.__class__.__name__)}({str(self.term)}, {{{', '.join(sorted([str(literal) for literal in self.literals]))}}})" + literals_str = ", ".join(sorted([str(literal) for literal in self.literals])) + return f"{str(self.__class__.__name__)}({str(self.term)}, {{{literals_str}}})" def __repr__(self) -> str: """Return the string representation of the SetPredicate class.""" # Sort to make it deterministic - return f"{str(self.__class__.__name__)}({repr(self.term)}, {{{', '.join(sorted([repr(literal) for literal in self.literals]))}}})" + literals_repr = ", ".join(sorted([repr(literal) for literal in self.literals])) + return f"{str(self.__class__.__name__)}({repr(self.term)}, {{{literals_repr}}})" def __eq__(self, other: Any) -> bool: """Return the equality of two instances of the SetPredicate class.""" @@ -725,12 +727,14 @@ def value_set(self) -> set[Any]: def __str__(self) -> str: """Return the string representation of the BoundSetPredicate class.""" # Sort to make it deterministic - return f"{str(self.__class__.__name__)}({str(self.term)}, {{{', '.join(sorted([str(literal) for literal in self.literals]))}}})" + literals_str = ", ".join(sorted([str(literal) for literal in self.literals])) + return f"{str(self.__class__.__name__)}({str(self.term)}, {{{literals_str}}})" def __repr__(self) -> str: """Return the string representation of the BoundSetPredicate class.""" # Sort to make it deterministic - return f"{str(self.__class__.__name__)}({repr(self.term)}, {{{', '.join(sorted([repr(literal) for literal in self.literals]))}}})" + literals_repr = ", ".join(sorted([repr(literal) for literal in self.literals])) + return f"{str(self.__class__.__name__)}({repr(self.term)}, {{{literals_repr}}})" def __eq__(self, other: Any) -> bool: """Return the equality of two instances of the BoundSetPredicate class.""" diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py index e4ab3befa3..0beb0f3df0 100644 --- a/pyiceberg/expressions/visitors.py +++ b/pyiceberg/expressions/visitors.py @@ -139,7 +139,8 @@ def visit(obj: BooleanExpression, visitor: BooleanExpressionVisitor[T]) -> T: Args: obj (BooleanExpression): An instance of a BooleanExpression. - visitor (BooleanExpressionVisitor[T]): An instance of an implementation of the generic BooleanExpressionVisitor base class. + visitor (BooleanExpressionVisitor[T]): An instance of an implementation of the generic + BooleanExpressionVisitor base class. Raises: NotImplementedError: If attempting to visit an unsupported expression. diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py index 87d155a0fd..85bd402d60 100644 --- a/pyiceberg/io/__init__.py +++ b/pyiceberg/io/__init__.py @@ -368,5 +368,6 @@ def load_file_io(properties: Properties = EMPTY_DICT, location: str | None = Non return PyArrowFileIO(properties) except ModuleNotFoundError as e: raise ModuleNotFoundError( - 'Could not load a FileIO, please consider installing one: pip3 install "pyiceberg[pyarrow]", for more options refer to the docs.' + "Could not load a FileIO, please consider installing one: " + 'pip3 install "pyiceberg[pyarrow]", for more options refer to the docs.' ) from e diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 55ecc7ac93..0fea901f43 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -248,7 +248,9 @@ def open_output_stream(self, path: str, *args: Any, **kwargs: Any) -> pyarrow.Na class PyArrowFile(InputFile, OutputFile): - """A combined InputFile and OutputFile implementation that uses a pyarrow filesystem to generate pyarrow.lib.NativeFile instances. + """A combined InputFile and OutputFile implementation using pyarrow filesystem. + + This class generates pyarrow.lib.NativeFile instances. Args: location (str): A URI or a path to a local file. @@ -645,8 +647,9 @@ def delete(self, location: str | InputFile | OutputFile) -> None: """Delete the file at the given location. Args: - location (Union[str, InputFile, OutputFile]): The URI to the file--if an InputFile instance or an OutputFile instance is provided, - the location attribute for that instance is used as the location to delete. + location (Union[str, InputFile, OutputFile]): The URI to the file--if an InputFile instance or + an OutputFile instance is provided, the location attribute for that instance is used as + the location to delete. Raises: FileNotFoundError: When the file at the provided location does not exist. @@ -1014,7 +1017,10 @@ def collect( self, expr: BooleanExpression, ) -> None: - """Collect the bound references categorized by having at least one is_null or is_not_null in the expr and the remaining.""" + """Collect bound references categorized by null predicates. + + Categorizes by having at least one is_null or is_not_null in the expr and the remaining. + """ boolean_expression_visit(expr, self) @@ -1035,7 +1041,8 @@ def expression_to_pyarrow(expr: BooleanExpression, schema: Schema | None = None) def _expression_to_complementary_pyarrow(expr: BooleanExpression, schema: Schema | None = None) -> pc.Expression: """Complementary filter conversion function of expression_to_pyarrow. - Could not use expression_to_pyarrow(Not(expr)) to achieve this complementary effect because ~ in pyarrow.compute.Expression does not handle null. + Could not use expression_to_pyarrow(Not(expr)) to achieve this complementary effect because + ~ in pyarrow.compute.Expression does not handle null. """ collector = _NullNaNUnmentionedTermsCollector() collector.collect(expr) @@ -1417,7 +1424,9 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType: return TimestampNanoType() else: raise TypeError( - "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write.", + "Iceberg does not yet support 'ns' timestamp precision. " + "Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically " + "downcast 'ns' to 'us' on write.", ) else: raise TypeError(f"Unsupported precision for timestamp type: {primitive.unit}") @@ -1580,7 +1589,8 @@ def _task_to_record_batches( fragment = arrow_format.make_fragment(fin) physical_schema = fragment.physical_schema - # For V1 and V2, we only support Timestamp 'us' in Iceberg Schema, therefore it is reasonable to always cast 'ns' timestamp to 'us' on read. + # For V1 and V2, we only support Timestamp 'us' in Iceberg Schema, + # therefore it is reasonable to always cast 'ns' timestamp to 'us' on read. # For V3 this has to set explicitly to avoid nanosecond timestamp to be down-casted by default downcast_ns_timestamp_to_us = ( downcast_ns_timestamp_to_us if downcast_ns_timestamp_to_us is not None else format_version <= 2 @@ -2441,7 +2451,8 @@ def _partition_value(self, partition_field: PartitionField, schema: Schema) -> A if not iceberg_transform.preserves_order: raise ValueError( - f"Cannot infer partition value from parquet metadata for a non-linear Partition Field: {partition_field.name} with transform {partition_field.transform}" + f"Cannot infer partition value from parquet metadata for a non-linear Partition Field: " + f"{partition_field.name} with transform {partition_field.transform}" ) transform_func = iceberg_transform.transform(source_field.field_type) @@ -2462,7 +2473,8 @@ def _partition_value(self, partition_field: PartitionField, schema: Schema) -> A ) if lower_value != upper_value: raise ValueError( - f"Cannot infer partition value from parquet metadata as there are more than one partition values for Partition Field: {partition_field.name}. {lower_value=}, {upper_value=}" + f"Cannot infer partition value from parquet metadata as there are more than one partition values " + f"for Partition Field: {partition_field.name}. {lower_value=}, {upper_value=}" ) return lower_value @@ -2729,7 +2741,8 @@ def _check_pyarrow_schema_compatible( ) additional_names = set(provided_schema._name_to_id.keys()) - set(requested_schema._name_to_id.keys()) raise ValueError( - f"PyArrow table contains more columns: {', '.join(sorted(additional_names))}. Update the schema first (hint, use union_by_name)." + f"PyArrow table contains more columns: {', '.join(sorted(additional_names))}. " + "Update the schema first (hint, use union_by_name)." ) from e _check_schema_compatible(requested_schema, provided_schema) diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 0afa16666e..bd83075bdf 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -1319,7 +1319,8 @@ def prepare_manifest(self, manifest_file: ManifestFile) -> ManifestFile: # To validate this, check that the snapshot id matches the current commit if self._commit_snapshot_id != wrapped_manifest_file.added_snapshot_id: raise ValueError( - f"Found unassigned sequence number for a manifest from snapshot: {self._commit_snapshot_id} != {wrapped_manifest_file.added_snapshot_id}" + f"Found unassigned sequence number for a manifest from snapshot: " + f"{self._commit_snapshot_id} != {wrapped_manifest_file.added_snapshot_id}" ) wrapped_manifest_file.sequence_number = self._sequence_number diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 5896e7e1eb..94a7ba59c5 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -104,7 +104,8 @@ def __str__(self) -> str: def __repr__(self) -> str: """Return the string representation of the Schema class.""" - return f"Schema({', '.join(repr(column) for column in self.columns)}, schema_id={self.schema_id}, identifier_field_ids={self.identifier_field_ids})" + columns_repr = ", ".join(repr(column) for column in self.columns) + return f"Schema({columns_repr}, schema_id={self.schema_id}, identifier_field_ids={self.identifier_field_ids})" def __len__(self) -> int: """Return the length of an instance of the Literal class.""" @@ -374,7 +375,8 @@ def check_format_version_compatibility(self, format_version: int) -> None: for field in self._lazy_id_to_field.values(): if format_version < field.field_type.minimum_format_version(): raise ValueError( - f"{field.field_type} is only supported in {field.field_type.minimum_format_version()} or higher. Current format version is: {format_version}" + f"{field.field_type} is only supported in {field.field_type.minimum_format_version()} or higher. " + f"Current format version is: {format_version}" ) @@ -1530,7 +1532,8 @@ def field(self, field: NestedField, field_result: IcebergType | None) -> Iceberg else: if not field.field_type.is_primitive: raise ValueError( - f"Cannot explicitly project List or Map types, {field.field_id}:{field.name} of type {field.field_type} was selected" + f"Cannot explicitly project List or Map types, " + f"{field.field_id}:{field.name} of type {field.field_type} was selected" ) # Selected non-struct field return field.field_type @@ -1550,7 +1553,8 @@ def list(self, list_type: ListType, element_result: IcebergType | None) -> Icebe else: if not list_type.element_type.is_primitive: raise ValueError( - f"Cannot explicitly project List or Map types, {list_type.element_id} of type {list_type.element_type} was selected" + f"Cannot explicitly project List or Map types, " + f"{list_type.element_id} of type {list_type.element_type} was selected" ) return list_type elif element_result is not None: @@ -1567,7 +1571,8 @@ def map(self, map_type: MapType, key_result: IcebergType | None, value_result: I return self._project_map(map_type, projected_struct) if not map_type.value_type.is_primitive: raise ValueError( - f"Cannot explicitly project List or Map types, Map value {map_type.value_id} of type {map_type.value_type} was selected" + f"Cannot explicitly project List or Map types, " + f"Map value {map_type.value_id} of type {map_type.value_type} was selected" ) return map_type elif value_result is not None: @@ -1764,9 +1769,17 @@ def _is_field_compatible(self, lhs: NestedField) -> bool: # UnknownType can only be promoted to Primitive types if isinstance(rhs.field_type, UnknownType): if not isinstance(lhs.field_type, PrimitiveType): - error_msg = f"Null type (UnknownType) cannot be promoted to non-primitive type {lhs.field_type}. UnknownType can only be promoted to primitive types (string, int, boolean, etc.) in V3+ tables." + error_msg = ( + f"Null type (UnknownType) cannot be promoted to non-primitive type {lhs.field_type}. " + "UnknownType can only be promoted to primitive types (string, int, boolean, etc.) " + "in V3+ tables." + ) else: - error_msg = f"Null type (UnknownType) cannot be promoted to {lhs.field_type}. This may be due to table format version limitations (V1/V2 tables don't support UnknownType promotion)." + error_msg = ( + f"Null type (UnknownType) cannot be promoted to {lhs.field_type}. " + "This may be due to table format version limitations " + "(V1/V2 tables don't support UnknownType promotion)." + ) self.rich_table.add_row("❌", str(lhs), f"{str(rhs)} - {error_msg}") else: self.rich_table.add_row("❌", str(lhs), str(rhs)) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index ec49d482b9..a6244916b7 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -560,7 +560,8 @@ def dynamic_partition_overwrite( for field in self.table_metadata.spec().fields: if not isinstance(field.transform, IdentityTransform): raise ValueError( - f"For now dynamic overwrite does not support a table with non-identity-transform field in the latest partition spec: {field}" + f"For now dynamic overwrite does not support a table with non-identity-transform field " + f"in the latest partition spec: {field}" ) downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False @@ -769,8 +770,10 @@ def upsert( df: The input dataframe to upsert with the table's data. join_cols: Columns to join on, if not provided, it will use the identifier-field-ids. - when_matched_update_all: Bool indicating to update rows that are matched but require an update due to a value in a non-key column changing - when_not_matched_insert_all: Bool indicating new rows to be inserted that do not match any existing rows in the table + when_matched_update_all: Bool indicating to update rows that are matched but require an update + due to a value in a non-key column changing + when_not_matched_insert_all: Bool indicating new rows to be inserted that do not match any + existing rows in the table case_sensitive: Bool indicating if the match should be case-sensitive branch: Branch Reference to run the upsert operation snapshot_properties: Custom properties to be added to the snapshot summary @@ -860,8 +863,9 @@ def upsert( rows = pa.Table.from_batches([batch]) if when_matched_update_all: - # function get_rows_to_update is doing a check on non-key columns to see if any of the values have actually changed - # we don't want to do just a blanket overwrite for matched rows if the actual non-key column data hasn't changed + # function get_rows_to_update is doing a check on non-key columns to see if any of the + # values have actually changed. We don't want to do just a blanket overwrite for matched + # rows if the actual non-key column data hasn't changed. # this extra step avoids unnecessary IO and writes rows_to_update = upsert_util.get_rows_to_update(df, rows, join_cols) @@ -1368,8 +1372,10 @@ def upsert( df: The input dataframe to upsert with the table's data. join_cols: Columns to join on, if not provided, it will use the identifier-field-ids. - when_matched_update_all: Bool indicating to update rows that are matched but require an update due to a value in a non-key column changing - when_not_matched_insert_all: Bool indicating new rows to be inserted that do not match any existing rows in the table + when_matched_update_all: Bool indicating to update rows that are matched but require an update + due to a value in a non-key column changing + when_not_matched_insert_all: Bool indicating new rows to be inserted that do not match any + existing rows in the table case_sensitive: Bool indicating if the match should be case-sensitive branch: Branch Reference to run the upsert operation snapshot_properties: Custom properties to be added to the snapshot summary diff --git a/pyiceberg/table/locations.py b/pyiceberg/table/locations.py index 3aeaaf228b..771c6b5a0f 100644 --- a/pyiceberg/table/locations.py +++ b/pyiceberg/table/locations.py @@ -172,7 +172,8 @@ def _import_location_provider( from pyiceberg.table import TableProperties raise ValueError( - f"{TableProperties.WRITE_PY_LOCATION_PROVIDER_IMPL} should be full path (module.CustomLocationProvider), got: {location_provider_impl}" + f"{TableProperties.WRITE_PY_LOCATION_PROVIDER_IMPL} should be full path " + f"(module.CustomLocationProvider), got: {location_provider_impl}" ) module_name, class_name = ".".join(path_parts[:-1]), path_parts[-1] module = importlib.import_module(module_name) diff --git a/pyiceberg/table/metadata.py b/pyiceberg/table/metadata.py index 8a55f77b11..07960ad7d5 100644 --- a/pyiceberg/table/metadata.py +++ b/pyiceberg/table/metadata.py @@ -679,7 +679,8 @@ def parse_obj(data: dict[str, Any]) -> TableMetadata: def _construct_without_validation(table_metadata: TableMetadata) -> TableMetadata: """Construct table metadata from an existing table without performing validation. - This method is useful during a sequence of table updates when the model needs to be re-constructed but is not yet ready for validation. + This method is useful during a sequence of table updates when the model needs to be + re-constructed but is not yet ready for validation. """ if table_metadata.format_version is None: raise ValidationError(f"Missing format-version in TableMetadata: {table_metadata}") diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index 797093f90b..a64cf7bd21 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -70,9 +70,11 @@ class Operation(Enum): Possible operation values are: - append: Only data files were added and no files were removed. - - replace: Data and delete files were added and removed without changing table data; i.e., compaction, changing the data file format, or relocating data files. + - replace: Data and delete files were added and removed without changing table data; + i.e., compaction, changing the data file format, or relocating data files. - overwrite: Data and delete files were added and removed in a logical overwrite operation. - - delete: Data files were removed and their contents logically deleted and/or delete files were added to delete rows. + - delete: Data files were removed and their contents logically deleted and/or delete files + were added to delete rows. """ APPEND = "append" diff --git a/pyiceberg/table/sorting.py b/pyiceberg/table/sorting.py index 5243d7b184..62fc555417 100644 --- a/pyiceberg/table/sorting.py +++ b/pyiceberg/table/sorting.py @@ -67,7 +67,8 @@ class SortField(IcebergBaseModel): transform (str): Transform that is used to produce values to be sorted on from the source column. This is the same transform as described in partition transforms. direction (SortDirection): Sort direction, that can only be either asc or desc. - null_order (NullOrder): Null order that describes the order of null values when sorted. Can only be either nulls-first or nulls-last. + null_order (NullOrder): Null order that describes the order of null values when sorted. + Can only be either nulls-first or nulls-last. """ def __init__( diff --git a/pyiceberg/table/update/__init__.py b/pyiceberg/table/update/__init__.py index 68e3d9f753..07032bd888 100644 --- a/pyiceberg/table/update/__init__.py +++ b/pyiceberg/table/update/__init__.py @@ -176,7 +176,8 @@ class SetStatisticsUpdate(IcebergBaseModel): snapshot_id: int | None = Field( None, alias="snapshot-id", - description="snapshot-id is **DEPRECATED for REMOVAL** since it contains redundant information. Use `statistics.snapshot-id` field instead.", + description="snapshot-id is **DEPRECATED for REMOVAL** since it contains redundant information. " + "Use `statistics.snapshot-id` field instead.", ) @model_validator(mode="before") @@ -450,7 +451,8 @@ def _(update: AddSnapshotUpdate, base_metadata: TableMetadata, context: _TableMe and update.snapshot.first_row_id < base_metadata.next_row_id ): raise ValueError( - f"Cannot add a snapshot with first row id smaller than the table's next-row-id {update.snapshot.first_row_id} < {base_metadata.next_row_id}" + f"Cannot add a snapshot with first row id smaller than the table's next-row-id " + f"{update.snapshot.first_row_id} < {base_metadata.next_row_id}" ) context.add_update(update) @@ -812,7 +814,8 @@ def validate(self, base_metadata: TableMetadata | None) -> None: raise CommitFailedException(f"Requirement failed: {ref_type} {self.ref} was created concurrently") elif self.snapshot_id != snapshot_ref.snapshot_id: raise CommitFailedException( - f"Requirement failed: {ref_type} {self.ref} has changed: expected id {self.snapshot_id}, found {snapshot_ref.snapshot_id}" + f"Requirement failed: {ref_type} {self.ref} has changed: " + f"expected id {self.snapshot_id}, found {snapshot_ref.snapshot_id}" ) elif self.snapshot_id is not None: raise CommitFailedException(f"Requirement failed: branch or tag {self.ref} is missing, expected {self.snapshot_id}") @@ -829,7 +832,8 @@ def validate(self, base_metadata: TableMetadata | None) -> None: raise CommitFailedException("Requirement failed: current table metadata is missing") elif base_metadata.last_column_id != self.last_assigned_field_id: raise CommitFailedException( - f"Requirement failed: last assigned field id has changed: expected {self.last_assigned_field_id}, found {base_metadata.last_column_id}" + f"Requirement failed: last assigned field id has changed: " + f"expected {self.last_assigned_field_id}, found {base_metadata.last_column_id}" ) @@ -844,7 +848,8 @@ def validate(self, base_metadata: TableMetadata | None) -> None: raise CommitFailedException("Requirement failed: current table metadata is missing") elif self.current_schema_id != base_metadata.current_schema_id: raise CommitFailedException( - f"Requirement failed: current schema id has changed: expected {self.current_schema_id}, found {base_metadata.current_schema_id}" + f"Requirement failed: current schema id has changed: " + f"expected {self.current_schema_id}, found {base_metadata.current_schema_id}" ) @@ -859,7 +864,8 @@ def validate(self, base_metadata: TableMetadata | None) -> None: raise CommitFailedException("Requirement failed: current table metadata is missing") elif base_metadata.last_partition_id != self.last_assigned_partition_id: raise CommitFailedException( - f"Requirement failed: last assigned partition id has changed: expected {self.last_assigned_partition_id}, found {base_metadata.last_partition_id}" + f"Requirement failed: last assigned partition id has changed: " + f"expected {self.last_assigned_partition_id}, found {base_metadata.last_partition_id}" ) @@ -874,7 +880,8 @@ def validate(self, base_metadata: TableMetadata | None) -> None: raise CommitFailedException("Requirement failed: current table metadata is missing") elif self.default_spec_id != base_metadata.default_spec_id: raise CommitFailedException( - f"Requirement failed: default spec id has changed: expected {self.default_spec_id}, found {base_metadata.default_spec_id}" + f"Requirement failed: default spec id has changed: " + f"expected {self.default_spec_id}, found {base_metadata.default_spec_id}" ) @@ -889,7 +896,8 @@ def validate(self, base_metadata: TableMetadata | None) -> None: raise CommitFailedException("Requirement failed: current table metadata is missing") elif self.default_sort_order_id != base_metadata.default_sort_order_id: raise CommitFailedException( - f"Requirement failed: default sort order id has changed: expected {self.default_sort_order_id}, found {base_metadata.default_sort_order_id}" + f"Requirement failed: default sort order id has changed: " + f"expected {self.default_sort_order_id}, found {base_metadata.default_sort_order_id}" ) diff --git a/pyiceberg/table/update/snapshot.py b/pyiceberg/table/update/snapshot.py index b7c863d84a..4e3b9232ca 100644 --- a/pyiceberg/table/update/snapshot.py +++ b/pyiceberg/table/update/snapshot.py @@ -437,7 +437,8 @@ def _compute_deletes(self) -> tuple[list[ManifestFile], list[ManifestEntry], boo def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) -> ManifestEntry: return ManifestEntry.from_args( status=status, - # When a file is replaced or deleted from the dataset, its manifest entry fields store the snapshot ID in which the file was deleted and status 2 (deleted). + # When a file is replaced or deleted from the dataset, its manifest entry fields store the + # snapshot ID in which the file was deleted and status 2 (deleted). snapshot_id=self.snapshot_id if status == ManifestEntryStatus.DELETED else entry.snapshot_id, sequence_number=entry.sequence_number, file_sequence_number=entry.file_sequence_number, diff --git a/pyiceberg/utils/schema_conversion.py b/pyiceberg/utils/schema_conversion.py index 66e57d5d9f..f08f5395de 100644 --- a/pyiceberg/utils/schema_conversion.py +++ b/pyiceberg/utils/schema_conversion.py @@ -107,10 +107,12 @@ def avro_to_iceberg(self, avro_schema: dict[str, Any]) -> Schema: ... }) >>> iceberg_schema = Schema( ... NestedField( - ... field_id=500, name="manifest_path", field_type=StringType(), required=False, doc="Location URI with FS scheme" + ... field_id=500, name="manifest_path", field_type=StringType(), + ... required=False, doc="Location URI with FS scheme" ... ), ... NestedField( - ... field_id=501, name="manifest_length", field_type=LongType(), required=False, doc="Total file size in bytes" + ... field_id=501, name="manifest_length", field_type=LongType(), + ... required=False, doc="Total file size in bytes" ... ), ... schema_id=1 ... ) From 4b9df27b6f02ee31c6465c97fad39bf6ae238d39 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 25 Jan 2026 12:39:50 -0500 Subject: [PATCH 4/4] fix tests --- ruff.toml | 6 +-- tests/catalog/test_hive.py | 5 +- tests/catalog/test_rest.py | 36 ++++++++----- tests/cli/test_console.py | 12 ++--- tests/conftest.py | 29 +++++++--- tests/expressions/test_evaluator.py | 4 +- tests/expressions/test_visitors.py | 7 +-- tests/integration/test_add_files.py | 12 ++--- tests/integration/test_inspect_table.py | 19 +++++-- tests/integration/test_partitioning_key.py | 26 ++++++--- tests/integration/test_reads.py | 3 +- .../test_writes/test_partitioned_writes.py | 11 ++-- tests/io/test_fsspec.py | 49 ++++++++++++++--- tests/io/test_pyarrow.py | 54 ++++++++++++------- tests/io/test_pyarrow_visitor.py | 19 ++++--- tests/table/test_name_mapping.py | 16 +++--- tests/table/test_partitioning.py | 13 +++-- tests/table/test_refs.py | 6 +-- tests/table/test_snapshots.py | 54 ++++++++++++------- tests/table/test_sorting.py | 25 +++++++-- tests/table/test_upsert.py | 5 +- tests/test_schema.py | 43 +++++++++++---- tests/test_types.py | 12 ++--- tests/utils/test_manifest.py | 17 +++--- 24 files changed, 333 insertions(+), 150 deletions(-) diff --git a/ruff.toml b/ruff.toml index 8033da29d0..257556a463 100644 --- a/ruff.toml +++ b/ruff.toml @@ -63,13 +63,11 @@ ignore = [] fixable = ["ALL"] unfixable = [] +per-file-ignores = {} + # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -[lint.per-file-ignores] -"tests/integration/test_inspect_table.py" = ["E501"] -"tests/io/test_pyarrow.py" = ["E501"] - [lint.pyupgrade] # Preserve types, even if a file imports `from __future__ import annotations`. keep-runtime-typing = true diff --git a/tests/catalog/test_hive.py b/tests/catalog/test_hive.py index dcfc812378..88b653e44f 100644 --- a/tests/catalog/test_hive.py +++ b/tests/catalog/test_hive.py @@ -970,7 +970,10 @@ def test_rename_table_to_namespace_does_not_exists() -> None: catalog._client = MagicMock() catalog._client.__enter__().alter_table_with_environment_context.side_effect = InvalidOperationException( - message="Unable to change partition or table. Database default does not exist Check metastore logs for detailed stack.does_not_exists" # noqa: E501 + message=( + "Unable to change partition or table. Database default does not exist " + "Check metastore logs for detailed stack.does_not_exists" + ) ) with pytest.raises(NoSuchNamespaceError) as exc_info: diff --git a/tests/catalog/test_rest.py b/tests/catalog/test_rest.py index 763ff46e03..9fb1fa9af5 100644 --- a/tests/catalog/test_rest.py +++ b/tests/catalog/test_rest.py @@ -156,7 +156,8 @@ def test_no_uri_supplied() -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. " + "Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" ) def test_token_200(rest_mock: Mocker) -> None: rest_mock.post( @@ -179,7 +180,8 @@ def test_token_200(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. " + "Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" ) def test_token_200_without_optional_fields(rest_mock: Mocker) -> None: rest_mock.post( @@ -198,7 +200,8 @@ def test_token_200_without_optional_fields(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. " + "Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" ) def test_token_with_optional_oauth_params(rest_mock: Mocker) -> None: mock_request = rest_mock.post( @@ -223,7 +226,8 @@ def test_token_with_optional_oauth_params(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. " + "Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" ) def test_token_with_optional_oauth_params_as_empty(rest_mock: Mocker) -> None: mock_request = rest_mock.post( @@ -246,7 +250,8 @@ def test_token_with_optional_oauth_params_as_empty(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. " + "Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" ) def test_token_with_default_scope(rest_mock: Mocker) -> None: mock_request = rest_mock.post( @@ -267,7 +272,8 @@ def test_token_with_default_scope(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. " + "Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" ) def test_token_with_custom_scope(rest_mock: Mocker) -> None: mock_request = rest_mock.post( @@ -289,7 +295,8 @@ def test_token_with_custom_scope(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. " + "Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" ) def test_token_200_w_oauth2_server_uri(rest_mock: Mocker) -> None: rest_mock.post( @@ -314,7 +321,8 @@ def test_token_200_w_oauth2_server_uri(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. " + "Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" ) def test_config_200(requests_mock: Mocker) -> None: requests_mock.get( @@ -402,7 +410,8 @@ def test_config_sets_headers(requests_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. " + "Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" ) def test_token_400(rest_mock: Mocker) -> None: rest_mock.post( @@ -418,7 +427,8 @@ def test_token_400(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. " + "Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" ) def test_token_401(rest_mock: Mocker) -> None: message = "invalid_client" @@ -664,7 +674,8 @@ def test_list_namespace_with_parent_404(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. " + "Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" ) @pytest.mark.parametrize("status_code", [401, 419]) def test_list_namespaces_token_expired_success_on_retries(rest_mock: Mocker, status_code: int) -> None: @@ -2017,7 +2028,8 @@ def test_custom_namespace_separator(rest_mock: Mocker) -> None: @pytest.mark.filterwarnings( - "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" # noqa: E501 + "ignore:Deprecated in 0.8.0, will be removed in 1.0.0. " + "Iceberg REST client is missing the OAuth2 server URI:DeprecationWarning" ) def test_auth_header(rest_mock: Mocker) -> None: mock_request = rest_mock.post( diff --git a/tests/cli/test_console.py b/tests/cli/test_console.py index a1a84201ef..f4b343f683 100644 --- a/tests/cli/test_console.py +++ b/tests/cli/test_console.py @@ -627,9 +627,10 @@ def test_json_schema(catalog: InMemoryCatalog) -> None: runner = CliRunner() result = runner.invoke(run, ["--output=json", "schema", "default.my_table"]) assert result.exit_code == 0 - assert ( - result.output - == """{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}\n""" # noqa: E501 + assert result.output == ( + '{"type":"struct","fields":[{"id":1,"name":"x","type":"long","required":true},' + '{"id":2,"name":"y","type":"long","required":true,"doc":"comment"},' + '{"id":3,"name":"z","type":"long","required":true}],"schema-id":0,"identifier-field-ids":[]}\n' ) @@ -819,9 +820,8 @@ def test_json_properties_get_table_specific_property_that_doesnt_exist(catalog: runner = CliRunner() result = runner.invoke(run, ["--output=json", "properties", "get", "table", "default.my_table", "doesnotexist"]) assert result.exit_code == 1 - assert ( - result.output - == """{"type": "NoSuchPropertyException", "message": "Could not find property doesnotexist on table default.my_table"}\n""" # noqa: E501 + assert result.output == ( + '{"type": "NoSuchPropertyException", "message": "Could not find property doesnotexist on table default.my_table"}\n' ) diff --git a/tests/conftest.py b/tests/conftest.py index 928690791a..ca9175f292 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1188,7 +1188,10 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: "status": 1, "snapshot_id": 8744736658442914487, "data_file": { - "file_path": "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet", # noqa: E501 + "file_path": ( + "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/" + "00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet" + ), "file_format": "PARQUET", "partition": {"VendorID": 1, "tpep_pickup_day": 1925}, "record_count": 19513, @@ -1308,7 +1311,10 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: "status": 1, "snapshot_id": 8744736658442914487, "data_file": { - "file_path": "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=1/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00002.parquet", # noqa: E501 + "file_path": ( + "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=1/" + "00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00002.parquet" + ), "file_format": "PARQUET", "partition": {"VendorID": 1, "tpep_pickup_datetime": None}, "record_count": 95050, @@ -2146,7 +2152,10 @@ def adls_fsspec_fileio(request: pytest.FixtureRequest) -> Generator[FsspecFileIO azurite_url = request.config.getoption("--adls.endpoint") azurite_account_name = request.config.getoption("--adls.account-name") azurite_account_key = request.config.getoption("--adls.account-key") - azurite_connection_string = f"DefaultEndpointsProtocol=http;AccountName={azurite_account_name};AccountKey={azurite_account_key};BlobEndpoint={azurite_url}/{azurite_account_name};" # noqa: E501 + azurite_connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName={azurite_account_name};" + f"AccountKey={azurite_account_key};BlobEndpoint={azurite_url}/{azurite_account_name};" + ) properties = { "adls.connection-string": azurite_connection_string, "adls.account-name": azurite_account_name, @@ -2183,7 +2192,10 @@ def pyarrow_fileio_adls(request: pytest.FixtureRequest) -> Generator[Any, None, azurite_account_name = request.config.getoption("--adls.account-name") azurite_account_key = request.config.getoption("--adls.account-key") - azurite_connection_string = f"DefaultEndpointsProtocol=http;AccountName={azurite_account_name};AccountKey={azurite_account_key};BlobEndpoint={azurite_url}/{azurite_account_name};" # noqa: E501 + azurite_connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName={azurite_account_name};" + f"AccountKey={azurite_account_key};BlobEndpoint={azurite_url}/{azurite_account_name};" + ) properties = { ADLS_ACCOUNT_NAME: azurite_account_name, ADLS_ACCOUNT_KEY: azurite_account_key, @@ -2623,7 +2635,8 @@ def spark() -> "SparkSession": # Not supported by Spark # 'time': [time(1, 22, 0), None, time(19, 25, 0)], # Not natively supported by Arrow - # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes], # noqa: E501 + # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, + # uuid.UUID('11111111-1111-1111-1111-111111111111').bytes], "binary": [b"\01", None, b"\22"], "fixed": [ uuid.UUID("00000000-0000-0000-0000-000000000000").bytes, @@ -2676,7 +2689,8 @@ def arrow_table_with_null(pa_schema: "pa.Schema") -> "pa.Table": "long": [1, None, 9], "float": [0.0, None, 0.9], "double": [0.0, None, 0.9], - # 'time': [1_000_000, None, 3_000_000], # Example times: 1s, none, and 3s past midnight #Spark does not support time fields # noqa: E501 + # 'time': [1_000_000, None, 3_000_000], # Example times: 1s, none, and 3s past midnight + # Spark does not support time fields "timestamp": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], "timestamptz": [ datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), @@ -2687,7 +2701,8 @@ def arrow_table_with_null(pa_schema: "pa.Schema") -> "pa.Table": # Not supported by Spark # 'time': [time(1, 22, 0), None, time(19, 25, 0)], # Not natively supported by Arrow - # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes], # noqa: E501 + # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, + # uuid.UUID('11111111-1111-1111-1111-111111111111').bytes], "binary": [b"\01", None, b"\22"], "fixed": [ uuid.UUID("00000000-0000-0000-0000-000000000000").bytes, diff --git a/tests/expressions/test_evaluator.py b/tests/expressions/test_evaluator.py index 09e0d49670..03a33e4c78 100644 --- a/tests/expressions/test_evaluator.py +++ b/tests/expressions/test_evaluator.py @@ -1427,7 +1427,9 @@ def test_strict_integer_in(strict_data_file_schema: Schema, strict_data_file_1: def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file_1: DataFile) -> None: - # should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MIN_VALUE - 25, INT_MIN_VALUE - 24})).eval(strict_data_file_1) # noqa: E501 + # should_read = _StrictMetricsEvaluator( + # strict_data_file_schema, NotIn("id", {INT_MIN_VALUE - 25, INT_MIN_VALUE - 24}) + # ).eval(strict_data_file_1) # assert should_read, "Should match: all values != 5 and != 6" should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("id", {INT_MIN_VALUE - 1, INT_MIN_VALUE})).eval( diff --git a/tests/expressions/test_visitors.py b/tests/expressions/test_visitors.py index b52ecd70cc..7687d2e5a0 100644 --- a/tests/expressions/test_visitors.py +++ b/tests/expressions/test_visitors.py @@ -261,9 +261,10 @@ def test_bind_visitor_already_bound(table_schema_simple: Schema) -> None: with pytest.raises(TypeError) as exc_info: visit(bound, visitor=BindVisitor(schema=table_schema_simple, case_sensitive=True)) assert ( - "Found already bound predicate: BoundEqualTo(term=BoundReference(field=NestedField(field_id=1, name='foo', field_type=StringType(), required=False), accessor=Accessor(position=0,inner=None)), literal=literal('hello'))" # noqa: E501 - == str(exc_info.value) - ) + "Found already bound predicate: BoundEqualTo(term=BoundReference(" + "field=NestedField(field_id=1, name='foo', field_type=StringType(), required=False), " + "accessor=Accessor(position=0,inner=None)), literal=literal('hello'))" + ) == str(exc_info.value) def test_visit_bound_visitor_unknown_predicate() -> None: diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py index 13f66bf1bb..5c529e6efb 100644 --- a/tests/integration/test_add_files.py +++ b/tests/integration/test_add_files.py @@ -470,8 +470,8 @@ def test_add_files_to_bucket_partitioned_table_fails(spark: SparkSession, sessio with pytest.raises(ValueError) as exc_info: tbl.add_files(file_paths=file_paths) assert ( - "Cannot infer partition value from parquet metadata for a non-linear Partition Field: baz_bucket_3 with transform bucket[3]" # noqa: E501 - in str(exc_info.value) + "Cannot infer partition value from parquet metadata for a non-linear Partition Field: " + "baz_bucket_3 with transform bucket[3]" in str(exc_info.value) ) @@ -518,8 +518,8 @@ def test_add_files_to_partitioned_table_fails_with_lower_and_upper_mismatch( with pytest.raises(ValueError) as exc_info: tbl.add_files(file_paths=file_paths) assert ( - "Cannot infer partition value from parquet metadata as there are more than one partition values for Partition Field: baz. lower_value=123, upper_value=124" # noqa: E501 - in str(exc_info.value) + "Cannot infer partition value from parquet metadata as there are more than one partition values " + "for Partition Field: baz. lower_value=123, upper_value=124" in str(exc_info.value) ) @@ -754,8 +754,8 @@ def test_add_files_with_timestamp_tz_ns_fails(session_catalog: Catalog, format_v exception_cause = exc_info.value.__cause__ assert isinstance(exception_cause, TypeError) assert ( - "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." # noqa: E501 - in exception_cause.args[0] + "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' " + "configuration property to automatically downcast 'ns' to 'us' on write." in exception_cause.args[0] ) diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index bfffa34956..b40ba7e176 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -1096,13 +1096,22 @@ def test_inspect_files_format_version_3(spark: SparkSession, session_catalog: Ca }, ) - # fmt: off + row1 = ( + "(false, 'a', 'aaaaaaaaaaaaaaaaaaaaaa', 1, 1, 0.0, 0.0, " + "TIMESTAMP('2023-01-01 19:25:00'), TIMESTAMP('2023-01-01 19:25:00+00:00'), " + "DATE('2023-01-01'), X'01', X'00000000000000000000000000000000')" + ) + row2 = "(NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)" + row3 = ( + "(true, 'z', 'zzzzzzzzzzzzzzzzzzzzzz', 9, 9, 0.9, 0.9, " + "TIMESTAMP('2023-03-01 19:25:00'), TIMESTAMP('2023-03-01 19:25:00+00:00'), " + "DATE('2023-03-01'), X'12', X'11111111111111111111111111111111')" + ) insert_data_sql = f"""INSERT INTO {identifier} VALUES - (false, 'a', 'aaaaaaaaaaaaaaaaaaaaaa', 1, 1, 0.0, 0.0, TIMESTAMP('2023-01-01 19:25:00'), TIMESTAMP('2023-01-01 19:25:00+00:00'), DATE('2023-01-01'), X'01', X'00000000000000000000000000000000'), - (NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL), - (true, 'z', 'zzzzzzzzzzzzzzzzzzzzzz', 9, 9, 0.9, 0.9, TIMESTAMP('2023-03-01 19:25:00'), TIMESTAMP('2023-03-01 19:25:00+00:00'), DATE('2023-03-01'), X'12', X'11111111111111111111111111111111'); + {row1}, + {row2}, + {row3}; """ - # fmt: on spark.sql(insert_data_sql) spark.sql(insert_data_sql) diff --git a/tests/integration/test_partitioning_key.py b/tests/integration/test_partitioning_key.py index 82158002e1..f91efc5aca 100644 --- a/tests/integration/test_partitioning_key.py +++ b/tests/integration/test_partitioning_key.py @@ -77,7 +77,10 @@ @pytest.mark.parametrize( - "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification", # noqa: E501 + ( + "partition_fields, partition_values, expected_partition_record, expected_hive_partition_path_slice, " + "spark_create_table_sql_for_justification, spark_data_insert_sql_for_justification" + ), [ # Identity Transform ( @@ -161,7 +164,8 @@ [3.14], Record(3.14), "float_field=3.14", - # spark writes differently as pyiceberg, Record[float_field=3.140000104904175], path:float_field=3.14 (Record has difference) # noqa: E501 + # spark writes differently as pyiceberg, Record[float_field=3.140000104904175], + # path:float_field=3.14 (Record has difference) # so justification (compare expected value with spark behavior) would fail. None, None, @@ -184,7 +188,8 @@ [6.282], Record(6.282), "double_field=6.282", - # spark writes differently as pyiceberg, Record[double_field=6.2820000648498535] path:double_field=6.282 (Record has difference) # noqa: E501 + # spark writes differently as pyiceberg, Record[double_field=6.2820000648498535] + # path:double_field=6.282 (Record has difference) # so justification (compare expected value with spark behavior) would fail. None, None, @@ -246,8 +251,10 @@ Record(1672574400000000), "timestamp_field=2023-01-01T12%3A00%3A00", # Spark writes differently as pyiceberg, so justification (compare expected value with spark behavior) would fail - # AssertionError: assert 'timestamp_field=2023-01-01T12%3A00%3A00' in 's3://warehouse/default/test_table/data/timestamp_field=2023-01-01T12%3A00/00000-5-f9dca69a-9fb7-4830-9ef6-62d3d7afc09e-00001.parquet' - # TLDR: CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ) becomes 2023-01-01T12:00 in the hive partition path when spark writes it (without the seconds). # noqa: E501 + # AssertionError: assert 'timestamp_field=2023-01-01T12%3A00%3A00' in + # 's3://warehouse/default/test_table/data/timestamp_field=2023-01-01T12%3A00/...-00001.parquet' + # TLDR: CAST('2023-01-01 12:00:00' AS TIMESTAMP_NTZ) becomes 2023-01-01T12:00 in the hive partition path + # when spark writes it (without the seconds). None, None, # f"""CREATE TABLE {identifier} ( @@ -270,8 +277,10 @@ Record(1672563601000999), "timestamptz_field=2023-01-01T09%3A00%3A01.000999%2B00%3A00", # Spark writes differently as pyiceberg, so justification (compare expected value with spark behavior) would fail - # AssertionError: assert 'timestamptz_field=2023-01-01T09%3A00%3A01.000999%2B00%3A00' in 's3://warehouse/default/test_table/data/timestamptz_field=2023-01-01T09%3A00%3A01.000999Z/00000-5-b710fc4d-66b6-47f1-b8ae-6208f8aaa2d4-00001.parquet' - # TLDR: CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP) becomes 2023-01-01T09:00:01.000999Z in the hive partition path when spark writes it (while iceberg: timestamptz_field=2023-01-01T09:00:01.000999+00:00). # noqa: E501 + # AssertionError: assert 'timestamptz_field=2023-01-01T09%3A00%3A01.000999%2B00%3A00' in + # 's3://warehouse/default/test_table/data/timestamptz_field=2023-01-01T09%3A00%3A01.000999Z/...-00001.parquet' + # TLDR: CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP) becomes 2023-01-01T09:00:01.000999Z in the hive + # partition path when spark writes it (while iceberg: timestamptz_field=2023-01-01T09:00:01.000999+00:00). None, None, # f"""CREATE TABLE {identifier} ( @@ -285,7 +294,8 @@ # """, # f"""INSERT INTO {identifier} # VALUES - # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00') # noqa: E501 + # (CAST('2023-01-01 12:00:01.000999+03:00' AS TIMESTAMP), + # 'Associated string value for timestamp 2023-01-01 12:00:01.000999+03:00') # """ ), ( diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 250c58bccd..5f059a9e66 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -1247,7 +1247,8 @@ def test_scan_source_field_missing_in_spec(catalog: Catalog, spark: SparkSession spark.sql(f"DROP TABLE IF EXISTS {identifier}") spark.sql(f"CREATE TABLE {identifier} (foo int, bar int, jaz string) USING ICEBERG PARTITIONED BY (foo, bar)") spark.sql( - f"INSERT INTO {identifier} (foo, bar, jaz) VALUES (1, 1, 'dummy data'), (1, 2, 'dummy data again'), (2, 1, 'another partition')" # noqa: E501 + f"INSERT INTO {identifier} (foo, bar, jaz) VALUES " + f"(1, 1, 'dummy data'), (1, 2, 'dummy data again'), (2, 1, 'another partition')" ) spark.sql(f"ALTER TABLE {identifier} DROP PARTITION FIELD foo") spark.sql(f"ALTER TABLE {identifier} DROP COLUMN foo") diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index ba64434d5e..2bc4985609 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -372,7 +372,9 @@ def test_dynamic_partition_overwrite_non_identity_transform( ) with pytest.raises( ValueError, - match="For now dynamic overwrite does not support a table with non-identity-transform field in the latest partition spec: *", # noqa: E501 + match=( + "For now dynamic overwrite does not support a table with non-identity-transform field in the latest partition spec: *" + ), ): tbl.dynamic_partition_overwrite(arrow_table_with_null.slice(0, 1)) @@ -591,7 +593,8 @@ def test_data_files_with_table_partitioned_with_null( # the first snapshot generates M3 with 6 delete data entries collected from M1 and M2. # ML3 = [M3] # - # The second snapshot generates M4 with 3 appended data entries and since M3 (previous manifests) only has delete entries it does not lint to it. # noqa: E501 + # The second snapshot generates M4 with 3 appended data entries and since M3 (previous manifests) + # only has delete entries it does not lint to it. # ML4 = [M4] # Append : Append generates M5 with new data entries and links to all previous manifests which is M4 . @@ -603,8 +606,8 @@ def test_data_files_with_table_partitioned_with_null( # then it generates M7 as remaining existing entries from M1 and M8 from M2 # ML6 = [M6, M7, M8] # - # The second snapshot generates M9 with 3 appended data entries and it also looks at manifests in ML6 (previous manifests) # noqa: E501 - # it ignores M6 since it only has delete entries but it links to M7 and M8. + # The second snapshot generates M9 with 3 appended data entries and it also looks at manifests + # in ML6 (previous manifests) it ignores M6 since it only has delete entries but it links to M7 and M8. # ML7 = [M9, M7, M8] # tldr: diff --git a/tests/io/test_fsspec.py b/tests/io/test_fsspec.py index 900fc6193f..94eff1cb58 100644 --- a/tests/io/test_fsspec.py +++ b/tests/io/test_fsspec.py @@ -812,7 +812,9 @@ def test_s3v4_rest_signer(requests_mock: Mocker) -> None: "uri": new_uri, "headers": { "Authorization": [ - "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02" # noqa: E501 + "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, " + "SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, " + "Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02" ], "Host": ["bucket.s3.us-west-2.amazonaws.com"], "User-Agent": ["Botocore/1.27.59 Python/3.10.7 Darwin/21.5.0"], @@ -849,11 +851,25 @@ def test_s3v4_rest_signer(requests_mock: Mocker) -> None: assert request.url == new_uri assert dict(request.headers) == { - "Authorization": "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02", # noqa: E501 + "Authorization": ( + "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, " + "SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, " + "Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02" + ), "Host": "bucket.s3.us-west-2.amazonaws.com", "User-Agent": "Botocore/1.27.59 Python/3.10.7 Darwin/21.5.0", "X-Amz-Date": "20221017T102940Z", - "X-Amz-Security-Token": "YQoJb3JpZ2luX2VjEDoaCXVzLXdlc3QtMiJGMEQCID/fFxZP5oaEgQmcwP6XhZa0xSq9lmLSx8ffaWbySfUPAiAesa7sjd/WV4uwRTO0S03y/MWVtgpH+/NyZQ4bZgLVriqrAggTEAEaDDAzMzQwNzIyMjE1OSIMOeFOWhZIurMmAqjsKogCxMCqxX8ZjK0gacAkcDqBCyA7qTSLhdfKQIH/w7WpLBU1km+cRUWWCudan6gZsAq867DBaKEP7qI05DAWr9MChAkgUgyI8/G3Z23ET0gAedf3GsJbakB0F1kklx8jPmj4BPCht9RcTiXiJ5DxTS/cRCcalIQXmPFbaJSqpBusVG2EkWnm1v7VQrNPE2Os2b2P293vpbhwkyCEQiGRVva4Sw9D1sKvqSsK10QCRG+os6dFEOu1kARaXi6pStvR4OVmj7OYeAYjzaFchn7nz2CSae0M4IluiYQ01eQAywbfRo9DpKSmDM/DnPZWJnD/woLhaaaCrCxSSEaFsvGOHFhLd3Rknw1v0jADMILUtJoGOp4BpqKqyMz0CY3kpKL0jfR3ykTf/ge9wWVE0Alr7wRIkGCIURkhslGHqSyFRGoTqIXaxU+oPbwlw/0w/nYO7qQ6bTANOWye/wgw4h/NmJ6vU7wnZTXwREf1r6MF72++bE/fMk19LfVb8jN/qrUqAUXTc8gBAUxL5pgy8+oT/JnI2BkVrrLS4ilxEXP9Ahm+6GDUYXV4fBpqpZwdkzQ/5Gw=", # noqa: E501 + "X-Amz-Security-Token": ( + "YQoJb3JpZ2luX2VjEDoaCXVzLXdlc3QtMiJGMEQCID/fFxZP5oaEgQmcwP6XhZa0xSq9lmLSx8ffaWbySfUPAiAesa7sjd" + "/WV4uwRTO0S03y/MWVtgpH+/NyZQ4bZgLVriqrAggTEAEaDDAzMzQwNzIyMjE1OSIMOeFOWhZIurMmAqjsKogCxMCqxX8Z" + "jK0gacAkcDqBCyA7qTSLhdfKQIH/w7WpLBU1km+cRUWWCudan6gZsAq867DBaKEP7qI05DAWr9MChAkgUgyI8/G3Z23ET0" + "gAedf3GsJbakB0F1kklx8jPmj4BPCht9RcTiXiJ5DxTS/cRCcalIQXmPFbaJSqpBusVG2EkWnm1v7VQrNPE2Os2b2P293v" + "pbhwkyCEQiGRVva4Sw9D1sKvqSsK10QCRG+os6dFEOu1kARaXi6pStvR4OVmj7OYeAYjzaFchn7nz2CSae0M4IluiYQ01e" + "QAywbfRo9DpKSmDM/DnPZWJnD/woLhaaaCrCxSSEaFsvGOHFhLd3Rknw1v0jADMILUtJoGOp4BpqKqyMz0CY3kpKL0jfR3" + "ykTf/ge9wWVE0Alr7wRIkGCIURkhslGHqSyFRGoTqIXaxU+oPbwlw/0w/nYO7qQ6bTANOWye/wgw4h/NmJ6vU7wnZTXwRE" + "f1r6MF72++bE/fMk19LfVb8jN/qrUqAUXTc8gBAUxL5pgy8+oT/JnI2BkVrrLS4ilxEXP9Ahm+6GDUYXV4fBpqpZwdkzQ/" + "5Gw=" + ), "x-amz-content-sha256": "UNSIGNED-PAYLOAD", "X-Custom-Header": "value", } @@ -868,7 +884,9 @@ def test_s3v4_rest_signer_endpoint(requests_mock: Mocker) -> None: "uri": new_uri, "headers": { "Authorization": [ - "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02" # noqa: E501 + "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, " + "SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, " + "Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02" ], "Host": ["bucket.s3.us-west-2.amazonaws.com"], "User-Agent": ["Botocore/1.27.59 Python/3.10.7 Darwin/21.5.0"], @@ -904,11 +922,25 @@ def test_s3v4_rest_signer_endpoint(requests_mock: Mocker) -> None: assert request.url == new_uri assert dict(request.headers) == { - "Authorization": "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02", # noqa: E501 + "Authorization": ( + "AWS4-HMAC-SHA256 Credential=ASIAQPRZZYGHUT57DL3I/20221017/us-west-2/s3/aws4_request, " + "SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-security-token, " + "Signature=430582a17d61ab02c272896fa59195f277af4bdf2121c441685e589f044bbe02" + ), "Host": "bucket.s3.us-west-2.amazonaws.com", "User-Agent": "Botocore/1.27.59 Python/3.10.7 Darwin/21.5.0", "X-Amz-Date": "20221017T102940Z", - "X-Amz-Security-Token": "YQoJb3JpZ2luX2VjEDoaCXVzLXdlc3QtMiJGMEQCID/fFxZP5oaEgQmcwP6XhZa0xSq9lmLSx8ffaWbySfUPAiAesa7sjd/WV4uwRTO0S03y/MWVtgpH+/NyZQ4bZgLVriqrAggTEAEaDDAzMzQwNzIyMjE1OSIMOeFOWhZIurMmAqjsKogCxMCqxX8ZjK0gacAkcDqBCyA7qTSLhdfKQIH/w7WpLBU1km+cRUWWCudan6gZsAq867DBaKEP7qI05DAWr9MChAkgUgyI8/G3Z23ET0gAedf3GsJbakB0F1kklx8jPmj4BPCht9RcTiXiJ5DxTS/cRCcalIQXmPFbaJSqpBusVG2EkWnm1v7VQrNPE2Os2b2P293vpbhwkyCEQiGRVva4Sw9D1sKvqSsK10QCRG+os6dFEOu1kARaXi6pStvR4OVmj7OYeAYjzaFchn7nz2CSae0M4IluiYQ01eQAywbfRo9DpKSmDM/DnPZWJnD/woLhaaaCrCxSSEaFsvGOHFhLd3Rknw1v0jADMILUtJoGOp4BpqKqyMz0CY3kpKL0jfR3ykTf/ge9wWVE0Alr7wRIkGCIURkhslGHqSyFRGoTqIXaxU+oPbwlw/0w/nYO7qQ6bTANOWye/wgw4h/NmJ6vU7wnZTXwREf1r6MF72++bE/fMk19LfVb8jN/qrUqAUXTc8gBAUxL5pgy8+oT/JnI2BkVrrLS4ilxEXP9Ahm+6GDUYXV4fBpqpZwdkzQ/5Gw=", # noqa: E501 + "X-Amz-Security-Token": ( + "YQoJb3JpZ2luX2VjEDoaCXVzLXdlc3QtMiJGMEQCID/fFxZP5oaEgQmcwP6XhZa0xSq9lmLSx8ffaWbySfUPAiAesa7sjd" + "/WV4uwRTO0S03y/MWVtgpH+/NyZQ4bZgLVriqrAggTEAEaDDAzMzQwNzIyMjE1OSIMOeFOWhZIurMmAqjsKogCxMCqxX8Z" + "jK0gacAkcDqBCyA7qTSLhdfKQIH/w7WpLBU1km+cRUWWCudan6gZsAq867DBaKEP7qI05DAWr9MChAkgUgyI8/G3Z23ET0" + "gAedf3GsJbakB0F1kklx8jPmj4BPCht9RcTiXiJ5DxTS/cRCcalIQXmPFbaJSqpBusVG2EkWnm1v7VQrNPE2Os2b2P293v" + "pbhwkyCEQiGRVva4Sw9D1sKvqSsK10QCRG+os6dFEOu1kARaXi6pStvR4OVmj7OYeAYjzaFchn7nz2CSae0M4IluiYQ01e" + "QAywbfRo9DpKSmDM/DnPZWJnD/woLhaaaCrCxSSEaFsvGOHFhLd3Rknw1v0jADMILUtJoGOp4BpqKqyMz0CY3kpKL0jfR3" + "ykTf/ge9wWVE0Alr7wRIkGCIURkhslGHqSyFRGoTqIXaxU+oPbwlw/0w/nYO7qQ6bTANOWye/wgw4h/NmJ6vU7wnZTXwRE" + "f1r6MF72++bE/fMk19LfVb8jN/qrUqAUXTc8gBAUxL5pgy8+oT/JnI2BkVrrLS4ilxEXP9Ahm+6GDUYXV4fBpqpZwdkzQ/" + "5Gw=" + ), "x-amz-content-sha256": "UNSIGNED-PAYLOAD", } @@ -946,8 +978,9 @@ def test_s3v4_rest_signer_forbidden(requests_mock: Mocker) -> None: signer(request) assert ( - """Failed to sign request 401: {'method': 'HEAD', 'region': 'us-west-2', 'uri': 'https://bucket/metadata/snap-8048355899640248710-1-a5c8ea2d-aa1f-48e8-89f4-1fa69db8c742.avro', 'headers': {'User-Agent': ['Botocore/1.27.59 Python/3.10.7 Darwin/21.5.0']}}""" # noqa: E501 - in str(exc_info.value) + "Failed to sign request 401: {'method': 'HEAD', 'region': 'us-west-2', " + "'uri': 'https://bucket/metadata/snap-8048355899640248710-1-a5c8ea2d-aa1f-48e8-89f4-1fa69db8c742.avro', " + "'headers': {'User-Agent': ['Botocore/1.27.59 Python/3.10.7 Darwin/21.5.0']}}" in str(exc_info.value) ) diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 1de23ddc28..d9e007c021 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -1195,7 +1195,8 @@ def test_projection_concat_files(schema_int: Schema, file_int: str) -> None: def test_identity_transform_column_projection(tmp_path: str, catalog: InMemoryCatalog) -> None: - # Test by adding a non-partitioned data file to a partitioned table, verifying partition value projection from manifest metadata. # noqa: E501 + # Test by adding a non-partitioned data file to a partitioned table, verifying partition value + # projection from manifest metadata. # TODO: Update to use a data file created by writing data to an unpartitioned table once add_files supports field IDs. # (context: https://github.com/apache/iceberg-python/pull/1443#discussion_r1901374875) @@ -1264,7 +1265,8 @@ def test_identity_transform_column_projection(tmp_path: str, catalog: InMemoryCa def test_identity_transform_columns_projection(tmp_path: str, catalog: InMemoryCatalog) -> None: - # Test by adding a non-partitioned data file to a multi-partitioned table, verifying partition value projection from manifest metadata. # noqa: E501 + # Test by adding a non-partitioned data file to a multi-partitioned table, verifying partition value + # projection from manifest metadata. # TODO: Update to use a data file created by writing data to an unpartitioned table once add_files supports field IDs. # (context: https://github.com/apache/iceberg-python/pull/1443#discussion_r1901374875) schema = Schema( @@ -1542,18 +1544,19 @@ def test_projection_maps_of_structs(schema_map_of_structs: Schema, file_map_of_s strict=True, ): assert actual.as_py() == expected - # fmt: off - assert ( - repr(result_table.schema) - == """locations: map> - child 0, entries: struct not null - child 0, latitude: double not null - child 1, longitude: double not null - child 2, altitude: double""" + expected_schema_repr = ( + "locations: map>\n" + " child 0, entries: struct not null\n" + " child 0, latitude: double not null\n" + " child 1, longitude: double not null\n" + " child 2, altitude: double" ) - # fmt: on + assert repr(result_table.schema) == expected_schema_repr def test_projection_nested_struct_different_parent_id(file_struct: str) -> None: @@ -2755,7 +2758,8 @@ def test__to_requested_schema_timestamps_without_downcast_raises_exception( def test_pyarrow_file_io_fs_by_scheme_cache() -> None: - # It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument becomes available for `resolve_s3_region` # noqa: E501 + # It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument + # becomes available for `resolve_s3_region` # Refer to: https://github.com/apache/arrow/issues/43713 pyarrow_file_io = PyArrowFileIO() @@ -2789,7 +2793,8 @@ def test_pyarrow_file_io_fs_by_scheme_cache() -> None: def test_pyarrow_io_new_input_multi_region(caplog: Any) -> None: - # It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument becomes available for `resolve_s3_region` # noqa: E501 + # It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument + # becomes available for `resolve_s3_region` # Refer to: https://github.com/apache/arrow/issues/43713 user_provided_region = "ap-southeast-1" bucket_regions = [ @@ -3000,7 +3005,8 @@ def test_iceberg_read_orc(tmp_path: Path) -> None: partition_specs=[PartitionSpec()], properties={ "write.format.default": "parquet", # This doesn't matter for reading - "schema.name-mapping.default": '[{"field-id": 1, "names": ["id"]}, {"field-id": 2, "names": ["name"]}]', # Add name mapping for ORC files without field IDs # noqa: E501 + # Add name mapping for ORC files without field IDs + "schema.name-mapping.default": ('[{"field-id": 1, "names": ["id"]}, {"field-id": 2, "names": ["name"]}]'), }, ) io = PyArrowFileIO() @@ -3100,7 +3106,10 @@ def test_orc_row_filtering_predicate_pushdown(tmp_path: Path) -> None: schemas=[schema], partition_specs=[PartitionSpec()], properties={ - "schema.name-mapping.default": '[{"field-id": 1, "names": ["id"]}, {"field-id": 2, "names": ["name"]}, {"field-id": 3, "names": ["age"]}, {"field-id": 4, "names": ["active"]}]', # noqa: E501 + "schema.name-mapping.default": ( + '[{"field-id": 1, "names": ["id"]}, {"field-id": 2, "names": ["name"]}, ' + '{"field-id": 3, "names": ["age"]}, {"field-id": 4, "names": ["active"]}]' + ), }, ) io = PyArrowFileIO() @@ -4380,7 +4389,8 @@ def test_orc_stripe_size_batch_size_compression_interaction(tmp_path: Path) -> N # Assert batching behavior assert len(batches) > 0, f"Should have at least one batch for {description}" assert len(batches) == actual_stripes, ( - f"Number of batches should match number of stripes for {description}: {len(batches)} batches vs {actual_stripes} stripes" # noqa: E501 + f"Number of batches should match number of stripes for {description}: " + f"{len(batches)} batches vs {actual_stripes} stripes" ) # Assert data integrity @@ -4450,7 +4460,10 @@ def test_orc_near_perfect_stripe_size_mapping(tmp_path: Path) -> None: { "id": pa.array( [ - f"very_long_string_value_{i:06d}_with_lots_of_padding_to_make_it_harder_to_compress_{i * 7919 % 100000:05d}_more_padding_{i * 7919 % 100000:05d}" # noqa: E501 + ( + f"very_long_string_value_{i:06d}_with_lots_of_padding_to_make_it_harder_to_compress_" + f"{i * 7919 % 100000:05d}_more_padding_{i * 7919 % 100000:05d}" + ) for i in range(1, 50001) ] ) # 50K rows @@ -4525,7 +4538,8 @@ def test_orc_near_perfect_stripe_size_mapping(tmp_path: Path) -> None: # Assert batching behavior assert len(batches) > 0, f"Should have at least one batch for stripe_size={stripe_size}" assert len(batches) == actual_stripes, ( - f"Number of batches should match number of stripes for stripe_size={stripe_size}: {len(batches)} batches vs {actual_stripes} stripes" # noqa: E501 + f"Number of batches should match number of stripes for stripe_size={stripe_size}: " + f"{len(batches)} batches vs {actual_stripes} stripes" ) # Assert data integrity diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index 5f40f7f374..433350c225 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -190,7 +190,8 @@ def test_pyarrow_timestamp_invalid_units() -> None: with pytest.raises( TypeError, match=re.escape( - "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." # noqa: E501 + "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' " + "configuration property to automatically downcast 'ns' to 'us' on write." ), ): visit_pyarrow(pyarrow_type, _ConvertToIceberg()) @@ -212,7 +213,8 @@ def test_pyarrow_timestamp_tz_invalid_units() -> None: with pytest.raises( TypeError, match=re.escape( - "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." # noqa: E501 + "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' " + "configuration property to automatically downcast 'ns' to 'us' on write." ), ): visit_pyarrow(pyarrow_type, _ConvertToIceberg()) @@ -832,8 +834,13 @@ def test_expression_to_complementary_pyarrow( Not(bound_is_null_double_field), ) result = _expression_to_complementary_pyarrow(bound_expr) - # Notice an isNan predicate on a str column is automatically converted to always false and removed from Or and thus will not appear in the pc.expr. # noqa: E501 - assert ( - repr(result) - == """ 100)) or ((is_nan(float_field) and (double_field == 0)) or (float_field > 100))) and invert(is_null(double_field, {nan_is_null=false})))) or is_null(float_field, {nan_is_null=false})) or is_null(string_field, {nan_is_null=false})) or is_nan(double_field))>""" # noqa: E501 + # Notice an isNan predicate on a str column is automatically converted to always false and removed from Or + # and thus will not appear in the pc.expr. + expected_repr = ( + ' 100)) ' + "or ((is_nan(float_field) and (double_field == 0)) or (float_field > 100))) " + "and invert(is_null(double_field, {nan_is_null=false})))) " + "or is_null(float_field, {nan_is_null=false})) " + "or is_null(string_field, {nan_is_null=false})) or is_nan(double_field))>" ) + assert repr(result) == expected_repr diff --git a/tests/table/test_name_mapping.py b/tests/table/test_name_mapping.py index ab60537c9c..e8de5f18e5 100644 --- a/tests/table/test_name_mapping.py +++ b/tests/table/test_name_mapping.py @@ -192,16 +192,20 @@ def test_json_mapped_field_no_field_id_serialization() -> None: ] ) - assert ( - table_name_mapping_nested_no_field_id.model_dump_json() - == """[{"names":["foo"],"field-id":1},{"names":["bar"]},{"names":["qux"],"field-id":2,"fields":[{"names":["element"]}]}]""" # noqa: E501 + assert table_name_mapping_nested_no_field_id.model_dump_json() == ( + '[{"names":["foo"],"field-id":1},{"names":["bar"]},{"names":["qux"],"field-id":2,"fields":[{"names":["element"]}]}]' ) def test_json_serialization(table_name_mapping_nested: NameMapping) -> None: - assert ( - table_name_mapping_nested.model_dump_json() - == """[{"names":["foo"],"field-id":1},{"names":["bar"],"field-id":2},{"names":["baz"],"field-id":3},{"names":["qux"],"field-id":4,"fields":[{"names":["element"],"field-id":5}]},{"names":["quux"],"field-id":6,"fields":[{"names":["key"],"field-id":7},{"names":["value"],"field-id":8,"fields":[{"names":["key"],"field-id":9},{"names":["value"],"field-id":10}]}]},{"names":["location"],"field-id":11,"fields":[{"names":["element"],"field-id":12,"fields":[{"names":["latitude"],"field-id":13},{"names":["longitude"],"field-id":14}]}]},{"names":["person"],"field-id":15,"fields":[{"names":["name"],"field-id":16},{"names":["age"],"field-id":17}]}]""" # noqa: E501 + assert table_name_mapping_nested.model_dump_json() == ( + '[{"names":["foo"],"field-id":1},{"names":["bar"],"field-id":2},{"names":["baz"],"field-id":3},' + '{"names":["qux"],"field-id":4,"fields":[{"names":["element"],"field-id":5}]},' + '{"names":["quux"],"field-id":6,"fields":[{"names":["key"],"field-id":7},' + '{"names":["value"],"field-id":8,"fields":[{"names":["key"],"field-id":9},{"names":["value"],"field-id":10}]}]},' + '{"names":["location"],"field-id":11,"fields":[{"names":["element"],"field-id":12,"fields":' + '[{"names":["latitude"],"field-id":13},{"names":["longitude"],"field-id":14}]}]},' + '{"names":["person"],"field-id":15,"fields":[{"names":["name"],"field-id":16},{"names":["age"],"field-id":17}]}]' ) diff --git a/tests/table/test_partitioning.py b/tests/table/test_partitioning.py index 7b95d2e9ac..cfe3e2c12d 100644 --- a/tests/table/test_partitioning.py +++ b/tests/table/test_partitioning.py @@ -120,9 +120,10 @@ def test_serialize_partition_spec() -> None: PartitionField(source_id=2, field_id=1001, transform=BucketTransform(num_buckets=25), name="int_bucket"), spec_id=3, ) - assert ( - partitioned.model_dump_json() - == """{"spec-id":3,"fields":[{"source-id":1,"field-id":1000,"transform":"truncate[19]","name":"str_truncate"},{"source-id":2,"field-id":1001,"transform":"bucket[25]","name":"int_bucket"}]}""" # noqa: E501 + assert partitioned.model_dump_json() == ( + '{"spec-id":3,"fields":[' + '{"source-id":1,"field-id":1000,"transform":"truncate[19]","name":"str_truncate"},' + '{"source-id":2,"field-id":1001,"transform":"bucket[25]","name":"int_bucket"}]}' ) @@ -134,7 +135,11 @@ def test_deserialize_unpartition_spec() -> None: def test_deserialize_partition_spec() -> None: - json_partition_spec = """{"spec-id": 3, "fields": [{"source-id": 1, "field-id": 1000, "transform": "truncate[19]", "name": "str_truncate"}, {"source-id": 2, "field-id": 1001, "transform": "bucket[25]", "name": "int_bucket"}]}""" # noqa: E501 + json_partition_spec = ( + '{"spec-id": 3, "fields": [' + '{"source-id": 1, "field-id": 1000, "transform": "truncate[19]", "name": "str_truncate"}, ' + '{"source-id": 2, "field-id": 1001, "transform": "bucket[25]", "name": "int_bucket"}]}' + ) spec = PartitionSpec.model_validate_json(json_partition_spec) diff --git a/tests/table/test_refs.py b/tests/table/test_refs.py index 63a9e34ec8..dbd0fee13c 100644 --- a/tests/table/test_refs.py +++ b/tests/table/test_refs.py @@ -31,9 +31,9 @@ def test_snapshot_with_properties_repr() -> None: max_ref_age_ms=10000000, ) - assert ( - repr(snapshot_ref) - == """SnapshotRef(snapshot_id=3051729675574597004, snapshot_ref_type=SnapshotRefType.TAG, min_snapshots_to_keep=None, max_snapshot_age_ms=None, max_ref_age_ms=10000000)""" # noqa: E501 + assert repr(snapshot_ref) == ( + "SnapshotRef(snapshot_id=3051729675574597004, snapshot_ref_type=SnapshotRefType.TAG, " + "min_snapshots_to_keep=None, max_snapshot_age_ms=None, max_ref_age_ms=10000000)" ) assert snapshot_ref == eval(repr(snapshot_ref)) diff --git a/tests/table/test_snapshots.py b/tests/table/test_snapshots.py index d450f78379..cfdc516227 100644 --- a/tests/table/test_snapshots.py +++ b/tests/table/test_snapshots.py @@ -79,9 +79,9 @@ def test_serialize_summary_with_properties() -> None: def test_serialize_snapshot(snapshot: Snapshot) -> None: - assert ( - snapshot.model_dump_json() - == """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append"},"schema-id":3}""" # noqa: E501 + assert snapshot.model_dump_json() == ( + '{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,' + '"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append"},"schema-id":3}' ) @@ -96,14 +96,17 @@ def test_serialize_snapshot_without_sequence_number() -> None: schema_id=3, ) actual = snapshot.model_dump_json() - expected = """{"snapshot-id":25,"parent-snapshot-id":19,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append"},"schema-id":3}""" # noqa: E501 + expected = ( + '{"snapshot-id":25,"parent-snapshot-id":19,"timestamp-ms":1602638573590,' + '"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append"},"schema-id":3}' + ) assert actual == expected def test_serialize_snapshot_with_properties(snapshot_with_properties: Snapshot) -> None: - assert ( - snapshot_with_properties.model_dump_json() - == """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append","foo":"bar"},"schema-id":3}""" # noqa: E501 + assert snapshot_with_properties.model_dump_json() == ( + '{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,' + '"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append","foo":"bar"},"schema-id":3}' ) @@ -119,36 +122,45 @@ def test_deserialize_summary_with_properties() -> None: def test_deserialize_snapshot(snapshot: Snapshot) -> None: - payload = """{"snapshot-id": 25, "parent-snapshot-id": 19, "sequence-number": 200, "timestamp-ms": 1602638573590, "manifest-list": "s3:/a/b/c.avro", "summary": {"operation": "append"}, "schema-id": 3}""" # noqa: E501 + payload = ( + '{"snapshot-id": 25, "parent-snapshot-id": 19, "sequence-number": 200, "timestamp-ms": 1602638573590, ' + '"manifest-list": "s3:/a/b/c.avro", "summary": {"operation": "append"}, "schema-id": 3}' + ) actual = Snapshot.model_validate_json(payload) assert actual == snapshot def test_deserialize_snapshot_without_operation(snapshot: Snapshot) -> None: - payload = """{"snapshot-id": 25, "parent-snapshot-id": 19, "sequence-number": 200, "timestamp-ms": 1602638573590, "manifest-list": "s3:/a/b/c.avro", "summary": {}, "schema-id": 3}""" # noqa: E501 + payload = ( + '{"snapshot-id": 25, "parent-snapshot-id": 19, "sequence-number": 200, "timestamp-ms": 1602638573590, ' + '"manifest-list": "s3:/a/b/c.avro", "summary": {}, "schema-id": 3}' + ) with pytest.warns(UserWarning, match="Encountered invalid snapshot summary: operation is missing, defaulting to overwrite"): actual = Snapshot.model_validate_json(payload) assert actual.summary.operation == Operation.OVERWRITE def test_deserialize_snapshot_with_properties(snapshot_with_properties: Snapshot) -> None: - payload = """{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append","foo":"bar"},"schema-id":3}""" # noqa: E501 + payload = ( + '{"snapshot-id":25,"parent-snapshot-id":19,"sequence-number":200,"timestamp-ms":1602638573590,' + '"manifest-list":"s3:/a/b/c.avro","summary":{"operation":"append","foo":"bar"},"schema-id":3}' + ) snapshot = Snapshot.model_validate_json(payload) assert snapshot == snapshot_with_properties def test_snapshot_repr(snapshot: Snapshot) -> None: - assert ( - repr(snapshot) - == """Snapshot(snapshot_id=25, parent_snapshot_id=19, sequence_number=200, timestamp_ms=1602638573590, manifest_list='s3:/a/b/c.avro', summary=Summary(Operation.APPEND), schema_id=3)""" # noqa: E501 + assert repr(snapshot) == ( + "Snapshot(snapshot_id=25, parent_snapshot_id=19, sequence_number=200, timestamp_ms=1602638573590, " + "manifest_list='s3:/a/b/c.avro', summary=Summary(Operation.APPEND), schema_id=3)" ) assert snapshot == eval(repr(snapshot)) def test_snapshot_with_properties_repr(snapshot_with_properties: Snapshot) -> None: - assert ( - repr(snapshot_with_properties) - == """Snapshot(snapshot_id=25, parent_snapshot_id=19, sequence_number=200, timestamp_ms=1602638573590, manifest_list='s3:/a/b/c.avro', summary=Summary(Operation.APPEND, **{'foo': 'bar'}), schema_id=3)""" # noqa: E501 + assert repr(snapshot_with_properties) == ( + "Snapshot(snapshot_id=25, parent_snapshot_id=19, sequence_number=200, timestamp_ms=1602638573590, " + "manifest_list='s3:/a/b/c.avro', summary=Summary(Operation.APPEND, **{'foo': 'bar'}), schema_id=3)" ) assert snapshot_with_properties == eval(repr(snapshot_with_properties)) @@ -226,7 +238,10 @@ def test_snapshot_summary_collector_with_partition() -> None: "deleted-records": "300", "changed-partition-count": "2", "partition-summaries-included": "true", - "partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100", # noqa: E501 + "partitions.int_field=1": ( + "added-files-size=1234,removed-files-size=1234,added-data-files=1," + "deleted-data-files=1,added-records=100,deleted-records=100" + ), "partitions.int_field=2": "removed-files-size=4321,deleted-data-files=1,deleted-records=200", } @@ -262,7 +277,10 @@ def test_snapshot_summary_collector_with_partition_limit_in_constructor() -> Non "deleted-records": "300", "changed-partition-count": "2", "partition-summaries-included": "true", - "partitions.int_field=1": "added-files-size=1234,removed-files-size=1234,added-data-files=1,deleted-data-files=1,added-records=100,deleted-records=100", # noqa: E501 + "partitions.int_field=1": ( + "added-files-size=1234,removed-files-size=1234,added-data-files=1," + "deleted-data-files=1,added-records=100,deleted-records=100" + ), "partitions.int_field=2": "removed-files-size=4321,deleted-data-files=1,deleted-records=200", } diff --git a/tests/table/test_sorting.py b/tests/table/test_sorting.py index b9855380a2..eaf6076a98 100644 --- a/tests/table/test_sorting.py +++ b/tests/table/test_sorting.py @@ -53,12 +53,22 @@ def test_serialize_sort_order_unsorted() -> None: def test_serialize_sort_order(sort_order: SortOrder) -> None: - expected = '{"order-id":22,"fields":[{"source-id":19,"transform":"identity","direction":"asc","null-order":"nulls-first"},{"source-id":25,"transform":"bucket[4]","direction":"desc","null-order":"nulls-last"},{"source-id":22,"transform":"void","direction":"asc","null-order":"nulls-first"}]}' # noqa: E501 + expected = ( + '{"order-id":22,"fields":[' + '{"source-id":19,"transform":"identity","direction":"asc","null-order":"nulls-first"},' + '{"source-id":25,"transform":"bucket[4]","direction":"desc","null-order":"nulls-last"},' + '{"source-id":22,"transform":"void","direction":"asc","null-order":"nulls-first"}]}' + ) assert sort_order.model_dump_json() == expected def test_deserialize_sort_order(sort_order: SortOrder) -> None: - payload = '{"order-id": 22, "fields": [{"source-id": 19, "transform": "identity", "direction": "asc", "null-order": "nulls-first"}, {"source-id": 25, "transform": "bucket[4]", "direction": "desc", "null-order": "nulls-last"}, {"source-id": 22, "transform": "void", "direction": "asc", "null-order": "nulls-first"}]}' # noqa: E501 + payload = ( + '{"order-id": 22, "fields": [' + '{"source-id": 19, "transform": "identity", "direction": "asc", "null-order": "nulls-first"}, ' + '{"source-id": 25, "transform": "bucket[4]", "direction": "desc", "null-order": "nulls-last"}, ' + '{"source-id": 22, "transform": "void", "direction": "asc", "null-order": "nulls-first"}]}' + ) assert SortOrder.model_validate_json(payload) == sort_order @@ -90,7 +100,16 @@ def test_sorting_to_string(sort_order: SortOrder) -> None: def test_sorting_to_repr(sort_order: SortOrder) -> None: - expected = """SortOrder(SortField(source_id=19, transform=IdentityTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST), SortField(source_id=25, transform=BucketTransform(num_buckets=4), direction=SortDirection.DESC, null_order=NullOrder.NULLS_LAST), SortField(source_id=22, transform=VoidTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST), order_id=22)""" # noqa: E501 + expected = ( + "SortOrder(" + "SortField(source_id=19, transform=IdentityTransform(), " + "direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST), " + "SortField(source_id=25, transform=BucketTransform(num_buckets=4), " + "direction=SortDirection.DESC, null_order=NullOrder.NULLS_LAST), " + "SortField(source_id=22, transform=VoidTransform(), " + "direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST), " + "order_id=22)" + ) assert repr(sort_order) == expected diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py index 9ff5b9d9f4..08f90c6600 100644 --- a/tests/table/test_upsert.py +++ b/tests/table/test_upsert.py @@ -120,7 +120,10 @@ def assert_upsert_result(res: UpsertResult, expected_updated: int, expected_inse @pytest.mark.parametrize( - "join_cols, src_start_row, src_end_row, target_start_row, target_end_row, when_matched_update_all, when_not_matched_insert_all, expected_updated, expected_inserted", # noqa: E501 + ( + "join_cols, src_start_row, src_end_row, target_start_row, target_end_row, " + "when_matched_update_all, when_not_matched_insert_all, expected_updated, expected_inserted" + ), [ (["order_id"], 1, 2, 2, 3, True, True, 1, 1), # single row (["order_id"], 5001, 15000, 1, 10000, True, True, 5000, 5000), # 10k rows diff --git a/tests/test_schema.py b/tests/test_schema.py index d63d498e8d..93ddc16202 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -95,7 +95,10 @@ def test_schema_str(table_schema_simple: Schema) -> None: def test_schema_repr_single_field() -> None: """Test schema representation""" actual = repr(Schema(NestedField(field_id=1, name="foo", field_type=StringType()), schema_id=1)) - expected = "Schema(NestedField(field_id=1, name='foo', field_type=StringType(), required=False), schema_id=1, identifier_field_ids=[])" # noqa: E501 + expected = ( + "Schema(NestedField(field_id=1, name='foo', field_type=StringType(), required=False), " + "schema_id=1, identifier_field_ids=[])" + ) assert expected == actual @@ -108,7 +111,11 @@ def test_schema_repr_two_fields() -> None: schema_id=1, ) ) - expected = "Schema(NestedField(field_id=1, name='foo', field_type=StringType(), required=False), NestedField(field_id=2, name='bar', field_type=IntegerType(), required=False), schema_id=1, identifier_field_ids=[])" # noqa: E501 + expected = ( + "Schema(NestedField(field_id=1, name='foo', field_type=StringType(), required=False), " + "NestedField(field_id=2, name='bar', field_type=IntegerType(), required=False), " + "schema_id=1, identifier_field_ids=[])" + ) assert expected == actual @@ -428,13 +435,29 @@ def __getitem__(self, pos: int) -> Any: def test_serialize_schema(table_schema_with_full_nested_fields: Schema) -> None: actual = table_schema_with_full_nested_fields.model_dump_json() - expected = """{"type":"struct","fields":[{"id":1,"name":"foo","type":"string","required":false,"doc":"foo doc","initial-default":"foo initial","write-default":"foo write"},{"id":2,"name":"bar","type":"int","required":true,"doc":"bar doc","initial-default":42,"write-default":43},{"id":3,"name":"baz","type":"boolean","required":false,"doc":"baz doc","initial-default":true,"write-default":false}],"schema-id":1,"identifier-field-ids":[2]}""" # noqa: E501 + expected = ( + '{"type":"struct","fields":[' + '{"id":1,"name":"foo","type":"string","required":false,"doc":"foo doc",' + '"initial-default":"foo initial","write-default":"foo write"},' + '{"id":2,"name":"bar","type":"int","required":true,"doc":"bar doc",' + '"initial-default":42,"write-default":43},' + '{"id":3,"name":"baz","type":"boolean","required":false,"doc":"baz doc",' + '"initial-default":true,"write-default":false}],' + '"schema-id":1,"identifier-field-ids":[2]}' + ) assert actual == expected def test_deserialize_schema(table_schema_with_full_nested_fields: Schema) -> None: actual = Schema.model_validate_json( - """{"type": "struct", "fields": [{"id": 1, "name": "foo", "type": "string", "required": false, "doc": "foo doc", "initial-default": "foo initial", "write-default": "foo write"}, {"id": 2, "name": "bar", "type": "int", "required": true, "doc": "bar doc", "initial-default": 42, "write-default": 43}, {"id": 3, "name": "baz", "type": "boolean", "required": false, "doc": "baz doc", "initial-default": true, "write-default": false}], "schema-id": 1, "identifier-field-ids": [2]}""" # noqa: E501 + '{"type": "struct", "fields": [' + '{"id": 1, "name": "foo", "type": "string", "required": false, "doc": "foo doc", ' + '"initial-default": "foo initial", "write-default": "foo write"}, ' + '{"id": 2, "name": "bar", "type": "int", "required": true, "doc": "bar doc", ' + '"initial-default": 42, "write-default": 43}, ' + '{"id": 3, "name": "baz", "type": "boolean", "required": false, "doc": "baz doc", ' + '"initial-default": true, "write-default": false}], ' + '"schema-id": 1, "identifier-field-ids": [2]}' ) expected = table_schema_with_full_nested_fields assert actual == expected @@ -885,22 +908,22 @@ def test_identifier_fields_fails(table_schema_nested_with_struct_key_map: Schema with pytest.raises(ValueError) as exc_info: Schema(*table_schema_nested_with_struct_key_map.fields, schema_id=1, identifier_field_ids=[23]) assert ( - f"Cannot add field zip as an identifier field: must not be nested in {table_schema_nested_with_struct_key_map.find_field('location')}" # noqa: E501 - in str(exc_info.value) + f"Cannot add field zip as an identifier field: must not be nested in " + f"{table_schema_nested_with_struct_key_map.find_field('location')}" in str(exc_info.value) ) with pytest.raises(ValueError) as exc_info: Schema(*table_schema_nested_with_struct_key_map.fields, schema_id=1, identifier_field_ids=[26]) assert ( - f"Cannot add field x as an identifier field: must not be nested in {table_schema_nested_with_struct_key_map.find_field('points')}" # noqa: E501 - in str(exc_info.value) + f"Cannot add field x as an identifier field: must not be nested in " + f"{table_schema_nested_with_struct_key_map.find_field('points')}" in str(exc_info.value) ) with pytest.raises(ValueError) as exc_info: Schema(*table_schema_nested_with_struct_key_map.fields, schema_id=1, identifier_field_ids=[17]) assert ( - f"Cannot add field age as an identifier field: must not be nested in an optional field {table_schema_nested_with_struct_key_map.find_field('person')}" # noqa: E501 - in str(exc_info.value) + f"Cannot add field age as an identifier field: must not be nested in an optional field " + f"{table_schema_nested_with_struct_key_map.find_field('person')}" in str(exc_info.value) ) diff --git a/tests/test_types.py b/tests/test_types.py index 7911ba6494..64f3eb95a3 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -533,9 +533,9 @@ def test_repr_nested_field_default_nones_should_not_appear() -> None: repr(NestedField(1, "required_field", StringType(), required=False, initial_default="hello", write_default=None)) == "NestedField(field_id=1, name='required_field', field_type=StringType(), required=False, initial_default='hello')" ) - assert ( - repr(NestedField(1, "required_field", StringType(), required=False, initial_default="hello", write_default="bye")) - == "NestedField(field_id=1, name='required_field', field_type=StringType(), required=False, initial_default='hello', write_default='bye')" # noqa: E501 + assert repr(NestedField(1, "required_field", StringType(), required=False, initial_default="hello", write_default="bye")) == ( + "NestedField(field_id=1, name='required_field', field_type=StringType(), required=False, " + "initial_default='hello', write_default='bye')" ) @@ -633,9 +633,9 @@ def test_str_struct(simple_struct: StructType) -> None: def test_repr_struct(simple_struct: StructType) -> None: - assert ( - repr(simple_struct) - == "StructType(fields=(NestedField(field_id=1, name='required_field', field_type=StringType(), required=True), NestedField(field_id=2, name='optional_field', field_type=IntegerType(), required=False),))" # noqa: E501 + assert repr(simple_struct) == ( + "StructType(fields=(NestedField(field_id=1, name='required_field', field_type=StringType(), required=True), " + "NestedField(field_id=2, name='optional_field', field_type=IntegerType(), required=False),))" ) diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 7ba8a7e084..abd9878ea1 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -79,9 +79,9 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None: data_file = manifest_entry.data_file assert data_file.content == DataFileContent.DATA - assert ( - data_file.file_path - == "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet" # noqa: E501 + assert data_file.file_path == ( + "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/" + "00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet" ) assert data_file.file_format == FileFormat.PARQUET assert repr(data_file.partition) == "Record[1, 1925]" @@ -396,7 +396,10 @@ def test_write_manifest( expected_metadata = { "schema": test_schema.model_dump_json(), - "partition-spec": """[{"source-id":1,"field-id":1000,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":1001,"transform":"day","name":"tpep_pickup_day"}]""", # noqa: E501 + "partition-spec": ( + '[{"source-id":1,"field-id":1000,"transform":"identity","name":"VendorID"},' + '{"source-id":2,"field-id":1001,"transform":"day","name":"tpep_pickup_day"}]' + ), "partition-spec-id": str(demo_manifest_file.partition_spec_id), "format-version": str(format_version), } @@ -416,9 +419,9 @@ def test_write_manifest( data_file = manifest_entry.data_file assert data_file.content == DataFileContent.DATA - assert ( - data_file.file_path - == "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet" # noqa: E501 + assert data_file.file_path == ( + "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/" + "00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet" ) assert data_file.file_format == FileFormat.PARQUET assert data_file.partition == Record(1, 1925)