apache · emkornfield · Apr 30, 2025 · Feb 7, 2025 · Feb 17, 2025 · Feb 21, 2025
diff --git a/data/geospatial/README.md b/data/geospatial/README.md
@@ -0,0 +1,67 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one
+  ~ or more contributor license agreements.  See the NOTICE file
+  ~ distributed with this work for additional information
+  ~ regarding copyright ownership.  The ASF licenses this file
+  ~ to you under the Apache License, Version 2.0 (the
+  ~ "License"); you may not use this file except in compliance
+  ~ with the License.  You may obtain a copy of the License at
+  ~
+  ~   http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing,
+  ~ software distributed under the License is distributed on an
+  ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  ~ KIND, either express or implied.  See the License for the
+  ~ specific language governing permissions and limitations
+  ~ under the License.
+  -->
+
+# Geospatial Test Files
+
+
+These test files cover the main and corner case functionality of the
+[Parquet Geospatial Types](https://github.com/apache/parquet-format/blob/master/Geospatial.md)
+GEOMETRY and GEOGRAPHY.
+
+- `geospatial.parquet`: Contains row groups with specific combinations of
+  geometry types to test statistics generation and geometry type coverage.
+  The file contains columns `group` (string identifier of the group name),
+  `wkt` (the human-readable well-known text representation of the geometry)
+  and `geometry` (a Parquet GEOMETRY column). A human-readable version of
+  the file is available in `geospatial.yaml`.
+
+- `geospatial-with-nan.parquet`: Contains a single row group with a GEOMETRY
+  column whose contents contains two valid geometries and one invalid LINESTRING
+  whose coordinates contain a `NaN` value in all dimensions. Such a geometry is
+  not valid and the behaviour of it is not defined; however, implementations should
+  not generate statistics that would prevent the other (valid) geometries in the
+  column chunk from appearing in the case of predicate pushdown. Notably,
+  implementations should *not* generate statistics that contain `NaN` for this case.
+
+  Note that POINT EMPTY is represented by convention in well-known binary as
+  a POINT whose coordinates are all `NaN`, which should be treated as a valid
+  (but empty) geometry.
+
+- `crs-default.parquet`: Contains a GEOMETRY column with the crs
+  omitted. This should be interpreted as OGC:CRS84 (i.e., longitude/latitude).
+
+- `crs-geography.parquet`: Contains a GEOGRAPHY column with the crs
+  omitted. This should be interpreted as OGC:CRS84 (i.e., longitude/latitude).
+
+- `crs-projjson.parquet`: Contains a GEOMETRY column with the crs parameter
+  set to `projjson:projjson_epsg_5070` and a metadata field with the key
+  `projjson_epsg_5070` and a value consisting of the appropriate PROJJSON
+  value for EPSG:5070.
+
+- `crs-srid.parquet`: Contains a GEOMETRY column with the crs parameter set
+  to `srid:5070`. The Parquet format does not mention the EPSG database in
+  any way, but otherwise out-of-context SRID values are commonly interpreted
+  as the corresponding EPSG:xxxx value. Producers of SRIDs may wish to
+  avoid valid EPSG:xxxx values where this is not the intended usage to minimize
+  the chances they will be misinterpreted by consumers who make this assumption.
+
+- `crs-arbitrary-value.parquet`: Contains a GEOMETRY column with the crs
+  parameter set to an arbitrary string value. The Parquet format does not
+  restrict the value of the crs parameter and implementations may choose to
+  attempt interpreting the value or error.
diff --git a/data/geospatial/crs-arbitrary-value.parquet b/data/geospatial/crs-arbitrary-value.parquet
diff --git a/data/geospatial/crs-default.parquet b/data/geospatial/crs-default.parquet
diff --git a/data/geospatial/crs-gen.py b/data/geospatial/crs-gen.py
@@ -0,0 +1,163 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+
+from pathlib import Path
+
+import pyarrow as pa
+from pyarrow import parquet
+import pyproj
+import shapely
+
+HERE = Path(__file__).parent
+
+
+# Using Wyoming because it is the easiest state to inline into a Python file
+WYOMING_LOWRES = (
+    "POLYGON ((-111.0 45.0, -111.0 41.0, -104.0 41.0, -104.0 45.0, -111.0 45.0))"
+)
+
+# We densify the edges such that there is a point every 0.1 degrees to minimize
+# the effect of the edge algorithm and coordinate transformation.
+WYOMING_HIRES = shapely.from_wkt(WYOMING_LOWRES).segmentize(0.1).wkt
+
+
+class WkbType(pa.ExtensionType):
+    """Minimal geoarrow.wkb implementation"""
+
+    def __init__(self, crs=None, edges=None, *, storage_type=pa.binary(), **kwargs):
+        self.crs = crs
+        self.edges = edges
+        super().__init__(storage_type, "geoarrow.wkb")
+
+    def __arrow_ext_serialize__(self):
+        obj = {"crs": self.crs, "edges": self.edges}
+        return json.dumps({k: v for k, v in obj.items() if v}).encode()
+
+    @classmethod
+    def __arrow_ext_deserialize__(cls, storage_type, serialized):
+        obj: dict = json.loads(serialized)
+        return WkbType(**obj, storage_type=storage_type)
+
+
+pa.register_extension_type(WkbType())
+
+
+def write_crs(type, geometry, name, col_name="geometry", metadata=None):
+    schema = pa.schema({"wkt": pa.utf8(), col_name: type})
+
+    with parquet.ParquetWriter(
+        HERE / name,
+        schema,
+        # Not sure if there's a way to write metadata without
+        # storing the Arrow schema
+        store_schema=metadata is not None,
+        compression="none",
+    ) as writer:
+        batch = pa.record_batch(
+            {
+                "wkt": [geometry.wkt],
+                col_name: type.wrap_array(pa.array([geometry.wkb])),
+            }
+        )
+        writer.write_batch(batch)
+
+        if metadata is not None:
+            writer.add_key_value_metadata(metadata)
+
+
+def write_crs_files():
+    # Create the Shapely geometry
+    geometry = shapely.from_wkt(WYOMING_HIRES)
+
+    # A general purpose coordinate system for the United States
+    crs_not_lonlat = pyproj.CRS("EPSG:5070")
+    transformer = pyproj.Transformer.from_crs(
+        "OGC:CRS84", crs_not_lonlat, always_xy=True
+    )
+    geometry_not_lonlat = shapely.transform(
+        geometry, transformer.transform, interleaved=False
+    )
+
+    # Write with the default CRS (i.e., lon/lat)
+    write_crs(WkbType(), geometry, "crs-default.parquet")
+
+    # Write a Geography column with the default CRS
+    write_crs(
+        WkbType(edges="spherical"),
+        geometry,
+        "crs-geography.parquet",
+        col_name="geography",
+    )
+
+    # Write a file with the projjson format in the specification
+    # and the appropriate metadata key
+    write_crs(
+        WkbType(crs="projjson:projjson_epsg_5070"),
+        geometry_not_lonlat,
+        "crs-projjson.parquet",
+        metadata={"projjson_epsg_5070": crs_not_lonlat.to_json()},
+    )
+
+    # Write a file with the srid format in the specification
+    write_crs(WkbType(crs="srid:5070"), geometry_not_lonlat, "crs-srid.parquet")
+
+    # Write a file with an arbitrary value (theoretically allowed by the format
+    # and consumers may choose to error or attempt to interpret the value)
+    write_crs(
+        WkbType(crs=crs_not_lonlat.to_json_dict()),
+        geometry_not_lonlat,
+        "crs-arbitrary-value.parquet",
+    )
+
+
+def check_crs_schema(name, expected_col_type):
+    file = parquet.ParquetFile(HERE / name)
+
+    col = file.schema.column(1)
+    col_dict = json.loads(col.logical_type.to_json())
+    col_type = col_dict["Type"]
+    if col_type != expected_col_type:
+        raise ValueError(
+            f"Expected '{expected_col_type}' logical type but got '{col_type}'"
+        )
+
+
+def check_crs_crs(name, expected_crs):
+    expected_crs = pyproj.CRS(expected_crs)
+
+    file = parquet.ParquetFile(HERE / name, arrow_extensions_enabled=True)
+    ext_type = file.schema_arrow.field(1).type
+    actual_crs = pyproj.CRS(ext_type.crs)
+    if actual_crs != expected_crs:
+        raise ValueError(f"Expected '{expected_crs}' crs but got '{actual_crs}'")
+
+
+def check_crs(name, expected_col_type, expected_crs):
+    check_crs_schema(name, expected_col_type)
+    check_crs_crs(name, expected_crs)
+
+
+if __name__ == "__main__":
+    write_crs_files()
+
+    check_crs("crs-default.parquet", "Geometry", "OGC:CRS84")
+    check_crs("crs-geography.parquet", "Geography", "OGC:CRS84")
+    check_crs("crs-projjson.parquet", "Geometry", "EPSG:5070")
+    check_crs("crs-srid.parquet", "Geometry", "EPSG:5070")
+    check_crs("crs-arbitrary-value.parquet", "Geometry", "EPSG:5070")
diff --git a/data/geospatial/crs-geography.parquet b/data/geospatial/crs-geography.parquet
diff --git a/data/geospatial/crs-projjson.parquet b/data/geospatial/crs-projjson.parquet
diff --git a/data/geospatial/crs-srid.parquet b/data/geospatial/crs-srid.parquet