Skip to content
67 changes: 67 additions & 0 deletions data/geospatial/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing,
~ software distributed under the License is distributed on an
~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
~ KIND, either express or implied. See the License for the
~ specific language governing permissions and limitations
~ under the License.
-->

# Geospatial Test Files


These test files cover the main and corner case functionality of the
[Parquet Geospatial Types](https://github.com/apache/parquet-format/blob/master/Geospatial.md)
GEOMETRY and GEOGRAPHY.

- `geospatial.parquet`: Contains row groups with specific combinations of
geometry types to test statistics generation and geometry type coverage.
The file contains columns `group` (string identifier of the group name),
`wkt` (the human-readable well-known text representation of the geometry)
and `geometry` (a Parquet GEOMETRY column). A human-readable version of
the file is available in `geospatial.yaml`.

- `geospatial-with-nan.parquet`: Contains a single row group with a GEOMETRY
column whose contents contains two valid geometries and one invalid LINESTRING
whose coordinates contain a `NaN` value in all dimensions. Such a geometry is
not valid and the behaviour of it is not defined; however, implementations should
not generate statistics that would prevent the other (valid) geometries in the
column chunk from appearing in the case of predicate pushdown. Notably,
implementations should *not* generate statistics that contain `NaN` for this case.

Note that POINT EMPTY is represented by convention in well-known binary as
a POINT whose coordinates are all `NaN`, which should be treated as a valid
(but empty) geometry.

- `crs-default.parquet`: Contains a GEOMETRY column with the crs
omitted. This should be interpreted as OGC:CRS84 (i.e., longitude/latitude).

- `crs-geography.parquet`: Contains a GEOGRAPHY column with the crs
omitted. This should be interpreted as OGC:CRS84 (i.e., longitude/latitude).

- `crs-projjson.parquet`: Contains a GEOMETRY column with the crs parameter
set to `projjson:projjson_epsg_5070` and a metadata field with the key
`projjson_epsg_5070` and a value consisting of the appropriate PROJJSON
value for EPSG:5070.

- `crs-srid.parquet`: Contains a GEOMETRY column with the crs parameter set
to `srid:5070`. The Parquet format does not mention the EPSG database in
any way, but otherwise out-of-context SRID values are commonly interpreted
as the corresponding EPSG:xxxx value. Producers of SRIDs may wish to
avoid valid EPSG:xxxx values where this is not the intended usage to minimize
the chances they will be misinterpreted by consumers who make this assumption.

- `crs-arbitrary-value.parquet`: Contains a GEOMETRY column with the crs
parameter set to an arbitrary string value. The Parquet format does not
restrict the value of the crs parameter and implementations may choose to
attempt interpreting the value or error.
Binary file added data/geospatial/crs-arbitrary-value.parquet
Binary file not shown.
Binary file added data/geospatial/crs-default.parquet
Binary file not shown.
163 changes: 163 additions & 0 deletions data/geospatial/crs-gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

import json

from pathlib import Path

import pyarrow as pa
from pyarrow import parquet
import pyproj
import shapely

HERE = Path(__file__).parent


# Using Wyoming because it is the easiest state to inline into a Python file
WYOMING_LOWRES = (
"POLYGON ((-111.0 45.0, -111.0 41.0, -104.0 41.0, -104.0 45.0, -111.0 45.0))"
)

# We densify the edges such that there is a point every 0.1 degrees to minimize
# the effect of the edge algorithm and coordinate transformation.
WYOMING_HIRES = shapely.from_wkt(WYOMING_LOWRES).segmentize(0.1).wkt


class WkbType(pa.ExtensionType):
"""Minimal geoarrow.wkb implementation"""

def __init__(self, crs=None, edges=None, *, storage_type=pa.binary(), **kwargs):
self.crs = crs
self.edges = edges
super().__init__(storage_type, "geoarrow.wkb")

def __arrow_ext_serialize__(self):
obj = {"crs": self.crs, "edges": self.edges}
return json.dumps({k: v for k, v in obj.items() if v}).encode()

@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
obj: dict = json.loads(serialized)
return WkbType(**obj, storage_type=storage_type)


pa.register_extension_type(WkbType())


def write_crs(type, geometry, name, col_name="geometry", metadata=None):
schema = pa.schema({"wkt": pa.utf8(), col_name: type})

with parquet.ParquetWriter(
HERE / name,
schema,
# Not sure if there's a way to write metadata without
# storing the Arrow schema
store_schema=metadata is not None,
compression="none",
) as writer:
batch = pa.record_batch(
{
"wkt": [geometry.wkt],
col_name: type.wrap_array(pa.array([geometry.wkb])),
}
)
writer.write_batch(batch)

if metadata is not None:
writer.add_key_value_metadata(metadata)


def write_crs_files():
# Create the Shapely geometry
geometry = shapely.from_wkt(WYOMING_HIRES)

# A general purpose coordinate system for the United States
crs_not_lonlat = pyproj.CRS("EPSG:5070")
transformer = pyproj.Transformer.from_crs(
"OGC:CRS84", crs_not_lonlat, always_xy=True
)
geometry_not_lonlat = shapely.transform(
geometry, transformer.transform, interleaved=False
)

# Write with the default CRS (i.e., lon/lat)
write_crs(WkbType(), geometry, "crs-default.parquet")

# Write a Geography column with the default CRS
write_crs(
WkbType(edges="spherical"),
geometry,
"crs-geography.parquet",
col_name="geography",
)

# Write a file with the projjson format in the specification
# and the appropriate metadata key
write_crs(
WkbType(crs="projjson:projjson_epsg_5070"),
geometry_not_lonlat,
"crs-projjson.parquet",
metadata={"projjson_epsg_5070": crs_not_lonlat.to_json()},
)

# Write a file with the srid format in the specification
write_crs(WkbType(crs="srid:5070"), geometry_not_lonlat, "crs-srid.parquet")

# Write a file with an arbitrary value (theoretically allowed by the format
# and consumers may choose to error or attempt to interpret the value)
write_crs(
WkbType(crs=crs_not_lonlat.to_json_dict()),
geometry_not_lonlat,
"crs-arbitrary-value.parquet",
)


def check_crs_schema(name, expected_col_type):
file = parquet.ParquetFile(HERE / name)

col = file.schema.column(1)
col_dict = json.loads(col.logical_type.to_json())
col_type = col_dict["Type"]
if col_type != expected_col_type:
raise ValueError(
f"Expected '{expected_col_type}' logical type but got '{col_type}'"
)


def check_crs_crs(name, expected_crs):
expected_crs = pyproj.CRS(expected_crs)

file = parquet.ParquetFile(HERE / name, arrow_extensions_enabled=True)
ext_type = file.schema_arrow.field(1).type
actual_crs = pyproj.CRS(ext_type.crs)
if actual_crs != expected_crs:
raise ValueError(f"Expected '{expected_crs}' crs but got '{actual_crs}'")


def check_crs(name, expected_col_type, expected_crs):
check_crs_schema(name, expected_col_type)
check_crs_crs(name, expected_crs)


if __name__ == "__main__":
write_crs_files()

check_crs("crs-default.parquet", "Geometry", "OGC:CRS84")
check_crs("crs-geography.parquet", "Geography", "OGC:CRS84")
check_crs("crs-projjson.parquet", "Geometry", "EPSG:5070")
check_crs("crs-srid.parquet", "Geometry", "EPSG:5070")
check_crs("crs-arbitrary-value.parquet", "Geometry", "EPSG:5070")
Binary file added data/geospatial/crs-geography.parquet
Binary file not shown.
Binary file added data/geospatial/crs-projjson.parquet
Binary file not shown.
Binary file added data/geospatial/crs-srid.parquet
Binary file not shown.
Loading