From e25acb96b547c6b794bf30c2f7a31bce50616519 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 11 May 2024 16:41:38 +0800 Subject: [PATCH 1/3] WIP: Add geometry type --- specification/ORCv2.md | 87 ++++++++++++++++++++++++ src/main/proto/orc/proto/orc_proto.proto | 59 ++++++++++++++++ 2 files changed, 146 insertions(+) diff --git a/specification/ORCv2.md b/specification/ORCv2.md index 73daf6e..b3bdeb5 100644 --- a/specification/ORCv2.md +++ b/specification/ORCv2.md @@ -261,6 +261,7 @@ message Type { VARCHAR = 16; CHAR = 17; TIMESTAMP_INSTANT = 18; + GEOMETRY = 19; } // the kind of this type required Kind kind = 1; @@ -273,6 +274,61 @@ message Type { // the precision and scale for decimal optional uint32 precision = 5; optional uint32 scale = 6; + repeated StringPair attributes = 7; + // the attributes associated with the geometry type + optional GeometryType geometry = 8; +} +``` + +#### Geometry Type + +Geometry type requires additional information as described in the GeometryType +message below. These attributes limit the scope of geospatial features that +we can support for now. For example, only 2D geometries and OGC:CRS84 are +supported. + +``` +message GeometryType { + // A geometry can be any of the following geospatial subtype list, + // which is taken from the OGC (Open Geospatial Consortium) + // SFA (Simple Feature Access) Part 1- Common Architecture. + // If subtype is set, all values in the column must be of the same subtype; + // otherwise, the column may contain value of any subtype. + enum SubType { + POINT = 0; + LINESTRING = 1; + POLYGON = 2; + MULTIPOINT = 3; + MULTILINESTRING = 4; + MULTIPOLYGON = 5; + GEOMETRYCOLLECTION = 6; + } + optional SubType subtype = 1; + + // The dimension of the geometry. + // For now only 2D geometry is supported and the value must be 2 if set. + optional int32 dimension = 2; + + // Coordinate Reference System, i.e. mapping of how coordinates refer to + // precise locations on earth. + // For now only OGC:CRS84 is supported. + optional string crs = 3; + + // Interpretation for edges, i.e. whether the edge between points + // represent a straight cartesian line or the shortest line on the sphere + enum Edges { + PLANAR = 0; + // SPHERICAL = 1; + } + optional Edges edges = 4; + + enum GeospatialEncoding { + // The geometry is stored as a Well-Known Binary (WKB) binary data. + // This is a well-known and popular binary representation regulated by + // the Open Geospatial Consortium (OGC). + WKB = 0; + } + optional GeospatialEncoding encoding = 5; } ``` @@ -303,6 +359,7 @@ message ColumnStatistics { optional bool hasNull = 10; optional uint64 bytes_on_disk = 11; optional CollectionStatistics collection_statistics = 12; + optional GeometryStatistics geometry_statistics = 13; } ``` @@ -397,6 +454,21 @@ message BinaryStatistics { } ``` +Geometry columns store coordinates of the bounding box built from all values. +For example, all 2D geometries are regarded as a collection of coordinate +(x, y). POINT has one coordinate, LINESTRING has two coordinates, and POLYGON +might have three or more coordinates. A bounding box is the combination of +x_min, x_max, y_min, and y_max of all coordinates from all geometry values. + +``` +message GeometryStatistics { + optional double min_x = 1; + optional double max_x = 2; + optional double min_y = 3; + optional double max_y = 4; +} +``` + ### User Metadata The user can add arbitrary key/value pairs to an ORC file as it is @@ -1235,6 +1307,21 @@ Encoding | Stream Kind | Optional | Contents DIRECT | PRESENT | Yes | Boolean RLE | DIRECT | No | Byte RLE +## Geometry Columns + +Geometry data is encoded with a PRESENT stream, a DATA stream that records +the WKB-encoded geometry data as binary, and a LENGTH stream that records +the number of bytes per a value. + +Encoding | Stream Kind | Optional | Contents +:------------ | :-------------- | :------- | :------- +DIRECT | PRESENT | Yes | Boolean RLE + | DATA | No | Binary contents + | LENGTH | No | Unsigned Integer RLE v1 +DIRECT_V2 | PRESENT | Yes | Boolean RLE + | DATA | No | Binary contents + | LENGTH | No | Unsigned Integer RLE v2 + # Indexes ## Row Group Index diff --git a/src/main/proto/orc/proto/orc_proto.proto b/src/main/proto/orc/proto/orc_proto.proto index 16c5523..c4cc4ac 100644 --- a/src/main/proto/orc/proto/orc_proto.proto +++ b/src/main/proto/orc/proto/orc_proto.proto @@ -84,6 +84,19 @@ message CollectionStatistics { optional uint64 total_children = 3; } +// Statistics for geometry, which are coordinates of bounding box built from +// geometry objects in the same column. For example, all 2D geometries are +// regarded as a collection of coordinate (x, y). POINT has one coordinate, +// LINESTRING has two coordinates, and POLYGON might have three or more +// coordinates. A bounding box is the combination of x_min, x_max, y_min, and +// y_max of all coordinates from all geometry values. +message GeometryStatistics { + optional double min_x = 1; + optional double max_x = 2; + optional double min_y = 3; + optional double max_y = 4; +} + message ColumnStatistics { optional uint64 number_of_values = 1; optional IntegerStatistics int_statistics = 2; @@ -97,6 +110,7 @@ message ColumnStatistics { optional bool has_null = 10; optional uint64 bytes_on_disk = 11; optional CollectionStatistics collection_statistics = 12; + optional GeometryStatistics geometry_statistics = 13; } message RowIndexEntry { @@ -195,6 +209,49 @@ message StringPair { optional string value = 2; } +message GeometryType { + // A geometry can be any of the following geospatial subtype list, + // which is taken from the OGC (Open Geospatial Consortium) + // SFA (Simple Feature Access) Part 1- Common Architecture. + // If subtype is set, all values in the column must be of the same subtype; + // otherwise, the column may contain value of any subtype. + enum SubType { + POINT = 0; + LINESTRING = 1; + POLYGON = 2; + MULTIPOINT = 3; + MULTILINESTRING = 4; + MULTIPOLYGON = 5; + GEOMETRYCOLLECTION = 6; + } + optional SubType subtype = 1; + + // The dimension of the geometry. + // For now only 2D geometry is supported and the value must be 2 if set. + optional int32 dimension = 2; + + // Coordinate Reference System, i.e. mapping of how coordinates refer to + // precise locations on earth. + // For now only OGC:CRS84 is supported. + optional string crs = 3; + + // Interpretation for edges, i.e. whether the edge between points + // represent a straight cartesian line or the shortest line on the sphere + enum Edges { + PLANAR = 0; + // SPHERICAL = 1; + } + optional Edges edges = 4; + + enum GeospatialEncoding { + // The geometry is stored as a Well-Known Binary (WKB) binary data. + // This is a well-known and popular binary representation regulated by + // the Open Geospatial Consortium (OGC). + WKB = 0; + } + optional GeospatialEncoding encoding = 5; +} + message Type { enum Kind { BOOLEAN = 0; @@ -216,6 +273,7 @@ message Type { VARCHAR = 16; CHAR = 17; TIMESTAMP_INSTANT = 18; + GEOMETRY = 19; } optional Kind kind = 1; repeated uint32 subtypes = 2 [packed=true]; @@ -224,6 +282,7 @@ message Type { optional uint32 precision = 5; optional uint32 scale = 6; repeated StringPair attributes = 7; + optional GeometryType geometry = 8; } message StripeInformation { From 05a09ac91367c21b3edc5252789ab269a32449e1 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 22 Aug 2024 13:59:16 +0800 Subject: [PATCH 2/3] reflect latest change from parquet proposal --- specification/ORCv2.md | 176 +++++++++++++++++------ src/main/proto/orc/proto/orc_proto.proto | 170 ++++++++++++++++------ 2 files changed, 256 insertions(+), 90 deletions(-) diff --git a/specification/ORCv2.md b/specification/ORCv2.md index b3bdeb5..dd2d874 100644 --- a/specification/ORCv2.md +++ b/specification/ORCv2.md @@ -284,51 +284,81 @@ message Type { Geometry type requires additional information as described in the GeometryType message below. These attributes limit the scope of geospatial features that -we can support for now. For example, only 2D geometries and OGC:CRS84 are -supported. +we can support for now. ``` message GeometryType { - // A geometry can be any of the following geospatial subtype list, - // which is taken from the OGC (Open Geospatial Consortium) - // SFA (Simple Feature Access) Part 1- Common Architecture. - // If subtype is set, all values in the column must be of the same subtype; - // otherwise, the column may contain value of any subtype. - enum SubType { - POINT = 0; - LINESTRING = 1; - POLYGON = 2; - MULTIPOINT = 3; - MULTILINESTRING = 4; - MULTIPOLYGON = 5; - GEOMETRYCOLLECTION = 6; + enum GeometryEncoding { + // Well-known binary (WKB) representations of geometries. + // + // To be clear, we follow the same rule of WKB and coordinate axis order + // from GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM + // and the standard geometry types (Point, LineString, Polygon, MultiPoint, + // MultiLineString, MultiPolygon, and GeometryCollection). + // + // [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 + // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 + WKB = 0; } - optional SubType subtype = 1; - - // The dimension of the geometry. - // For now only 2D geometry is supported and the value must be 2 if set. - optional int32 dimension = 2; + required GeospatialEncoding encoding = 1; - // Coordinate Reference System, i.e. mapping of how coordinates refer to - // precise locations on earth. - // For now only OGC:CRS84 is supported. - optional string crs = 3; - - // Interpretation for edges, i.e. whether the edge between points - // represent a straight cartesian line or the shortest line on the sphere + // Interpretation for edges of non-point geometry objects, i.e. whether the + // edge between points represent a straight cartesian line or the shortest + // line on the sphere. enum Edges { PLANAR = 0; - // SPHERICAL = 1; + SPHERICAL = 1; } - optional Edges edges = 4; + required Edges edges = 2; - enum GeospatialEncoding { - // The geometry is stored as a Well-Known Binary (WKB) binary data. - // This is a well-known and popular binary representation regulated by - // the Open Geospatial Consortium (OGC). - WKB = 0; - } - optional GeospatialEncoding encoding = 5; + // Coordinate Reference System, i.e. mapping of how coordinates refer to + // precise locations on earth. Writers are not required to set this field. + // Once crs is set, crs_encoding field below MUST be set together. + // For example, "OGC:CRS84" can be set in the form of PROJJSON as below: + // { + // "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", + // "type": "GeographicCRS", + // "name": "WGS 84 longitude-latitude", + // "datum": { + // "type": "GeodeticReferenceFrame", + // "name": "World Geodetic System 1984", + // "ellipsoid": { + // "name": "WGS 84", + // "semi_major_axis": 6378137, + // "inverse_flattening": 298.257223563 + // } + // }, + // "coordinate_system": { + // "subtype": "ellipsoidal", + // "axis": [ + // { + // "name": "Geodetic longitude", + // "abbreviation": "Lon", + // "direction": "east", + // "unit": "degree" + // }, + // { + // "name": "Geodetic latitude", + // "abbreviation": "Lat", + // "direction": "north", + // "unit": "degree" + // } + // ] + // }, + // "id": { + // "authority": "OGC", + // "code": "CRS84" + // } + // } + // + optional string crs = 3; + // Encoding used in the above crs field. It MUST be set if crs field is set. + // Currently the only allowed value is "PROJJSON". + optional string crs_encoding = 4; + + // Additional informative metadata about the geometry type. + // Recommended to write a JSON-encoded UTF-8 string. + optional string metadata = 5; } ``` @@ -454,18 +484,72 @@ message BinaryStatistics { } ``` -Geometry columns store coordinates of the bounding box built from all values. -For example, all 2D geometries are regarded as a collection of coordinate -(x, y). POINT has one coordinate, LINESTRING has two coordinates, and POLYGON -might have three or more coordinates. A bounding box is the combination of -x_min, x_max, y_min, and y_max of all coordinates from all geometry values. +Geometry columns store optional bounding boxes, coverings and list of +geometry type codes from all values. + +``` +// Bounding box of geometries in the representation of min/max value pair of +// coordinates from each axis. Values of Z and M are omitted for 2D geometries. +// Filter pushdown on geometries are only safe for planar spatial predicate +// but it is recommended that the writer always generates bounding boxes +// regardless of whether the geometries are planar or spherical. +message BoundingBox { + required double xmin = 1; + required double xmax = 2; + required double ymin = 3; + required double ymax = 4; + optional double zmin = 5; + optional double zmax = 6; + optional double mmin = 7; + optional double mmax = 8; +} + +// A custom binary-encoded polygon or multi-polygon to represent a covering of +// geometries. For example, it may be a bounding box or an envelope when a +// bounding box cannot be built (e.g., a geometry has spherical edges, or if +// an edge of geographic coordinates crosses the antimeridian). In addition, +// it can also be used to provide vendor-agnostic coverings like S2 or H3 grids. +message Covering { + // A type of covering. Currently accepted values: "WKB". + optional string kind = 1; + // A payload specific to the kind. Below are the supported values: + // - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely + // covers the contents. This will be interpreted according to the same CRS + // and edges defined by the logical type. + optional bytes value = 2; +} -``` message GeometryStatistics { - optional double min_x = 1; - optional double max_x = 2; - optional double min_y = 3; - optional double max_y = 4; + // The bounding box of geometries in the column. + optional BoundingBox bbox = 1; + // List of coverings of geometries in the column. + repeated Covering coverings = 2; + // The geometry types of all geometries, or an empty array if they are not + // known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] + // except that values in the list are WKB (ISO variant) integer codes [2]. Table + // below shows the most common geometry types and their codes: + // + // | Type | XY | XYZ | XYM | XYZM | + // | :----------------- | :--- | :--- | :--- | :--: | + // | Point | 0001 | 1001 | 2001 | 3001 | + // | LineString | 0002 | 1002 | 2002 | 3002 | + // | Polygon | 0003 | 1003 | 2003 | 3003 | + // | MultiPoint | 0004 | 1004 | 2004 | 3004 | + // | MultiLineString | 0005 | 1005 | 2005 | 3005 | + // | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + // | GeometryCollection | 0007 | 1007 | 2007 | 3007 | + // + // In addition, the following rules are used: + // - A list of multiple values indicates that multiple geometry types are + // present (e.g. `[0003, 0006]`). + // - An empty array explicitly signals that the geometry types are not known. + // - The geometry types in the list must be unique (e.g. `[0001, 0001]` + // is not valid). + // + // Please refer to links below for more detail: + // [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary + // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 + repeated int32 geometry_types = 3; } ``` diff --git a/src/main/proto/orc/proto/orc_proto.proto b/src/main/proto/orc/proto/orc_proto.proto index c4cc4ac..d57b5d8 100644 --- a/src/main/proto/orc/proto/orc_proto.proto +++ b/src/main/proto/orc/proto/orc_proto.proto @@ -84,17 +84,68 @@ message CollectionStatistics { optional uint64 total_children = 3; } -// Statistics for geometry, which are coordinates of bounding box built from -// geometry objects in the same column. For example, all 2D geometries are -// regarded as a collection of coordinate (x, y). POINT has one coordinate, -// LINESTRING has two coordinates, and POLYGON might have three or more -// coordinates. A bounding box is the combination of x_min, x_max, y_min, and -// y_max of all coordinates from all geometry values. +// Bounding box of geometries in the representation of min/max value pair of +// coordinates from each axis. Values of Z and M are omitted for 2D geometries. +// Filter pushdown on geometries are only safe for planar spatial predicate +// but it is recommended that the writer always generates bounding boxes +// regardless of whether the geometries are planar or spherical. +message BoundingBox { + required double xmin = 1; + required double xmax = 2; + required double ymin = 3; + required double ymax = 4; + optional double zmin = 5; + optional double zmax = 6; + optional double mmin = 7; + optional double mmax = 8; +} + +// A custom binary-encoded polygon or multi-polygon to represent a covering of +// geometries. For example, it may be a bounding box or an envelope when a +// bounding box cannot be built (e.g., a geometry has spherical edges, or if +// an edge of geographic coordinates crosses the antimeridian). In addition, +// it can also be used to provide vendor-agnostic coverings like S2 or H3 grids. +message Covering { + // A type of covering. Currently accepted values: "WKB". + optional string kind = 1; + // A payload specific to the kind. Below are the supported values: + // - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely + // covers the contents. This will be interpreted according to the same CRS + // and edges defined by the logical type. + optional bytes value = 2; +} + message GeometryStatistics { - optional double min_x = 1; - optional double max_x = 2; - optional double min_y = 3; - optional double max_y = 4; + // The bounding box of geometries in the column. + optional BoundingBox bbox = 1; + // List of coverings of geometries in the column. + repeated Covering coverings = 2; + // The geometry types of all geometries, or an empty array if they are not + // known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] + // except that values in the list are WKB (ISO variant) integer codes [2]. Table + // below shows the most common geometry types and their codes: + // + // | Type | XY | XYZ | XYM | XYZM | + // | :----------------- | :--- | :--- | :--- | :--: | + // | Point | 0001 | 1001 | 2001 | 3001 | + // | LineString | 0002 | 1002 | 2002 | 3002 | + // | Polygon | 0003 | 1003 | 2003 | 3003 | + // | MultiPoint | 0004 | 1004 | 2004 | 3004 | + // | MultiLineString | 0005 | 1005 | 2005 | 3005 | + // | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + // | GeometryCollection | 0007 | 1007 | 2007 | 3007 | + // + // In addition, the following rules are used: + // - A list of multiple values indicates that multiple geometry types are + // present (e.g. `[0003, 0006]`). + // - An empty array explicitly signals that the geometry types are not known. + // - The geometry types in the list must be unique (e.g. `[0001, 0001]` + // is not valid). + // + // Please refer to links below for more detail: + // [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary + // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 + repeated int32 geometry_types = 3; } message ColumnStatistics { @@ -210,46 +261,77 @@ message StringPair { } message GeometryType { - // A geometry can be any of the following geospatial subtype list, - // which is taken from the OGC (Open Geospatial Consortium) - // SFA (Simple Feature Access) Part 1- Common Architecture. - // If subtype is set, all values in the column must be of the same subtype; - // otherwise, the column may contain value of any subtype. - enum SubType { - POINT = 0; - LINESTRING = 1; - POLYGON = 2; - MULTIPOINT = 3; - MULTILINESTRING = 4; - MULTIPOLYGON = 5; - GEOMETRYCOLLECTION = 6; + enum GeometryEncoding { + // Well-known binary (WKB) representations of geometries. + // + // To be clear, we follow the same rule of WKB and coordinate axis order + // from GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM + // and the standard geometry types (Point, LineString, Polygon, MultiPoint, + // MultiLineString, MultiPolygon, and GeometryCollection). + // + // [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 + // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 + WKB = 0; } - optional SubType subtype = 1; + required GeometryEncoding encoding = 1; - // The dimension of the geometry. - // For now only 2D geometry is supported and the value must be 2 if set. - optional int32 dimension = 2; - - // Coordinate Reference System, i.e. mapping of how coordinates refer to - // precise locations on earth. - // For now only OGC:CRS84 is supported. - optional string crs = 3; - - // Interpretation for edges, i.e. whether the edge between points - // represent a straight cartesian line or the shortest line on the sphere + // Interpretation for edges of non-point geometry objects, i.e. whether the + // edge between points represent a straight cartesian line or the shortest + // line on the sphere. enum Edges { PLANAR = 0; - // SPHERICAL = 1; + SPHERICAL = 1; } - optional Edges edges = 4; + required Edges edges = 2; - enum GeospatialEncoding { - // The geometry is stored as a Well-Known Binary (WKB) binary data. - // This is a well-known and popular binary representation regulated by - // the Open Geospatial Consortium (OGC). - WKB = 0; - } - optional GeospatialEncoding encoding = 5; + // Coordinate Reference System, i.e. mapping of how coordinates refer to + // precise locations on earth. Writers are not required to set this field. + // Once crs is set, crs_encoding field below MUST be set together. + // For example, "OGC:CRS84" can be set in the form of PROJJSON as below: + // { + // "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", + // "type": "GeographicCRS", + // "name": "WGS 84 longitude-latitude", + // "datum": { + // "type": "GeodeticReferenceFrame", + // "name": "World Geodetic System 1984", + // "ellipsoid": { + // "name": "WGS 84", + // "semi_major_axis": 6378137, + // "inverse_flattening": 298.257223563 + // } + // }, + // "coordinate_system": { + // "subtype": "ellipsoidal", + // "axis": [ + // { + // "name": "Geodetic longitude", + // "abbreviation": "Lon", + // "direction": "east", + // "unit": "degree" + // }, + // { + // "name": "Geodetic latitude", + // "abbreviation": "Lat", + // "direction": "north", + // "unit": "degree" + // } + // ] + // }, + // "id": { + // "authority": "OGC", + // "code": "CRS84" + // } + // } + // + optional string crs = 3; + // Encoding used in the above crs field. It MUST be set if crs field is set. + // Currently the only allowed value is "PROJJSON". + optional string crs_encoding = 4; + + // Additional informative metadata about the geometry type. + // Recommended to write a JSON-encoded UTF-8 string. + optional string metadata = 5; } message Type { From 753bffc1dd218051ec7d97ef51879824e3044d70 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 3 Apr 2025 14:00:05 +0800 Subject: [PATCH 3/3] reflect the latest parquet spec --- specification/ORCv2.md | 288 ++++++++++++----------- src/main/proto/orc/proto/orc_proto.proto | 155 ++---------- 2 files changed, 171 insertions(+), 272 deletions(-) diff --git a/specification/ORCv2.md b/specification/ORCv2.md index dd2d874..9aaeb91 100644 --- a/specification/ORCv2.md +++ b/specification/ORCv2.md @@ -262,6 +262,7 @@ message Type { CHAR = 17; TIMESTAMP_INSTANT = 18; GEOMETRY = 19; + GEOGRAPHY = 20; } // the kind of this type required Kind kind = 1; @@ -277,90 +278,80 @@ message Type { repeated StringPair attributes = 7; // the attributes associated with the geometry type optional GeometryType geometry = 8; + // Coordinate Reference System (CRS) for Geometry and Geography types + optional string crs = 8; + // Edge interpolation algorithm for Geography type + enum EdgeInterpolationAlgorithm { + SPHERICAL = 0; + VINCENTY = 1; + THOMAS = 2; + ANDOYER = 3; + KARNEY = 4; + } + optional EdgeInterpolationAlgorithm algorithm = 9; } ``` -#### Geometry Type - -Geometry type requires additional information as described in the GeometryType -message below. These attributes limit the scope of geospatial features that -we can support for now. - -``` -message GeometryType { - enum GeometryEncoding { - // Well-known binary (WKB) representations of geometries. - // - // To be clear, we follow the same rule of WKB and coordinate axis order - // from GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM - // and the standard geometry types (Point, LineString, Polygon, MultiPoint, - // MultiLineString, MultiPolygon, and GeometryCollection). - // - // [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 - // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 - WKB = 0; - } - required GeospatialEncoding encoding = 1; - - // Interpretation for edges of non-point geometry objects, i.e. whether the - // edge between points represent a straight cartesian line or the shortest - // line on the sphere. - enum Edges { - PLANAR = 0; - SPHERICAL = 1; - } - required Edges edges = 2; - - // Coordinate Reference System, i.e. mapping of how coordinates refer to - // precise locations on earth. Writers are not required to set this field. - // Once crs is set, crs_encoding field below MUST be set together. - // For example, "OGC:CRS84" can be set in the form of PROJJSON as below: - // { - // "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", - // "type": "GeographicCRS", - // "name": "WGS 84 longitude-latitude", - // "datum": { - // "type": "GeodeticReferenceFrame", - // "name": "World Geodetic System 1984", - // "ellipsoid": { - // "name": "WGS 84", - // "semi_major_axis": 6378137, - // "inverse_flattening": 298.257223563 - // } - // }, - // "coordinate_system": { - // "subtype": "ellipsoidal", - // "axis": [ - // { - // "name": "Geodetic longitude", - // "abbreviation": "Lon", - // "direction": "east", - // "unit": "degree" - // }, - // { - // "name": "Geodetic latitude", - // "abbreviation": "Lat", - // "direction": "north", - // "unit": "degree" - // } - // ] - // }, - // "id": { - // "authority": "OGC", - // "code": "CRS84" - // } - // } - // - optional string crs = 3; - // Encoding used in the above crs field. It MUST be set if crs field is set. - // Currently the only allowed value is "PROJJSON". - optional string crs_encoding = 4; - - // Additional informative metadata about the geometry type. - // Recommended to write a JSON-encoded UTF-8 string. - optional string metadata = 5; -} -``` +#### Geometry & Geography Types + +##### Background + +The Geometry and Geography class hierarchy and its Well-Known Text (WKT) and +Well-Known Binary (WKB) serializations (ISO variant supporting XY, XYZ, XYM, +XYZM) are defined by [OpenGIS Implementation Specification for Geographic +information - Simple feature access - Part 1: Common architecture][sfa-part1], +from [OGC(Open Geospatial Consortium)][ogc]. + +The version of the OGC standard first used here is 1.2.1, but future versions +may also be used if the WKB representation remains wire-compatible. + +[sfa-part1]: https://portal.ogc.org/files/?artifact_id=25355 +[ogc]: https://www.ogc.org/standard/sfa/ + +###### Coordinate Reference System + +Coordinate Reference System (CRS) is a mapping of how coordinates refer to +locations on Earth. + +The default CRS `OGC:CRS84` means that the geospatial features must be stored +in the order of longitude/latitude based on the WGS84 datum. + +Custom CRS can be specified by a string value. It is recommended to use an +identifier-based approach like [Spatial reference identifier][srid]. + +For geographic CRS, longitudes are bound by [-180, 180] and latitudes are bound +by [-90, 90]. + +[srid]: https://en.wikipedia.org/wiki/Spatial_reference_system#Identifier + +###### Edge Interpolation Algorithm + +An algorithm for interpolating edges, and is one of the following values: + +* `spherical`: edges are interpolated as geodesics on a sphere. +* `vincenty`: [https://en.wikipedia.org/wiki/Vincenty%27s_formulae](https://en.wikipedia.org/wiki/Vincenty%27s_formulae) +* `thomas`: Thomas, Paul D. Spheroidal geodesics, reference systems, & local geometry. US Naval Oceanographic Office, 1970. +* `andoyer`: Thomas, Paul D. Mathematical models for navigation systems. US Naval Oceanographic Office, 1965. +* `karney`: [Karney, Charles FF. "Algorithms for geodesics." Journal of Geodesy 87 (2013): 43-55](https://link.springer.com/content/pdf/10.1007/s00190-012-0578-z.pdf), and [GeographicLib](https://geographiclib.sourceforge.io/) + +###### CRS Customization + +CRS is represented as a string value. Writer and reader implementations are +responsible for serializing and deserializing the CRS, respectively. + +As a convention to maximize the interoperability, custom CRS values can be +specified by a string of the format `type:identifier`, where `type` is one of +the following values: + +* `srid`: [Spatial reference identifier](https://en.wikipedia.org/wiki/Spatial_reference_system#Identifier), `identifier` is the SRID itself. +* `projjson`: [PROJJSON](https://proj.org/en/stable/specifications/projjson.html), `identifier` is the name of a table property or a file property where the projjson string is stored. + +###### Coordinate Axis Order + +The axis order of the coordinates in WKB and bounding box stored here +follows the de facto standard for axis order in WKB and is therefore always +(x, y) where x is easting or longitude and y is northing or latitude. This +ordering explicitly overrides the axis order as specified in the CRS. ### Column Statistics @@ -389,7 +380,7 @@ message ColumnStatistics { optional bool hasNull = 10; optional uint64 bytes_on_disk = 11; optional CollectionStatistics collection_statistics = 12; - optional GeometryStatistics geometry_statistics = 13; + optional GeospatialStatistics geospatial_statistics = 13; } ``` @@ -484,72 +475,85 @@ message BinaryStatistics { } ``` -Geometry columns store optional bounding boxes, coverings and list of -geometry type codes from all values. +Geometry and Geography columns store optional bounding boxes and list of +geospatial type codes from all values. -``` -// Bounding box of geometries in the representation of min/max value pair of -// coordinates from each axis. Values of Z and M are omitted for 2D geometries. -// Filter pushdown on geometries are only safe for planar spatial predicate -// but it is recommended that the writer always generates bounding boxes -// regardless of whether the geometries are planar or spherical. +**Bounding Box** + +A geospatial instance has at least two coordinate dimensions: X and Y for 2D +coordinates of each point. Please note that X is longitude/easting and Y is +latitude/northing. A geospatial instance can optionally have Z and/or M values +associated with each point. + +The Z values introduce the third dimension coordinate. Usually they are used to +indicate the height, or elevation. + +M values are an opportunity for a geospatial instance to express a fourth +dimension as a coordinate value. These values can be used as a linear reference +value (e.g., highway milepost value), a timestamp, or some other value as defined +by the CRS. + +Bounding box is defined as the thrift struct below in the representation of +min/max value pair of coordinates from each axis. Note that X and Y Values are +always present. Z and M are omitted for 2D geospatial instances. + +For the X values only, xmin may be greater than xmax. In this case, an object +in this bounding box may match if it contains an X such that `x >= xmin` OR +`x <= xmax`. This wraparound occurs only when the corresponding bounding box +crosses the antimeridian line. In geographic terminology, the concepts of `xmin`, +`xmax`, `ymin`, and `ymax` are also known as `westernmost`, `easternmost`, +`southernmost` and `northernmost`, respectively. + +For Geography type, X and Y values are restricted to the canonical ranges of +[-180, 180] for X and [-90, 90] for Y. + +**Geospatial Types** + +A list of geospatial types from all instances in the Geometry or Geography +column, or an empty list if they are not known. + +This is borrowed from [geometry_types of GeoParquet][geometry-types] except that +values in the list are [WKB (ISO-variant) integer codes][wkb-integer-code]. +Table below shows the most common geospatial types and their codes: + +| Type | XY | XYZ | XYM | XYZM | +| :----------------- | :--- | :--- | :--- | :--: | +| Point | 0001 | 1001 | 2001 | 3001 | +| LineString | 0002 | 1002 | 2002 | 3002 | +| Polygon | 0003 | 1003 | 2003 | 3003 | +| MultiPoint | 0004 | 1004 | 2004 | 3004 | +| MultiLineString | 0005 | 1005 | 2005 | 3005 | +| MultiPolygon | 0006 | 1006 | 2006 | 3006 | +| GeometryCollection | 0007 | 1007 | 2007 | 3007 | + +In addition, the following rules are applied: +- A list of multiple values indicates that multiple geospatial types are present (e.g. `[0003, 0006]`). +- An empty array explicitly signals that the geospatial types are not known. +- The geospatial types in the list must be unique (e.g. `[0001, 0001]` is not valid). + +[geometry-types]: https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 +[wkb-integer-code]: https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary + +```protobuf +// Bounding box for Geometry or Geography type in the representation of min/max +// value pair of coordinates from each axis. message BoundingBox { - required double xmin = 1; - required double xmax = 2; - required double ymin = 3; - required double ymax = 4; + optional double xmin = 1; + optional double xmax = 2; + optional double ymin = 3; + optional double ymax = 4; optional double zmin = 5; optional double zmax = 6; optional double mmin = 7; optional double mmax = 8; } -// A custom binary-encoded polygon or multi-polygon to represent a covering of -// geometries. For example, it may be a bounding box or an envelope when a -// bounding box cannot be built (e.g., a geometry has spherical edges, or if -// an edge of geographic coordinates crosses the antimeridian). In addition, -// it can also be used to provide vendor-agnostic coverings like S2 or H3 grids. -message Covering { - // A type of covering. Currently accepted values: "WKB". - optional string kind = 1; - // A payload specific to the kind. Below are the supported values: - // - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely - // covers the contents. This will be interpreted according to the same CRS - // and edges defined by the logical type. - optional bytes value = 2; -} - -message GeometryStatistics { - // The bounding box of geometries in the column. +// Statistics specific to Geometry or Geography type +message GeospatialStatistics { + // A bounding box of geospatial instances optional BoundingBox bbox = 1; - // List of coverings of geometries in the column. - repeated Covering coverings = 2; - // The geometry types of all geometries, or an empty array if they are not - // known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] - // except that values in the list are WKB (ISO variant) integer codes [2]. Table - // below shows the most common geometry types and their codes: - // - // | Type | XY | XYZ | XYM | XYZM | - // | :----------------- | :--- | :--- | :--- | :--: | - // | Point | 0001 | 1001 | 2001 | 3001 | - // | LineString | 0002 | 1002 | 2002 | 3002 | - // | Polygon | 0003 | 1003 | 2003 | 3003 | - // | MultiPoint | 0004 | 1004 | 2004 | 3004 | - // | MultiLineString | 0005 | 1005 | 2005 | 3005 | - // | MultiPolygon | 0006 | 1006 | 2006 | 3006 | - // | GeometryCollection | 0007 | 1007 | 2007 | 3007 | - // - // In addition, the following rules are used: - // - A list of multiple values indicates that multiple geometry types are - // present (e.g. `[0003, 0006]`). - // - An empty array explicitly signals that the geometry types are not known. - // - The geometry types in the list must be unique (e.g. `[0001, 0001]` - // is not valid). - // - // Please refer to links below for more detail: - // [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary - // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 - repeated int32 geometry_types = 3; + // Geospatial type codes of all instances, or an empty list if not known + repeated int32 geospatial_types = 2; } ``` @@ -1391,10 +1395,10 @@ Encoding | Stream Kind | Optional | Contents DIRECT | PRESENT | Yes | Boolean RLE | DIRECT | No | Byte RLE -## Geometry Columns +## Geometry & Geography Columns -Geometry data is encoded with a PRESENT stream, a DATA stream that records -the WKB-encoded geometry data as binary, and a LENGTH stream that records +Geometry and Geography data are encoded with a PRESENT stream, a DATA stream that records +the WKB-encoded geometry/geography data as binary, and a LENGTH stream that records the number of bytes per a value. Encoding | Stream Kind | Optional | Contents diff --git a/src/main/proto/orc/proto/orc_proto.proto b/src/main/proto/orc/proto/orc_proto.proto index d57b5d8..1c38fc7 100644 --- a/src/main/proto/orc/proto/orc_proto.proto +++ b/src/main/proto/orc/proto/orc_proto.proto @@ -84,68 +84,25 @@ message CollectionStatistics { optional uint64 total_children = 3; } -// Bounding box of geometries in the representation of min/max value pair of -// coordinates from each axis. Values of Z and M are omitted for 2D geometries. -// Filter pushdown on geometries are only safe for planar spatial predicate -// but it is recommended that the writer always generates bounding boxes -// regardless of whether the geometries are planar or spherical. +// Bounding box for Geometry or Geography type in the representation of min/max +// value pair of coordinates from each axis. message BoundingBox { - required double xmin = 1; - required double xmax = 2; - required double ymin = 3; - required double ymax = 4; + optional double xmin = 1; + optional double xmax = 2; + optional double ymin = 3; + optional double ymax = 4; optional double zmin = 5; optional double zmax = 6; optional double mmin = 7; optional double mmax = 8; } -// A custom binary-encoded polygon or multi-polygon to represent a covering of -// geometries. For example, it may be a bounding box or an envelope when a -// bounding box cannot be built (e.g., a geometry has spherical edges, or if -// an edge of geographic coordinates crosses the antimeridian). In addition, -// it can also be used to provide vendor-agnostic coverings like S2 or H3 grids. -message Covering { - // A type of covering. Currently accepted values: "WKB". - optional string kind = 1; - // A payload specific to the kind. Below are the supported values: - // - WKB: well-known binary of a POLYGON or MULTI-POLYGON that completely - // covers the contents. This will be interpreted according to the same CRS - // and edges defined by the logical type. - optional bytes value = 2; -} - -message GeometryStatistics { - // The bounding box of geometries in the column. +// Statistics specific to Geometry or Geography type +message GeospatialStatistics { + // A bounding box of geospatial instances optional BoundingBox bbox = 1; - // List of coverings of geometries in the column. - repeated Covering coverings = 2; - // The geometry types of all geometries, or an empty array if they are not - // known. This is borrowed from `geometry_types` column metadata of GeoParquet [1] - // except that values in the list are WKB (ISO variant) integer codes [2]. Table - // below shows the most common geometry types and their codes: - // - // | Type | XY | XYZ | XYM | XYZM | - // | :----------------- | :--- | :--- | :--- | :--: | - // | Point | 0001 | 1001 | 2001 | 3001 | - // | LineString | 0002 | 1002 | 2002 | 3002 | - // | Polygon | 0003 | 1003 | 2003 | 3003 | - // | MultiPoint | 0004 | 1004 | 2004 | 3004 | - // | MultiLineString | 0005 | 1005 | 2005 | 3005 | - // | MultiPolygon | 0006 | 1006 | 2006 | 3006 | - // | GeometryCollection | 0007 | 1007 | 2007 | 3007 | - // - // In addition, the following rules are used: - // - A list of multiple values indicates that multiple geometry types are - // present (e.g. `[0003, 0006]`). - // - An empty array explicitly signals that the geometry types are not known. - // - The geometry types in the list must be unique (e.g. `[0001, 0001]` - // is not valid). - // - // Please refer to links below for more detail: - // [1] https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Well-known_binary - // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L159 - repeated int32 geometry_types = 3; + // Geospatial type codes of all instances, or an empty list if not known + repeated int32 geospatial_types = 2; } message ColumnStatistics { @@ -161,7 +118,7 @@ message ColumnStatistics { optional bool has_null = 10; optional uint64 bytes_on_disk = 11; optional CollectionStatistics collection_statistics = 12; - optional GeometryStatistics geometry_statistics = 13; + optional GeospatialStatistics geospatial_statistics = 13; } message RowIndexEntry { @@ -260,80 +217,6 @@ message StringPair { optional string value = 2; } -message GeometryType { - enum GeometryEncoding { - // Well-known binary (WKB) representations of geometries. - // - // To be clear, we follow the same rule of WKB and coordinate axis order - // from GeoParquet [1][2]. It is the ISO WKB supporting XY, XYZ, XYM, XYZM - // and the standard geometry types (Point, LineString, Polygon, MultiPoint, - // MultiLineString, MultiPolygon, and GeometryCollection). - // - // [1] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L92 - // [2] https://github.com/opengeospatial/geoparquet/blob/v1.1.0/format-specs/geoparquet.md?plain=1#L155 - WKB = 0; - } - required GeometryEncoding encoding = 1; - - // Interpretation for edges of non-point geometry objects, i.e. whether the - // edge between points represent a straight cartesian line or the shortest - // line on the sphere. - enum Edges { - PLANAR = 0; - SPHERICAL = 1; - } - required Edges edges = 2; - - // Coordinate Reference System, i.e. mapping of how coordinates refer to - // precise locations on earth. Writers are not required to set this field. - // Once crs is set, crs_encoding field below MUST be set together. - // For example, "OGC:CRS84" can be set in the form of PROJJSON as below: - // { - // "$schema": "https://proj.org/schemas/v0.5/projjson.schema.json", - // "type": "GeographicCRS", - // "name": "WGS 84 longitude-latitude", - // "datum": { - // "type": "GeodeticReferenceFrame", - // "name": "World Geodetic System 1984", - // "ellipsoid": { - // "name": "WGS 84", - // "semi_major_axis": 6378137, - // "inverse_flattening": 298.257223563 - // } - // }, - // "coordinate_system": { - // "subtype": "ellipsoidal", - // "axis": [ - // { - // "name": "Geodetic longitude", - // "abbreviation": "Lon", - // "direction": "east", - // "unit": "degree" - // }, - // { - // "name": "Geodetic latitude", - // "abbreviation": "Lat", - // "direction": "north", - // "unit": "degree" - // } - // ] - // }, - // "id": { - // "authority": "OGC", - // "code": "CRS84" - // } - // } - // - optional string crs = 3; - // Encoding used in the above crs field. It MUST be set if crs field is set. - // Currently the only allowed value is "PROJJSON". - optional string crs_encoding = 4; - - // Additional informative metadata about the geometry type. - // Recommended to write a JSON-encoded UTF-8 string. - optional string metadata = 5; -} - message Type { enum Kind { BOOLEAN = 0; @@ -356,6 +239,7 @@ message Type { CHAR = 17; TIMESTAMP_INSTANT = 18; GEOMETRY = 19; + GEOGRAPHY = 20; } optional Kind kind = 1; repeated uint32 subtypes = 2 [packed=true]; @@ -364,7 +248,18 @@ message Type { optional uint32 precision = 5; optional uint32 scale = 6; repeated StringPair attributes = 7; - optional GeometryType geometry = 8; + + // Coordinate Reference System (CRS) for Geometry and Geography types + optional string crs = 8; + // Edge interpolation algorithm for Geography type + enum EdgeInterpolationAlgorithm { + SPHERICAL = 0; + VINCENTY = 1; + THOMAS = 2; + ANDOYER = 3; + KARNEY = 4; + } + optional EdgeInterpolationAlgorithm algorithm = 9; } message StripeInformation {