diff --git a/java/core/pom.xml b/java/core/pom.xml index cdfbd5db9a..487a524398 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -80,6 +80,11 @@ com.aayushatharva.brotli4j brotli4j + + org.locationtech.jts + jts-core + ${jts.version} + diff --git a/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java b/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java new file mode 100644 index 0000000000..db66084c13 --- /dev/null +++ b/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import org.apache.orc.geospatial.BoundingBox; +import org.apache.orc.geospatial.GeospatialTypes; + +public interface GeospatialColumnStatistics extends ColumnStatistics { + BoundingBox getBoundingBox(); + GeospatialTypes getGeospatialTypes(); +} diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java index 7dde0bc0fd..ded04b8abc 100644 --- a/java/core/src/java/org/apache/orc/OrcUtils.java +++ b/java/core/src/java/org/apache/orc/OrcUtils.java @@ -17,6 +17,7 @@ */ package org.apache.orc; +import org.apache.orc.TypeDescription.EdgeInterpolationAlgorithm; import org.apache.orc.impl.ParserUtils; import org.apache.orc.impl.ReaderImpl; import org.apache.orc.impl.SchemaEvolution; @@ -171,6 +172,23 @@ private static void appendOrcTypes(List result, TypeDescription t type.setPrecision(typeDescr.getPrecision()); type.setScale(typeDescr.getScale()); break; + case Geography: + type.setKind(OrcProto.Type.Kind.GEOGRAPHY); + type.setAlgorithm(switch (typeDescr.getEdgeInterpolationAlgorithm()) { + case SPHERICAL -> OrcProto.Type.EdgeInterpolationAlgorithm.SPHERICAL; + case VINCENTY -> OrcProto.Type.EdgeInterpolationAlgorithm.VINCENTY; + case THOMAS -> OrcProto.Type.EdgeInterpolationAlgorithm.THOMAS; + case ANDOYER -> OrcProto.Type.EdgeInterpolationAlgorithm.ANDOYER; + case KARNEY -> OrcProto.Type.EdgeInterpolationAlgorithm.KARNEY; + default -> throw new IllegalArgumentException("Unknown interpolation algorithm: " + + typeDescr.getEdgeInterpolationAlgorithm()); + }); + type.setCrs(typeDescr.getCrs()); + break; + case Geometry: + type.setKind(OrcProto.Type.Kind.GEOMETRY); + type.setCrs(typeDescr.getCrs()); + break; case LIST: type.setKind(OrcProto.Type.Kind.LIST); type.addSubtypes(children.get(0).getId()); @@ -325,6 +343,29 @@ TypeDescription convertTypeFromProtobuf(List types, result.withPrecision(type.getPrecision()); } break; + case GEOMETRY: + result = TypeDescription.createGeometry(); + if (type.hasCrs()) { + result.withCRS(type.getCrs()); + } + break; + case GEOGRAPHY: + result = TypeDescription.createGeography(); + if (type.hasCrs()) { + result.withCRS(type.getCrs()); + } + result.withEdgeInterpolationAlgorithm( + switch (type.getAlgorithm()) { + case SPHERICAL -> EdgeInterpolationAlgorithm.SPHERICAL; + case VINCENTY -> EdgeInterpolationAlgorithm.VINCENTY; + case THOMAS -> EdgeInterpolationAlgorithm.THOMAS; + case ANDOYER -> EdgeInterpolationAlgorithm.ANDOYER; + case KARNEY -> EdgeInterpolationAlgorithm.KARNEY; + default -> throw new IllegalArgumentException("Unknown interpolation algorithm: " + + type.getAlgorithm()); + } + ); + break; case LIST: if (type.getSubtypesCount() != 1) { throw new FileFormatException("LIST type should contain exactly " + diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index 8ea9fca1b2..c5ef48b047 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -44,12 +44,29 @@ public class TypeDescription public static final long MAX_DECIMAL64 = 999_999_999_999_999_999L; public static final long MIN_DECIMAL64 = -MAX_DECIMAL64; private static final int DEFAULT_LENGTH = 256; + private static final String DEFAULT_CRS = "OGC:CRS84"; static final Pattern UNQUOTED_NAMES = Pattern.compile("^[a-zA-Z0-9_]+$"); // type attributes public static final String ENCRYPT_ATTRIBUTE = "encrypt"; public static final String MASK_ATTRIBUTE = "mask"; + public enum EdgeInterpolationAlgorithm { + SPHERICAL("spherical"), + VINCENTY("vincenty"), + THOMAS("thomas"), + ANDOYER("andoyer"), + KARNEY("karney"); + + EdgeInterpolationAlgorithm(String name) { + this.name = name; + } + final String name; + } + + private static final EdgeInterpolationAlgorithm DEFAULT_EDGE_INTERPOLATION_ALGORITHM + = EdgeInterpolationAlgorithm.SPHERICAL; + @Override public int compareTo(TypeDescription other) { if (this == other) { @@ -116,7 +133,9 @@ public enum Category { MAP("map", false), STRUCT("struct", false), UNION("uniontype", false), - TIMESTAMP_INSTANT("timestamp with local time zone", true); + TIMESTAMP_INSTANT("timestamp with local time zone", true), + Geometry("geometry", true), + Geography("geography", true); Category(String name, boolean isPrimitive) { this.name = name; @@ -187,6 +206,14 @@ public static TypeDescription createDecimal() { return new TypeDescription(Category.DECIMAL); } + public static TypeDescription createGeometry() { + return new TypeDescription(Category.Geometry); + } + + public static TypeDescription createGeography() { + return new TypeDescription(Category.Geography); + } + /** * Parse TypeDescription from the Hive type names. This is the inverse * of TypeDescription.toString() @@ -239,6 +266,26 @@ public TypeDescription withScale(int scale) { return this; } + public TypeDescription withCRS(String crs) { + if (category != Category.Geometry && + category != Category.Geography) { + throw new IllegalArgumentException("crs is only allowed on Geometry/Geography" + + " and not " + category.name); + } + this.crs = crs; + return this; + } + + public TypeDescription withEdgeInterpolationAlgorithm( + EdgeInterpolationAlgorithm edgeInterpolationAlgorithm) { + if (category != Category.Geography) { + throw new IllegalArgumentException("edgeInterpolationAlgorithm is only allowed on Geography" + + " and not " + category.name); + } + this.edgeInterpolationAlgorithm = edgeInterpolationAlgorithm; + return this; + } + /** * Set an attribute on this type. * @param key the attribute name @@ -366,6 +413,8 @@ public TypeDescription clone() { result.maxLength = maxLength; result.precision = precision; result.scale = scale; + result.crs = crs; + result.edgeInterpolationAlgorithm = edgeInterpolationAlgorithm; if (fieldNames != null) { result.fieldNames.addAll(fieldNames); } @@ -557,6 +606,14 @@ public int getScale() { return scale; } + public String getCrs() { + return crs; + } + + public EdgeInterpolationAlgorithm getEdgeInterpolationAlgorithm() { + return edgeInterpolationAlgorithm; + } + /** * For struct types, get the list of field names. * @return the list of field names. @@ -664,6 +721,9 @@ public TypeDescription(Category category) { private int maxLength = DEFAULT_LENGTH; private int precision = DEFAULT_PRECISION; private int scale = DEFAULT_SCALE; + private String crs = DEFAULT_CRS; + private EdgeInterpolationAlgorithm edgeInterpolationAlgorithm + = DEFAULT_EDGE_INTERPOLATION_ALGORITHM; static void printFieldName(StringBuilder buffer, String name) { if (UNQUOTED_NAMES.matcher(name).matches()) { @@ -691,6 +751,18 @@ public void printToBuffer(StringBuilder buffer) { buffer.append(maxLength); buffer.append(')'); break; + case Geometry: + buffer.append('('); + buffer.append(crs); + buffer.append(')'); + break; + case Geography: + buffer.append('('); + buffer.append(crs); + buffer.append(','); + buffer.append(edgeInterpolationAlgorithm.name()); + buffer.append(')'); + break; case LIST: case MAP: case UNION: @@ -751,6 +823,16 @@ private void printJsonToBuffer(String prefix, StringBuilder buffer, buffer.append(", \"length\": "); buffer.append(maxLength); break; + case Geometry: + buffer.append(", \"crs\": "); + buffer.append(crs); + break; + case Geography: + buffer.append(", \"crs\": "); + buffer.append(crs); + buffer.append(", \"edge_interpolation_algorithm\": "); + buffer.append(edgeInterpolationAlgorithm.name()); + break; case LIST: case MAP: case UNION: diff --git a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java new file mode 100644 index 0000000000..093e2c96c8 --- /dev/null +++ b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java @@ -0,0 +1,361 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.geospatial; + +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Envelope; +import org.locationtech.jts.geom.Geometry; + +/** + * Bounding box for Geometry or Geography type in the representation of min/max + * value pairs of coordinates from each axis. + * A bounding box is considered valid if none of the X / Y dimensions contain NaN. + */ +public class BoundingBox { + + private double xMin = Double.POSITIVE_INFINITY; + private double xMax = Double.NEGATIVE_INFINITY; + private double yMin = Double.POSITIVE_INFINITY; + private double yMax = Double.NEGATIVE_INFINITY; + private double zMin = Double.POSITIVE_INFINITY; + private double zMax = Double.NEGATIVE_INFINITY; + private double mMin = Double.POSITIVE_INFINITY; + private double mMax = Double.NEGATIVE_INFINITY; + private boolean valid = true; + + public BoundingBox() { + } + + public BoundingBox( + double xMin, double xMax, double yMin, double yMax, + double zMin, double zMax, double mMin, double mMax) { + this.xMin = xMin; + this.xMax = xMax; + this.yMin = yMin; + this.yMax = yMax; + this.zMin = zMin; + this.zMax = zMax; + this.mMin = mMin; + this.mMax = mMax; + + // Update the validity + valid = isXYValid(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof BoundingBox other)) { + return false; + } + if (obj == this) { + return true; + } + + // Valid flag must be checked since invalid bounding boxes may have equal coordinates with the initial one + return xMin == other.xMin && xMax == other.xMax && yMin == other.yMin && yMax == other.yMax && + zMin == other.zMin && zMax == other.zMax && mMin == other.mMin && mMax == other.mMax && + valid == other.valid; + } + + @Override + public int hashCode() { + return Double.hashCode(xMin) ^ Double.hashCode(xMax) ^ + Double.hashCode(yMin) ^ Double.hashCode(yMax) ^ + Double.hashCode(zMin) ^ Double.hashCode(zMax) ^ + Double.hashCode(mMin) ^ Double.hashCode(mMax) ^ + Boolean.hashCode(valid); + } + + // Don't change `valid` here and let the caller maintain it + private void resetBBox() { + xMin = Double.POSITIVE_INFINITY; + xMax = Double.NEGATIVE_INFINITY; + yMin = Double.POSITIVE_INFINITY; + yMax = Double.NEGATIVE_INFINITY; + zMin = Double.POSITIVE_INFINITY; + zMax = Double.NEGATIVE_INFINITY; + mMin = Double.POSITIVE_INFINITY; + mMax = Double.NEGATIVE_INFINITY; + } + + public double getXMin() { + return xMin; + } + + public double getXMax() { + return xMax; + } + + public double getYMin() { + return yMin; + } + + public double getYMax() { + return yMax; + } + + public double getZMin() { + return zMin; + } + + public double getZMax() { + return zMax; + } + + public double getMMin() { + return mMin; + } + + public double getMMax() { + return mMax; + } + + /** + * Checks if the bounding box is valid. + * A bounding box is considered valid if none of the X / Y dimensions contain NaN. + * + * @return true if the bounding box is valid, false otherwise. + */ + public boolean isValid() { + return valid; + } + + /** + * Checks if the X and Y dimensions of the bounding box are valid. + * The X and Y dimensions are considered valid if none of the bounds contain NaN. + * + * @return true if the X and Y dimensions are valid, false otherwise. + */ + public boolean isXYValid() { + return isXValid() && isYValid(); + } + + /** + * Checks if the X dimension of the bounding box is valid. + * The X dimension is considered valid if neither bound contains NaN. + * + * @return true if the X dimension is valid, false otherwise. + */ + public boolean isXValid() { + return !(Double.isNaN(xMin) || Double.isNaN(xMax)); + } + + /** + * Checks if the Y dimension of the bounding box is valid. + * The Y dimension is considered valid if neither bound contains NaN. + * + * @return true if the Y dimension is valid, false otherwise. + */ + public boolean isYValid() { + return !(Double.isNaN(yMin) || Double.isNaN(yMax)); + } + + /** + * Checks if the Z dimension of the bounding box is valid. + * The Z dimension is considered valid if none of the bounds contain NaN. + * + * @return true if the Z dimension is valid, false otherwise. + */ + public boolean isZValid() { + return !(Double.isNaN(zMin) || Double.isNaN(zMax)); + } + + /** + * Checks if the M dimension of the bounding box is valid. + * The M dimension is considered valid if none of the bounds contain NaN. + * + * @return true if the M dimension is valid, false otherwise. + */ + public boolean isMValid() { + return !(Double.isNaN(mMin) || Double.isNaN(mMax)); + } + + /** + * Checks if the bounding box is empty in the X / Y dimension. + * + * @return true if the bounding box is empty, false otherwise. + */ + public boolean isXYEmpty() { + return isXEmpty() || isYEmpty(); + } + + /** + * Checks if the bounding box is empty in the X dimension. + * + * @return true if the X dimension is empty, false otherwise. + */ + public boolean isXEmpty() { + return Double.isInfinite(xMin) && Double.isInfinite(xMax); + } + + /** + * Checks if the bounding box is empty in the Y dimension. + * + * @return true if the Y dimension is empty, false otherwise. + */ + public boolean isYEmpty() { + return Double.isInfinite(yMin) && Double.isInfinite(yMax); + } + + /** + * Checks if the bounding box is empty in the Z dimension. + * + * @return true if the Z dimension is empty, false otherwise. + */ + public boolean isZEmpty() { + return Double.isInfinite(zMin) && Double.isInfinite(zMax); + } + + /** + * Checks if the bounding box is empty in the M dimension. + * + * @return true if the M dimension is empty, false otherwise. + */ + public boolean isMEmpty() { + return Double.isInfinite(mMin) && Double.isInfinite(mMax); + } + + /** + * Expands this bounding box to include the bounds of another box. + * After merging, this bounding box will contain both its original extent + * and the extent of the other bounding box. + * + * @param other the other BoundingBox whose bounds will be merged into this one + */ + public void merge(BoundingBox other) { + if (!valid) { + return; + } + + // If other is null or invalid, mark this as invalid + if (other == null || !other.valid) { + valid = false; + resetBBox(); + return; + } + + this.xMin = Math.min(this.xMin, other.xMin); + this.xMax = Math.max(this.xMax, other.xMax); + this.yMin = Math.min(this.yMin, other.yMin); + this.yMax = Math.max(this.yMax, other.yMax); + this.zMin = Math.min(this.zMin, other.zMin); + this.zMax = Math.max(this.zMax, other.zMax); + this.mMin = Math.min(this.mMin, other.mMin); + this.mMax = Math.max(this.mMax, other.mMax); + + // Update the validity of this bounding box based on the other bounding box + valid = isXYValid(); + } + + /** + * Extends this bounding box to include the spatial extent of the provided geometry. + * The bounding box coordinates (min/max values for x, y, z, m) will be adjusted + * to encompass both the current bounds and the geometry's bounds. + * + * @param geometry The geometry whose coordinates will be used to update this bounding box. + * If null or empty, the method returns without making any changes. + */ + public void update(Geometry geometry) { + if (!valid) { + return; + } + + if (geometry == null || geometry.isEmpty()) { + return; + } + + // Updates the X and Y bounds of this bounding box with the given coordinates. + // Updates are conditional: + // - X bounds are only updated if both minX and maxX are not NaN + // - Y bounds are only updated if both minY and maxY are not NaN + // This allows partial updates while preserving valid dimensions. + Envelope envelope = geometry.getEnvelopeInternal(); + if (!Double.isNaN(envelope.getMinX()) && !Double.isNaN(envelope.getMaxX())) { + xMin = Math.min(xMin, envelope.getMinX()); + xMax = Math.max(xMax, envelope.getMaxX()); + } + if (!Double.isNaN(envelope.getMinY()) && !Double.isNaN(envelope.getMaxY())) { + yMin = Math.min(yMin, envelope.getMinY()); + yMax = Math.max(yMax, envelope.getMaxY()); + } + + for (Coordinate coord : geometry.getCoordinates()) { + if (!Double.isNaN(coord.getZ())) { + zMin = Math.min(zMin, coord.getZ()); + zMax = Math.max(zMax, coord.getZ()); + } + if (!Double.isNaN(coord.getM())) { + mMin = Math.min(mMin, coord.getM()); + mMax = Math.max(mMax, coord.getM()); + } + } + + // Update the validity of this bounding box based on the other bounding box + valid = isXYValid(); + } + + /** + * Resets the bounding box to its initial state. + */ + public void reset() { + resetBBox(); + valid = true; + } + + /** + * Creates a copy of the current bounding box. + * + * @return a new BoundingBox instance with the same values as this one. + */ + public BoundingBox copy() { + return new BoundingBox( + this.xMin, this.xMax, + this.yMin, this.yMax, + this.zMin, this.zMax, + this.mMin, this.mMax); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("BoundingBox{xMin=") + .append(xMin) + .append(", xMax=") + .append(xMax) + .append(", yMin=") + .append(yMin) + .append(", yMax=") + .append(yMax) + .append(", zMin=") + .append(zMin) + .append(", zMax=") + .append(zMax) + .append(", mMin=") + .append(mMin) + .append(", mMax=") + .append(mMax); + + // Only include the valid flag when it's false + if (!valid) { + sb.append(", valid=false"); + } + + sb.append('}'); + return sb.toString(); + } +} diff --git a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java new file mode 100644 index 0000000000..c1067adad4 --- /dev/null +++ b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.geospatial; + +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; + +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * A list of geospatial types from all instances in the Geometry or Geography column, + * or an empty list if they are not known. + * + * The GeospatialTypes instance becomes invalid in the following cases: + * - When an unknown or unsupported geometry type is encountered during update + * - When merging with another invalid GeospatialTypes instance + * - When explicitly aborted using abort() + * + * When invalid, the types list is cleared and remains empty. All subsequent + * updates and merges are ignored until reset() is called. + */ +public class GeospatialTypes { + + private static final int UNKNOWN_TYPE_ID = -1; + private Set types = new HashSet<>(); + private boolean valid = true; + + public GeospatialTypes(Set types) { + this.types = types; + this.valid = true; + } + + public GeospatialTypes(Set types, boolean valid) { + this.types = types; + this.valid = valid; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof GeospatialTypes other)) { + return false; + } + if (obj == this) { + return true; + } + return valid == other.valid && types.equals(other.types); + } + + @Override + public int hashCode() { + return types.hashCode() ^ Boolean.hashCode(valid); + } + + public GeospatialTypes() {} + + public Set getTypes() { + return types; + } + + /** + * Updates the types list with the given geometry's type. + * If the geometry type is unknown, the instance becomes invalid. + * + * @param geometry the geometry to process + */ + public void update(Geometry geometry) { + if (!valid) { + return; + } + + if (geometry == null || geometry.isEmpty()) { + return; + } + + int code = getGeometryTypeCode(geometry); + if (code != UNKNOWN_TYPE_ID) { + types.add(code); + } else { + valid = false; + types.clear(); + } + } + + public void merge(GeospatialTypes other) { + if (!valid) { + return; + } + + if (other == null || !other.valid) { + valid = false; + types.clear(); + return; + } + types.addAll(other.types); + } + + public void reset() { + types.clear(); + valid = true; + } + + public boolean isValid() { + return valid; + } + + public GeospatialTypes copy() { + return new GeospatialTypes(new HashSet<>(types), valid); + } + + /** + * Extracts the base geometry type code from a full type code. + * For example: 1001 (XYZ Point) -> 1 (Point) + * + * @param typeId the full geometry type code + * @return the base type code (1-7) + */ + private int getBaseTypeCode(int typeId) { + return typeId % 1000; + } + + /** + * Extracts the dimension prefix from a full type code. + * For example: 1001 (XYZ Point) -> 1000 (XYZ) + * + * @param typeId the full geometry type code + * @return the dimension prefix (0, 1000, 2000, or 3000) + */ + private int getDimensionPrefix(int typeId) { + return (typeId / 1000) * 1000; + } + + @Override + public String toString() { + return "GeospatialTypes{" + "types=" + + types.stream().map(this::typeIdToString).collect(Collectors.toSet()) + '}'; + } + + private int getGeometryTypeId(Geometry geometry) { + return switch (geometry.getGeometryType()) { + case Geometry.TYPENAME_POINT -> 1; + case Geometry.TYPENAME_LINESTRING -> 2; + case Geometry.TYPENAME_POLYGON -> 3; + case Geometry.TYPENAME_MULTIPOINT -> 4; + case Geometry.TYPENAME_MULTILINESTRING -> 5; + case Geometry.TYPENAME_MULTIPOLYGON -> 6; + case Geometry.TYPENAME_GEOMETRYCOLLECTION -> 7; + default -> UNKNOWN_TYPE_ID; + }; + } + + /** + * Geospatial type codes: + * + * | Type | XY | XYZ | XYM | XYZM | + * | :----------------- | :--- | :--- | :--- | :--: | + * | Point | 0001 | 1001 | 2001 | 3001 | + * | LineString | 0002 | 1002 | 2002 | 3002 | + * | Polygon | 0003 | 1003 | 2003 | 3003 | + * | MultiPoint | 0004 | 1004 | 2004 | 3004 | + * | MultiLineString | 0005 | 1005 | 2005 | 3005 | + * | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + * | GeometryCollection | 0007 | 1007 | 2007 | 3007 | + * + * See https://github.com/apache/parquet-format/blob/master/Geospatial.md#geospatial-types + */ + private int getGeometryTypeCode(Geometry geometry) { + int typeId = getGeometryTypeId(geometry); + if (typeId == UNKNOWN_TYPE_ID) { + return UNKNOWN_TYPE_ID; + } + Coordinate[] coordinates = geometry.getCoordinates(); + boolean hasZ = false; + boolean hasM = false; + if (coordinates.length > 0) { + Coordinate firstCoord = coordinates[0]; + hasZ = !Double.isNaN(firstCoord.getZ()); + hasM = !Double.isNaN(firstCoord.getM()); + } + if (hasZ) { + typeId += 1000; + } + if (hasM) { + typeId += 2000; + } + return typeId; + } + + private String typeIdToString(int typeId) { + String typeString; + + typeString = switch (typeId % 1000) { + case 1 -> Geometry.TYPENAME_POINT; + case 2 -> Geometry.TYPENAME_LINESTRING; + case 3 -> Geometry.TYPENAME_POLYGON; + case 4 -> Geometry.TYPENAME_MULTIPOINT; + case 5 -> Geometry.TYPENAME_MULTILINESTRING; + case 6 -> Geometry.TYPENAME_MULTIPOLYGON; + case 7 -> Geometry.TYPENAME_GEOMETRYCOLLECTION; + default -> { + yield "Unknown"; + } + }; + if (typeId >= 3000) { + typeString += " (XYZM)"; + } else if (typeId >= 2000) { + typeString += " (XYM)"; + } else if (typeId >= 1000) { + typeString += " (XYZ)"; + } else { + typeString += " (XY)"; + } + return typeString; + } +} diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index d6147050ec..46b87bfdef 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -30,11 +30,17 @@ import org.apache.orc.DateColumnStatistics; import org.apache.orc.DecimalColumnStatistics; import org.apache.orc.DoubleColumnStatistics; +import org.apache.orc.GeospatialColumnStatistics; import org.apache.orc.IntegerColumnStatistics; import org.apache.orc.OrcProto; import org.apache.orc.StringColumnStatistics; import org.apache.orc.TimestampColumnStatistics; import org.apache.orc.TypeDescription; +import org.apache.orc.geospatial.BoundingBox; +import org.apache.orc.geospatial.GeospatialTypes; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.io.ParseException; +import org.locationtech.jts.io.WKBReader; import org.threeten.extra.chrono.HybridChronology; import java.sql.Date; @@ -42,6 +48,11 @@ import java.time.chrono.ChronoLocalDate; import java.time.chrono.Chronology; import java.time.chrono.IsoChronology; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; import java.util.TimeZone; @@ -1858,6 +1869,167 @@ public Timestamp getMaximum() { private boolean hasNull = false; private long bytesOnDisk = 0; + private static final class GeospatialStatisticsImpl extends ColumnStatisticsImpl + implements GeospatialColumnStatistics { + private final BoundingBox boundingBox; + private final GeospatialTypes geospatialTypes; + private final WKBReader reader = new WKBReader(); + + GeospatialStatisticsImpl() { + this.boundingBox = new BoundingBox(); + this.geospatialTypes = new GeospatialTypes(); + } + + GeospatialStatisticsImpl(OrcProto.ColumnStatistics stats) { + super(stats); + BoundingBox boundingBoxOut = null; + GeospatialTypes geospatialTypesOut = null; + + OrcProto.GeospatialStatistics geoStatistics = stats.getGeospatialStatistics(); + if (geoStatistics.hasBbox()) { + OrcProto.BoundingBox bbox = geoStatistics.getBbox(); + boundingBoxOut = new BoundingBox( + bbox.hasXmin() ? bbox.getXmin() : Double.NaN, + bbox.hasXmax() ? bbox.getXmax() : Double.NaN, + bbox.hasYmin() ? bbox.getYmin() : Double.NaN, + bbox.hasYmax() ? bbox.getYmax() : Double.NaN, + bbox.hasZmin() ? bbox.getZmin() : Double.NaN, + bbox.hasZmax() ? bbox.getZmax() : Double.NaN, + bbox.hasMmin() ? bbox.getMmin() : Double.NaN, + bbox.hasMmax() ? bbox.getMmax() : Double.NaN); + } + + if (!geoStatistics.getGeospatialTypesList().isEmpty()) { + Set types = new HashSet<>(geoStatistics.getGeospatialTypesList()); + geospatialTypesOut = new GeospatialTypes(types); + } + this.boundingBox = boundingBoxOut; + this.geospatialTypes = geospatialTypesOut; + } + + @Override + public void updateGeometry(BytesWritable value) { + if (value == null) { + return; + } + + try { + Geometry geom = reader.read(value.getBytes()); + boundingBox.update(geom); + geospatialTypes.update(geom); + } catch (ParseException e) { + throw new IllegalArgumentException("Invalid geospatial data - failed to parse WKB format", e); + } + } + + @Override + public void updateGeometry(byte[] bytes, int offset, int length) { + if (bytes == null) { + return; + } + BytesWritable value = new BytesWritable(); + value.set(bytes, offset, length); + updateGeometry(value); + } + + @Override + public void reset() { + super.reset(); + boundingBox.reset();; + geospatialTypes.reset(); + } + + @Override + public void merge(ColumnStatisticsImpl other) { + if (other instanceof GeospatialStatisticsImpl geoStats) { + boundingBox.merge(geoStats.boundingBox); + geospatialTypes.merge(geoStats.geospatialTypes); + } else { + throw new IllegalArgumentException("Incompatible merging of geospatial column statistics"); + } + super.merge(other); + } + + @Override + public OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder builder = super.serialize(); + OrcProto.GeospatialStatistics.Builder geoStats = OrcProto.GeospatialStatistics.newBuilder(); + + OrcProto.BoundingBox.Builder bboxBuilder = OrcProto.BoundingBox.newBuilder(); + if (boundingBox.isValid() && !boundingBox.isXYEmpty()) { + bboxBuilder.setXmin(boundingBox.getXMin()); + bboxBuilder.setXmax(boundingBox.getXMax()); + bboxBuilder.setYmin(boundingBox.getYMin()); + bboxBuilder.setYmax(boundingBox.getYMax()); + if (boundingBox.isZValid() && !boundingBox.isZEmpty()) { + bboxBuilder.setZmin(boundingBox.getZMin()); + bboxBuilder.setZmax(boundingBox.getZMax()); + } + if (boundingBox.isMValid() && !boundingBox.isMEmpty()) { + bboxBuilder.setMmin(boundingBox.getMMin()); + bboxBuilder.setMmax(boundingBox.getMMax()); + } + geoStats.setBbox(bboxBuilder); + } + if (geospatialTypes.isValid() && !geospatialTypes.getTypes().isEmpty()) { + List sortedTypes = new ArrayList<>(geospatialTypes.getTypes()); + Collections.sort(sortedTypes); + geoStats.addAllGeospatialTypes(sortedTypes); + } + builder.setGeospatialStatistics(geoStats); + return builder; + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(super.toString()); + if (boundingBox.isValid()) { + buf.append(" bbox: "); + buf.append(boundingBox.toString()); + } + if (geospatialTypes.isValid()) { + buf.append(" types: "); + buf.append(geospatialTypes.toString()); + } + return buf.toString(); + } + + @Override + public BoundingBox getBoundingBox() { + return boundingBox; + } + + @Override + public GeospatialTypes getGeospatialTypes() { + return geospatialTypes; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof GeospatialStatisticsImpl that)) { + return false; + } + if (!super.equals(o)) { + return false; + } + + return boundingBox.equals(that.boundingBox) && + geospatialTypes.equals(that.geospatialTypes); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + boundingBox.hashCode(); + result = prime * result + geospatialTypes.hashCode(); + return result; + } + } + ColumnStatisticsImpl(OrcProto.ColumnStatistics stats) { if (stats.hasNumberOfValues()) { count = stats.getNumberOfValues(); @@ -1955,6 +2127,14 @@ public void updateTimestamp(long value, int nanos) { throw new UnsupportedOperationException("Can't update timestamp"); } + public void updateGeometry(BytesWritable value) { + throw new UnsupportedOperationException("Can't update Geometry"); + } + + public void updateGeometry(byte[] bytes, int offset, int length) { + throw new UnsupportedOperationException("Can't update Geometry"); + } + public boolean isStatsExists() { return (count > 0 || hasNull == true); } @@ -2046,6 +2226,9 @@ public static ColumnStatisticsImpl create(TypeDescription schema, return new TimestampInstantStatisticsImpl(); case BINARY: return new BinaryStatisticsImpl(); + case Geography: + case Geometry: + return new GeospatialStatisticsImpl(); default: return new ColumnStatisticsImpl(); } @@ -2089,6 +2272,8 @@ public static ColumnStatisticsImpl deserialize(TypeDescription schema, writerUsedProlepticGregorian, convertToProlepticGregorian); } else if(stats.hasBinaryStatistics()) { return new BinaryStatisticsImpl(stats); + } else if (stats.hasGeospatialStatistics()) { + return new GeospatialStatisticsImpl(stats); } else { return new ColumnStatisticsImpl(stats); } diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java index 09b4b2ae61..eacff4b063 100644 --- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java +++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java @@ -461,6 +461,8 @@ void buildConversion(TypeDescription fileType, case TIMESTAMP_INSTANT: case BINARY: case DATE: + case Geometry: + case Geography: // these are always a match break; case CHAR: diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java index 418b9c9561..785f568ff4 100644 --- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -1154,6 +1154,12 @@ public void skipRows(long items, ReadPhase readPhase) throws IOException { } } + public static class GeospatialTreeReader extends BinaryTreeReader { + GeospatialTreeReader(int columnId, Context context) throws IOException { + super(columnId, context); + } + } + public static class TimestampTreeReader extends TreeReader { protected IntegerReader data = null; protected IntegerReader nanos = null; @@ -3028,6 +3034,9 @@ public static TypeReader createTreeReader(TypeDescription readerType, } return new DecimalTreeReader(fileType.getId(), fileType.getPrecision(), fileType.getScale(), context); + case Geography: + case Geometry: + return new GeospatialTreeReader(fileType.getId(), context); case STRUCT: return new StructTreeReader(fileType.getId(), readerType, context); case LIST: diff --git a/java/core/src/java/org/apache/orc/impl/TypeUtils.java b/java/core/src/java/org/apache/orc/impl/TypeUtils.java index a5daa89572..40d22e2c43 100644 --- a/java/core/src/java/org/apache/orc/impl/TypeUtils.java +++ b/java/core/src/java/org/apache/orc/impl/TypeUtils.java @@ -69,6 +69,8 @@ public static ColumnVector createColumn(TypeDescription schema, case BINARY: case CHAR: case VARCHAR: + case Geometry: + case Geography: return new BytesColumnVector(maxSize); case STRUCT: { List children = schema.getChildren(); diff --git a/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java new file mode 100644 index 0000000000..e9a0aa70bf --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl.writer; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.io.BytesWritable; +import org.apache.orc.OrcProto; +import org.apache.orc.TypeDescription; +import org.apache.orc.impl.CryptoUtils; +import org.apache.orc.impl.IntegerWriter; +import org.apache.orc.impl.PositionRecorder; +import org.apache.orc.impl.PositionedOutputStream; +import org.apache.orc.impl.StreamName; + +import java.io.IOException; +import java.util.function.Consumer; + +public class GeospatialTreeWriter extends TreeWriterBase { + private final PositionedOutputStream stream; + private final IntegerWriter length; + private boolean isDirectV2 = true; + private long rawDataSize = 0; + private boolean isGeometry = false; + + public GeospatialTreeWriter(TypeDescription schema, + WriterEncryptionVariant encryption, + WriterContext context) throws IOException { + super(schema, encryption, context); + this.isGeometry = schema.getCategory() == TypeDescription.Category.Geometry; + this.stream = context.createStream( + new StreamName(id, OrcProto.Stream.Kind.DATA, encryption)); + this.isDirectV2 = isNewWriteFormat(context); + this.length = createIntegerWriter(context.createStream( + new StreamName(id, OrcProto.Stream.Kind.LENGTH, encryption)), + false, isDirectV2, context); + if (rowIndexPosition != null) { + recordPosition(rowIndexPosition); + } + } + + @Override + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder result = super.getEncoding(); + if (isDirectV2) { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); + } + return result; + } + + @Override + public void writeBatch(ColumnVector vector, int offset, + int length) throws IOException { + super.writeBatch(vector, offset, length); + BytesColumnVector vec = (BytesColumnVector) vector; + if (vector.isRepeating) { + if (vector.noNulls || !vector.isNull[0]) { + for (int i = 0; i < length; ++i) { + stream.write(vec.vector[0], vec.start[0], + vec.length[0]); + this.length.write(vec.length[0]); + } + rawDataSize += (long) length * vec.length[0]; + if (isGeometry) { + indexStatistics.updateGeometry(vec.vector[0], vec.start[0], vec.length[0]); + } + if (createBloomFilter) { + if (bloomFilter != null) { + bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]); + } + bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]); + } + } + } else { + for (int i = 0; i < length; ++i) { + if (vec.noNulls || !vec.isNull[i + offset]) { + stream.write(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); + this.length.write(vec.length[offset + i]); + rawDataSize += vec.length[offset + i]; + BytesWritable bw = new BytesWritable(); + bw.set(vec.vector[offset + i], vec.start[offset + i], vec.length[offset + i]); + if (isGeometry) { + indexStatistics.updateGeometry(vec.vector[i], vec.start[i], vec.length[i]); + } + if (createBloomFilter) { + if (bloomFilter != null) { + bloomFilter.addBytes(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); + } + bloomFilterUtf8.addBytes(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); + } + } + } + } + } + + @Override + public void writeStripe(int requiredIndexEntries) throws IOException { + super.writeStripe(requiredIndexEntries); + if (rowIndexPosition != null) { + recordPosition(rowIndexPosition); + } + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + stream.getPosition(recorder); + length.getPosition(recorder); + } + + @Override + public long estimateMemory() { + return super.estimateMemory() + stream.getBufferSize() + + length.estimateMemory(); + } + + @Override + public long getRawDataSize() { + return rawDataSize; + } + + @Override + public void flushStreams() throws IOException { + super.flushStreams(); + stream.flush(); + length.flush(); + } + + @Override + public void prepareStripe(int stripeId) { + super.prepareStripe(stripeId); + Consumer updater = CryptoUtils.modifyIvForStripe(stripeId); + stream.changeIv(updater); + length.changeIv(updater); + } +} diff --git a/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java index 71eb3a5648..de63f9efb6 100644 --- a/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java +++ b/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java @@ -185,6 +185,9 @@ static TreeWriter createSubtree(TypeDescription schema, return new ListTreeWriter(schema, encryption, streamFactory); case UNION: return new UnionTreeWriter(schema, encryption, streamFactory); + case Geometry: + case Geography: + return new GeospatialTreeWriter(schema, encryption, streamFactory); default: throw new IllegalArgumentException("Bad category: " + schema.getCategory()); diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java b/java/core/src/test/org/apache/orc/TestColumnStatistics.java index 5fc429199e..dea3359d92 100644 --- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java +++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java @@ -28,10 +28,14 @@ import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; +import org.apache.orc.geospatial.BoundingBox; +import org.apache.orc.geospatial.GeospatialTypes; import org.apache.orc.impl.ColumnStatisticsImpl; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; +import org.locationtech.jts.geom.*; +import org.locationtech.jts.io.WKBWriter; import java.io.File; import java.math.BigDecimal; @@ -742,6 +746,201 @@ public void testMergeIncompatible() { assertEquals(0, ((DoubleColumnStatistics) doubleStats).getNumberOfValues()); } + @Test + public void testUpdateGeometry() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + byte[][] points = { + wkbWriter.write(geometryFactory.createPoint(new Coordinate(1.0, 1.0))), + wkbWriter.write(geometryFactory.createPoint(new Coordinate(2.0, 2.0))), + }; + + for (byte[] point : points) { + stats.updateGeometry(new BytesWritable(point)); + } + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(2.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", + geometryStatistics.toString()); + + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + assertTrue(geospatialTypes.getTypes().contains(1)); + assertEquals(1, geospatialTypes.getTypes().size()); + } + + @Test + public void testUpdateGeometryWithDifferentTypes() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + Point point = geometryFactory.createPoint(new Coordinate(1, 1)); + Coordinate[] lineCoords = new Coordinate[]{new Coordinate(1, 1), new Coordinate(2, 2)}; + LineString line = geometryFactory.createLineString(lineCoords); + Coordinate[] polygonCoords = new Coordinate[]{ + new Coordinate(0, 0), new Coordinate(3, 0), + new Coordinate(1, 3), new Coordinate(0, 1), + new Coordinate(0, 0) + }; + LinearRing shell = geometryFactory.createLinearRing(polygonCoords); + Polygon polygon = geometryFactory.createPolygon(shell); + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + // Generate WKB and update stats + byte[] pointWkb = wkbWriter.write(point); + stats.updateGeometry(new BytesWritable(pointWkb)); + + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(1.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(1.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=1.0, xMax=1.0, yMin=1.0, yMax=1.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + + assertTrue(geospatialTypes.getTypes().contains(1)); + assertEquals(1, geospatialTypes.getTypes().size()); + assertEquals("GeospatialTypes{types=[Point (XY)]}", geospatialTypes.toString()); + + + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=1.0, yMin=1.0, yMax=1.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", + geometryStatistics.toString()); + + byte[] lineWkb = wkbWriter.write(line); + stats.updateGeometry(new BytesWritable(lineWkb)); + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(2.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + + assertTrue(geospatialTypes.getTypes().contains(1)); + assertTrue(geospatialTypes.getTypes().contains(2)); + assertEquals(2, geospatialTypes.getTypes().size()); + assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY)]}", + geospatialTypes.toString()); + + byte[] polygonWkb = wkbWriter.write(polygon); + stats.updateGeometry(new BytesWritable(polygonWkb)); + stats.updateGeometry(new BytesWritable(lineWkb)); + assertEquals(0.0, bbox.getXMin(), 0.0); + assertEquals(3.0, bbox.getXMax(), 0.0); + assertEquals(0.0, bbox.getYMin(), 0.0); + assertEquals(3.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + + assertTrue(geospatialTypes.getTypes().contains(1)); + assertTrue(geospatialTypes.getTypes().contains(2)); + assertTrue(geospatialTypes.getTypes().contains(2)); + assertEquals(3, geospatialTypes.getTypes().size()); + assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY), Polygon (XY)]}", + geospatialTypes.toString()); + + } + + @Test + public void testUpdateGeometryWithZCoordinates() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(3); + + Point point1 = geometryFactory.createPoint(new Coordinate(0, 1, 2)); + Point point2 = geometryFactory.createPoint(new Coordinate(2, 1, 0)); + + stats.updateGeometry(new BytesWritable(wkbWriter.write(point1))); + stats.updateGeometry(new BytesWritable(wkbWriter.write(point2))); + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + assertEquals(0.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(1.0, bbox.getYMax(), 0.0); + assertEquals(0.0, bbox.getZMin(), 0.0); + assertEquals(2.0, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=0.0, xMax=2.0, yMin=1.0, yMax=1.0, zMin=0.0, zMax=2.0, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=0.0, xMax=2.0, yMin=1.0, yMax=1.0, zMin=0.0, zMax=2.0, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XYZ)]}", + geometryStatistics.toString()); + + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + assertTrue(geospatialTypes.getTypes().contains(1001)); + assertEquals(1, geospatialTypes.getTypes().size()); + } + + @Test + public void TestGeospatialMerge() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats0 = ColumnStatisticsImpl.create(desc); + ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + byte[][] points = { + wkbWriter.write(geometryFactory.createPoint(new Coordinate(1.0, 1.0))), + wkbWriter.write(geometryFactory.createPoint(new Coordinate(2.0, 2.0))), + }; + + stats0.updateGeometry(new BytesWritable(points[0])); + stats1.updateGeometry(new BytesWritable(points[1])); + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats0; + stats0.merge(stats1); + + BoundingBox bbox = geometryStatistics.getBoundingBox(); + assertTrue(bbox.isXYValid()); + assertFalse(bbox.isXYEmpty()); + assertTrue(bbox.isZValid()); + assertTrue(bbox.isMValid()); + assertTrue(bbox.isZEmpty()); + assertTrue(bbox.isMEmpty()); + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(2.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", + geometryStatistics.toString()); + + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + assertTrue(geospatialTypes.getTypes().contains(1)); + assertEquals(1, geospatialTypes.getTypes().size()); + } + Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); diff --git a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java new file mode 100644 index 0000000000..f0e148fb0d --- /dev/null +++ b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.impl.ColumnStatisticsImpl; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.geom.GeometryFactory; +import org.locationtech.jts.io.WKBReader; +import org.locationtech.jts.io.WKBWriter; + +import java.io.File; +import java.util.Arrays; + +import static org.junit.jupiter.api.Assertions.*; + +public class TestOrcGeospatial implements TestConf { + Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + FileSystem fs; + Path testFilePath; + + @BeforeEach + public void openFileSystem(TestInfo testInfo) throws Exception { + fs = FileSystem.getLocal(conf); + testFilePath = new Path(workDir, "TestOrcGeospatial." + + testInfo.getTestMethod().get().getName() + ".orc"); + fs.delete(testFilePath, false); + } + + @Test + public void testGeometryWriterWithNulls() throws Exception { + // Create a geometry schema and ORC file writer + TypeDescription schema = TypeDescription.createGeometry(); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000)); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + WKBReader wkbReader = new WKBReader(); + + // Add data + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector geos = (BytesColumnVector) batch.cols[0]; + for (int i = 0; i < 100; i++) { + if (i % 2 == 0) { + byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); + geos.setVal(batch.size++, bytes); + } else { + geos.noNulls = false; + geos.isNull[batch.size++] = true; + } + } + writer.addRowBatch(batch); + writer.close(); + + // Verify reader schema + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals("geometry(OGC:CRS84)", reader.getSchema().toString()); + assertEquals(100, reader.getNumberOfRows()); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + geos = (BytesColumnVector) batch.cols[0]; + + // Verify statistics + ColumnStatistics[] stats = reader.getStatistics(); + assertEquals(1, stats.length); + assertEquals(50, stats[0].getNumberOfValues()); + assertTrue(stats[0].hasNull()); + assertInstanceOf(GeospatialColumnStatistics.class, stats[0]); + assertTrue(((GeospatialColumnStatistics) stats[0]).getBoundingBox().isXYValid()); + assertFalse(((GeospatialColumnStatistics) stats[0]).getBoundingBox().isZValid()); + assertFalse(((GeospatialColumnStatistics) stats[0]).getBoundingBox().isMValid()); + assertEquals("BoundingBox{xMin=0.0, xMax=98.0, yMin=0.0, yMax=98.0, zMin=NaN, zMax=NaN, mMin=NaN, mMax=NaN}", ((GeospatialColumnStatistics) stats[0]).getBoundingBox().toString()); + assertEquals("GeospatialTypes{types=[Point (XY)]}", ((GeospatialColumnStatistics) stats[0]).getGeospatialTypes().toString()); + + // Verify data + int idx = 0; + while (rows.nextBatch(batch)) { + for (int r = 0; r < batch.size; ++r) { + if (idx % 2 == 0) { + Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); + assertEquals("Point", geom.getGeometryType()); + assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); + } else { + assertTrue(geos.isNull[r]); + } + idx += 1; + } + } + rows.close(); + } + + @Test + public void testGeographyWriterWithNulls() throws Exception { + // Create geography schema and ORC file writer + TypeDescription schema = TypeDescription.createGeography(); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000)); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + WKBReader wkbReader = new WKBReader(); + + // Add data + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector geos = (BytesColumnVector) batch.cols[0]; + for (int i = 0; i < 100; i++) { + if (i % 2 == 0) { + byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); + geos.setVal(batch.size++, bytes); + } else { + geos.noNulls = false; + geos.isNull[batch.size++] = true; + } + } + writer.addRowBatch(batch); + writer.close(); + + // Verify reader schema + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals("geography(OGC:CRS84,SPHERICAL)", reader.getSchema().toString()); + assertEquals(100, reader.getNumberOfRows()); + + // Verify statistics, make sure there are no bounding box and geospatial types + ColumnStatistics[] stats = reader.getStatistics(); + assertEquals(1, stats.length); + assertEquals(50, stats[0].getNumberOfValues()); + assertTrue(stats[0].hasNull()); + assertInstanceOf(GeospatialColumnStatistics.class, stats[0]); + assertNull(((GeospatialColumnStatistics) stats[0]).getBoundingBox()); + assertNull(((GeospatialColumnStatistics) stats[0]).getGeospatialTypes()); + + // Verify Data + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + geos = (BytesColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for (int r = 0; r < batch.size; ++r) { + if (idx % 2 == 0) { + Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); + assertEquals("Point", geom.getGeometryType()); + assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); + } else { + assertTrue(geos.isNull[r]); + } + idx += 1; + } + } + rows.close(); + } +} diff --git a/java/pom.xml b/java/pom.xml index 50df3df920..f969ea95f1 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -67,6 +67,7 @@ 3.4.1 17 ${project.basedir}/../target/javadoc + 1.20.0 5.12.2 3.7.1 3.8.1