From 6d6c02017728d3316b943d2e567b814e07ddfefb Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 2 Jun 2025 23:08:21 +0800 Subject: [PATCH 01/16] Geometry and geography --- java/core/pom.xml | 5 + .../orc/GeospatialColumnStatistics.java | 28 ++ .../src/java/org/apache/orc/OrcUtils.java | 62 ++++ .../java/org/apache/orc/TypeDescription.java | 72 +++- .../apache/orc/geospatial/BoundingBox.java | 345 ++++++++++++++++++ .../orc/geospatial/GeospatialTypes.java | 223 +++++++++++ .../apache/orc/impl/ColumnStatisticsImpl.java | 156 +++++++- .../org/apache/orc/impl/SchemaEvolution.java | 2 + .../apache/orc/impl/TreeReaderFactory.java | 8 + .../java/org/apache/orc/impl/TypeUtils.java | 2 + .../orc/impl/writer/GeospatialTreeWriter.java | 157 ++++++++ .../apache/orc/impl/writer/TreeWriter.java | 3 + .../org/apache/orc/TestOrcGeospatial.java | 135 +++++++ .../orc/impl/TestColumnStatisticsImpl.java | 159 ++++++++ java/pom.xml | 1 + 15 files changed, 1345 insertions(+), 13 deletions(-) create mode 100644 java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java create mode 100644 java/core/src/java/org/apache/orc/geospatial/BoundingBox.java create mode 100644 java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java create mode 100644 java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java create mode 100644 java/core/src/test/org/apache/orc/TestOrcGeospatial.java diff --git a/java/core/pom.xml b/java/core/pom.xml index cdfbd5db9a..487a524398 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -80,6 +80,11 @@ com.aayushatharva.brotli4j brotli4j + + org.locationtech.jts + jts-core + ${jts.version} + diff --git a/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java b/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java new file mode 100644 index 0000000000..4cac17eace --- /dev/null +++ b/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import org.apache.orc.geospatial.BoundingBox; +import org.apache.orc.geospatial.GeospatialTypes; + +public interface GeospatialColumnStatistics extends ColumnStatistics { + + BoundingBox getBoundingBox(); + GeospatialTypes getGeospatialTypes(); +} \ No newline at end of file diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java index 7dde0bc0fd..9e172acb19 100644 --- a/java/core/src/java/org/apache/orc/OrcUtils.java +++ b/java/core/src/java/org/apache/orc/OrcUtils.java @@ -171,6 +171,31 @@ private static void appendOrcTypes(List result, TypeDescription t type.setPrecision(typeDescr.getPrecision()); type.setScale(typeDescr.getScale()); break; + case Geometry: + case Geography: + type.setKind(OrcProto.Type.Kind.GEOMETRY); + type.setCrs(typeDescr.getCRS()); + switch (typeDescr.getEdgeInterpolationAlgorithm()) { + case SPHERICAL: + type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.SPHERICAL); + break; + case VINCENTY: + type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.VINCENTY); + break; + case THOMAS: + type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.THOMAS); + break; + case ANDOYER: + type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.ANDOYER); + break; + case KARNEY: + type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.KARNEY); + break; + default: + throw new IllegalArgumentException("Unknown interpolation algorithm: " + + typeDescr.getEdgeInterpolationAlgorithm()); + } + break; case LIST: type.setKind(OrcProto.Type.Kind.LIST); type.addSubtypes(children.get(0).getId()); @@ -325,6 +350,43 @@ TypeDescription convertTypeFromProtobuf(List types, result.withPrecision(type.getPrecision()); } break; + case GEOMETRY: + result = TypeDescription.createGeometry(); + if (type.hasCrs()) { + result.withCRS(type.getCrs()); + } + break; + case GEOGRAPHY: + result = TypeDescription.createGeography(); + if (type.hasCrs()) { + result.withCRS(type.getCrs()); + } + switch (type.getAlgorithm()) { + case SPHERICAL: + result.withEdgeInterpolationAlgorithm( + TypeDescription.EdgeInterpolationAlgorithm.SPHERICAL); + break; + case VINCENTY: + result.withEdgeInterpolationAlgorithm( + TypeDescription.EdgeInterpolationAlgorithm.VINCENTY); + break; + case THOMAS: + result.withEdgeInterpolationAlgorithm( + TypeDescription.EdgeInterpolationAlgorithm.THOMAS); + break; + case ANDOYER: + result.withEdgeInterpolationAlgorithm( + TypeDescription.EdgeInterpolationAlgorithm.ANDOYER); + break; + case KARNEY: + result.withEdgeInterpolationAlgorithm( + TypeDescription.EdgeInterpolationAlgorithm.KARNEY); + break; + default: + throw new IllegalArgumentException("Unknown interpolation algorithm: " + + type.getAlgorithm()); + } + break; case LIST: if (type.getSubtypesCount() != 1) { throw new FileFormatException("LIST type should contain exactly " + diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index 8ea9fca1b2..54b6c7cd4c 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -50,6 +50,19 @@ public class TypeDescription public static final String ENCRYPT_ATTRIBUTE = "encrypt"; public static final String MASK_ATTRIBUTE = "mask"; + public enum EdgeInterpolationAlgorithm { + SPHERICAL("spherical"), + VINCENTY("vincenty"), + THOMAS("thomas"), + ANDOYER("andoyer"), + KARNEY("karney"); + + EdgeInterpolationAlgorithm(String name) { + this.name = name; + } + final String name; + } + @Override public int compareTo(TypeDescription other) { if (this == other) { @@ -116,7 +129,9 @@ public enum Category { MAP("map", false), STRUCT("struct", false), UNION("uniontype", false), - TIMESTAMP_INSTANT("timestamp with local time zone", true); + TIMESTAMP_INSTANT("timestamp with local time zone", true), + Geometry("geometry", true), + Geography("geography", true); Category(String name, boolean isPrimitive) { this.name = name; @@ -187,6 +202,10 @@ public static TypeDescription createDecimal() { return new TypeDescription(Category.DECIMAL); } + public static TypeDescription createGeometry() { return new TypeDescription(Category.Geometry); } + + public static TypeDescription createGeography() { return new TypeDescription(Category.Geography); } + /** * Parse TypeDescription from the Hive type names. This is the inverse * of TypeDescription.toString() @@ -239,6 +258,25 @@ public TypeDescription withScale(int scale) { return this; } + public TypeDescription withCRS(String crs) { + if (category != Category.Geometry && + category != Category.Geography) { + throw new IllegalArgumentException("crs is only allowed on Geometry/Geography" + + " and not " + category.name); + } + this.CRS = crs; + return this; + } + + public TypeDescription withEdgeInterpolationAlgorithm(EdgeInterpolationAlgorithm edgeInterpolationAlgorithm) { + if (category != Category.Geography) { + throw new IllegalArgumentException("edgeInterpolationAlgorithm is only allowed on Geography" + + " and not " + category.name); + } + this.edgeInterpolationAlgorithm = edgeInterpolationAlgorithm; + return this; + } + /** * Set an attribute on this type. * @param key the attribute name @@ -366,6 +404,8 @@ public TypeDescription clone() { result.maxLength = maxLength; result.precision = precision; result.scale = scale; + result.CRS = CRS; + result.edgeInterpolationAlgorithm = edgeInterpolationAlgorithm; if (fieldNames != null) { result.fieldNames.addAll(fieldNames); } @@ -557,6 +597,14 @@ public int getScale() { return scale; } + public String getCRS() { + return CRS; + } + + public EdgeInterpolationAlgorithm getEdgeInterpolationAlgorithm() { + return edgeInterpolationAlgorithm; + } + /** * For struct types, get the list of field names. * @return the list of field names. @@ -664,6 +712,8 @@ public TypeDescription(Category category) { private int maxLength = DEFAULT_LENGTH; private int precision = DEFAULT_PRECISION; private int scale = DEFAULT_SCALE; + private String CRS = "OGC:CRS84"; + private EdgeInterpolationAlgorithm edgeInterpolationAlgorithm = EdgeInterpolationAlgorithm.SPHERICAL; static void printFieldName(StringBuilder buffer, String name) { if (UNQUOTED_NAMES.matcher(name).matches()) { @@ -691,6 +741,17 @@ public void printToBuffer(StringBuilder buffer) { buffer.append(maxLength); buffer.append(')'); break; + case Geometry: + buffer.append('('); + buffer.append(CRS); + buffer.append(')'); + break; + case Geography: + buffer.append('('); + buffer.append(CRS); + buffer.append(','); + buffer.append(edgeInterpolationAlgorithm.name()); + buffer.append(')'); case LIST: case MAP: case UNION: @@ -751,6 +812,15 @@ private void printJsonToBuffer(String prefix, StringBuilder buffer, buffer.append(", \"length\": "); buffer.append(maxLength); break; + case Geometry: + buffer.append(", \"crs\": "); + buffer.append(CRS); + break; + case Geography: + buffer.append(", \"crs\": "); + buffer.append(CRS); + buffer.append(", \"edge_interpolation_algorithm\": "); + buffer.append(edgeInterpolationAlgorithm.name()); case LIST: case MAP: case UNION: diff --git a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java new file mode 100644 index 0000000000..7ebe97f6aa --- /dev/null +++ b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.orc.geospatial; + +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Envelope; +import org.locationtech.jts.geom.Geometry; + +public class BoundingBox { + + private double xMin = Double.POSITIVE_INFINITY; + private double xMax = Double.NEGATIVE_INFINITY; + private double yMin = Double.POSITIVE_INFINITY; + private double yMax = Double.NEGATIVE_INFINITY; + private double zMin = Double.POSITIVE_INFINITY; + private double zMax = Double.NEGATIVE_INFINITY; + private double mMin = Double.POSITIVE_INFINITY; + private double mMax = Double.NEGATIVE_INFINITY; + private boolean valid = true; + + public BoundingBox() {} + + public BoundingBox( + double xMin, double xMax, double yMin, double yMax, double zMin, double zMax, double mMin, double mMax) { + this.xMin = xMin; + this.xMax = xMax; + this.yMin = yMin; + this.yMax = yMax; + this.zMin = zMin; + this.zMax = zMax; + this.mMin = mMin; + this.mMax = mMax; + + // Update the validity + valid = isXYValid(); + } + + private void resetBBox() { + xMin = Double.POSITIVE_INFINITY; + xMax = Double.NEGATIVE_INFINITY; + yMin = Double.POSITIVE_INFINITY; + yMax = Double.NEGATIVE_INFINITY; + zMin = Double.POSITIVE_INFINITY; + zMax = Double.NEGATIVE_INFINITY; + mMin = Double.POSITIVE_INFINITY; + mMax = Double.NEGATIVE_INFINITY; + } + + public double getXMin() { + return xMin; + } + + public double getXMax() { + return xMax; + } + + public double getYMin() { + return yMin; + } + + public double getYMax() { + return yMax; + } + + public double getZMin() { + return zMin; + } + + public double getZMax() { + return zMax; + } + + public double getMMin() { + return mMin; + } + + public double getMMax() { + return mMax; + } + + /** + * Checks if the bounding box is valid. + * A bounding box is considered valid if none of the X / Y dimensions contain NaN. + * + * @return true if the bounding box is valid, false otherwise. + */ + public boolean isValid() { + return valid; + } + + /** + * Checks if the X and Y dimensions of the bounding box are valid. + * The X and Y dimensions are considered valid if none of the bounds contain NaN. + * + * @return true if the X and Y dimensions are valid, false otherwise. + */ + public boolean isXYValid() { + return isXValid() && isYValid(); + } + + /** + * Checks if the X dimension of the bounding box is valid. + * The X dimension is considered valid if neither bound contains NaN. + * + * @return true if the X dimension is valid, false otherwise. + */ + public boolean isXValid() { + return !(Double.isNaN(xMin) || Double.isNaN(xMax)); + } + + /** + * Checks if the Y dimension of the bounding box is valid. + * The Y dimension is considered valid if neither bound contains NaN. + * + * @return true if the Y dimension is valid, false otherwise. + */ + public boolean isYValid() { + return !(Double.isNaN(yMin) || Double.isNaN(yMax)); + } + + /** + * Checks if the Z dimension of the bounding box is valid. + * The Z dimension is considered valid if none of the bounds contain NaN. + * + * @return true if the Z dimension is valid, false otherwise. + */ + public boolean isZValid() { + return !(Double.isNaN(zMin) || Double.isNaN(zMax)); + } + + /** + * Checks if the M dimension of the bounding box is valid. + * The M dimension is considered valid if none of the bounds contain NaN. + * + * @return true if the M dimension is valid, false otherwise. + */ + public boolean isMValid() { + return !(Double.isNaN(mMin) || Double.isNaN(mMax)); + } + + /** + * Checks if the bounding box is empty in the X / Y dimension. + * + * @return true if the bounding box is empty, false otherwise. + */ + public boolean isXYEmpty() { + return isXEmpty() || isYEmpty(); + } + + /** + * Checks if the bounding box is empty in the X dimension. + * + * @return true if the X dimension is empty, false otherwise. + */ + public boolean isXEmpty() { + return Double.isInfinite(xMin) && Double.isInfinite(xMax); + } + + /** + * Checks if the bounding box is empty in the Y dimension. + * + * @return true if the Y dimension is empty, false otherwise. + */ + public boolean isYEmpty() { + return Double.isInfinite(yMin) && Double.isInfinite(yMax); + } + + /** + * Checks if the bounding box is empty in the Z dimension. + * + * @return true if the Z dimension is empty, false otherwise. + */ + public boolean isZEmpty() { + return Double.isInfinite(zMin) && Double.isInfinite(zMax); + } + + /** + * Checks if the bounding box is empty in the M dimension. + * + * @return true if the M dimension is empty, false otherwise. + */ + public boolean isMEmpty() { + return Double.isInfinite(mMin) && Double.isInfinite(mMax); + } + + /** + * Expands this bounding box to include the bounds of another box. + * After merging, this bounding box will contain both its original extent + * and the extent of the other bounding box. + * + * @param other the other BoundingBox whose bounds will be merged into this one + */ + public void merge(BoundingBox other) { + if (!valid) { + return; + } + + // If other is null or invalid, mark this as invalid + if (other == null || !other.valid) { + valid = false; + resetBBox(); + return; + } + + this.xMin = Math.min(this.xMin, other.xMin); + this.xMax = Math.max(this.xMax, other.xMax); + this.yMin = Math.min(this.yMin, other.yMin); + this.yMax = Math.max(this.yMax, other.yMax); + this.zMin = Math.min(this.zMin, other.zMin); + this.zMax = Math.max(this.zMax, other.zMax); + this.mMin = Math.min(this.mMin, other.mMin); + this.mMax = Math.max(this.mMax, other.mMax); + + // Update the validity of this bounding box based on the other bounding box + valid = isXYValid(); + } + + /** + * Extends this bounding box to include the spatial extent of the provided geometry. + * The bounding box coordinates (min/max values for x, y, z, m) will be adjusted + * to encompass both the current bounds and the geometry's bounds. + * + * @param geometry The geometry whose coordinates will be used to update this bounding box. + * If null or empty, the method returns without making any changes. + */ + public void update(Geometry geometry) { + if (!valid) { + return; + } + + if (geometry == null || geometry.isEmpty()) { + return; + } + + Envelope envelope = geometry.getEnvelopeInternal(); + updateBounds(envelope.getMinX(), envelope.getMaxX(), envelope.getMinY(), envelope.getMaxY()); + + for (Coordinate coord : geometry.getCoordinates()) { + if (!Double.isNaN(coord.getZ())) { + zMin = Math.min(zMin, coord.getZ()); + zMax = Math.max(zMax, coord.getZ()); + } + if (!Double.isNaN(coord.getM())) { + mMin = Math.min(mMin, coord.getM()); + mMax = Math.max(mMax, coord.getM()); + } + } + + // Update the validity of this bounding box based on the other bounding box + valid = isXYValid(); + } + + /** + * Updates the X and Y bounds of this bounding box with the given coordinates. + * Updates are conditional: + * - X bounds are only updated if both minX and maxX are not NaN + * - Y bounds are only updated if both minY and maxY are not NaN + * This allows partial updates while preserving valid dimensions. + */ + private void updateBounds(double minX, double maxX, double minY, double maxY) { + if (!Double.isNaN(minX) && !Double.isNaN(maxX)) { + xMin = Math.min(xMin, minX); + xMax = Math.max(xMax, maxX); + } + + if (!Double.isNaN(minY) && !Double.isNaN(maxY)) { + yMin = Math.min(yMin, minY); + yMax = Math.max(yMax, maxY); + } + } + + /** + * Aborts the bounding box by resetting it to its initial state. + */ + public void abort() { + valid = false; + resetBBox(); + } + + /** + * Resets the bounding box to its initial state. + */ + public void reset() { + resetBBox(); + valid = true; + } + + /** + * Creates a copy of the current bounding box. + * + * @return a new BoundingBox instance with the same values as this one. + */ + public BoundingBox copy() { + return new BoundingBox( + this.xMin, this.xMax, + this.yMin, this.yMax, + this.zMin, this.zMax, + this.mMin, this.mMax); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("BoundingBox{xMin=") + .append(xMin) + .append(", xMax=") + .append(xMax) + .append(", yMin=") + .append(yMin) + .append(", yMax=") + .append(yMax) + .append(", zMin=") + .append(zMin) + .append(", zMax=") + .append(zMax) + .append(", mMin=") + .append(mMin) + .append(", mMax=") + .append(mMax); + + // Only include the valid flag when it's false + if (!valid) { + sb.append(", valid=false"); + } + + sb.append('}'); + return sb.toString(); + } +} \ No newline at end of file diff --git a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java new file mode 100644 index 0000000000..938e6e5548 --- /dev/null +++ b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.orc.geospatial; + +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; + +public class GeospatialTypes { + + private static final int UNKNOWN_TYPE_ID = -1; + private Set types = new HashSet<>(); + private boolean valid = true; + + public GeospatialTypes(Set types) { + this.types = types; + this.valid = true; + } + + public GeospatialTypes(Set types, boolean valid) { + this.types = types; + this.valid = valid; + } + + public GeospatialTypes() {} + + public Set getTypes() { + return types; + } + + public void update(Geometry geometry) { + if (!valid) { + return; + } + + if (geometry == null || geometry.isEmpty()) { + return; + } + + int code = getGeometryTypeCode(geometry); + if (code != UNKNOWN_TYPE_ID) { + types.add(code); + } else { + valid = false; + types.clear(); + } + } + + public void merge(GeospatialTypes other) { + if (!valid) { + return; + } + + if (other == null || !other.valid) { + valid = false; + types.clear(); + return; + } + types.addAll(other.types); + } + + public void reset() { + types.clear(); + valid = true; + } + + public void abort() { + valid = false; + types.clear(); + } + + public boolean isValid() { + return valid; + } + + public GeospatialTypes copy() { + return new GeospatialTypes(new HashSet<>(types), valid); + } + + /** + * Extracts the base geometry type code from a full type code. + * For example: 1001 (XYZ Point) -> 1 (Point) + * + * @param typeId the full geometry type code + * @return the base type code (1-7) + */ + private int getBaseTypeCode(int typeId) { + return typeId % 1000; + } + + /** + * Extracts the dimension prefix from a full type code. + * For example: 1001 (XYZ Point) -> 1000 (XYZ) + * + * @param typeId the full geometry type code + * @return the dimension prefix (0, 1000, 2000, or 3000) + */ + private int getDimensionPrefix(int typeId) { + return (typeId / 1000) * 1000; + } + + @Override + public String toString() { + return "GeospatialTypes{" + "types=" + + types.stream().map(this::typeIdToString).collect(Collectors.toSet()) + '}'; + } + + private int getGeometryTypeId(Geometry geometry) { + switch (geometry.getGeometryType()) { + case Geometry.TYPENAME_POINT: + return 1; + case Geometry.TYPENAME_LINESTRING: + return 2; + case Geometry.TYPENAME_POLYGON: + return 3; + case Geometry.TYPENAME_MULTIPOINT: + return 4; + case Geometry.TYPENAME_MULTILINESTRING: + return 5; + case Geometry.TYPENAME_MULTIPOLYGON: + return 6; + case Geometry.TYPENAME_GEOMETRYCOLLECTION: + return 7; + default: + return UNKNOWN_TYPE_ID; + } + } + + /** + * Geospatial type codes: + * + * | Type | XY | XYZ | XYM | XYZM | + * | :----------------- | :--- | :--- | :--- | :--: | + * | Point | 0001 | 1001 | 2001 | 3001 | + * | LineString | 0002 | 1002 | 2002 | 3002 | + * | Polygon | 0003 | 1003 | 2003 | 3003 | + * | MultiPoint | 0004 | 1004 | 2004 | 3004 | + * | MultiLineString | 0005 | 1005 | 2005 | 3005 | + * | MultiPolygon | 0006 | 1006 | 2006 | 3006 | + * | GeometryCollection | 0007 | 1007 | 2007 | 3007 | + * + * See https://github.com/apache/parquet-format/blob/master/Geospatial.md#geospatial-types + */ + private int getGeometryTypeCode(Geometry geometry) { + int typeId = getGeometryTypeId(geometry); + if (typeId == UNKNOWN_TYPE_ID) { + return UNKNOWN_TYPE_ID; + } + Coordinate[] coordinates = geometry.getCoordinates(); + boolean hasZ = false; + boolean hasM = false; + if (coordinates.length > 0) { + Coordinate firstCoord = coordinates[0]; + hasZ = !Double.isNaN(firstCoord.getZ()); + hasM = !Double.isNaN(firstCoord.getM()); + } + if (hasZ) { + typeId += 1000; + } + if (hasM) { + typeId += 2000; + } + return typeId; + } + + private String typeIdToString(int typeId) { + String typeString; + switch (typeId % 1000) { + case 1: + typeString = Geometry.TYPENAME_POINT; + break; + case 2: + typeString = Geometry.TYPENAME_LINESTRING; + break; + case 3: + typeString = Geometry.TYPENAME_POLYGON; + break; + case 4: + typeString = Geometry.TYPENAME_MULTIPOINT; + break; + case 5: + typeString = Geometry.TYPENAME_MULTILINESTRING; + break; + case 6: + typeString = Geometry.TYPENAME_MULTIPOLYGON; + break; + case 7: + typeString = Geometry.TYPENAME_GEOMETRYCOLLECTION; + break; + default: + return "Unknown"; + } + if (typeId >= 3000) { + typeString += " (XYZM)"; + } else if (typeId >= 2000) { + typeString += " (XYM)"; + } else if (typeId >= 1000) { + typeString += " (XYZ)"; + } else { + typeString += " (XY)"; + } + return typeString; + } +} \ No newline at end of file diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index d6147050ec..b1470926fc 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -23,18 +23,12 @@ import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparator; -import org.apache.orc.BinaryColumnStatistics; -import org.apache.orc.BooleanColumnStatistics; -import org.apache.orc.CollectionColumnStatistics; -import org.apache.orc.ColumnStatistics; -import org.apache.orc.DateColumnStatistics; -import org.apache.orc.DecimalColumnStatistics; -import org.apache.orc.DoubleColumnStatistics; -import org.apache.orc.IntegerColumnStatistics; -import org.apache.orc.OrcProto; -import org.apache.orc.StringColumnStatistics; -import org.apache.orc.TimestampColumnStatistics; -import org.apache.orc.TypeDescription; +import org.apache.orc.*; +import org.apache.orc.geospatial.BoundingBox; +import org.apache.orc.geospatial.GeospatialTypes; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.io.ParseException; +import org.locationtech.jts.io.WKBReader; import org.threeten.extra.chrono.HybridChronology; import java.sql.Date; @@ -42,6 +36,8 @@ import java.time.chrono.ChronoLocalDate; import java.time.chrono.Chronology; import java.time.chrono.IsoChronology; +import java.util.HashSet; +import java.util.Set; import java.util.TimeZone; @@ -1858,6 +1854,131 @@ public Timestamp getMaximum() { private boolean hasNull = false; private long bytesOnDisk = 0; + private static final class GeospatialStatisticsImpl extends ColumnStatisticsImpl + implements GeospatialColumnStatistics { + private BoundingBox boundingBox; + private final GeospatialTypes geospatialTypes; + private final WKBReader reader = new WKBReader(); + + GeospatialStatisticsImpl() { + this.boundingBox = new BoundingBox(); + this.geospatialTypes = new GeospatialTypes(); + } + + GeospatialStatisticsImpl(OrcProto.ColumnStatistics stats) { + super(stats); + this.boundingBox = new BoundingBox(); + + OrcProto.GeospatialStatistics geoStatistics = stats.getGeospatialStatistics(); + if (geoStatistics.hasBbox()) { + OrcProto.BoundingBox bbox = geoStatistics.getBbox(); + this.boundingBox = new BoundingBox( + bbox.hasXmin() ? bbox.getXmin() : Double.POSITIVE_INFINITY, + bbox.hasXmax() ? bbox.getXmax() : Double.NEGATIVE_INFINITY, + bbox.hasYmin() ? bbox.getYmin() : Double.POSITIVE_INFINITY, + bbox.hasYmax() ? bbox.getYmax() : Double.NEGATIVE_INFINITY, + bbox.hasZmin() ? bbox.getZmin() : Double.POSITIVE_INFINITY, + bbox.hasZmax() ? bbox.getZmax() : Double.NEGATIVE_INFINITY, + bbox.hasMmin() ? bbox.getMmin() : Double.POSITIVE_INFINITY, + bbox.hasMmax() ? bbox.getMmax() : Double.NEGATIVE_INFINITY); + } + + Set types = new HashSet<>(geoStatistics.getGeospatialTypesList()); + this.geospatialTypes = new GeospatialTypes(types); + } + + @Override + public void updateGeometry(BytesWritable value) { + if (value == null) { + return; + } + + try { + Geometry geom = reader.read(value.getBytes()); + boundingBox.update(geom); + geospatialTypes.update(geom); + } catch (ParseException e) { + throw new IllegalArgumentException("Invalid geospatial data - failed to parse WKB format", e); + } + } + + @Override + public void updateGeometry(byte[] bytes, int offset, int length) { + if (bytes == null) { + return; + } + BytesWritable value = new BytesWritable(); + value.set(bytes, offset, length); + updateGeometry(value); + } + + @Override + public void reset() { + super.reset(); + boundingBox.reset();; + geospatialTypes.reset(); + } + + @Override + public void merge(ColumnStatisticsImpl other) { + if (other instanceof GeospatialStatisticsImpl geoStats) { + boundingBox.merge(geoStats.boundingBox); + geospatialTypes.merge(geoStats.geospatialTypes); + } else { + throw new IllegalArgumentException("Incompatible merging of geospatial column statistics"); + } + super.merge(other); + } + + @Override + public OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder builder = super.serialize(); + OrcProto.GeospatialStatistics.Builder geoStats = OrcProto.GeospatialStatistics.newBuilder(); + + OrcProto.BoundingBox.Builder bboxBuilder = OrcProto.BoundingBox.newBuilder(); + if (boundingBox.isValid()) { + bboxBuilder.setXmin(boundingBox.getXMin()); + bboxBuilder.setXmax(boundingBox.getXMax()); + bboxBuilder.setYmin(boundingBox.getYMin()); + bboxBuilder.setYmax(boundingBox.getYMax()); + bboxBuilder.setZmin(boundingBox.getZMin()); + bboxBuilder.setZmax(boundingBox.getZMax()); + bboxBuilder.setMmin(boundingBox.getMMin()); + bboxBuilder.setMmax(boundingBox.getMMax()); + } + if (geospatialTypes.isValid()) { + geoStats.addAllGeospatialTypes(geospatialTypes.getTypes()); + } + geoStats.setBbox(bboxBuilder); + builder.setGeospatialStatistics(geoStats); + return builder; + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(super.toString()); + if (boundingBox.isValid()) { + buf.append(" bbox: "); + buf.append(boundingBox.toString()); + } + if (geospatialTypes.isValid()) { + buf.append(" types: "); + buf.append(geospatialTypes.toString()); + } + return buf.toString(); + } + + @Override + public BoundingBox getBoundingBox() { + return boundingBox; + } + + @Override + public GeospatialTypes getGeospatialTypes() { + return geospatialTypes; + } + } + ColumnStatisticsImpl(OrcProto.ColumnStatistics stats) { if (stats.hasNumberOfValues()) { count = stats.getNumberOfValues(); @@ -1955,6 +2076,14 @@ public void updateTimestamp(long value, int nanos) { throw new UnsupportedOperationException("Can't update timestamp"); } + public void updateGeometry(BytesWritable value) { + throw new UnsupportedOperationException("Can't update Geometry"); + } + + public void updateGeometry(byte[] bytes, int offset, int length) { + throw new UnsupportedOperationException("Can't update Geometry"); + } + public boolean isStatsExists() { return (count > 0 || hasNull == true); } @@ -2046,6 +2175,9 @@ public static ColumnStatisticsImpl create(TypeDescription schema, return new TimestampInstantStatisticsImpl(); case BINARY: return new BinaryStatisticsImpl(); + case Geography: + case Geometry: + return new GeospatialStatisticsImpl(); default: return new ColumnStatisticsImpl(); } diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java index 09b4b2ae61..eacff4b063 100644 --- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java +++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java @@ -461,6 +461,8 @@ void buildConversion(TypeDescription fileType, case TIMESTAMP_INSTANT: case BINARY: case DATE: + case Geometry: + case Geography: // these are always a match break; case CHAR: diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java index 418b9c9561..d12f97c5c4 100644 --- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -1154,6 +1154,12 @@ public void skipRows(long items, ReadPhase readPhase) throws IOException { } } + public static class GeospatialTreeReader extends BinaryTreeReader { + GeospatialTreeReader(int columnId, Context context) throws IOException { + super(columnId, context); + } + } + public static class TimestampTreeReader extends TreeReader { protected IntegerReader data = null; protected IntegerReader nanos = null; @@ -3028,6 +3034,8 @@ public static TypeReader createTreeReader(TypeDescription readerType, } return new DecimalTreeReader(fileType.getId(), fileType.getPrecision(), fileType.getScale(), context); + case Geometry: + return new GeospatialTreeReader(fileType.getId(), context); case STRUCT: return new StructTreeReader(fileType.getId(), readerType, context); case LIST: diff --git a/java/core/src/java/org/apache/orc/impl/TypeUtils.java b/java/core/src/java/org/apache/orc/impl/TypeUtils.java index a5daa89572..40d22e2c43 100644 --- a/java/core/src/java/org/apache/orc/impl/TypeUtils.java +++ b/java/core/src/java/org/apache/orc/impl/TypeUtils.java @@ -69,6 +69,8 @@ public static ColumnVector createColumn(TypeDescription schema, case BINARY: case CHAR: case VARCHAR: + case Geometry: + case Geography: return new BytesColumnVector(maxSize); case STRUCT: { List children = schema.getChildren(); diff --git a/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java new file mode 100644 index 0000000000..308a8e4629 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl.writer; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.io.BytesWritable; +import org.apache.orc.OrcProto; +import org.apache.orc.TypeDescription; +import org.apache.orc.impl.CryptoUtils; +import org.apache.orc.impl.IntegerWriter; +import org.apache.orc.impl.PositionRecorder; +import org.apache.orc.impl.PositionedOutputStream; +import org.apache.orc.impl.StreamName; + +import java.io.IOException; +import java.util.function.Consumer; + +public class GeospatialTreeWriter extends TreeWriterBase { + private final PositionedOutputStream stream; + private final IntegerWriter length; + private boolean isDirectV2 = true; + private long rawDataSize = 0; + private boolean isGeometry = false; + + public GeospatialTreeWriter(TypeDescription schema, + WriterEncryptionVariant encryption, + WriterContext context) throws IOException { + super(schema, encryption, context); + this.isGeometry = schema.getCategory() == TypeDescription.Category.Geometry; + this.stream = context.createStream( + new StreamName(id, OrcProto.Stream.Kind.DATA, encryption)); + this.isDirectV2 = isNewWriteFormat(context); + this.length = createIntegerWriter(context.createStream( + new StreamName(id, OrcProto.Stream.Kind.LENGTH, encryption)), + false, isDirectV2, context); + if (rowIndexPosition != null) { + recordPosition(rowIndexPosition); + } + } + + @Override + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder result = super.getEncoding(); + if (isDirectV2) { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); + } + return result; + } + + @Override + public void writeBatch(ColumnVector vector, int offset, + int length) throws IOException { + super.writeBatch(vector, offset, length); + BytesColumnVector vec = (BytesColumnVector) vector; + if (vector.isRepeating) { + if (vector.noNulls || !vector.isNull[0]) { + for (int i = 0; i < length; ++i) { + stream.write(vec.vector[0], vec.start[0], + vec.length[0]); + this.length.write(vec.length[0]); + } + rawDataSize += (long)length * vec.length[0]; + if (isGeometry) { + indexStatistics.updateGeometry(vec.vector[0], vec.start[0], vec.length[0]); + } + if (createBloomFilter) { + if (bloomFilter != null) { + bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]); + } + bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]); + } + } + } else { + for (int i = 0; i < length; ++i) { + if (vec.noNulls || !vec.isNull[i + offset]) { + stream.write(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); + this.length.write(vec.length[offset + i]); + rawDataSize += vec.length[offset + i]; + BytesWritable bw = new BytesWritable(); + bw.set(vec.vector[offset + i], vec.start[offset + i], vec.length[offset + i]); + if (isGeometry) { + indexStatistics.updateGeometry(vec.vector[i], vec.start[i], vec.length[i]); + } + if (createBloomFilter) { + if (bloomFilter != null) { + bloomFilter.addBytes(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); + } + bloomFilterUtf8.addBytes(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); + } + } + } + } + } + + @Override + public void writeStripe(int requiredIndexEntries) throws IOException { + super.writeStripe(requiredIndexEntries); + if (rowIndexPosition != null) { + recordPosition(rowIndexPosition); + } + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + stream.getPosition(recorder); + length.getPosition(recorder); + } + + @Override + public long estimateMemory() { + return super.estimateMemory() + stream.getBufferSize() + + length.estimateMemory(); + } + + @Override + public long getRawDataSize() { + return rawDataSize; + } + + @Override + public void flushStreams() throws IOException { + super.flushStreams(); + stream.flush(); + length.flush(); + } + + @Override + public void prepareStripe(int stripeId) { + super.prepareStripe(stripeId); + Consumer updater = CryptoUtils.modifyIvForStripe(stripeId); + stream.changeIv(updater); + length.changeIv(updater); + } +} diff --git a/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java index 71eb3a5648..de63f9efb6 100644 --- a/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java +++ b/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java @@ -185,6 +185,9 @@ static TreeWriter createSubtree(TypeDescription schema, return new ListTreeWriter(schema, encryption, streamFactory); case UNION: return new UnionTreeWriter(schema, encryption, streamFactory); + case Geometry: + case Geography: + return new GeospatialTreeWriter(schema, encryption, streamFactory); default: throw new IllegalArgumentException("Bad category: " + schema.getCategory()); diff --git a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java new file mode 100644 index 0000000000..57ee55eaf1 --- /dev/null +++ b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; +import org.locationtech.jts.geom.GeometryFactory; +import org.locationtech.jts.io.WKBReader; +import org.locationtech.jts.io.WKBWriter; + +import java.io.File; +import java.util.Arrays; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * + */ +public class TestOrcGeospatial { + Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + Configuration conf; + FileSystem fs; + Path testFilePath; + + public TestOrcGeospatial() { + } + + @BeforeEach + public void openFileSystem(TestInfo testInfo) throws Exception { + conf = new Configuration(); + fs = FileSystem.getLocal(conf); + testFilePath = new Path(workDir, "TestOrcGeospatial." + + testInfo.getTestMethod().get().getName() + ".orc"); + fs.delete(testFilePath, false); + } + + @Test + public void testGeometryWriter() throws Exception { + TypeDescription schema = TypeDescription.createGeometry(); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000)); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + WKBReader wkbReader = new WKBReader(); + + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector geos = (BytesColumnVector) batch.cols[0]; + long sum = 0; + for (int i = 0; i < 100; i++) { + byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); + sum += bytes.length; + geos.setVal(batch.size++, bytes); + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + geos = (BytesColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); + assertEquals("Point", geom.getGeometryType()); + assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); + idx += 1; + } + } + rows.close(); + } + + @Test + public void testGeographyWriter() throws Exception { + TypeDescription schema = TypeDescription.createGeography(); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000)); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + WKBReader wkbReader = new WKBReader(); + + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector geos = (BytesColumnVector) batch.cols[0]; + long sum = 0; + for (int i = 0; i < 100; i++) { + byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); + sum += bytes.length; + geos.setVal(batch.size++, bytes); + } + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + geos = (BytesColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for(int r=0; r < batch.size; ++r) { + Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); + assertEquals("Point", geom.getGeometryType()); + assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); + idx += 1; + } + } + rows.close(); + } +} diff --git a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java index f16d042fdb..7797045887 100644 --- a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java @@ -27,7 +27,13 @@ import org.apache.orc.TestConf; import org.apache.orc.TimestampColumnStatistics; import org.apache.orc.TypeDescription; +import org.apache.hadoop.io.BytesWritable; +import org.apache.orc.*; +import org.apache.orc.geospatial.BoundingBox; +import org.apache.orc.geospatial.GeospatialTypes; import org.junit.jupiter.api.Test; +import org.locationtech.jts.geom.*; +import org.locationtech.jts.io.WKBWriter; import java.io.IOException; import java.util.TimeZone; @@ -217,4 +223,157 @@ public void testCollectionColumnStats() { assertEquals(100, collectionStatistics.getTotalChildren()); } + + @Test + public void testUpdateGeometry() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + byte[][] points = { + wkbWriter.write(geometryFactory.createPoint(new Coordinate(1.0, 1.0))), + wkbWriter.write(geometryFactory.createPoint(new Coordinate(2.0, 2.0))), + }; + + for (byte[] point : points) { + stats.updateGeometry(new BytesWritable(point)); + } + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(2.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", + geometryStatistics.toString()); + + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + assertTrue(geospatialTypes.getTypes().contains(1)); + assertEquals(1, geospatialTypes.getTypes().size()); + } + + @Test + public void testUpdateGeometryWithDifferentTypes() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + Point point = geometryFactory.createPoint(new Coordinate(1, 1)); + + Coordinate[] lineCoords = new Coordinate[] {new Coordinate(1, 1), new Coordinate(2, 2)}; + LineString line = geometryFactory.createLineString(lineCoords); + + Coordinate[] polygonCoords = new Coordinate[] { + new Coordinate(0, 0), new Coordinate(3, 0), + new Coordinate(1, 3), new Coordinate(0, 1), + new Coordinate(0, 0) + }; + LinearRing shell = geometryFactory.createLinearRing(polygonCoords); + Polygon polygon = geometryFactory.createPolygon(shell); + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + // Generate WKB and update stats + byte[] pointWkb = wkbWriter.write(point); + stats.updateGeometry(new BytesWritable(pointWkb)); + + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(1.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(1.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=1.0, xMax=1.0, yMin=1.0, yMax=1.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + + assertTrue(geospatialTypes.getTypes().contains(1)); + assertEquals(1, geospatialTypes.getTypes().size()); + assertEquals("GeospatialTypes{types=[Point (XY)]}", geospatialTypes.toString()); + + + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=1.0, yMin=1.0, yMax=1.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", + geometryStatistics.toString()); + + byte[] lineWkb = wkbWriter.write(line); + stats.updateGeometry(new BytesWritable(lineWkb)); + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(2.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + + assertTrue(geospatialTypes.getTypes().contains(1)); + assertTrue(geospatialTypes.getTypes().contains(2)); + assertEquals(2, geospatialTypes.getTypes().size()); + assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY)]}", + geospatialTypes.toString()); + + byte[] polygonWkb = wkbWriter.write(polygon); + stats.updateGeometry(new BytesWritable(polygonWkb)); + stats.updateGeometry(new BytesWritable(lineWkb)); + assertEquals(0.0, bbox.getXMin(), 0.0); + assertEquals(3.0, bbox.getXMax(), 0.0); + assertEquals(0.0, bbox.getYMin(), 0.0); + assertEquals(3.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + + assertTrue(geospatialTypes.getTypes().contains(1)); + assertTrue(geospatialTypes.getTypes().contains(2)); + assertTrue(geospatialTypes.getTypes().contains(2)); + assertEquals(3, geospatialTypes.getTypes().size()); + assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY), Polygon (XY)]}", + geospatialTypes.toString()); + + } + + @Test + public void testUpdateGeometryWithZCoordinates() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(3); + + Point point1 = geometryFactory.createPoint(new Coordinate(0, 1, 2)); + Point point2 = geometryFactory.createPoint(new Coordinate(2, 1, 0)); + + stats.updateGeometry(new BytesWritable(wkbWriter.write(point1))); + stats.updateGeometry(new BytesWritable(wkbWriter.write(point2))); + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + assertEquals(0.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(1.0, bbox.getYMax(), 0.0); + assertEquals(0.0, bbox.getZMin(), 0.0); + assertEquals(2.0, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=0.0, xMax=2.0, yMin=1.0, yMax=1.0, zMin=0.0, zMax=2.0, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=0.0, xMax=2.0, yMin=1.0, yMax=1.0, zMin=0.0, zMax=2.0, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XYZ)]}", + geometryStatistics.toString()); + + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + assertTrue(geospatialTypes.getTypes().contains(1001)); + assertEquals(1, geospatialTypes.getTypes().size()); + } } diff --git a/java/pom.xml b/java/pom.xml index 50df3df920..c510e68141 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -85,6 +85,7 @@ 3.5.3 ${project.build.directory}/testing-tmp 1.5.7-3 + 1.19.0 From 05135cbfae5843f6b201ef5e539ebaedce92dbb7 Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 2 Jun 2025 23:24:40 +0800 Subject: [PATCH 02/16] Fix some bugs --- .../src/java/org/apache/orc/OrcUtils.java | 48 ++++++++++--------- .../java/org/apache/orc/TypeDescription.java | 1 + .../apache/orc/impl/TreeReaderFactory.java | 1 + .../org/apache/orc/TestOrcGeospatial.java | 2 + 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java index 9e172acb19..3462f23a9b 100644 --- a/java/core/src/java/org/apache/orc/OrcUtils.java +++ b/java/core/src/java/org/apache/orc/OrcUtils.java @@ -171,30 +171,34 @@ private static void appendOrcTypes(List result, TypeDescription t type.setPrecision(typeDescr.getPrecision()); type.setScale(typeDescr.getScale()); break; - case Geometry: case Geography: - type.setKind(OrcProto.Type.Kind.GEOMETRY); - type.setCrs(typeDescr.getCRS()); - switch (typeDescr.getEdgeInterpolationAlgorithm()) { - case SPHERICAL: - type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.SPHERICAL); - break; - case VINCENTY: - type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.VINCENTY); - break; - case THOMAS: - type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.THOMAS); - break; - case ANDOYER: - type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.ANDOYER); - break; - case KARNEY: - type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.KARNEY); - break; - default: - throw new IllegalArgumentException("Unknown interpolation algorithm: " + - typeDescr.getEdgeInterpolationAlgorithm()); + case Geometry: + if (typeDescr.getCategory() == TypeDescription.Category.Geography) { + type.setKind(OrcProto.Type.Kind.GEOGRAPHY); + switch (typeDescr.getEdgeInterpolationAlgorithm()) { + case SPHERICAL: + type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.SPHERICAL); + break; + case VINCENTY: + type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.VINCENTY); + break; + case THOMAS: + type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.THOMAS); + break; + case ANDOYER: + type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.ANDOYER); + break; + case KARNEY: + type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.KARNEY); + break; + default: + throw new IllegalArgumentException("Unknown interpolation algorithm: " + + typeDescr.getEdgeInterpolationAlgorithm()); + } + } else { + type.setKind(OrcProto.Type.Kind.GEOMETRY); } + type.setCrs(typeDescr.getCRS()); break; case LIST: type.setKind(OrcProto.Type.Kind.LIST); diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index 54b6c7cd4c..827944637b 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -752,6 +752,7 @@ public void printToBuffer(StringBuilder buffer) { buffer.append(','); buffer.append(edgeInterpolationAlgorithm.name()); buffer.append(')'); + break; case LIST: case MAP: case UNION: diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java index d12f97c5c4..785f568ff4 100644 --- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java +++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java @@ -3034,6 +3034,7 @@ public static TypeReader createTreeReader(TypeDescription readerType, } return new DecimalTreeReader(fileType.getId(), fileType.getPrecision(), fileType.getScale(), context); + case Geography: case Geometry: return new GeospatialTreeReader(fileType.getId(), context); case STRUCT: diff --git a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java index 57ee55eaf1..3d58008aaa 100644 --- a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java +++ b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java @@ -81,6 +81,7 @@ public void testGeometryWriter() throws Exception { writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals(reader.getSchema().toString(), "geometry(OGC:CRS84)"); RecordReader rows = reader.rows(); batch = reader.getSchema().createRowBatch(); geos = (BytesColumnVector) batch.cols[0]; @@ -118,6 +119,7 @@ public void testGeographyWriter() throws Exception { writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals(reader.getSchema().toString(), "geography(OGC:CRS84,SPHERICAL)"); RecordReader rows = reader.rows(); batch = reader.getSchema().createRowBatch(); geos = (BytesColumnVector) batch.cols[0]; From d5cd48541571a71a4d5ff678146b2481774d88ac Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 2 Jun 2025 23:41:44 +0800 Subject: [PATCH 03/16] Fix spotless --- .../src/java/org/apache/orc/geospatial/GeospatialTypes.java | 5 +++-- java/pom.xml | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java index 938e6e5548..984782e1e4 100644 --- a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java +++ b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java @@ -19,11 +19,12 @@ package org.apache.orc.geospatial; +import org.locationtech.jts.geom.Coordinate; +import org.locationtech.jts.geom.Geometry; + import java.util.HashSet; import java.util.Set; import java.util.stream.Collectors; -import org.locationtech.jts.geom.Coordinate; -import org.locationtech.jts.geom.Geometry; public class GeospatialTypes { diff --git a/java/pom.xml b/java/pom.xml index c510e68141..1166132cf7 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -67,6 +67,7 @@ 3.4.1 17 ${project.basedir}/../target/javadoc + 1.19.0 5.12.2 3.7.1 3.8.1 @@ -85,7 +86,6 @@ 3.5.3 ${project.build.directory}/testing-tmp 1.5.7-3 - 1.19.0 From 03385105938282cff223ac7ba16519ed0fc71843 Mon Sep 17 00:00:00 2001 From: ffacs Date: Tue, 3 Jun 2025 17:07:04 +0800 Subject: [PATCH 04/16] Fix spotbugs --- .../java/org/apache/orc/TypeDescription.java | 1 + .../apache/orc/geospatial/BoundingBox.java | 20 +++++++++++++++ .../orc/geospatial/GeospatialTypes.java | 16 ++++++++++++ .../apache/orc/impl/ColumnStatisticsImpl.java | 25 +++++++++++++++++++ 4 files changed, 62 insertions(+) diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index 827944637b..25a70b4328 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -822,6 +822,7 @@ private void printJsonToBuffer(String prefix, StringBuilder buffer, buffer.append(CRS); buffer.append(", \"edge_interpolation_algorithm\": "); buffer.append(edgeInterpolationAlgorithm.name()); + break; case LIST: case MAP: case UNION: diff --git a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java index 7ebe97f6aa..edc74b12e1 100644 --- a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java +++ b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java @@ -52,6 +52,26 @@ public BoundingBox( valid = isXYValid(); } + @Override + public boolean equals(Object obj) { + if (!(obj instanceof BoundingBox other)) { + return false; + } + if (obj == this) { + return true; + } + return xMin == other.xMin && xMax == other.xMax && yMin == other.yMin && yMax == other.yMax + && zMin == other.zMin && zMax == other.zMax && mMin == other.mMin && mMax == other.mMax + && valid == other.valid; + } + + @Override + public int hashCode() { + return Double.hashCode(xMin) ^ Double.hashCode(xMax) ^ Double.hashCode(yMin) ^ Double.hashCode(yMax) + ^ Double.hashCode(zMin) ^ Double.hashCode(zMax) ^ Double.hashCode(mMin) ^ Double.hashCode(mMax) + ^ Boolean.hashCode(valid); + } + private void resetBBox() { xMin = Double.POSITIVE_INFINITY; xMax = Double.NEGATIVE_INFINITY; diff --git a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java index 984782e1e4..2a82d133bb 100644 --- a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java +++ b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java @@ -42,6 +42,22 @@ public GeospatialTypes(Set types, boolean valid) { this.valid = valid; } + @Override + public boolean equals(Object obj) { + if (!(obj instanceof GeospatialTypes other)) { + return false; + } + if (obj == this) { + return true; + } + return valid == other.valid && types.equals(other.types); + } + + @Override + public int hashCode() { + return types.hashCode() ^ Boolean.hashCode(valid); + } + public GeospatialTypes() {} public Set getTypes() { diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index b1470926fc..86d282cfff 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -1977,6 +1977,31 @@ public BoundingBox getBoundingBox() { public GeospatialTypes getGeospatialTypes() { return geospatialTypes; } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof GeospatialStatisticsImpl that)) { + return false; + } + if (!super.equals(o)) { + return false; + } + + return boundingBox.equals(that.boundingBox) && + geospatialTypes.equals(that.geospatialTypes); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + boundingBox.hashCode(); + result = prime * result + geospatialTypes.hashCode(); + return result; + } } ColumnStatisticsImpl(OrcProto.ColumnStatistics stats) { From 4b500935557bff7437bcf0e71d68ce9fdb067cbf Mon Sep 17 00:00:00 2001 From: ffacs Date: Tue, 3 Jun 2025 17:45:59 +0800 Subject: [PATCH 05/16] Fix checkstyle --- .../orc/GeospatialColumnStatistics.java | 2 +- .../java/org/apache/orc/TypeDescription.java | 18 +- .../apache/orc/geospatial/BoundingBox.java | 646 +++++++++--------- .../orc/geospatial/GeospatialTypes.java | 2 +- .../apache/orc/impl/ColumnStatisticsImpl.java | 14 +- .../orc/impl/writer/GeospatialTreeWriter.java | 212 +++--- .../org/apache/orc/TestOrcGeospatial.java | 166 ++--- .../orc/impl/TestColumnStatisticsImpl.java | 22 +- 8 files changed, 552 insertions(+), 530 deletions(-) diff --git a/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java b/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java index 4cac17eace..538eba2b0e 100644 --- a/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java +++ b/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java @@ -25,4 +25,4 @@ public interface GeospatialColumnStatistics extends ColumnStatistics { BoundingBox getBoundingBox(); GeospatialTypes getGeospatialTypes(); -} \ No newline at end of file +} diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index 25a70b4328..342cd998e8 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -202,9 +202,13 @@ public static TypeDescription createDecimal() { return new TypeDescription(Category.DECIMAL); } - public static TypeDescription createGeometry() { return new TypeDescription(Category.Geometry); } - - public static TypeDescription createGeography() { return new TypeDescription(Category.Geography); } + public static TypeDescription createGeometry() { + return new TypeDescription(Category.Geometry); + } + + public static TypeDescription createGeography() { + return new TypeDescription(Category.Geography); + } /** * Parse TypeDescription from the Hive type names. This is the inverse @@ -268,10 +272,11 @@ public TypeDescription withCRS(String crs) { return this; } - public TypeDescription withEdgeInterpolationAlgorithm(EdgeInterpolationAlgorithm edgeInterpolationAlgorithm) { + public TypeDescription withEdgeInterpolationAlgorithm( + EdgeInterpolationAlgorithm edgeInterpolationAlgorithm) { if (category != Category.Geography) { throw new IllegalArgumentException("edgeInterpolationAlgorithm is only allowed on Geography" + - " and not " + category.name); + " and not " + category.name); } this.edgeInterpolationAlgorithm = edgeInterpolationAlgorithm; return this; @@ -713,7 +718,8 @@ public TypeDescription(Category category) { private int precision = DEFAULT_PRECISION; private int scale = DEFAULT_SCALE; private String CRS = "OGC:CRS84"; - private EdgeInterpolationAlgorithm edgeInterpolationAlgorithm = EdgeInterpolationAlgorithm.SPHERICAL; + private EdgeInterpolationAlgorithm edgeInterpolationAlgorithm = + EdgeInterpolationAlgorithm.SPHERICAL; static void printFieldName(StringBuilder buffer, String name) { if (UNQUOTED_NAMES.matcher(name).matches()) { diff --git a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java index edc74b12e1..61c280636b 100644 --- a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java +++ b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java @@ -25,341 +25,345 @@ public class BoundingBox { - private double xMin = Double.POSITIVE_INFINITY; - private double xMax = Double.NEGATIVE_INFINITY; - private double yMin = Double.POSITIVE_INFINITY; - private double yMax = Double.NEGATIVE_INFINITY; - private double zMin = Double.POSITIVE_INFINITY; - private double zMax = Double.NEGATIVE_INFINITY; - private double mMin = Double.POSITIVE_INFINITY; - private double mMax = Double.NEGATIVE_INFINITY; - private boolean valid = true; - - public BoundingBox() {} - - public BoundingBox( - double xMin, double xMax, double yMin, double yMax, double zMin, double zMax, double mMin, double mMax) { - this.xMin = xMin; - this.xMax = xMax; - this.yMin = yMin; - this.yMax = yMax; - this.zMin = zMin; - this.zMax = zMax; - this.mMin = mMin; - this.mMax = mMax; - - // Update the validity - valid = isXYValid(); + private double xMin = Double.POSITIVE_INFINITY; + private double xMax = Double.NEGATIVE_INFINITY; + private double yMin = Double.POSITIVE_INFINITY; + private double yMax = Double.NEGATIVE_INFINITY; + private double zMin = Double.POSITIVE_INFINITY; + private double zMax = Double.NEGATIVE_INFINITY; + private double mMin = Double.POSITIVE_INFINITY; + private double mMax = Double.NEGATIVE_INFINITY; + private boolean valid = true; + + public BoundingBox() { + } + + public BoundingBox( + double xMin, double xMax, double yMin, double yMax, + double zMin, double zMax, double mMin, double mMax) { + this.xMin = xMin; + this.xMax = xMax; + this.yMin = yMin; + this.yMax = yMax; + this.zMin = zMin; + this.zMax = zMax; + this.mMin = mMin; + this.mMax = mMax; + + // Update the validity + valid = isXYValid(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof BoundingBox other)) { + return false; } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof BoundingBox other)) { - return false; - } - if (obj == this) { - return true; - } - return xMin == other.xMin && xMax == other.xMax && yMin == other.yMin && yMax == other.yMax - && zMin == other.zMin && zMax == other.zMax && mMin == other.mMin && mMax == other.mMax - && valid == other.valid; - } - - @Override - public int hashCode() { - return Double.hashCode(xMin) ^ Double.hashCode(xMax) ^ Double.hashCode(yMin) ^ Double.hashCode(yMax) - ^ Double.hashCode(zMin) ^ Double.hashCode(zMax) ^ Double.hashCode(mMin) ^ Double.hashCode(mMax) - ^ Boolean.hashCode(valid); - } - - private void resetBBox() { - xMin = Double.POSITIVE_INFINITY; - xMax = Double.NEGATIVE_INFINITY; - yMin = Double.POSITIVE_INFINITY; - yMax = Double.NEGATIVE_INFINITY; - zMin = Double.POSITIVE_INFINITY; - zMax = Double.NEGATIVE_INFINITY; - mMin = Double.POSITIVE_INFINITY; - mMax = Double.NEGATIVE_INFINITY; - } - - public double getXMin() { - return xMin; - } - - public double getXMax() { - return xMax; - } - - public double getYMin() { - return yMin; - } - - public double getYMax() { - return yMax; - } - - public double getZMin() { - return zMin; - } - - public double getZMax() { - return zMax; - } - - public double getMMin() { - return mMin; - } - - public double getMMax() { - return mMax; + if (obj == this) { + return true; } - - /** - * Checks if the bounding box is valid. - * A bounding box is considered valid if none of the X / Y dimensions contain NaN. - * - * @return true if the bounding box is valid, false otherwise. - */ - public boolean isValid() { - return valid; - } - - /** - * Checks if the X and Y dimensions of the bounding box are valid. - * The X and Y dimensions are considered valid if none of the bounds contain NaN. - * - * @return true if the X and Y dimensions are valid, false otherwise. - */ - public boolean isXYValid() { - return isXValid() && isYValid(); + return xMin == other.xMin && xMax == other.xMax && yMin == other.yMin && yMax == other.yMax && + zMin == other.zMin && zMax == other.zMax && mMin == other.mMin && mMax == other.mMax && + valid == other.valid; + } + + @Override + public int hashCode() { + return Double.hashCode(xMin) ^ Double.hashCode(xMax) ^ + Double.hashCode(yMin) ^ Double.hashCode(yMax) ^ + Double.hashCode(zMin) ^ Double.hashCode(zMax) ^ + Double.hashCode(mMin) ^ Double.hashCode(mMax) ^ + Boolean.hashCode(valid); + } + + private void resetBBox() { + xMin = Double.POSITIVE_INFINITY; + xMax = Double.NEGATIVE_INFINITY; + yMin = Double.POSITIVE_INFINITY; + yMax = Double.NEGATIVE_INFINITY; + zMin = Double.POSITIVE_INFINITY; + zMax = Double.NEGATIVE_INFINITY; + mMin = Double.POSITIVE_INFINITY; + mMax = Double.NEGATIVE_INFINITY; + } + + public double getXMin() { + return xMin; + } + + public double getXMax() { + return xMax; + } + + public double getYMin() { + return yMin; + } + + public double getYMax() { + return yMax; + } + + public double getZMin() { + return zMin; + } + + public double getZMax() { + return zMax; + } + + public double getMMin() { + return mMin; + } + + public double getMMax() { + return mMax; + } + + /** + * Checks if the bounding box is valid. + * A bounding box is considered valid if none of the X / Y dimensions contain NaN. + * + * @return true if the bounding box is valid, false otherwise. + */ + public boolean isValid() { + return valid; + } + + /** + * Checks if the X and Y dimensions of the bounding box are valid. + * The X and Y dimensions are considered valid if none of the bounds contain NaN. + * + * @return true if the X and Y dimensions are valid, false otherwise. + */ + public boolean isXYValid() { + return isXValid() && isYValid(); + } + + /** + * Checks if the X dimension of the bounding box is valid. + * The X dimension is considered valid if neither bound contains NaN. + * + * @return true if the X dimension is valid, false otherwise. + */ + public boolean isXValid() { + return !(Double.isNaN(xMin) || Double.isNaN(xMax)); + } + + /** + * Checks if the Y dimension of the bounding box is valid. + * The Y dimension is considered valid if neither bound contains NaN. + * + * @return true if the Y dimension is valid, false otherwise. + */ + public boolean isYValid() { + return !(Double.isNaN(yMin) || Double.isNaN(yMax)); + } + + /** + * Checks if the Z dimension of the bounding box is valid. + * The Z dimension is considered valid if none of the bounds contain NaN. + * + * @return true if the Z dimension is valid, false otherwise. + */ + public boolean isZValid() { + return !(Double.isNaN(zMin) || Double.isNaN(zMax)); + } + + /** + * Checks if the M dimension of the bounding box is valid. + * The M dimension is considered valid if none of the bounds contain NaN. + * + * @return true if the M dimension is valid, false otherwise. + */ + public boolean isMValid() { + return !(Double.isNaN(mMin) || Double.isNaN(mMax)); + } + + /** + * Checks if the bounding box is empty in the X / Y dimension. + * + * @return true if the bounding box is empty, false otherwise. + */ + public boolean isXYEmpty() { + return isXEmpty() || isYEmpty(); + } + + /** + * Checks if the bounding box is empty in the X dimension. + * + * @return true if the X dimension is empty, false otherwise. + */ + public boolean isXEmpty() { + return Double.isInfinite(xMin) && Double.isInfinite(xMax); + } + + /** + * Checks if the bounding box is empty in the Y dimension. + * + * @return true if the Y dimension is empty, false otherwise. + */ + public boolean isYEmpty() { + return Double.isInfinite(yMin) && Double.isInfinite(yMax); + } + + /** + * Checks if the bounding box is empty in the Z dimension. + * + * @return true if the Z dimension is empty, false otherwise. + */ + public boolean isZEmpty() { + return Double.isInfinite(zMin) && Double.isInfinite(zMax); + } + + /** + * Checks if the bounding box is empty in the M dimension. + * + * @return true if the M dimension is empty, false otherwise. + */ + public boolean isMEmpty() { + return Double.isInfinite(mMin) && Double.isInfinite(mMax); + } + + /** + * Expands this bounding box to include the bounds of another box. + * After merging, this bounding box will contain both its original extent + * and the extent of the other bounding box. + * + * @param other the other BoundingBox whose bounds will be merged into this one + */ + public void merge(BoundingBox other) { + if (!valid) { + return; } - /** - * Checks if the X dimension of the bounding box is valid. - * The X dimension is considered valid if neither bound contains NaN. - * - * @return true if the X dimension is valid, false otherwise. - */ - public boolean isXValid() { - return !(Double.isNaN(xMin) || Double.isNaN(xMax)); + // If other is null or invalid, mark this as invalid + if (other == null || !other.valid) { + valid = false; + resetBBox(); + return; } - /** - * Checks if the Y dimension of the bounding box is valid. - * The Y dimension is considered valid if neither bound contains NaN. - * - * @return true if the Y dimension is valid, false otherwise. - */ - public boolean isYValid() { - return !(Double.isNaN(yMin) || Double.isNaN(yMax)); + this.xMin = Math.min(this.xMin, other.xMin); + this.xMax = Math.max(this.xMax, other.xMax); + this.yMin = Math.min(this.yMin, other.yMin); + this.yMax = Math.max(this.yMax, other.yMax); + this.zMin = Math.min(this.zMin, other.zMin); + this.zMax = Math.max(this.zMax, other.zMax); + this.mMin = Math.min(this.mMin, other.mMin); + this.mMax = Math.max(this.mMax, other.mMax); + + // Update the validity of this bounding box based on the other bounding box + valid = isXYValid(); + } + + /** + * Extends this bounding box to include the spatial extent of the provided geometry. + * The bounding box coordinates (min/max values for x, y, z, m) will be adjusted + * to encompass both the current bounds and the geometry's bounds. + * + * @param geometry The geometry whose coordinates will be used to update this bounding box. + * If null or empty, the method returns without making any changes. + */ + public void update(Geometry geometry) { + if (!valid) { + return; } - /** - * Checks if the Z dimension of the bounding box is valid. - * The Z dimension is considered valid if none of the bounds contain NaN. - * - * @return true if the Z dimension is valid, false otherwise. - */ - public boolean isZValid() { - return !(Double.isNaN(zMin) || Double.isNaN(zMax)); + if (geometry == null || geometry.isEmpty()) { + return; } - /** - * Checks if the M dimension of the bounding box is valid. - * The M dimension is considered valid if none of the bounds contain NaN. - * - * @return true if the M dimension is valid, false otherwise. - */ - public boolean isMValid() { - return !(Double.isNaN(mMin) || Double.isNaN(mMax)); + Envelope envelope = geometry.getEnvelopeInternal(); + updateBounds(envelope.getMinX(), envelope.getMaxX(), envelope.getMinY(), envelope.getMaxY()); + + for (Coordinate coord : geometry.getCoordinates()) { + if (!Double.isNaN(coord.getZ())) { + zMin = Math.min(zMin, coord.getZ()); + zMax = Math.max(zMax, coord.getZ()); + } + if (!Double.isNaN(coord.getM())) { + mMin = Math.min(mMin, coord.getM()); + mMax = Math.max(mMax, coord.getM()); + } } - /** - * Checks if the bounding box is empty in the X / Y dimension. - * - * @return true if the bounding box is empty, false otherwise. - */ - public boolean isXYEmpty() { - return isXEmpty() || isYEmpty(); + // Update the validity of this bounding box based on the other bounding box + valid = isXYValid(); + } + + /** + * Updates the X and Y bounds of this bounding box with the given coordinates. + * Updates are conditional: + * - X bounds are only updated if both minX and maxX are not NaN + * - Y bounds are only updated if both minY and maxY are not NaN + * This allows partial updates while preserving valid dimensions. + */ + private void updateBounds(double minX, double maxX, double minY, double maxY) { + if (!Double.isNaN(minX) && !Double.isNaN(maxX)) { + xMin = Math.min(xMin, minX); + xMax = Math.max(xMax, maxX); } - /** - * Checks if the bounding box is empty in the X dimension. - * - * @return true if the X dimension is empty, false otherwise. - */ - public boolean isXEmpty() { - return Double.isInfinite(xMin) && Double.isInfinite(xMax); + if (!Double.isNaN(minY) && !Double.isNaN(maxY)) { + yMin = Math.min(yMin, minY); + yMax = Math.max(yMax, maxY); } - - /** - * Checks if the bounding box is empty in the Y dimension. - * - * @return true if the Y dimension is empty, false otherwise. - */ - public boolean isYEmpty() { - return Double.isInfinite(yMin) && Double.isInfinite(yMax); - } - - /** - * Checks if the bounding box is empty in the Z dimension. - * - * @return true if the Z dimension is empty, false otherwise. - */ - public boolean isZEmpty() { - return Double.isInfinite(zMin) && Double.isInfinite(zMax); + } + + /** + * Aborts the bounding box by resetting it to its initial state. + */ + public void abort() { + valid = false; + resetBBox(); + } + + /** + * Resets the bounding box to its initial state. + */ + public void reset() { + resetBBox(); + valid = true; + } + + /** + * Creates a copy of the current bounding box. + * + * @return a new BoundingBox instance with the same values as this one. + */ + public BoundingBox copy() { + return new BoundingBox( + this.xMin, this.xMax, + this.yMin, this.yMax, + this.zMin, this.zMax, + this.mMin, this.mMax); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("BoundingBox{xMin=") + .append(xMin) + .append(", xMax=") + .append(xMax) + .append(", yMin=") + .append(yMin) + .append(", yMax=") + .append(yMax) + .append(", zMin=") + .append(zMin) + .append(", zMax=") + .append(zMax) + .append(", mMin=") + .append(mMin) + .append(", mMax=") + .append(mMax); + + // Only include the valid flag when it's false + if (!valid) { + sb.append(", valid=false"); } - /** - * Checks if the bounding box is empty in the M dimension. - * - * @return true if the M dimension is empty, false otherwise. - */ - public boolean isMEmpty() { - return Double.isInfinite(mMin) && Double.isInfinite(mMax); - } - - /** - * Expands this bounding box to include the bounds of another box. - * After merging, this bounding box will contain both its original extent - * and the extent of the other bounding box. - * - * @param other the other BoundingBox whose bounds will be merged into this one - */ - public void merge(BoundingBox other) { - if (!valid) { - return; - } - - // If other is null or invalid, mark this as invalid - if (other == null || !other.valid) { - valid = false; - resetBBox(); - return; - } - - this.xMin = Math.min(this.xMin, other.xMin); - this.xMax = Math.max(this.xMax, other.xMax); - this.yMin = Math.min(this.yMin, other.yMin); - this.yMax = Math.max(this.yMax, other.yMax); - this.zMin = Math.min(this.zMin, other.zMin); - this.zMax = Math.max(this.zMax, other.zMax); - this.mMin = Math.min(this.mMin, other.mMin); - this.mMax = Math.max(this.mMax, other.mMax); - - // Update the validity of this bounding box based on the other bounding box - valid = isXYValid(); - } - - /** - * Extends this bounding box to include the spatial extent of the provided geometry. - * The bounding box coordinates (min/max values for x, y, z, m) will be adjusted - * to encompass both the current bounds and the geometry's bounds. - * - * @param geometry The geometry whose coordinates will be used to update this bounding box. - * If null or empty, the method returns without making any changes. - */ - public void update(Geometry geometry) { - if (!valid) { - return; - } - - if (geometry == null || geometry.isEmpty()) { - return; - } - - Envelope envelope = geometry.getEnvelopeInternal(); - updateBounds(envelope.getMinX(), envelope.getMaxX(), envelope.getMinY(), envelope.getMaxY()); - - for (Coordinate coord : geometry.getCoordinates()) { - if (!Double.isNaN(coord.getZ())) { - zMin = Math.min(zMin, coord.getZ()); - zMax = Math.max(zMax, coord.getZ()); - } - if (!Double.isNaN(coord.getM())) { - mMin = Math.min(mMin, coord.getM()); - mMax = Math.max(mMax, coord.getM()); - } - } - - // Update the validity of this bounding box based on the other bounding box - valid = isXYValid(); - } - - /** - * Updates the X and Y bounds of this bounding box with the given coordinates. - * Updates are conditional: - * - X bounds are only updated if both minX and maxX are not NaN - * - Y bounds are only updated if both minY and maxY are not NaN - * This allows partial updates while preserving valid dimensions. - */ - private void updateBounds(double minX, double maxX, double minY, double maxY) { - if (!Double.isNaN(minX) && !Double.isNaN(maxX)) { - xMin = Math.min(xMin, minX); - xMax = Math.max(xMax, maxX); - } - - if (!Double.isNaN(minY) && !Double.isNaN(maxY)) { - yMin = Math.min(yMin, minY); - yMax = Math.max(yMax, maxY); - } - } - - /** - * Aborts the bounding box by resetting it to its initial state. - */ - public void abort() { - valid = false; - resetBBox(); - } - - /** - * Resets the bounding box to its initial state. - */ - public void reset() { - resetBBox(); - valid = true; - } - - /** - * Creates a copy of the current bounding box. - * - * @return a new BoundingBox instance with the same values as this one. - */ - public BoundingBox copy() { - return new BoundingBox( - this.xMin, this.xMax, - this.yMin, this.yMax, - this.zMin, this.zMax, - this.mMin, this.mMax); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder("BoundingBox{xMin=") - .append(xMin) - .append(", xMax=") - .append(xMax) - .append(", yMin=") - .append(yMin) - .append(", yMax=") - .append(yMax) - .append(", zMin=") - .append(zMin) - .append(", zMax=") - .append(zMax) - .append(", mMin=") - .append(mMin) - .append(", mMax=") - .append(mMax); - - // Only include the valid flag when it's false - if (!valid) { - sb.append(", valid=false"); - } - - sb.append('}'); - return sb.toString(); - } -} \ No newline at end of file + sb.append('}'); + return sb.toString(); + } +} diff --git a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java index 2a82d133bb..b15c379edb 100644 --- a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java +++ b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java @@ -237,4 +237,4 @@ private String typeIdToString(int typeId) { } return typeString; } -} \ No newline at end of file +} diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index 86d282cfff..dc2e53a557 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -23,7 +23,19 @@ import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparator; -import org.apache.orc.*; +import org.apache.orc.BinaryColumnStatistics; +import org.apache.orc.BooleanColumnStatistics; +import org.apache.orc.CollectionColumnStatistics; +import org.apache.orc.ColumnStatistics; +import org.apache.orc.DateColumnStatistics; +import org.apache.orc.DecimalColumnStatistics; +import org.apache.orc.DoubleColumnStatistics; +import org.apache.orc.IntegerColumnStatistics; +import org.apache.orc.OrcProto; +import org.apache.orc.StringColumnStatistics; +import org.apache.orc.TimestampColumnStatistics; +import org.apache.orc.GeospatialColumnStatistics; +import org.apache.orc.TypeDescription; import org.apache.orc.geospatial.BoundingBox; import org.apache.orc.geospatial.GeospatialTypes; import org.locationtech.jts.geom.Geometry; diff --git a/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java index 308a8e4629..e9a0aa70bf 100644 --- a/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java +++ b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java @@ -33,125 +33,125 @@ import java.util.function.Consumer; public class GeospatialTreeWriter extends TreeWriterBase { - private final PositionedOutputStream stream; - private final IntegerWriter length; - private boolean isDirectV2 = true; - private long rawDataSize = 0; - private boolean isGeometry = false; + private final PositionedOutputStream stream; + private final IntegerWriter length; + private boolean isDirectV2 = true; + private long rawDataSize = 0; + private boolean isGeometry = false; - public GeospatialTreeWriter(TypeDescription schema, - WriterEncryptionVariant encryption, - WriterContext context) throws IOException { - super(schema, encryption, context); - this.isGeometry = schema.getCategory() == TypeDescription.Category.Geometry; - this.stream = context.createStream( - new StreamName(id, OrcProto.Stream.Kind.DATA, encryption)); - this.isDirectV2 = isNewWriteFormat(context); - this.length = createIntegerWriter(context.createStream( - new StreamName(id, OrcProto.Stream.Kind.LENGTH, encryption)), - false, isDirectV2, context); - if (rowIndexPosition != null) { - recordPosition(rowIndexPosition); - } + public GeospatialTreeWriter(TypeDescription schema, + WriterEncryptionVariant encryption, + WriterContext context) throws IOException { + super(schema, encryption, context); + this.isGeometry = schema.getCategory() == TypeDescription.Category.Geometry; + this.stream = context.createStream( + new StreamName(id, OrcProto.Stream.Kind.DATA, encryption)); + this.isDirectV2 = isNewWriteFormat(context); + this.length = createIntegerWriter(context.createStream( + new StreamName(id, OrcProto.Stream.Kind.LENGTH, encryption)), + false, isDirectV2, context); + if (rowIndexPosition != null) { + recordPosition(rowIndexPosition); } + } - @Override - OrcProto.ColumnEncoding.Builder getEncoding() { - OrcProto.ColumnEncoding.Builder result = super.getEncoding(); - if (isDirectV2) { - result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); - } else { - result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); - } - return result; + @Override + OrcProto.ColumnEncoding.Builder getEncoding() { + OrcProto.ColumnEncoding.Builder result = super.getEncoding(); + if (isDirectV2) { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2); + } else { + result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT); } + return result; + } - @Override - public void writeBatch(ColumnVector vector, int offset, - int length) throws IOException { - super.writeBatch(vector, offset, length); - BytesColumnVector vec = (BytesColumnVector) vector; - if (vector.isRepeating) { - if (vector.noNulls || !vector.isNull[0]) { - for (int i = 0; i < length; ++i) { - stream.write(vec.vector[0], vec.start[0], - vec.length[0]); - this.length.write(vec.length[0]); - } - rawDataSize += (long)length * vec.length[0]; - if (isGeometry) { - indexStatistics.updateGeometry(vec.vector[0], vec.start[0], vec.length[0]); - } - if (createBloomFilter) { - if (bloomFilter != null) { - bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]); - } - bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]); - } - } - } else { - for (int i = 0; i < length; ++i) { - if (vec.noNulls || !vec.isNull[i + offset]) { - stream.write(vec.vector[offset + i], - vec.start[offset + i], vec.length[offset + i]); - this.length.write(vec.length[offset + i]); - rawDataSize += vec.length[offset + i]; - BytesWritable bw = new BytesWritable(); - bw.set(vec.vector[offset + i], vec.start[offset + i], vec.length[offset + i]); - if (isGeometry) { - indexStatistics.updateGeometry(vec.vector[i], vec.start[i], vec.length[i]); - } - if (createBloomFilter) { - if (bloomFilter != null) { - bloomFilter.addBytes(vec.vector[offset + i], - vec.start[offset + i], vec.length[offset + i]); - } - bloomFilterUtf8.addBytes(vec.vector[offset + i], - vec.start[offset + i], vec.length[offset + i]); - } - } + @Override + public void writeBatch(ColumnVector vector, int offset, + int length) throws IOException { + super.writeBatch(vector, offset, length); + BytesColumnVector vec = (BytesColumnVector) vector; + if (vector.isRepeating) { + if (vector.noNulls || !vector.isNull[0]) { + for (int i = 0; i < length; ++i) { + stream.write(vec.vector[0], vec.start[0], + vec.length[0]); + this.length.write(vec.length[0]); + } + rawDataSize += (long) length * vec.length[0]; + if (isGeometry) { + indexStatistics.updateGeometry(vec.vector[0], vec.start[0], vec.length[0]); + } + if (createBloomFilter) { + if (bloomFilter != null) { + bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]); + } + bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]); + } + } + } else { + for (int i = 0; i < length; ++i) { + if (vec.noNulls || !vec.isNull[i + offset]) { + stream.write(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); + this.length.write(vec.length[offset + i]); + rawDataSize += vec.length[offset + i]; + BytesWritable bw = new BytesWritable(); + bw.set(vec.vector[offset + i], vec.start[offset + i], vec.length[offset + i]); + if (isGeometry) { + indexStatistics.updateGeometry(vec.vector[i], vec.start[i], vec.length[i]); + } + if (createBloomFilter) { + if (bloomFilter != null) { + bloomFilter.addBytes(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); } + bloomFilterUtf8.addBytes(vec.vector[offset + i], + vec.start[offset + i], vec.length[offset + i]); + } } + } } + } - @Override - public void writeStripe(int requiredIndexEntries) throws IOException { - super.writeStripe(requiredIndexEntries); - if (rowIndexPosition != null) { - recordPosition(rowIndexPosition); - } + @Override + public void writeStripe(int requiredIndexEntries) throws IOException { + super.writeStripe(requiredIndexEntries); + if (rowIndexPosition != null) { + recordPosition(rowIndexPosition); } + } - @Override - void recordPosition(PositionRecorder recorder) throws IOException { - super.recordPosition(recorder); - stream.getPosition(recorder); - length.getPosition(recorder); - } + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + stream.getPosition(recorder); + length.getPosition(recorder); + } - @Override - public long estimateMemory() { - return super.estimateMemory() + stream.getBufferSize() + - length.estimateMemory(); - } + @Override + public long estimateMemory() { + return super.estimateMemory() + stream.getBufferSize() + + length.estimateMemory(); + } - @Override - public long getRawDataSize() { - return rawDataSize; - } + @Override + public long getRawDataSize() { + return rawDataSize; + } - @Override - public void flushStreams() throws IOException { - super.flushStreams(); - stream.flush(); - length.flush(); - } + @Override + public void flushStreams() throws IOException { + super.flushStreams(); + stream.flush(); + length.flush(); + } - @Override - public void prepareStripe(int stripeId) { - super.prepareStripe(stripeId); - Consumer updater = CryptoUtils.modifyIvForStripe(stripeId); - stream.changeIv(updater); - length.changeIv(updater); - } + @Override + public void prepareStripe(int stripeId) { + super.prepareStripe(stripeId); + Consumer updater = CryptoUtils.modifyIvForStripe(stripeId); + stream.changeIv(updater); + length.changeIv(updater); + } } diff --git a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java index 3d58008aaa..77c4efafca 100644 --- a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java +++ b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java @@ -41,97 +41,97 @@ * */ public class TestOrcGeospatial { - Path workDir = new Path(System.getProperty("test.tmp.dir", - "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; - FileSystem fs; - Path testFilePath; + Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + Configuration conf; + FileSystem fs; + Path testFilePath; - public TestOrcGeospatial() { - } + public TestOrcGeospatial() { + } - @BeforeEach - public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); - fs = FileSystem.getLocal(conf); - testFilePath = new Path(workDir, "TestOrcGeospatial." + - testInfo.getTestMethod().get().getName() + ".orc"); - fs.delete(testFilePath, false); - } + @BeforeEach + public void openFileSystem(TestInfo testInfo) throws Exception { + conf = new Configuration(); + fs = FileSystem.getLocal(conf); + testFilePath = new Path(workDir, "TestOrcGeospatial." + + testInfo.getTestMethod().get().getName() + ".orc"); + fs.delete(testFilePath, false); + } - @Test - public void testGeometryWriter() throws Exception { - TypeDescription schema = TypeDescription.createGeometry(); - Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) - .bufferSize(10000)); - GeometryFactory geometryFactory = new GeometryFactory(); - WKBWriter wkbWriter = new WKBWriter(); - WKBReader wkbReader = new WKBReader(); + @Test + public void testGeometryWriter() throws Exception { + TypeDescription schema = TypeDescription.createGeometry(); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000)); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + WKBReader wkbReader = new WKBReader(); - VectorizedRowBatch batch = schema.createRowBatch(); - BytesColumnVector geos = (BytesColumnVector) batch.cols[0]; - long sum = 0; - for (int i = 0; i < 100; i++) { - byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); - sum += bytes.length; - geos.setVal(batch.size++, bytes); - } - writer.addRowBatch(batch); - writer.close(); + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector geos = (BytesColumnVector) batch.cols[0]; + long sum = 0; + for (int i = 0; i < 100; i++) { + byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); + sum += bytes.length; + geos.setVal(batch.size++, bytes); + } + writer.addRowBatch(batch); + writer.close(); - Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); - assertEquals(reader.getSchema().toString(), "geometry(OGC:CRS84)"); - RecordReader rows = reader.rows(); - batch = reader.getSchema().createRowBatch(); - geos = (BytesColumnVector) batch.cols[0]; - int idx = 0; - while (rows.nextBatch(batch)) { - for(int r=0; r < batch.size; ++r) { - Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); - assertEquals("Point", geom.getGeometryType()); - assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); - idx += 1; - } - } - rows.close(); + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals(reader.getSchema().toString(), "geometry(OGC:CRS84)"); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + geos = (BytesColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for (int r = 0; r < batch.size; ++r) { + Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); + assertEquals("Point", geom.getGeometryType()); + assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); + idx += 1; + } } + rows.close(); + } - @Test - public void testGeographyWriter() throws Exception { - TypeDescription schema = TypeDescription.createGeography(); - Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) - .bufferSize(10000)); - GeometryFactory geometryFactory = new GeometryFactory(); - WKBWriter wkbWriter = new WKBWriter(); - WKBReader wkbReader = new WKBReader(); + @Test + public void testGeographyWriter() throws Exception { + TypeDescription schema = TypeDescription.createGeography(); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000)); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + WKBReader wkbReader = new WKBReader(); - VectorizedRowBatch batch = schema.createRowBatch(); - BytesColumnVector geos = (BytesColumnVector) batch.cols[0]; - long sum = 0; - for (int i = 0; i < 100; i++) { - byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); - sum += bytes.length; - geos.setVal(batch.size++, bytes); - } - writer.addRowBatch(batch); - writer.close(); + VectorizedRowBatch batch = schema.createRowBatch(); + BytesColumnVector geos = (BytesColumnVector) batch.cols[0]; + long sum = 0; + for (int i = 0; i < 100; i++) { + byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); + sum += bytes.length; + geos.setVal(batch.size++, bytes); + } + writer.addRowBatch(batch); + writer.close(); - Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); - assertEquals(reader.getSchema().toString(), "geography(OGC:CRS84,SPHERICAL)"); - RecordReader rows = reader.rows(); - batch = reader.getSchema().createRowBatch(); - geos = (BytesColumnVector) batch.cols[0]; - int idx = 0; - while (rows.nextBatch(batch)) { - for(int r=0; r < batch.size; ++r) { - Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); - assertEquals("Point", geom.getGeometryType()); - assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); - idx += 1; - } - } - rows.close(); + Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); + assertEquals(reader.getSchema().toString(), "geography(OGC:CRS84,SPHERICAL)"); + RecordReader rows = reader.rows(); + batch = reader.getSchema().createRowBatch(); + geos = (BytesColumnVector) batch.cols[0]; + int idx = 0; + while (rows.nextBatch(batch)) { + for (int r = 0; r < batch.size; ++r) { + Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); + assertEquals("Point", geom.getGeometryType()); + assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); + idx += 1; + } } + rows.close(); + } } diff --git a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java index 7797045887..faa370c23e 100644 --- a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java @@ -86,7 +86,7 @@ public void testOldTimestamps() throws IOException { Path file = new Path(exampleDir, "TestOrcFile.testTimestamp.orc"); Reader reader = OrcFile.createReader(file, OrcFile.readerOptions(conf)); TimestampColumnStatistics stats = - (TimestampColumnStatistics) reader.getStatistics()[0]; + (TimestampColumnStatistics) reader.getStatistics()[0]; assertEquals("1995-01-01 00:00:00.688", stats.getMinimum().toString()); // ORC-611: add TS stats nanosecond support for older files by using (max TS + 0.999 ms) assertEquals("2037-01-01 00:00:00.000999999", stats.getMaximum().toString()); @@ -140,9 +140,9 @@ public void testTimestamps() { public void testDecimal64Overflow() { TypeDescription schema = TypeDescription.fromString("decimal(18,6)"); OrcProto.ColumnStatistics.Builder pb = - OrcProto.ColumnStatistics.newBuilder(); + OrcProto.ColumnStatistics.newBuilder(); OrcProto.DecimalStatistics.Builder decimalBuilder = - OrcProto.DecimalStatistics.newBuilder(); + OrcProto.DecimalStatistics.newBuilder(); decimalBuilder.setMaximum("1000.0"); decimalBuilder.setMinimum("1.010"); decimalBuilder.setSum("123456789.123456"); @@ -152,7 +152,7 @@ public void testDecimal64Overflow() { // the base case doesn't overflow DecimalColumnStatistics stats1 = (DecimalColumnStatistics) - ColumnStatisticsImpl.deserialize(schema, pb.build()); + ColumnStatisticsImpl.deserialize(schema, pb.build()); ColumnStatisticsImpl updateStats1 = (ColumnStatisticsImpl) stats1; assertEquals("1.01", stats1.getMinimum().toString()); assertEquals("1000", stats1.getMaximum().toString()); @@ -163,7 +163,7 @@ public void testDecimal64Overflow() { decimalBuilder.setSum("1234567890123.45"); pb.setDecimalStatistics(decimalBuilder); DecimalColumnStatistics stats2 = (DecimalColumnStatistics) - ColumnStatisticsImpl.deserialize(schema, pb.build()); + ColumnStatisticsImpl.deserialize(schema, pb.build()); assertNull(stats2.getSum()); // merge them together @@ -269,13 +269,13 @@ public void testUpdateGeometryWithDifferentTypes() { Point point = geometryFactory.createPoint(new Coordinate(1, 1)); - Coordinate[] lineCoords = new Coordinate[] {new Coordinate(1, 1), new Coordinate(2, 2)}; + Coordinate[] lineCoords = new Coordinate[]{new Coordinate(1, 1), new Coordinate(2, 2)}; LineString line = geometryFactory.createLineString(lineCoords); - Coordinate[] polygonCoords = new Coordinate[] { - new Coordinate(0, 0), new Coordinate(3, 0), - new Coordinate(1, 3), new Coordinate(0, 1), - new Coordinate(0, 0) + Coordinate[] polygonCoords = new Coordinate[]{ + new Coordinate(0, 0), new Coordinate(3, 0), + new Coordinate(1, 3), new Coordinate(0, 1), + new Coordinate(0, 0) }; LinearRing shell = geometryFactory.createLinearRing(polygonCoords); Polygon polygon = geometryFactory.createPolygon(shell); @@ -321,7 +321,7 @@ public void testUpdateGeometryWithDifferentTypes() { assertTrue(geospatialTypes.getTypes().contains(2)); assertEquals(2, geospatialTypes.getTypes().size()); assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY)]}", - geospatialTypes.toString()); + geospatialTypes.toString()); byte[] polygonWkb = wkbWriter.write(polygon); stats.updateGeometry(new BytesWritable(polygonWkb)); From daba91505f7ea5cafc2807d107263913c024be4b Mon Sep 17 00:00:00 2001 From: ffacs Date: Tue, 3 Jun 2025 19:52:40 +0800 Subject: [PATCH 06/16] Fix spotless again --- .../core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index dc2e53a557..e98882cbd6 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -30,11 +30,11 @@ import org.apache.orc.DateColumnStatistics; import org.apache.orc.DecimalColumnStatistics; import org.apache.orc.DoubleColumnStatistics; +import org.apache.orc.GeospatialColumnStatistics; import org.apache.orc.IntegerColumnStatistics; import org.apache.orc.OrcProto; import org.apache.orc.StringColumnStatistics; import org.apache.orc.TimestampColumnStatistics; -import org.apache.orc.GeospatialColumnStatistics; import org.apache.orc.TypeDescription; import org.apache.orc.geospatial.BoundingBox; import org.apache.orc.geospatial.GeospatialTypes; From e4099f2556c01aff55c3cca3df4ad2f22e416d22 Mon Sep 17 00:00:00 2001 From: ffacs Date: Fri, 6 Jun 2025 01:03:51 +0800 Subject: [PATCH 07/16] Fix some bugs --- .../java/org/apache/orc/TypeDescription.java | 8 +- .../apache/orc/impl/ColumnStatisticsImpl.java | 23 ++- .../org/apache/orc/TestColumnStatistics.java | 193 ++++++++++++++++++ .../org/apache/orc/TestOrcGeospatial.java | 82 ++++++-- .../orc/impl/TestColumnStatisticsImpl.java | 153 -------------- java/pom.xml | 2 +- 6 files changed, 277 insertions(+), 184 deletions(-) diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index 342cd998e8..0c7d4a07dc 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -44,6 +44,7 @@ public class TypeDescription public static final long MAX_DECIMAL64 = 999_999_999_999_999_999L; public static final long MIN_DECIMAL64 = -MAX_DECIMAL64; private static final int DEFAULT_LENGTH = 256; + private static final String DEFAULT_CRS = "OGC:CRS84"; static final Pattern UNQUOTED_NAMES = Pattern.compile("^[a-zA-Z0-9_]+$"); // type attributes @@ -62,6 +63,8 @@ public enum EdgeInterpolationAlgorithm { } final String name; } + private static final EdgeInterpolationAlgorithm DEFAULT_EDGE_INTERPOLATION_ALGORITHM + = EdgeInterpolationAlgorithm.SPHERICAL; @Override public int compareTo(TypeDescription other) { @@ -717,9 +720,8 @@ public TypeDescription(Category category) { private int maxLength = DEFAULT_LENGTH; private int precision = DEFAULT_PRECISION; private int scale = DEFAULT_SCALE; - private String CRS = "OGC:CRS84"; - private EdgeInterpolationAlgorithm edgeInterpolationAlgorithm = - EdgeInterpolationAlgorithm.SPHERICAL; + private String CRS = DEFAULT_CRS; + private EdgeInterpolationAlgorithm edgeInterpolationAlgorithm = DEFAULT_EDGE_INTERPOLATION_ALGORITHM; static void printFieldName(StringBuilder buffer, String name) { if (UNQUOTED_NAMES.matcher(name).matches()) { diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index e98882cbd6..b9936234bc 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -48,9 +48,7 @@ import java.time.chrono.ChronoLocalDate; import java.time.chrono.Chronology; import java.time.chrono.IsoChronology; -import java.util.HashSet; -import java.util.Set; -import java.util.TimeZone; +import java.util.*; public class ColumnStatisticsImpl implements ColumnStatistics { @@ -1948,7 +1946,8 @@ public OrcProto.ColumnStatistics.Builder serialize() { OrcProto.GeospatialStatistics.Builder geoStats = OrcProto.GeospatialStatistics.newBuilder(); OrcProto.BoundingBox.Builder bboxBuilder = OrcProto.BoundingBox.newBuilder(); - if (boundingBox.isValid()) { + boolean hasStats = false; + if (boundingBox.isValid() && !boundingBox.isXYEmpty()) { bboxBuilder.setXmin(boundingBox.getXMin()); bboxBuilder.setXmax(boundingBox.getXMax()); bboxBuilder.setYmin(boundingBox.getYMin()); @@ -1957,12 +1956,18 @@ public OrcProto.ColumnStatistics.Builder serialize() { bboxBuilder.setZmax(boundingBox.getZMax()); bboxBuilder.setMmin(boundingBox.getMMin()); bboxBuilder.setMmax(boundingBox.getMMax()); + hasStats = true; + geoStats.setBbox(bboxBuilder); } - if (geospatialTypes.isValid()) { - geoStats.addAllGeospatialTypes(geospatialTypes.getTypes()); + if (geospatialTypes.isValid() && !geospatialTypes.getTypes().isEmpty()) { + List sortedTypes = new ArrayList<>(geospatialTypes.getTypes()); + Collections.sort(sortedTypes); + geoStats.addAllGeospatialTypes(sortedTypes); + hasStats = true; + } + if (hasStats) { + builder.setGeospatialStatistics(geoStats); } - geoStats.setBbox(bboxBuilder); - builder.setGeospatialStatistics(geoStats); return builder; } @@ -2258,6 +2263,8 @@ public static ColumnStatisticsImpl deserialize(TypeDescription schema, writerUsedProlepticGregorian, convertToProlepticGregorian); } else if(stats.hasBinaryStatistics()) { return new BinaryStatisticsImpl(stats); + } else if (stats.hasGeospatialStatistics()) { + return new GeospatialStatisticsImpl(stats); } else { return new ColumnStatisticsImpl(stats); } diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java b/java/core/src/test/org/apache/orc/TestColumnStatistics.java index 5fc429199e..13ef5bf698 100644 --- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java +++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java @@ -28,10 +28,14 @@ import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; +import org.apache.orc.geospatial.BoundingBox; +import org.apache.orc.geospatial.GeospatialTypes; import org.apache.orc.impl.ColumnStatisticsImpl; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; +import org.locationtech.jts.geom.*; +import org.locationtech.jts.io.WKBWriter; import java.io.File; import java.math.BigDecimal; @@ -742,6 +746,195 @@ public void testMergeIncompatible() { assertEquals(0, ((DoubleColumnStatistics) doubleStats).getNumberOfValues()); } + @Test + public void testUpdateGeometry() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + byte[][] points = { + wkbWriter.write(geometryFactory.createPoint(new Coordinate(1.0, 1.0))), + wkbWriter.write(geometryFactory.createPoint(new Coordinate(2.0, 2.0))), + }; + + for (byte[] point : points) { + stats.updateGeometry(new BytesWritable(point)); + } + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(2.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", + geometryStatistics.toString()); + + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + assertTrue(geospatialTypes.getTypes().contains(1)); + assertEquals(1, geospatialTypes.getTypes().size()); + } + + @Test + public void testUpdateGeometryWithDifferentTypes() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + Point point = geometryFactory.createPoint(new Coordinate(1, 1)); + Coordinate[] lineCoords = new Coordinate[]{new Coordinate(1, 1), new Coordinate(2, 2)}; + LineString line = geometryFactory.createLineString(lineCoords); + Coordinate[] polygonCoords = new Coordinate[]{ + new Coordinate(0, 0), new Coordinate(3, 0), + new Coordinate(1, 3), new Coordinate(0, 1), + new Coordinate(0, 0) + }; + LinearRing shell = geometryFactory.createLinearRing(polygonCoords); + Polygon polygon = geometryFactory.createPolygon(shell); + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + // Generate WKB and update stats + byte[] pointWkb = wkbWriter.write(point); + stats.updateGeometry(new BytesWritable(pointWkb)); + + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(1.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(1.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=1.0, xMax=1.0, yMin=1.0, yMax=1.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + + assertTrue(geospatialTypes.getTypes().contains(1)); + assertEquals(1, geospatialTypes.getTypes().size()); + assertEquals("GeospatialTypes{types=[Point (XY)]}", geospatialTypes.toString()); + + + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=1.0, yMin=1.0, yMax=1.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", + geometryStatistics.toString()); + + byte[] lineWkb = wkbWriter.write(line); + stats.updateGeometry(new BytesWritable(lineWkb)); + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(2.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + + assertTrue(geospatialTypes.getTypes().contains(1)); + assertTrue(geospatialTypes.getTypes().contains(2)); + assertEquals(2, geospatialTypes.getTypes().size()); + assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY)]}", + geospatialTypes.toString()); + + byte[] polygonWkb = wkbWriter.write(polygon); + stats.updateGeometry(new BytesWritable(polygonWkb)); + stats.updateGeometry(new BytesWritable(lineWkb)); + assertEquals(0.0, bbox.getXMin(), 0.0); + assertEquals(3.0, bbox.getXMax(), 0.0); + assertEquals(0.0, bbox.getYMin(), 0.0); + assertEquals(3.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + + assertTrue(geospatialTypes.getTypes().contains(1)); + assertTrue(geospatialTypes.getTypes().contains(2)); + assertTrue(geospatialTypes.getTypes().contains(2)); + assertEquals(3, geospatialTypes.getTypes().size()); + assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY), Polygon (XY)]}", + geospatialTypes.toString()); + + } + + @Test + public void testUpdateGeometryWithZCoordinates() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(3); + + Point point1 = geometryFactory.createPoint(new Coordinate(0, 1, 2)); + Point point2 = geometryFactory.createPoint(new Coordinate(2, 1, 0)); + + stats.updateGeometry(new BytesWritable(wkbWriter.write(point1))); + stats.updateGeometry(new BytesWritable(wkbWriter.write(point2))); + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; + BoundingBox bbox = geometryStatistics.getBoundingBox(); + assertEquals(0.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(1.0, bbox.getYMax(), 0.0); + assertEquals(0.0, bbox.getZMin(), 0.0); + assertEquals(2.0, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=0.0, xMax=2.0, yMin=1.0, yMax=1.0, zMin=0.0, zMax=2.0, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=0.0, xMax=2.0, yMin=1.0, yMax=1.0, zMin=0.0, zMax=2.0, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XYZ)]}", + geometryStatistics.toString()); + + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + assertTrue(geospatialTypes.getTypes().contains(1001)); + assertEquals(1, geospatialTypes.getTypes().size()); + } + + @Test + public void TestGeospatialMerge() { + TypeDescription desc = TypeDescription.createGeometry(); + ColumnStatisticsImpl stats0 = ColumnStatisticsImpl.create(desc); + ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(desc); + GeometryFactory geometryFactory = new GeometryFactory(); + WKBWriter wkbWriter = new WKBWriter(); + + byte[][] points = { + wkbWriter.write(geometryFactory.createPoint(new Coordinate(1.0, 1.0))), + wkbWriter.write(geometryFactory.createPoint(new Coordinate(2.0, 2.0))), + }; + + stats0.updateGeometry(new BytesWritable(points[0])); + stats1.updateGeometry(new BytesWritable(points[1])); + + GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats0; + stats0.merge(stats1); + + BoundingBox bbox = geometryStatistics.getBoundingBox(); + assertEquals(1.0, bbox.getXMin(), 0.0); + assertEquals(2.0, bbox.getXMax(), 0.0); + assertEquals(1.0, bbox.getYMin(), 0.0); + assertEquals(2.0, bbox.getYMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); + assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); + assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); + assertEquals("BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", + bbox.toString()); + assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", + geometryStatistics.toString()); + + GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); + assertTrue(geospatialTypes.getTypes().contains(1)); + assertEquals(1, geospatialTypes.getTypes().size()); + } + Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); diff --git a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java index 77c4efafca..05b2118380 100644 --- a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java +++ b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java @@ -23,6 +23,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.impl.ColumnStatisticsImpl; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInfo; @@ -35,7 +36,7 @@ import java.io.File; import java.util.Arrays; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.*; /** * @@ -60,7 +61,8 @@ public void openFileSystem(TestInfo testInfo) throws Exception { } @Test - public void testGeometryWriter() throws Exception { + public void testGeometryWriterWithNulls() throws Exception { + // Create a geometry schema and ORC file writer TypeDescription schema = TypeDescription.createGeometry(); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) @@ -69,28 +71,49 @@ public void testGeometryWriter() throws Exception { WKBWriter wkbWriter = new WKBWriter(); WKBReader wkbReader = new WKBReader(); + // Add data VectorizedRowBatch batch = schema.createRowBatch(); BytesColumnVector geos = (BytesColumnVector) batch.cols[0]; - long sum = 0; for (int i = 0; i < 100; i++) { - byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); - sum += bytes.length; - geos.setVal(batch.size++, bytes); + if (i % 2 == 0) { + byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); + geos.setVal(batch.size++, bytes); + } else { + geos.noNulls = false; + geos.isNull[batch.size++] = true; + } } writer.addRowBatch(batch); writer.close(); + // Verify reader schema Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); - assertEquals(reader.getSchema().toString(), "geometry(OGC:CRS84)"); + assertEquals("geometry(OGC:CRS84)", reader.getSchema().toString()); + assertEquals(100, reader.getNumberOfRows()); RecordReader rows = reader.rows(); batch = reader.getSchema().createRowBatch(); geos = (BytesColumnVector) batch.cols[0]; + + // Verify statistics + ColumnStatistics[] stats = reader.getStatistics(); + assertEquals(1, stats.length); + assertEquals(50, stats[0].getNumberOfValues()); + assertTrue(stats[0].hasNull()); + assertInstanceOf(GeospatialColumnStatistics.class, stats[0]); + assertEquals("BoundingBox{xMin=0.0, xMax=98.0, yMin=0.0, yMax=98.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", ((GeospatialColumnStatistics) stats[0]).getBoundingBox().toString()); + assertEquals("GeospatialTypes{types=[Point (XY)]}", ((GeospatialColumnStatistics) stats[0]).getGeospatialTypes().toString()); + + // Verify data int idx = 0; while (rows.nextBatch(batch)) { for (int r = 0; r < batch.size; ++r) { - Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); - assertEquals("Point", geom.getGeometryType()); - assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); + if (idx % 2 == 0) { + Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); + assertEquals("Point", geom.getGeometryType()); + assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); + } else { + assertTrue(geos.isNull[r]); + } idx += 1; } } @@ -98,7 +121,8 @@ public void testGeometryWriter() throws Exception { } @Test - public void testGeographyWriter() throws Exception { + public void testGeographyWriterWithNulls() throws Exception { + // Create geography schema and ORC file writer TypeDescription schema = TypeDescription.createGeography(); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) @@ -107,28 +131,48 @@ public void testGeographyWriter() throws Exception { WKBWriter wkbWriter = new WKBWriter(); WKBReader wkbReader = new WKBReader(); + // Add data VectorizedRowBatch batch = schema.createRowBatch(); BytesColumnVector geos = (BytesColumnVector) batch.cols[0]; - long sum = 0; for (int i = 0; i < 100; i++) { - byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); - sum += bytes.length; - geos.setVal(batch.size++, bytes); + if (i % 2 == 0) { + byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i))); + geos.setVal(batch.size++, bytes); + } else { + geos.noNulls = false; + geos.isNull[batch.size++] = true; + } } writer.addRowBatch(batch); writer.close(); + // Verify reader schema Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); - assertEquals(reader.getSchema().toString(), "geography(OGC:CRS84,SPHERICAL)"); + assertEquals("geography(OGC:CRS84,SPHERICAL)", reader.getSchema().toString()); + assertEquals(100, reader.getNumberOfRows()); + + // Verify statistics, make sure there is no geospatial stats + ColumnStatistics[] stats = reader.getStatistics(); + assertEquals(1, stats.length); + assertEquals(50, stats[0].getNumberOfValues()); + assertTrue(stats[0].hasNull()); + assertInstanceOf(ColumnStatisticsImpl.class, stats[0]); + assertFalse(stats[0] instanceof GeospatialColumnStatistics); + + // Verify Data RecordReader rows = reader.rows(); batch = reader.getSchema().createRowBatch(); geos = (BytesColumnVector) batch.cols[0]; int idx = 0; while (rows.nextBatch(batch)) { for (int r = 0; r < batch.size; ++r) { - Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); - assertEquals("Point", geom.getGeometryType()); - assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); + if (idx % 2 == 0) { + Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r])); + assertEquals("Point", geom.getGeometryType()); + assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx))); + } else { + assertTrue(geos.isNull[r]); + } idx += 1; } } diff --git a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java index faa370c23e..cc692d3ac5 100644 --- a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java @@ -223,157 +223,4 @@ public void testCollectionColumnStats() { assertEquals(100, collectionStatistics.getTotalChildren()); } - - @Test - public void testUpdateGeometry() { - TypeDescription desc = TypeDescription.createGeometry(); - ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); - GeometryFactory geometryFactory = new GeometryFactory(); - WKBWriter wkbWriter = new WKBWriter(); - - byte[][] points = { - wkbWriter.write(geometryFactory.createPoint(new Coordinate(1.0, 1.0))), - wkbWriter.write(geometryFactory.createPoint(new Coordinate(2.0, 2.0))), - }; - - for (byte[] point : points) { - stats.updateGeometry(new BytesWritable(point)); - } - - GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; - BoundingBox bbox = geometryStatistics.getBoundingBox(); - assertEquals(1.0, bbox.getXMin(), 0.0); - assertEquals(2.0, bbox.getXMax(), 0.0); - assertEquals(1.0, bbox.getYMin(), 0.0); - assertEquals(2.0, bbox.getYMax(), 0.0); - assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); - assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); - assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); - assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); - assertEquals("BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", - bbox.toString()); - assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", - geometryStatistics.toString()); - - GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); - assertTrue(geospatialTypes.getTypes().contains(1)); - assertEquals(1, geospatialTypes.getTypes().size()); - } - - @Test - public void testUpdateGeometryWithDifferentTypes() { - TypeDescription desc = TypeDescription.createGeometry(); - ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); - GeometryFactory geometryFactory = new GeometryFactory(); - WKBWriter wkbWriter = new WKBWriter(); - - Point point = geometryFactory.createPoint(new Coordinate(1, 1)); - - Coordinate[] lineCoords = new Coordinate[]{new Coordinate(1, 1), new Coordinate(2, 2)}; - LineString line = geometryFactory.createLineString(lineCoords); - - Coordinate[] polygonCoords = new Coordinate[]{ - new Coordinate(0, 0), new Coordinate(3, 0), - new Coordinate(1, 3), new Coordinate(0, 1), - new Coordinate(0, 0) - }; - LinearRing shell = geometryFactory.createLinearRing(polygonCoords); - Polygon polygon = geometryFactory.createPolygon(shell); - - GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; - BoundingBox bbox = geometryStatistics.getBoundingBox(); - GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); - // Generate WKB and update stats - byte[] pointWkb = wkbWriter.write(point); - stats.updateGeometry(new BytesWritable(pointWkb)); - - assertEquals(1.0, bbox.getXMin(), 0.0); - assertEquals(1.0, bbox.getXMax(), 0.0); - assertEquals(1.0, bbox.getYMin(), 0.0); - assertEquals(1.0, bbox.getYMax(), 0.0); - assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); - assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); - assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); - assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); - assertEquals("BoundingBox{xMin=1.0, xMax=1.0, yMin=1.0, yMax=1.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", - bbox.toString()); - - assertTrue(geospatialTypes.getTypes().contains(1)); - assertEquals(1, geospatialTypes.getTypes().size()); - assertEquals("GeospatialTypes{types=[Point (XY)]}", geospatialTypes.toString()); - - - assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=1.0, yMin=1.0, yMax=1.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}", - geometryStatistics.toString()); - - byte[] lineWkb = wkbWriter.write(line); - stats.updateGeometry(new BytesWritable(lineWkb)); - assertEquals(1.0, bbox.getXMin(), 0.0); - assertEquals(2.0, bbox.getXMax(), 0.0); - assertEquals(1.0, bbox.getYMin(), 0.0); - assertEquals(2.0, bbox.getYMax(), 0.0); - assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); - assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); - assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); - assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); - - assertTrue(geospatialTypes.getTypes().contains(1)); - assertTrue(geospatialTypes.getTypes().contains(2)); - assertEquals(2, geospatialTypes.getTypes().size()); - assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY)]}", - geospatialTypes.toString()); - - byte[] polygonWkb = wkbWriter.write(polygon); - stats.updateGeometry(new BytesWritable(polygonWkb)); - stats.updateGeometry(new BytesWritable(lineWkb)); - assertEquals(0.0, bbox.getXMin(), 0.0); - assertEquals(3.0, bbox.getXMax(), 0.0); - assertEquals(0.0, bbox.getYMin(), 0.0); - assertEquals(3.0, bbox.getYMax(), 0.0); - assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0); - assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0); - assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); - assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); - - assertTrue(geospatialTypes.getTypes().contains(1)); - assertTrue(geospatialTypes.getTypes().contains(2)); - assertTrue(geospatialTypes.getTypes().contains(2)); - assertEquals(3, geospatialTypes.getTypes().size()); - assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY), Polygon (XY)]}", - geospatialTypes.toString()); - - } - - @Test - public void testUpdateGeometryWithZCoordinates() { - TypeDescription desc = TypeDescription.createGeometry(); - ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc); - GeometryFactory geometryFactory = new GeometryFactory(); - WKBWriter wkbWriter = new WKBWriter(3); - - Point point1 = geometryFactory.createPoint(new Coordinate(0, 1, 2)); - Point point2 = geometryFactory.createPoint(new Coordinate(2, 1, 0)); - - stats.updateGeometry(new BytesWritable(wkbWriter.write(point1))); - stats.updateGeometry(new BytesWritable(wkbWriter.write(point2))); - - GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats; - BoundingBox bbox = geometryStatistics.getBoundingBox(); - assertEquals(0.0, bbox.getXMin(), 0.0); - assertEquals(2.0, bbox.getXMax(), 0.0); - assertEquals(1.0, bbox.getYMin(), 0.0); - assertEquals(1.0, bbox.getYMax(), 0.0); - assertEquals(0.0, bbox.getZMin(), 0.0); - assertEquals(2.0, bbox.getZMax(), 0.0); - assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0); - assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0); - assertEquals("BoundingBox{xMin=0.0, xMax=2.0, yMin=1.0, yMax=1.0, zMin=0.0, zMax=2.0, mMin=Infinity, mMax=-Infinity}", - bbox.toString()); - assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=0.0, xMax=2.0, yMin=1.0, yMax=1.0, zMin=0.0, zMax=2.0, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XYZ)]}", - geometryStatistics.toString()); - - GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes(); - assertTrue(geospatialTypes.getTypes().contains(1001)); - assertEquals(1, geospatialTypes.getTypes().size()); - } } diff --git a/java/pom.xml b/java/pom.xml index 1166132cf7..f969ea95f1 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -67,7 +67,7 @@ 3.4.1 17 ${project.basedir}/../target/javadoc - 1.19.0 + 1.20.0 5.12.2 3.7.1 3.8.1 From 866bf641663312c146825dac1962d438204bc64c Mon Sep 17 00:00:00 2001 From: ffacs Date: Fri, 6 Jun 2025 01:14:50 +0800 Subject: [PATCH 08/16] Fix style --- java/core/src/java/org/apache/orc/TypeDescription.java | 3 ++- .../src/java/org/apache/orc/impl/ColumnStatisticsImpl.java | 6 +++++- java/core/src/test/org/apache/orc/TestColumnStatistics.java | 6 +++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index 0c7d4a07dc..6dbbe322c6 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -721,7 +721,8 @@ public TypeDescription(Category category) { private int precision = DEFAULT_PRECISION; private int scale = DEFAULT_SCALE; private String CRS = DEFAULT_CRS; - private EdgeInterpolationAlgorithm edgeInterpolationAlgorithm = DEFAULT_EDGE_INTERPOLATION_ALGORITHM; + private EdgeInterpolationAlgorithm edgeInterpolationAlgorithm + = DEFAULT_EDGE_INTERPOLATION_ALGORITHM; static void printFieldName(StringBuilder buffer, String name) { if (UNQUOTED_NAMES.matcher(name).matches()) { diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index b9936234bc..193f8791e7 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -48,7 +48,11 @@ import java.time.chrono.ChronoLocalDate; import java.time.chrono.Chronology; import java.time.chrono.IsoChronology; -import java.util.*; +import java.util.HashSet; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.TimeZone; public class ColumnStatisticsImpl implements ColumnStatistics { diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java b/java/core/src/test/org/apache/orc/TestColumnStatistics.java index 13ef5bf698..b9089409dd 100644 --- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java +++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java @@ -793,9 +793,9 @@ public void testUpdateGeometryWithDifferentTypes() { Coordinate[] lineCoords = new Coordinate[]{new Coordinate(1, 1), new Coordinate(2, 2)}; LineString line = geometryFactory.createLineString(lineCoords); Coordinate[] polygonCoords = new Coordinate[]{ - new Coordinate(0, 0), new Coordinate(3, 0), - new Coordinate(1, 3), new Coordinate(0, 1), - new Coordinate(0, 0) + new Coordinate(0, 0), new Coordinate(3, 0), + new Coordinate(1, 3), new Coordinate(0, 1), + new Coordinate(0, 0) }; LinearRing shell = geometryFactory.createLinearRing(polygonCoords); Polygon polygon = geometryFactory.createPolygon(shell); From f2c1372c8cfb6207be7636b2e4f5b47010b1e6d9 Mon Sep 17 00:00:00 2001 From: ffacs Date: Fri, 6 Jun 2025 10:00:00 +0800 Subject: [PATCH 09/16] Fix building --- java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java | 1 + 1 file changed, 1 insertion(+) diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index 193f8791e7..364241d311 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -53,6 +53,7 @@ import java.util.ArrayList; import java.util.Set; import java.util.TimeZone; +import java.util.Collections; public class ColumnStatisticsImpl implements ColumnStatistics { From cedf9b6bcd1d3148f37437abc3d3cd229be91c56 Mon Sep 17 00:00:00 2001 From: ffacs Date: Sat, 7 Jun 2025 01:00:16 +0800 Subject: [PATCH 10/16] Yet another fix --- .../src/java/org/apache/orc/impl/ColumnStatisticsImpl.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index 364241d311..1ce236ea44 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -48,12 +48,12 @@ import java.time.chrono.ChronoLocalDate; import java.time.chrono.Chronology; import java.time.chrono.IsoChronology; +import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.List; -import java.util.ArrayList; import java.util.Set; import java.util.TimeZone; -import java.util.Collections; public class ColumnStatisticsImpl implements ColumnStatistics { From abb8d01b674e0d6aebdf05b895088c4eb515a934 Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 9 Jun 2025 01:06:42 +0800 Subject: [PATCH 11/16] Refactor --- .../orc/GeospatialColumnStatistics.java | 7 +- .../src/java/org/apache/orc/OrcUtils.java | 77 +++++---------- .../java/org/apache/orc/TypeDescription.java | 19 ++-- .../apache/orc/geospatial/BoundingBox.java | 27 +++--- .../orc/geospatial/GeospatialTypes.java | 97 +++++++++---------- .../org/apache/orc/TestOrcGeospatial.java | 3 - .../orc/impl/TestColumnStatisticsImpl.java | 4 +- 7 files changed, 104 insertions(+), 130 deletions(-) diff --git a/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java b/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java index 538eba2b0e..db66084c13 100644 --- a/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java +++ b/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - *

- * http://www.apache.org/licenses/LICENSE-2.0 - *

+ * + * http://www.apache.org/licenses/LICENSE-2.0 + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,7 +22,6 @@ import org.apache.orc.geospatial.GeospatialTypes; public interface GeospatialColumnStatistics extends ColumnStatistics { - BoundingBox getBoundingBox(); GeospatialTypes getGeospatialTypes(); } diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java index 3462f23a9b..ded04b8abc 100644 --- a/java/core/src/java/org/apache/orc/OrcUtils.java +++ b/java/core/src/java/org/apache/orc/OrcUtils.java @@ -17,6 +17,7 @@ */ package org.apache.orc; +import org.apache.orc.TypeDescription.EdgeInterpolationAlgorithm; import org.apache.orc.impl.ParserUtils; import org.apache.orc.impl.ReaderImpl; import org.apache.orc.impl.SchemaEvolution; @@ -172,33 +173,21 @@ private static void appendOrcTypes(List result, TypeDescription t type.setScale(typeDescr.getScale()); break; case Geography: + type.setKind(OrcProto.Type.Kind.GEOGRAPHY); + type.setAlgorithm(switch (typeDescr.getEdgeInterpolationAlgorithm()) { + case SPHERICAL -> OrcProto.Type.EdgeInterpolationAlgorithm.SPHERICAL; + case VINCENTY -> OrcProto.Type.EdgeInterpolationAlgorithm.VINCENTY; + case THOMAS -> OrcProto.Type.EdgeInterpolationAlgorithm.THOMAS; + case ANDOYER -> OrcProto.Type.EdgeInterpolationAlgorithm.ANDOYER; + case KARNEY -> OrcProto.Type.EdgeInterpolationAlgorithm.KARNEY; + default -> throw new IllegalArgumentException("Unknown interpolation algorithm: " + + typeDescr.getEdgeInterpolationAlgorithm()); + }); + type.setCrs(typeDescr.getCrs()); + break; case Geometry: - if (typeDescr.getCategory() == TypeDescription.Category.Geography) { - type.setKind(OrcProto.Type.Kind.GEOGRAPHY); - switch (typeDescr.getEdgeInterpolationAlgorithm()) { - case SPHERICAL: - type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.SPHERICAL); - break; - case VINCENTY: - type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.VINCENTY); - break; - case THOMAS: - type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.THOMAS); - break; - case ANDOYER: - type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.ANDOYER); - break; - case KARNEY: - type.setAlgorithm(OrcProto.Type.EdgeInterpolationAlgorithm.KARNEY); - break; - default: - throw new IllegalArgumentException("Unknown interpolation algorithm: " + - typeDescr.getEdgeInterpolationAlgorithm()); - } - } else { - type.setKind(OrcProto.Type.Kind.GEOMETRY); - } - type.setCrs(typeDescr.getCRS()); + type.setKind(OrcProto.Type.Kind.GEOMETRY); + type.setCrs(typeDescr.getCrs()); break; case LIST: type.setKind(OrcProto.Type.Kind.LIST); @@ -365,31 +354,17 @@ TypeDescription convertTypeFromProtobuf(List types, if (type.hasCrs()) { result.withCRS(type.getCrs()); } - switch (type.getAlgorithm()) { - case SPHERICAL: - result.withEdgeInterpolationAlgorithm( - TypeDescription.EdgeInterpolationAlgorithm.SPHERICAL); - break; - case VINCENTY: - result.withEdgeInterpolationAlgorithm( - TypeDescription.EdgeInterpolationAlgorithm.VINCENTY); - break; - case THOMAS: - result.withEdgeInterpolationAlgorithm( - TypeDescription.EdgeInterpolationAlgorithm.THOMAS); - break; - case ANDOYER: - result.withEdgeInterpolationAlgorithm( - TypeDescription.EdgeInterpolationAlgorithm.ANDOYER); - break; - case KARNEY: - result.withEdgeInterpolationAlgorithm( - TypeDescription.EdgeInterpolationAlgorithm.KARNEY); - break; - default: - throw new IllegalArgumentException("Unknown interpolation algorithm: " + - type.getAlgorithm()); - } + result.withEdgeInterpolationAlgorithm( + switch (type.getAlgorithm()) { + case SPHERICAL -> EdgeInterpolationAlgorithm.SPHERICAL; + case VINCENTY -> EdgeInterpolationAlgorithm.VINCENTY; + case THOMAS -> EdgeInterpolationAlgorithm.THOMAS; + case ANDOYER -> EdgeInterpolationAlgorithm.ANDOYER; + case KARNEY -> EdgeInterpolationAlgorithm.KARNEY; + default -> throw new IllegalArgumentException("Unknown interpolation algorithm: " + + type.getAlgorithm()); + } + ); break; case LIST: if (type.getSubtypesCount() != 1) { diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index 6dbbe322c6..c4ab6ebbdf 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -63,6 +63,7 @@ public enum EdgeInterpolationAlgorithm { } final String name; } + private static final EdgeInterpolationAlgorithm DEFAULT_EDGE_INTERPOLATION_ALGORITHM = EdgeInterpolationAlgorithm.SPHERICAL; @@ -271,7 +272,7 @@ public TypeDescription withCRS(String crs) { throw new IllegalArgumentException("crs is only allowed on Geometry/Geography" + " and not " + category.name); } - this.CRS = crs; + this.crs = crs; return this; } @@ -412,7 +413,7 @@ public TypeDescription clone() { result.maxLength = maxLength; result.precision = precision; result.scale = scale; - result.CRS = CRS; + result.crs = crs; result.edgeInterpolationAlgorithm = edgeInterpolationAlgorithm; if (fieldNames != null) { result.fieldNames.addAll(fieldNames); @@ -605,8 +606,8 @@ public int getScale() { return scale; } - public String getCRS() { - return CRS; + public String getCrs() { + return crs; } public EdgeInterpolationAlgorithm getEdgeInterpolationAlgorithm() { @@ -720,7 +721,7 @@ public TypeDescription(Category category) { private int maxLength = DEFAULT_LENGTH; private int precision = DEFAULT_PRECISION; private int scale = DEFAULT_SCALE; - private String CRS = DEFAULT_CRS; + private String crs = DEFAULT_CRS; private EdgeInterpolationAlgorithm edgeInterpolationAlgorithm = DEFAULT_EDGE_INTERPOLATION_ALGORITHM; @@ -752,12 +753,12 @@ public void printToBuffer(StringBuilder buffer) { break; case Geometry: buffer.append('('); - buffer.append(CRS); + buffer.append(crs); buffer.append(')'); break; case Geography: buffer.append('('); - buffer.append(CRS); + buffer.append(crs); buffer.append(','); buffer.append(edgeInterpolationAlgorithm.name()); buffer.append(')'); @@ -824,11 +825,11 @@ private void printJsonToBuffer(String prefix, StringBuilder buffer, break; case Geometry: buffer.append(", \"crs\": "); - buffer.append(CRS); + buffer.append(crs); break; case Geography: buffer.append(", \"crs\": "); - buffer.append(CRS); + buffer.append(crs); buffer.append(", \"edge_interpolation_algorithm\": "); buffer.append(edgeInterpolationAlgorithm.name()); break; diff --git a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java index 61c280636b..2d24783646 100644 --- a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java +++ b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java @@ -7,14 +7,13 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.apache.orc.geospatial; @@ -23,6 +22,11 @@ import org.locationtech.jts.geom.Envelope; import org.locationtech.jts.geom.Geometry; +/** + * Bounding box for Geometry or Geography type in the representation of min/max + * value pairs of coordinates from each axis. + * A bounding box is considered valid if none of the X / Y dimensions contain NaN. + */ public class BoundingBox { private double xMin = Double.POSITIVE_INFINITY; @@ -63,8 +67,7 @@ public boolean equals(Object obj) { return true; } return xMin == other.xMin && xMax == other.xMax && yMin == other.yMin && yMax == other.yMax && - zMin == other.zMin && zMax == other.zMax && mMin == other.mMin && mMax == other.mMax && - valid == other.valid; + zMin == other.zMin && zMax == other.zMax && mMin == other.mMin && mMax == other.mMax; } @Override @@ -72,8 +75,7 @@ public int hashCode() { return Double.hashCode(xMin) ^ Double.hashCode(xMax) ^ Double.hashCode(yMin) ^ Double.hashCode(yMax) ^ Double.hashCode(zMin) ^ Double.hashCode(zMax) ^ - Double.hashCode(mMin) ^ Double.hashCode(mMax) ^ - Boolean.hashCode(valid); + Double.hashCode(mMin) ^ Double.hashCode(mMax); } private void resetBBox() { @@ -85,6 +87,9 @@ private void resetBBox() { zMax = Double.NEGATIVE_INFINITY; mMin = Double.POSITIVE_INFINITY; mMax = Double.NEGATIVE_INFINITY; + + // Update the validity + valid = isXYValid(); } public double getXMin() { diff --git a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java index b15c379edb..7172050fdf 100644 --- a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java +++ b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java @@ -7,14 +7,13 @@ * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package org.apache.orc.geospatial; @@ -26,6 +25,18 @@ import java.util.Set; import java.util.stream.Collectors; +/** + * A list of geospatial types from all instances in the Geometry or Geography column, + * or an empty list if they are not known. + * + * The GeospatialTypes instance becomes invalid in the following cases: + * - When an unknown or unsupported geometry type is encountered during update + * - When merging with another invalid GeospatialTypes instance + * - When explicitly aborted using abort() + * + * When invalid, the types list is cleared and remains empty. All subsequent + * updates and merges are ignored until reset() is called. + */ public class GeospatialTypes { private static final int UNKNOWN_TYPE_ID = -1; @@ -64,6 +75,12 @@ public Set getTypes() { return types; } + /** + * Updates the types list with the given geometry's type. + * If the geometry type is unknown, the instance becomes invalid. + * + * @param geometry the geometry to process + */ public void update(Geometry geometry) { if (!valid) { return; @@ -142,24 +159,16 @@ public String toString() { } private int getGeometryTypeId(Geometry geometry) { - switch (geometry.getGeometryType()) { - case Geometry.TYPENAME_POINT: - return 1; - case Geometry.TYPENAME_LINESTRING: - return 2; - case Geometry.TYPENAME_POLYGON: - return 3; - case Geometry.TYPENAME_MULTIPOINT: - return 4; - case Geometry.TYPENAME_MULTILINESTRING: - return 5; - case Geometry.TYPENAME_MULTIPOLYGON: - return 6; - case Geometry.TYPENAME_GEOMETRYCOLLECTION: - return 7; - default: - return UNKNOWN_TYPE_ID; - } + return switch (geometry.getGeometryType()) { + case Geometry.TYPENAME_POINT -> 1; + case Geometry.TYPENAME_LINESTRING -> 2; + case Geometry.TYPENAME_POLYGON -> 3; + case Geometry.TYPENAME_MULTIPOINT -> 4; + case Geometry.TYPENAME_MULTILINESTRING -> 5; + case Geometry.TYPENAME_MULTIPOLYGON -> 6; + case Geometry.TYPENAME_GEOMETRYCOLLECTION -> 7; + default -> UNKNOWN_TYPE_ID; + }; } /** @@ -201,31 +210,19 @@ private int getGeometryTypeCode(Geometry geometry) { private String typeIdToString(int typeId) { String typeString; - switch (typeId % 1000) { - case 1: - typeString = Geometry.TYPENAME_POINT; - break; - case 2: - typeString = Geometry.TYPENAME_LINESTRING; - break; - case 3: - typeString = Geometry.TYPENAME_POLYGON; - break; - case 4: - typeString = Geometry.TYPENAME_MULTIPOINT; - break; - case 5: - typeString = Geometry.TYPENAME_MULTILINESTRING; - break; - case 6: - typeString = Geometry.TYPENAME_MULTIPOLYGON; - break; - case 7: - typeString = Geometry.TYPENAME_GEOMETRYCOLLECTION; - break; - default: - return "Unknown"; - } + + typeString = switch (typeId % 1000) { + case 1 -> Geometry.TYPENAME_POINT; + case 2 -> Geometry.TYPENAME_LINESTRING; + case 3 -> Geometry.TYPENAME_POLYGON; + case 4 -> Geometry.TYPENAME_MULTIPOINT; + case 5 -> Geometry.TYPENAME_MULTILINESTRING; + case 6 -> Geometry.TYPENAME_MULTIPOLYGON; + case 7 -> Geometry.TYPENAME_GEOMETRYCOLLECTION; + default -> { + yield "Unknown"; + } + }; if (typeId >= 3000) { typeString += " (XYZM)"; } else if (typeId >= 2000) { diff --git a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java index 05b2118380..6f9a4e948c 100644 --- a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java +++ b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java @@ -38,9 +38,6 @@ import static org.junit.jupiter.api.Assertions.*; -/** - * - */ public class TestOrcGeospatial { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); diff --git a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java index cc692d3ac5..7a00f20144 100644 --- a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java @@ -31,9 +31,9 @@ import org.apache.orc.*; import org.apache.orc.geospatial.BoundingBox; import org.apache.orc.geospatial.GeospatialTypes; +import org.apache.orc.TimestampColumnStatistics; +import org.apache.orc.TypeDescription; import org.junit.jupiter.api.Test; -import org.locationtech.jts.geom.*; -import org.locationtech.jts.io.WKBWriter; import java.io.IOException; import java.util.TimeZone; From 5bac508b475f2d6322b252ad0139e7cbc925e632 Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 9 Jun 2025 01:08:48 +0800 Subject: [PATCH 12/16] fix indentation --- java/core/src/java/org/apache/orc/TypeDescription.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index c4ab6ebbdf..c5ef48b047 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -65,7 +65,7 @@ public enum EdgeInterpolationAlgorithm { } private static final EdgeInterpolationAlgorithm DEFAULT_EDGE_INTERPOLATION_ALGORITHM - = EdgeInterpolationAlgorithm.SPHERICAL; + = EdgeInterpolationAlgorithm.SPHERICAL; @Override public int compareTo(TypeDescription other) { From 0f85db07ecda30ee582b7353d2bf7d0786487e0d Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 9 Jun 2025 01:13:21 +0800 Subject: [PATCH 13/16] revert changes in TestColumnStatisticsImpl.java --- .../org/apache/orc/impl/TestColumnStatisticsImpl.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java index 7a00f20144..a816618e6b 100644 --- a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java @@ -86,7 +86,7 @@ public void testOldTimestamps() throws IOException { Path file = new Path(exampleDir, "TestOrcFile.testTimestamp.orc"); Reader reader = OrcFile.createReader(file, OrcFile.readerOptions(conf)); TimestampColumnStatistics stats = - (TimestampColumnStatistics) reader.getStatistics()[0]; + (TimestampColumnStatistics) reader.getStatistics()[0]; assertEquals("1995-01-01 00:00:00.688", stats.getMinimum().toString()); // ORC-611: add TS stats nanosecond support for older files by using (max TS + 0.999 ms) assertEquals("2037-01-01 00:00:00.000999999", stats.getMaximum().toString()); @@ -140,9 +140,9 @@ public void testTimestamps() { public void testDecimal64Overflow() { TypeDescription schema = TypeDescription.fromString("decimal(18,6)"); OrcProto.ColumnStatistics.Builder pb = - OrcProto.ColumnStatistics.newBuilder(); + OrcProto.ColumnStatistics.newBuilder(); OrcProto.DecimalStatistics.Builder decimalBuilder = - OrcProto.DecimalStatistics.newBuilder(); + OrcProto.DecimalStatistics.newBuilder(); decimalBuilder.setMaximum("1000.0"); decimalBuilder.setMinimum("1.010"); decimalBuilder.setSum("123456789.123456"); @@ -152,7 +152,7 @@ public void testDecimal64Overflow() { // the base case doesn't overflow DecimalColumnStatistics stats1 = (DecimalColumnStatistics) - ColumnStatisticsImpl.deserialize(schema, pb.build()); + ColumnStatisticsImpl.deserialize(schema, pb.build()); ColumnStatisticsImpl updateStats1 = (ColumnStatisticsImpl) stats1; assertEquals("1.01", stats1.getMinimum().toString()); assertEquals("1000", stats1.getMaximum().toString()); @@ -163,7 +163,7 @@ public void testDecimal64Overflow() { decimalBuilder.setSum("1234567890123.45"); pb.setDecimalStatistics(decimalBuilder); DecimalColumnStatistics stats2 = (DecimalColumnStatistics) - ColumnStatisticsImpl.deserialize(schema, pb.build()); + ColumnStatisticsImpl.deserialize(schema, pb.build()); assertNull(stats2.getSum()); // merge them together From 2219a5838454846ecb0328f4e43f8995d594a31c Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 9 Jun 2025 23:26:02 +0800 Subject: [PATCH 14/16] Refactor serdes --- .../apache/orc/geospatial/BoundingBox.java | 20 +++----- .../orc/geospatial/GeospatialTypes.java | 5 -- .../apache/orc/impl/ColumnStatisticsImpl.java | 50 ++++++++++--------- .../org/apache/orc/TestColumnStatistics.java | 6 +++ .../org/apache/orc/TestOrcGeospatial.java | 12 +++-- 5 files changed, 48 insertions(+), 45 deletions(-) diff --git a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java index 2d24783646..fb6cfe96e0 100644 --- a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java +++ b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java @@ -66,8 +66,11 @@ public boolean equals(Object obj) { if (obj == this) { return true; } + + // Valid flag must be checked since invalid bounding boxes may have equal coordinates with the initial one return xMin == other.xMin && xMax == other.xMax && yMin == other.yMin && yMax == other.yMax && - zMin == other.zMin && zMax == other.zMax && mMin == other.mMin && mMax == other.mMax; + zMin == other.zMin && zMax == other.zMax && mMin == other.mMin && mMax == other.mMax && + valid == other.valid; } @Override @@ -75,9 +78,11 @@ public int hashCode() { return Double.hashCode(xMin) ^ Double.hashCode(xMax) ^ Double.hashCode(yMin) ^ Double.hashCode(yMax) ^ Double.hashCode(zMin) ^ Double.hashCode(zMax) ^ - Double.hashCode(mMin) ^ Double.hashCode(mMax); + Double.hashCode(mMin) ^ Double.hashCode(mMax) ^ + Boolean.hashCode(valid); } + // Don't change `valid` here and let the caller maintain it private void resetBBox() { xMin = Double.POSITIVE_INFINITY; xMax = Double.NEGATIVE_INFINITY; @@ -87,9 +92,6 @@ private void resetBBox() { zMax = Double.NEGATIVE_INFINITY; mMin = Double.POSITIVE_INFINITY; mMax = Double.NEGATIVE_INFINITY; - - // Update the validity - valid = isXYValid(); } public double getXMin() { @@ -315,14 +317,6 @@ private void updateBounds(double minX, double maxX, double minY, double maxY) { } } - /** - * Aborts the bounding box by resetting it to its initial state. - */ - public void abort() { - valid = false; - resetBBox(); - } - /** * Resets the bounding box to its initial state. */ diff --git a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java index 7172050fdf..c1067adad4 100644 --- a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java +++ b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java @@ -117,11 +117,6 @@ public void reset() { valid = true; } - public void abort() { - valid = false; - types.clear(); - } - public boolean isValid() { return valid; } diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java index 1ce236ea44..46b87bfdef 100644 --- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java +++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java @@ -1871,7 +1871,7 @@ public Timestamp getMaximum() { private static final class GeospatialStatisticsImpl extends ColumnStatisticsImpl implements GeospatialColumnStatistics { - private BoundingBox boundingBox; + private final BoundingBox boundingBox; private final GeospatialTypes geospatialTypes; private final WKBReader reader = new WKBReader(); @@ -1882,24 +1882,29 @@ private static final class GeospatialStatisticsImpl extends ColumnStatisticsImpl GeospatialStatisticsImpl(OrcProto.ColumnStatistics stats) { super(stats); - this.boundingBox = new BoundingBox(); + BoundingBox boundingBoxOut = null; + GeospatialTypes geospatialTypesOut = null; OrcProto.GeospatialStatistics geoStatistics = stats.getGeospatialStatistics(); if (geoStatistics.hasBbox()) { OrcProto.BoundingBox bbox = geoStatistics.getBbox(); - this.boundingBox = new BoundingBox( - bbox.hasXmin() ? bbox.getXmin() : Double.POSITIVE_INFINITY, - bbox.hasXmax() ? bbox.getXmax() : Double.NEGATIVE_INFINITY, - bbox.hasYmin() ? bbox.getYmin() : Double.POSITIVE_INFINITY, - bbox.hasYmax() ? bbox.getYmax() : Double.NEGATIVE_INFINITY, - bbox.hasZmin() ? bbox.getZmin() : Double.POSITIVE_INFINITY, - bbox.hasZmax() ? bbox.getZmax() : Double.NEGATIVE_INFINITY, - bbox.hasMmin() ? bbox.getMmin() : Double.POSITIVE_INFINITY, - bbox.hasMmax() ? bbox.getMmax() : Double.NEGATIVE_INFINITY); + boundingBoxOut = new BoundingBox( + bbox.hasXmin() ? bbox.getXmin() : Double.NaN, + bbox.hasXmax() ? bbox.getXmax() : Double.NaN, + bbox.hasYmin() ? bbox.getYmin() : Double.NaN, + bbox.hasYmax() ? bbox.getYmax() : Double.NaN, + bbox.hasZmin() ? bbox.getZmin() : Double.NaN, + bbox.hasZmax() ? bbox.getZmax() : Double.NaN, + bbox.hasMmin() ? bbox.getMmin() : Double.NaN, + bbox.hasMmax() ? bbox.getMmax() : Double.NaN); } - Set types = new HashSet<>(geoStatistics.getGeospatialTypesList()); - this.geospatialTypes = new GeospatialTypes(types); + if (!geoStatistics.getGeospatialTypesList().isEmpty()) { + Set types = new HashSet<>(geoStatistics.getGeospatialTypesList()); + geospatialTypesOut = new GeospatialTypes(types); + } + this.boundingBox = boundingBoxOut; + this.geospatialTypes = geospatialTypesOut; } @Override @@ -1951,28 +1956,27 @@ public OrcProto.ColumnStatistics.Builder serialize() { OrcProto.GeospatialStatistics.Builder geoStats = OrcProto.GeospatialStatistics.newBuilder(); OrcProto.BoundingBox.Builder bboxBuilder = OrcProto.BoundingBox.newBuilder(); - boolean hasStats = false; if (boundingBox.isValid() && !boundingBox.isXYEmpty()) { bboxBuilder.setXmin(boundingBox.getXMin()); bboxBuilder.setXmax(boundingBox.getXMax()); bboxBuilder.setYmin(boundingBox.getYMin()); bboxBuilder.setYmax(boundingBox.getYMax()); - bboxBuilder.setZmin(boundingBox.getZMin()); - bboxBuilder.setZmax(boundingBox.getZMax()); - bboxBuilder.setMmin(boundingBox.getMMin()); - bboxBuilder.setMmax(boundingBox.getMMax()); - hasStats = true; + if (boundingBox.isZValid() && !boundingBox.isZEmpty()) { + bboxBuilder.setZmin(boundingBox.getZMin()); + bboxBuilder.setZmax(boundingBox.getZMax()); + } + if (boundingBox.isMValid() && !boundingBox.isMEmpty()) { + bboxBuilder.setMmin(boundingBox.getMMin()); + bboxBuilder.setMmax(boundingBox.getMMax()); + } geoStats.setBbox(bboxBuilder); } if (geospatialTypes.isValid() && !geospatialTypes.getTypes().isEmpty()) { List sortedTypes = new ArrayList<>(geospatialTypes.getTypes()); Collections.sort(sortedTypes); geoStats.addAllGeospatialTypes(sortedTypes); - hasStats = true; - } - if (hasStats) { - builder.setGeospatialStatistics(geoStats); } + builder.setGeospatialStatistics(geoStats); return builder; } diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java b/java/core/src/test/org/apache/orc/TestColumnStatistics.java index b9089409dd..dea3359d92 100644 --- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java +++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java @@ -917,6 +917,12 @@ public void TestGeospatialMerge() { stats0.merge(stats1); BoundingBox bbox = geometryStatistics.getBoundingBox(); + assertTrue(bbox.isXYValid()); + assertFalse(bbox.isXYEmpty()); + assertTrue(bbox.isZValid()); + assertTrue(bbox.isMValid()); + assertTrue(bbox.isZEmpty()); + assertTrue(bbox.isMEmpty()); assertEquals(1.0, bbox.getXMin(), 0.0); assertEquals(2.0, bbox.getXMax(), 0.0); assertEquals(1.0, bbox.getYMin(), 0.0); diff --git a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java index 6f9a4e948c..74d68229c4 100644 --- a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java +++ b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java @@ -97,7 +97,10 @@ public void testGeometryWriterWithNulls() throws Exception { assertEquals(50, stats[0].getNumberOfValues()); assertTrue(stats[0].hasNull()); assertInstanceOf(GeospatialColumnStatistics.class, stats[0]); - assertEquals("BoundingBox{xMin=0.0, xMax=98.0, yMin=0.0, yMax=98.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}", ((GeospatialColumnStatistics) stats[0]).getBoundingBox().toString()); + assertTrue(((GeospatialColumnStatistics) stats[0]).getBoundingBox().isXYValid()); + assertFalse(((GeospatialColumnStatistics) stats[0]).getBoundingBox().isZValid()); + assertFalse(((GeospatialColumnStatistics) stats[0]).getBoundingBox().isMValid()); + assertEquals("BoundingBox{xMin=0.0, xMax=98.0, yMin=0.0, yMax=98.0, zMin=NaN, zMax=NaN, mMin=NaN, mMax=NaN}", ((GeospatialColumnStatistics) stats[0]).getBoundingBox().toString()); assertEquals("GeospatialTypes{types=[Point (XY)]}", ((GeospatialColumnStatistics) stats[0]).getGeospatialTypes().toString()); // Verify data @@ -148,13 +151,14 @@ public void testGeographyWriterWithNulls() throws Exception { assertEquals("geography(OGC:CRS84,SPHERICAL)", reader.getSchema().toString()); assertEquals(100, reader.getNumberOfRows()); - // Verify statistics, make sure there is no geospatial stats + // Verify statistics, make sure there are no bounding box and geospatial types ColumnStatistics[] stats = reader.getStatistics(); assertEquals(1, stats.length); assertEquals(50, stats[0].getNumberOfValues()); assertTrue(stats[0].hasNull()); - assertInstanceOf(ColumnStatisticsImpl.class, stats[0]); - assertFalse(stats[0] instanceof GeospatialColumnStatistics); + assertInstanceOf(GeospatialColumnStatistics.class, stats[0]); + assertNull(((GeospatialColumnStatistics) stats[0]).getBoundingBox()); + assertNull(((GeospatialColumnStatistics) stats[0]).getGeospatialTypes()); // Verify Data RecordReader rows = reader.rows(); From 79ac6ed9b7aad6976d4a3b4b5404aca25d36c226 Mon Sep 17 00:00:00 2001 From: ffacs Date: Tue, 10 Jun 2025 23:26:54 +0800 Subject: [PATCH 15/16] inline updateBounds --- .../apache/orc/geospatial/BoundingBox.java | 33 ++++++++----------- .../orc/impl/TestColumnStatisticsImpl.java | 6 ---- 2 files changed, 13 insertions(+), 26 deletions(-) diff --git a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java index fb6cfe96e0..093e2c96c8 100644 --- a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java +++ b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java @@ -280,8 +280,20 @@ public void update(Geometry geometry) { return; } + // Updates the X and Y bounds of this bounding box with the given coordinates. + // Updates are conditional: + // - X bounds are only updated if both minX and maxX are not NaN + // - Y bounds are only updated if both minY and maxY are not NaN + // This allows partial updates while preserving valid dimensions. Envelope envelope = geometry.getEnvelopeInternal(); - updateBounds(envelope.getMinX(), envelope.getMaxX(), envelope.getMinY(), envelope.getMaxY()); + if (!Double.isNaN(envelope.getMinX()) && !Double.isNaN(envelope.getMaxX())) { + xMin = Math.min(xMin, envelope.getMinX()); + xMax = Math.max(xMax, envelope.getMaxX()); + } + if (!Double.isNaN(envelope.getMinY()) && !Double.isNaN(envelope.getMaxY())) { + yMin = Math.min(yMin, envelope.getMinY()); + yMax = Math.max(yMax, envelope.getMaxY()); + } for (Coordinate coord : geometry.getCoordinates()) { if (!Double.isNaN(coord.getZ())) { @@ -298,25 +310,6 @@ public void update(Geometry geometry) { valid = isXYValid(); } - /** - * Updates the X and Y bounds of this bounding box with the given coordinates. - * Updates are conditional: - * - X bounds are only updated if both minX and maxX are not NaN - * - Y bounds are only updated if both minY and maxY are not NaN - * This allows partial updates while preserving valid dimensions. - */ - private void updateBounds(double minX, double maxX, double minY, double maxY) { - if (!Double.isNaN(minX) && !Double.isNaN(maxX)) { - xMin = Math.min(xMin, minX); - xMax = Math.max(xMax, maxX); - } - - if (!Double.isNaN(minY) && !Double.isNaN(maxY)) { - yMin = Math.min(yMin, minY); - yMax = Math.max(yMax, maxY); - } - } - /** * Resets the bounding box to its initial state. */ diff --git a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java index a816618e6b..f16d042fdb 100644 --- a/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java +++ b/java/core/src/test/org/apache/orc/impl/TestColumnStatisticsImpl.java @@ -27,12 +27,6 @@ import org.apache.orc.TestConf; import org.apache.orc.TimestampColumnStatistics; import org.apache.orc.TypeDescription; -import org.apache.hadoop.io.BytesWritable; -import org.apache.orc.*; -import org.apache.orc.geospatial.BoundingBox; -import org.apache.orc.geospatial.GeospatialTypes; -import org.apache.orc.TimestampColumnStatistics; -import org.apache.orc.TypeDescription; import org.junit.jupiter.api.Test; import java.io.IOException; From 95c59a5c0624d616386f1f7b859210cb330dec61 Mon Sep 17 00:00:00 2001 From: ffacs Date: Wed, 11 Jun 2025 11:52:46 +0800 Subject: [PATCH 16/16] support Java 25-ea --- java/core/src/test/org/apache/orc/TestOrcGeospatial.java | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java index 74d68229c4..f0e148fb0d 100644 --- a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java +++ b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java @@ -38,19 +38,14 @@ import static org.junit.jupiter.api.Assertions.*; -public class TestOrcGeospatial { +public class TestOrcGeospatial implements TestConf { Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); - Configuration conf; FileSystem fs; Path testFilePath; - public TestOrcGeospatial() { - } - @BeforeEach public void openFileSystem(TestInfo testInfo) throws Exception { - conf = new Configuration(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestOrcGeospatial." + testInfo.getTestMethod().get().getName() + ".orc");