diff --git a/java/core/pom.xml b/java/core/pom.xml
index cdfbd5db9a..487a524398 100644
--- a/java/core/pom.xml
+++ b/java/core/pom.xml
@@ -80,6 +80,11 @@
com.aayushatharva.brotli4j
brotli4j
+
+ org.locationtech.jts
+ jts-core
+ ${jts.version}
+
diff --git a/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java b/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java
new file mode 100644
index 0000000000..db66084c13
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/GeospatialColumnStatistics.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import org.apache.orc.geospatial.BoundingBox;
+import org.apache.orc.geospatial.GeospatialTypes;
+
+public interface GeospatialColumnStatistics extends ColumnStatistics {
+ BoundingBox getBoundingBox();
+ GeospatialTypes getGeospatialTypes();
+}
diff --git a/java/core/src/java/org/apache/orc/OrcUtils.java b/java/core/src/java/org/apache/orc/OrcUtils.java
index 7dde0bc0fd..ded04b8abc 100644
--- a/java/core/src/java/org/apache/orc/OrcUtils.java
+++ b/java/core/src/java/org/apache/orc/OrcUtils.java
@@ -17,6 +17,7 @@
*/
package org.apache.orc;
+import org.apache.orc.TypeDescription.EdgeInterpolationAlgorithm;
import org.apache.orc.impl.ParserUtils;
import org.apache.orc.impl.ReaderImpl;
import org.apache.orc.impl.SchemaEvolution;
@@ -171,6 +172,23 @@ private static void appendOrcTypes(List result, TypeDescription t
type.setPrecision(typeDescr.getPrecision());
type.setScale(typeDescr.getScale());
break;
+ case Geography:
+ type.setKind(OrcProto.Type.Kind.GEOGRAPHY);
+ type.setAlgorithm(switch (typeDescr.getEdgeInterpolationAlgorithm()) {
+ case SPHERICAL -> OrcProto.Type.EdgeInterpolationAlgorithm.SPHERICAL;
+ case VINCENTY -> OrcProto.Type.EdgeInterpolationAlgorithm.VINCENTY;
+ case THOMAS -> OrcProto.Type.EdgeInterpolationAlgorithm.THOMAS;
+ case ANDOYER -> OrcProto.Type.EdgeInterpolationAlgorithm.ANDOYER;
+ case KARNEY -> OrcProto.Type.EdgeInterpolationAlgorithm.KARNEY;
+ default -> throw new IllegalArgumentException("Unknown interpolation algorithm: " +
+ typeDescr.getEdgeInterpolationAlgorithm());
+ });
+ type.setCrs(typeDescr.getCrs());
+ break;
+ case Geometry:
+ type.setKind(OrcProto.Type.Kind.GEOMETRY);
+ type.setCrs(typeDescr.getCrs());
+ break;
case LIST:
type.setKind(OrcProto.Type.Kind.LIST);
type.addSubtypes(children.get(0).getId());
@@ -325,6 +343,29 @@ TypeDescription convertTypeFromProtobuf(List types,
result.withPrecision(type.getPrecision());
}
break;
+ case GEOMETRY:
+ result = TypeDescription.createGeometry();
+ if (type.hasCrs()) {
+ result.withCRS(type.getCrs());
+ }
+ break;
+ case GEOGRAPHY:
+ result = TypeDescription.createGeography();
+ if (type.hasCrs()) {
+ result.withCRS(type.getCrs());
+ }
+ result.withEdgeInterpolationAlgorithm(
+ switch (type.getAlgorithm()) {
+ case SPHERICAL -> EdgeInterpolationAlgorithm.SPHERICAL;
+ case VINCENTY -> EdgeInterpolationAlgorithm.VINCENTY;
+ case THOMAS -> EdgeInterpolationAlgorithm.THOMAS;
+ case ANDOYER -> EdgeInterpolationAlgorithm.ANDOYER;
+ case KARNEY -> EdgeInterpolationAlgorithm.KARNEY;
+ default -> throw new IllegalArgumentException("Unknown interpolation algorithm: " +
+ type.getAlgorithm());
+ }
+ );
+ break;
case LIST:
if (type.getSubtypesCount() != 1) {
throw new FileFormatException("LIST type should contain exactly " +
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java
index 8ea9fca1b2..c5ef48b047 100644
--- a/java/core/src/java/org/apache/orc/TypeDescription.java
+++ b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -44,12 +44,29 @@ public class TypeDescription
public static final long MAX_DECIMAL64 = 999_999_999_999_999_999L;
public static final long MIN_DECIMAL64 = -MAX_DECIMAL64;
private static final int DEFAULT_LENGTH = 256;
+ private static final String DEFAULT_CRS = "OGC:CRS84";
static final Pattern UNQUOTED_NAMES = Pattern.compile("^[a-zA-Z0-9_]+$");
// type attributes
public static final String ENCRYPT_ATTRIBUTE = "encrypt";
public static final String MASK_ATTRIBUTE = "mask";
+ public enum EdgeInterpolationAlgorithm {
+ SPHERICAL("spherical"),
+ VINCENTY("vincenty"),
+ THOMAS("thomas"),
+ ANDOYER("andoyer"),
+ KARNEY("karney");
+
+ EdgeInterpolationAlgorithm(String name) {
+ this.name = name;
+ }
+ final String name;
+ }
+
+ private static final EdgeInterpolationAlgorithm DEFAULT_EDGE_INTERPOLATION_ALGORITHM
+ = EdgeInterpolationAlgorithm.SPHERICAL;
+
@Override
public int compareTo(TypeDescription other) {
if (this == other) {
@@ -116,7 +133,9 @@ public enum Category {
MAP("map", false),
STRUCT("struct", false),
UNION("uniontype", false),
- TIMESTAMP_INSTANT("timestamp with local time zone", true);
+ TIMESTAMP_INSTANT("timestamp with local time zone", true),
+ Geometry("geometry", true),
+ Geography("geography", true);
Category(String name, boolean isPrimitive) {
this.name = name;
@@ -187,6 +206,14 @@ public static TypeDescription createDecimal() {
return new TypeDescription(Category.DECIMAL);
}
+ public static TypeDescription createGeometry() {
+ return new TypeDescription(Category.Geometry);
+ }
+
+ public static TypeDescription createGeography() {
+ return new TypeDescription(Category.Geography);
+ }
+
/**
* Parse TypeDescription from the Hive type names. This is the inverse
* of TypeDescription.toString()
@@ -239,6 +266,26 @@ public TypeDescription withScale(int scale) {
return this;
}
+ public TypeDescription withCRS(String crs) {
+ if (category != Category.Geometry &&
+ category != Category.Geography) {
+ throw new IllegalArgumentException("crs is only allowed on Geometry/Geography" +
+ " and not " + category.name);
+ }
+ this.crs = crs;
+ return this;
+ }
+
+ public TypeDescription withEdgeInterpolationAlgorithm(
+ EdgeInterpolationAlgorithm edgeInterpolationAlgorithm) {
+ if (category != Category.Geography) {
+ throw new IllegalArgumentException("edgeInterpolationAlgorithm is only allowed on Geography" +
+ " and not " + category.name);
+ }
+ this.edgeInterpolationAlgorithm = edgeInterpolationAlgorithm;
+ return this;
+ }
+
/**
* Set an attribute on this type.
* @param key the attribute name
@@ -366,6 +413,8 @@ public TypeDescription clone() {
result.maxLength = maxLength;
result.precision = precision;
result.scale = scale;
+ result.crs = crs;
+ result.edgeInterpolationAlgorithm = edgeInterpolationAlgorithm;
if (fieldNames != null) {
result.fieldNames.addAll(fieldNames);
}
@@ -557,6 +606,14 @@ public int getScale() {
return scale;
}
+ public String getCrs() {
+ return crs;
+ }
+
+ public EdgeInterpolationAlgorithm getEdgeInterpolationAlgorithm() {
+ return edgeInterpolationAlgorithm;
+ }
+
/**
* For struct types, get the list of field names.
* @return the list of field names.
@@ -664,6 +721,9 @@ public TypeDescription(Category category) {
private int maxLength = DEFAULT_LENGTH;
private int precision = DEFAULT_PRECISION;
private int scale = DEFAULT_SCALE;
+ private String crs = DEFAULT_CRS;
+ private EdgeInterpolationAlgorithm edgeInterpolationAlgorithm
+ = DEFAULT_EDGE_INTERPOLATION_ALGORITHM;
static void printFieldName(StringBuilder buffer, String name) {
if (UNQUOTED_NAMES.matcher(name).matches()) {
@@ -691,6 +751,18 @@ public void printToBuffer(StringBuilder buffer) {
buffer.append(maxLength);
buffer.append(')');
break;
+ case Geometry:
+ buffer.append('(');
+ buffer.append(crs);
+ buffer.append(')');
+ break;
+ case Geography:
+ buffer.append('(');
+ buffer.append(crs);
+ buffer.append(',');
+ buffer.append(edgeInterpolationAlgorithm.name());
+ buffer.append(')');
+ break;
case LIST:
case MAP:
case UNION:
@@ -751,6 +823,16 @@ private void printJsonToBuffer(String prefix, StringBuilder buffer,
buffer.append(", \"length\": ");
buffer.append(maxLength);
break;
+ case Geometry:
+ buffer.append(", \"crs\": ");
+ buffer.append(crs);
+ break;
+ case Geography:
+ buffer.append(", \"crs\": ");
+ buffer.append(crs);
+ buffer.append(", \"edge_interpolation_algorithm\": ");
+ buffer.append(edgeInterpolationAlgorithm.name());
+ break;
case LIST:
case MAP:
case UNION:
diff --git a/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java
new file mode 100644
index 0000000000..093e2c96c8
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/geospatial/BoundingBox.java
@@ -0,0 +1,361 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.geospatial;
+
+import org.locationtech.jts.geom.Coordinate;
+import org.locationtech.jts.geom.Envelope;
+import org.locationtech.jts.geom.Geometry;
+
+/**
+ * Bounding box for Geometry or Geography type in the representation of min/max
+ * value pairs of coordinates from each axis.
+ * A bounding box is considered valid if none of the X / Y dimensions contain NaN.
+ */
+public class BoundingBox {
+
+ private double xMin = Double.POSITIVE_INFINITY;
+ private double xMax = Double.NEGATIVE_INFINITY;
+ private double yMin = Double.POSITIVE_INFINITY;
+ private double yMax = Double.NEGATIVE_INFINITY;
+ private double zMin = Double.POSITIVE_INFINITY;
+ private double zMax = Double.NEGATIVE_INFINITY;
+ private double mMin = Double.POSITIVE_INFINITY;
+ private double mMax = Double.NEGATIVE_INFINITY;
+ private boolean valid = true;
+
+ public BoundingBox() {
+ }
+
+ public BoundingBox(
+ double xMin, double xMax, double yMin, double yMax,
+ double zMin, double zMax, double mMin, double mMax) {
+ this.xMin = xMin;
+ this.xMax = xMax;
+ this.yMin = yMin;
+ this.yMax = yMax;
+ this.zMin = zMin;
+ this.zMax = zMax;
+ this.mMin = mMin;
+ this.mMax = mMax;
+
+ // Update the validity
+ valid = isXYValid();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (!(obj instanceof BoundingBox other)) {
+ return false;
+ }
+ if (obj == this) {
+ return true;
+ }
+
+ // Valid flag must be checked since invalid bounding boxes may have equal coordinates with the initial one
+ return xMin == other.xMin && xMax == other.xMax && yMin == other.yMin && yMax == other.yMax &&
+ zMin == other.zMin && zMax == other.zMax && mMin == other.mMin && mMax == other.mMax &&
+ valid == other.valid;
+ }
+
+ @Override
+ public int hashCode() {
+ return Double.hashCode(xMin) ^ Double.hashCode(xMax) ^
+ Double.hashCode(yMin) ^ Double.hashCode(yMax) ^
+ Double.hashCode(zMin) ^ Double.hashCode(zMax) ^
+ Double.hashCode(mMin) ^ Double.hashCode(mMax) ^
+ Boolean.hashCode(valid);
+ }
+
+ // Don't change `valid` here and let the caller maintain it
+ private void resetBBox() {
+ xMin = Double.POSITIVE_INFINITY;
+ xMax = Double.NEGATIVE_INFINITY;
+ yMin = Double.POSITIVE_INFINITY;
+ yMax = Double.NEGATIVE_INFINITY;
+ zMin = Double.POSITIVE_INFINITY;
+ zMax = Double.NEGATIVE_INFINITY;
+ mMin = Double.POSITIVE_INFINITY;
+ mMax = Double.NEGATIVE_INFINITY;
+ }
+
+ public double getXMin() {
+ return xMin;
+ }
+
+ public double getXMax() {
+ return xMax;
+ }
+
+ public double getYMin() {
+ return yMin;
+ }
+
+ public double getYMax() {
+ return yMax;
+ }
+
+ public double getZMin() {
+ return zMin;
+ }
+
+ public double getZMax() {
+ return zMax;
+ }
+
+ public double getMMin() {
+ return mMin;
+ }
+
+ public double getMMax() {
+ return mMax;
+ }
+
+ /**
+ * Checks if the bounding box is valid.
+ * A bounding box is considered valid if none of the X / Y dimensions contain NaN.
+ *
+ * @return true if the bounding box is valid, false otherwise.
+ */
+ public boolean isValid() {
+ return valid;
+ }
+
+ /**
+ * Checks if the X and Y dimensions of the bounding box are valid.
+ * The X and Y dimensions are considered valid if none of the bounds contain NaN.
+ *
+ * @return true if the X and Y dimensions are valid, false otherwise.
+ */
+ public boolean isXYValid() {
+ return isXValid() && isYValid();
+ }
+
+ /**
+ * Checks if the X dimension of the bounding box is valid.
+ * The X dimension is considered valid if neither bound contains NaN.
+ *
+ * @return true if the X dimension is valid, false otherwise.
+ */
+ public boolean isXValid() {
+ return !(Double.isNaN(xMin) || Double.isNaN(xMax));
+ }
+
+ /**
+ * Checks if the Y dimension of the bounding box is valid.
+ * The Y dimension is considered valid if neither bound contains NaN.
+ *
+ * @return true if the Y dimension is valid, false otherwise.
+ */
+ public boolean isYValid() {
+ return !(Double.isNaN(yMin) || Double.isNaN(yMax));
+ }
+
+ /**
+ * Checks if the Z dimension of the bounding box is valid.
+ * The Z dimension is considered valid if none of the bounds contain NaN.
+ *
+ * @return true if the Z dimension is valid, false otherwise.
+ */
+ public boolean isZValid() {
+ return !(Double.isNaN(zMin) || Double.isNaN(zMax));
+ }
+
+ /**
+ * Checks if the M dimension of the bounding box is valid.
+ * The M dimension is considered valid if none of the bounds contain NaN.
+ *
+ * @return true if the M dimension is valid, false otherwise.
+ */
+ public boolean isMValid() {
+ return !(Double.isNaN(mMin) || Double.isNaN(mMax));
+ }
+
+ /**
+ * Checks if the bounding box is empty in the X / Y dimension.
+ *
+ * @return true if the bounding box is empty, false otherwise.
+ */
+ public boolean isXYEmpty() {
+ return isXEmpty() || isYEmpty();
+ }
+
+ /**
+ * Checks if the bounding box is empty in the X dimension.
+ *
+ * @return true if the X dimension is empty, false otherwise.
+ */
+ public boolean isXEmpty() {
+ return Double.isInfinite(xMin) && Double.isInfinite(xMax);
+ }
+
+ /**
+ * Checks if the bounding box is empty in the Y dimension.
+ *
+ * @return true if the Y dimension is empty, false otherwise.
+ */
+ public boolean isYEmpty() {
+ return Double.isInfinite(yMin) && Double.isInfinite(yMax);
+ }
+
+ /**
+ * Checks if the bounding box is empty in the Z dimension.
+ *
+ * @return true if the Z dimension is empty, false otherwise.
+ */
+ public boolean isZEmpty() {
+ return Double.isInfinite(zMin) && Double.isInfinite(zMax);
+ }
+
+ /**
+ * Checks if the bounding box is empty in the M dimension.
+ *
+ * @return true if the M dimension is empty, false otherwise.
+ */
+ public boolean isMEmpty() {
+ return Double.isInfinite(mMin) && Double.isInfinite(mMax);
+ }
+
+ /**
+ * Expands this bounding box to include the bounds of another box.
+ * After merging, this bounding box will contain both its original extent
+ * and the extent of the other bounding box.
+ *
+ * @param other the other BoundingBox whose bounds will be merged into this one
+ */
+ public void merge(BoundingBox other) {
+ if (!valid) {
+ return;
+ }
+
+ // If other is null or invalid, mark this as invalid
+ if (other == null || !other.valid) {
+ valid = false;
+ resetBBox();
+ return;
+ }
+
+ this.xMin = Math.min(this.xMin, other.xMin);
+ this.xMax = Math.max(this.xMax, other.xMax);
+ this.yMin = Math.min(this.yMin, other.yMin);
+ this.yMax = Math.max(this.yMax, other.yMax);
+ this.zMin = Math.min(this.zMin, other.zMin);
+ this.zMax = Math.max(this.zMax, other.zMax);
+ this.mMin = Math.min(this.mMin, other.mMin);
+ this.mMax = Math.max(this.mMax, other.mMax);
+
+ // Update the validity of this bounding box based on the other bounding box
+ valid = isXYValid();
+ }
+
+ /**
+ * Extends this bounding box to include the spatial extent of the provided geometry.
+ * The bounding box coordinates (min/max values for x, y, z, m) will be adjusted
+ * to encompass both the current bounds and the geometry's bounds.
+ *
+ * @param geometry The geometry whose coordinates will be used to update this bounding box.
+ * If null or empty, the method returns without making any changes.
+ */
+ public void update(Geometry geometry) {
+ if (!valid) {
+ return;
+ }
+
+ if (geometry == null || geometry.isEmpty()) {
+ return;
+ }
+
+ // Updates the X and Y bounds of this bounding box with the given coordinates.
+ // Updates are conditional:
+ // - X bounds are only updated if both minX and maxX are not NaN
+ // - Y bounds are only updated if both minY and maxY are not NaN
+ // This allows partial updates while preserving valid dimensions.
+ Envelope envelope = geometry.getEnvelopeInternal();
+ if (!Double.isNaN(envelope.getMinX()) && !Double.isNaN(envelope.getMaxX())) {
+ xMin = Math.min(xMin, envelope.getMinX());
+ xMax = Math.max(xMax, envelope.getMaxX());
+ }
+ if (!Double.isNaN(envelope.getMinY()) && !Double.isNaN(envelope.getMaxY())) {
+ yMin = Math.min(yMin, envelope.getMinY());
+ yMax = Math.max(yMax, envelope.getMaxY());
+ }
+
+ for (Coordinate coord : geometry.getCoordinates()) {
+ if (!Double.isNaN(coord.getZ())) {
+ zMin = Math.min(zMin, coord.getZ());
+ zMax = Math.max(zMax, coord.getZ());
+ }
+ if (!Double.isNaN(coord.getM())) {
+ mMin = Math.min(mMin, coord.getM());
+ mMax = Math.max(mMax, coord.getM());
+ }
+ }
+
+ // Update the validity of this bounding box based on the other bounding box
+ valid = isXYValid();
+ }
+
+ /**
+ * Resets the bounding box to its initial state.
+ */
+ public void reset() {
+ resetBBox();
+ valid = true;
+ }
+
+ /**
+ * Creates a copy of the current bounding box.
+ *
+ * @return a new BoundingBox instance with the same values as this one.
+ */
+ public BoundingBox copy() {
+ return new BoundingBox(
+ this.xMin, this.xMax,
+ this.yMin, this.yMax,
+ this.zMin, this.zMax,
+ this.mMin, this.mMax);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder("BoundingBox{xMin=")
+ .append(xMin)
+ .append(", xMax=")
+ .append(xMax)
+ .append(", yMin=")
+ .append(yMin)
+ .append(", yMax=")
+ .append(yMax)
+ .append(", zMin=")
+ .append(zMin)
+ .append(", zMax=")
+ .append(zMax)
+ .append(", mMin=")
+ .append(mMin)
+ .append(", mMax=")
+ .append(mMax);
+
+ // Only include the valid flag when it's false
+ if (!valid) {
+ sb.append(", valid=false");
+ }
+
+ sb.append('}');
+ return sb.toString();
+ }
+}
diff --git a/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java
new file mode 100644
index 0000000000..c1067adad4
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/geospatial/GeospatialTypes.java
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.geospatial;
+
+import org.locationtech.jts.geom.Coordinate;
+import org.locationtech.jts.geom.Geometry;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * A list of geospatial types from all instances in the Geometry or Geography column,
+ * or an empty list if they are not known.
+ *
+ * The GeospatialTypes instance becomes invalid in the following cases:
+ * - When an unknown or unsupported geometry type is encountered during update
+ * - When merging with another invalid GeospatialTypes instance
+ * - When explicitly aborted using abort()
+ *
+ * When invalid, the types list is cleared and remains empty. All subsequent
+ * updates and merges are ignored until reset() is called.
+ */
+public class GeospatialTypes {
+
+ private static final int UNKNOWN_TYPE_ID = -1;
+ private Set types = new HashSet<>();
+ private boolean valid = true;
+
+ public GeospatialTypes(Set types) {
+ this.types = types;
+ this.valid = true;
+ }
+
+ public GeospatialTypes(Set types, boolean valid) {
+ this.types = types;
+ this.valid = valid;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (!(obj instanceof GeospatialTypes other)) {
+ return false;
+ }
+ if (obj == this) {
+ return true;
+ }
+ return valid == other.valid && types.equals(other.types);
+ }
+
+ @Override
+ public int hashCode() {
+ return types.hashCode() ^ Boolean.hashCode(valid);
+ }
+
+ public GeospatialTypes() {}
+
+ public Set getTypes() {
+ return types;
+ }
+
+ /**
+ * Updates the types list with the given geometry's type.
+ * If the geometry type is unknown, the instance becomes invalid.
+ *
+ * @param geometry the geometry to process
+ */
+ public void update(Geometry geometry) {
+ if (!valid) {
+ return;
+ }
+
+ if (geometry == null || geometry.isEmpty()) {
+ return;
+ }
+
+ int code = getGeometryTypeCode(geometry);
+ if (code != UNKNOWN_TYPE_ID) {
+ types.add(code);
+ } else {
+ valid = false;
+ types.clear();
+ }
+ }
+
+ public void merge(GeospatialTypes other) {
+ if (!valid) {
+ return;
+ }
+
+ if (other == null || !other.valid) {
+ valid = false;
+ types.clear();
+ return;
+ }
+ types.addAll(other.types);
+ }
+
+ public void reset() {
+ types.clear();
+ valid = true;
+ }
+
+ public boolean isValid() {
+ return valid;
+ }
+
+ public GeospatialTypes copy() {
+ return new GeospatialTypes(new HashSet<>(types), valid);
+ }
+
+ /**
+ * Extracts the base geometry type code from a full type code.
+ * For example: 1001 (XYZ Point) -> 1 (Point)
+ *
+ * @param typeId the full geometry type code
+ * @return the base type code (1-7)
+ */
+ private int getBaseTypeCode(int typeId) {
+ return typeId % 1000;
+ }
+
+ /**
+ * Extracts the dimension prefix from a full type code.
+ * For example: 1001 (XYZ Point) -> 1000 (XYZ)
+ *
+ * @param typeId the full geometry type code
+ * @return the dimension prefix (0, 1000, 2000, or 3000)
+ */
+ private int getDimensionPrefix(int typeId) {
+ return (typeId / 1000) * 1000;
+ }
+
+ @Override
+ public String toString() {
+ return "GeospatialTypes{" + "types="
+ + types.stream().map(this::typeIdToString).collect(Collectors.toSet()) + '}';
+ }
+
+ private int getGeometryTypeId(Geometry geometry) {
+ return switch (geometry.getGeometryType()) {
+ case Geometry.TYPENAME_POINT -> 1;
+ case Geometry.TYPENAME_LINESTRING -> 2;
+ case Geometry.TYPENAME_POLYGON -> 3;
+ case Geometry.TYPENAME_MULTIPOINT -> 4;
+ case Geometry.TYPENAME_MULTILINESTRING -> 5;
+ case Geometry.TYPENAME_MULTIPOLYGON -> 6;
+ case Geometry.TYPENAME_GEOMETRYCOLLECTION -> 7;
+ default -> UNKNOWN_TYPE_ID;
+ };
+ }
+
+ /**
+ * Geospatial type codes:
+ *
+ * | Type | XY | XYZ | XYM | XYZM |
+ * | :----------------- | :--- | :--- | :--- | :--: |
+ * | Point | 0001 | 1001 | 2001 | 3001 |
+ * | LineString | 0002 | 1002 | 2002 | 3002 |
+ * | Polygon | 0003 | 1003 | 2003 | 3003 |
+ * | MultiPoint | 0004 | 1004 | 2004 | 3004 |
+ * | MultiLineString | 0005 | 1005 | 2005 | 3005 |
+ * | MultiPolygon | 0006 | 1006 | 2006 | 3006 |
+ * | GeometryCollection | 0007 | 1007 | 2007 | 3007 |
+ *
+ * See https://github.com/apache/parquet-format/blob/master/Geospatial.md#geospatial-types
+ */
+ private int getGeometryTypeCode(Geometry geometry) {
+ int typeId = getGeometryTypeId(geometry);
+ if (typeId == UNKNOWN_TYPE_ID) {
+ return UNKNOWN_TYPE_ID;
+ }
+ Coordinate[] coordinates = geometry.getCoordinates();
+ boolean hasZ = false;
+ boolean hasM = false;
+ if (coordinates.length > 0) {
+ Coordinate firstCoord = coordinates[0];
+ hasZ = !Double.isNaN(firstCoord.getZ());
+ hasM = !Double.isNaN(firstCoord.getM());
+ }
+ if (hasZ) {
+ typeId += 1000;
+ }
+ if (hasM) {
+ typeId += 2000;
+ }
+ return typeId;
+ }
+
+ private String typeIdToString(int typeId) {
+ String typeString;
+
+ typeString = switch (typeId % 1000) {
+ case 1 -> Geometry.TYPENAME_POINT;
+ case 2 -> Geometry.TYPENAME_LINESTRING;
+ case 3 -> Geometry.TYPENAME_POLYGON;
+ case 4 -> Geometry.TYPENAME_MULTIPOINT;
+ case 5 -> Geometry.TYPENAME_MULTILINESTRING;
+ case 6 -> Geometry.TYPENAME_MULTIPOLYGON;
+ case 7 -> Geometry.TYPENAME_GEOMETRYCOLLECTION;
+ default -> {
+ yield "Unknown";
+ }
+ };
+ if (typeId >= 3000) {
+ typeString += " (XYZM)";
+ } else if (typeId >= 2000) {
+ typeString += " (XYM)";
+ } else if (typeId >= 1000) {
+ typeString += " (XYZ)";
+ } else {
+ typeString += " (XY)";
+ }
+ return typeString;
+ }
+}
diff --git a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
index d6147050ec..46b87bfdef 100644
--- a/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
+++ b/java/core/src/java/org/apache/orc/impl/ColumnStatisticsImpl.java
@@ -30,11 +30,17 @@
import org.apache.orc.DateColumnStatistics;
import org.apache.orc.DecimalColumnStatistics;
import org.apache.orc.DoubleColumnStatistics;
+import org.apache.orc.GeospatialColumnStatistics;
import org.apache.orc.IntegerColumnStatistics;
import org.apache.orc.OrcProto;
import org.apache.orc.StringColumnStatistics;
import org.apache.orc.TimestampColumnStatistics;
import org.apache.orc.TypeDescription;
+import org.apache.orc.geospatial.BoundingBox;
+import org.apache.orc.geospatial.GeospatialTypes;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.io.ParseException;
+import org.locationtech.jts.io.WKBReader;
import org.threeten.extra.chrono.HybridChronology;
import java.sql.Date;
@@ -42,6 +48,11 @@
import java.time.chrono.ChronoLocalDate;
import java.time.chrono.Chronology;
import java.time.chrono.IsoChronology;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
import java.util.TimeZone;
@@ -1858,6 +1869,167 @@ public Timestamp getMaximum() {
private boolean hasNull = false;
private long bytesOnDisk = 0;
+ private static final class GeospatialStatisticsImpl extends ColumnStatisticsImpl
+ implements GeospatialColumnStatistics {
+ private final BoundingBox boundingBox;
+ private final GeospatialTypes geospatialTypes;
+ private final WKBReader reader = new WKBReader();
+
+ GeospatialStatisticsImpl() {
+ this.boundingBox = new BoundingBox();
+ this.geospatialTypes = new GeospatialTypes();
+ }
+
+ GeospatialStatisticsImpl(OrcProto.ColumnStatistics stats) {
+ super(stats);
+ BoundingBox boundingBoxOut = null;
+ GeospatialTypes geospatialTypesOut = null;
+
+ OrcProto.GeospatialStatistics geoStatistics = stats.getGeospatialStatistics();
+ if (geoStatistics.hasBbox()) {
+ OrcProto.BoundingBox bbox = geoStatistics.getBbox();
+ boundingBoxOut = new BoundingBox(
+ bbox.hasXmin() ? bbox.getXmin() : Double.NaN,
+ bbox.hasXmax() ? bbox.getXmax() : Double.NaN,
+ bbox.hasYmin() ? bbox.getYmin() : Double.NaN,
+ bbox.hasYmax() ? bbox.getYmax() : Double.NaN,
+ bbox.hasZmin() ? bbox.getZmin() : Double.NaN,
+ bbox.hasZmax() ? bbox.getZmax() : Double.NaN,
+ bbox.hasMmin() ? bbox.getMmin() : Double.NaN,
+ bbox.hasMmax() ? bbox.getMmax() : Double.NaN);
+ }
+
+ if (!geoStatistics.getGeospatialTypesList().isEmpty()) {
+ Set types = new HashSet<>(geoStatistics.getGeospatialTypesList());
+ geospatialTypesOut = new GeospatialTypes(types);
+ }
+ this.boundingBox = boundingBoxOut;
+ this.geospatialTypes = geospatialTypesOut;
+ }
+
+ @Override
+ public void updateGeometry(BytesWritable value) {
+ if (value == null) {
+ return;
+ }
+
+ try {
+ Geometry geom = reader.read(value.getBytes());
+ boundingBox.update(geom);
+ geospatialTypes.update(geom);
+ } catch (ParseException e) {
+ throw new IllegalArgumentException("Invalid geospatial data - failed to parse WKB format", e);
+ }
+ }
+
+ @Override
+ public void updateGeometry(byte[] bytes, int offset, int length) {
+ if (bytes == null) {
+ return;
+ }
+ BytesWritable value = new BytesWritable();
+ value.set(bytes, offset, length);
+ updateGeometry(value);
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ boundingBox.reset();;
+ geospatialTypes.reset();
+ }
+
+ @Override
+ public void merge(ColumnStatisticsImpl other) {
+ if (other instanceof GeospatialStatisticsImpl geoStats) {
+ boundingBox.merge(geoStats.boundingBox);
+ geospatialTypes.merge(geoStats.geospatialTypes);
+ } else {
+ throw new IllegalArgumentException("Incompatible merging of geospatial column statistics");
+ }
+ super.merge(other);
+ }
+
+ @Override
+ public OrcProto.ColumnStatistics.Builder serialize() {
+ OrcProto.ColumnStatistics.Builder builder = super.serialize();
+ OrcProto.GeospatialStatistics.Builder geoStats = OrcProto.GeospatialStatistics.newBuilder();
+
+ OrcProto.BoundingBox.Builder bboxBuilder = OrcProto.BoundingBox.newBuilder();
+ if (boundingBox.isValid() && !boundingBox.isXYEmpty()) {
+ bboxBuilder.setXmin(boundingBox.getXMin());
+ bboxBuilder.setXmax(boundingBox.getXMax());
+ bboxBuilder.setYmin(boundingBox.getYMin());
+ bboxBuilder.setYmax(boundingBox.getYMax());
+ if (boundingBox.isZValid() && !boundingBox.isZEmpty()) {
+ bboxBuilder.setZmin(boundingBox.getZMin());
+ bboxBuilder.setZmax(boundingBox.getZMax());
+ }
+ if (boundingBox.isMValid() && !boundingBox.isMEmpty()) {
+ bboxBuilder.setMmin(boundingBox.getMMin());
+ bboxBuilder.setMmax(boundingBox.getMMax());
+ }
+ geoStats.setBbox(bboxBuilder);
+ }
+ if (geospatialTypes.isValid() && !geospatialTypes.getTypes().isEmpty()) {
+ List sortedTypes = new ArrayList<>(geospatialTypes.getTypes());
+ Collections.sort(sortedTypes);
+ geoStats.addAllGeospatialTypes(sortedTypes);
+ }
+ builder.setGeospatialStatistics(geoStats);
+ return builder;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder buf = new StringBuilder(super.toString());
+ if (boundingBox.isValid()) {
+ buf.append(" bbox: ");
+ buf.append(boundingBox.toString());
+ }
+ if (geospatialTypes.isValid()) {
+ buf.append(" types: ");
+ buf.append(geospatialTypes.toString());
+ }
+ return buf.toString();
+ }
+
+ @Override
+ public BoundingBox getBoundingBox() {
+ return boundingBox;
+ }
+
+ @Override
+ public GeospatialTypes getGeospatialTypes() {
+ return geospatialTypes;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (!(o instanceof GeospatialStatisticsImpl that)) {
+ return false;
+ }
+ if (!super.equals(o)) {
+ return false;
+ }
+
+ return boundingBox.equals(that.boundingBox) &&
+ geospatialTypes.equals(that.geospatialTypes);
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = super.hashCode();
+ result = prime * result + boundingBox.hashCode();
+ result = prime * result + geospatialTypes.hashCode();
+ return result;
+ }
+ }
+
ColumnStatisticsImpl(OrcProto.ColumnStatistics stats) {
if (stats.hasNumberOfValues()) {
count = stats.getNumberOfValues();
@@ -1955,6 +2127,14 @@ public void updateTimestamp(long value, int nanos) {
throw new UnsupportedOperationException("Can't update timestamp");
}
+ public void updateGeometry(BytesWritable value) {
+ throw new UnsupportedOperationException("Can't update Geometry");
+ }
+
+ public void updateGeometry(byte[] bytes, int offset, int length) {
+ throw new UnsupportedOperationException("Can't update Geometry");
+ }
+
public boolean isStatsExists() {
return (count > 0 || hasNull == true);
}
@@ -2046,6 +2226,9 @@ public static ColumnStatisticsImpl create(TypeDescription schema,
return new TimestampInstantStatisticsImpl();
case BINARY:
return new BinaryStatisticsImpl();
+ case Geography:
+ case Geometry:
+ return new GeospatialStatisticsImpl();
default:
return new ColumnStatisticsImpl();
}
@@ -2089,6 +2272,8 @@ public static ColumnStatisticsImpl deserialize(TypeDescription schema,
writerUsedProlepticGregorian, convertToProlepticGregorian);
} else if(stats.hasBinaryStatistics()) {
return new BinaryStatisticsImpl(stats);
+ } else if (stats.hasGeospatialStatistics()) {
+ return new GeospatialStatisticsImpl(stats);
} else {
return new ColumnStatisticsImpl(stats);
}
diff --git a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
index 09b4b2ae61..eacff4b063 100644
--- a/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
+++ b/java/core/src/java/org/apache/orc/impl/SchemaEvolution.java
@@ -461,6 +461,8 @@ void buildConversion(TypeDescription fileType,
case TIMESTAMP_INSTANT:
case BINARY:
case DATE:
+ case Geometry:
+ case Geography:
// these are always a match
break;
case CHAR:
diff --git a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
index 418b9c9561..785f568ff4 100644
--- a/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
+++ b/java/core/src/java/org/apache/orc/impl/TreeReaderFactory.java
@@ -1154,6 +1154,12 @@ public void skipRows(long items, ReadPhase readPhase) throws IOException {
}
}
+ public static class GeospatialTreeReader extends BinaryTreeReader {
+ GeospatialTreeReader(int columnId, Context context) throws IOException {
+ super(columnId, context);
+ }
+ }
+
public static class TimestampTreeReader extends TreeReader {
protected IntegerReader data = null;
protected IntegerReader nanos = null;
@@ -3028,6 +3034,9 @@ public static TypeReader createTreeReader(TypeDescription readerType,
}
return new DecimalTreeReader(fileType.getId(), fileType.getPrecision(),
fileType.getScale(), context);
+ case Geography:
+ case Geometry:
+ return new GeospatialTreeReader(fileType.getId(), context);
case STRUCT:
return new StructTreeReader(fileType.getId(), readerType, context);
case LIST:
diff --git a/java/core/src/java/org/apache/orc/impl/TypeUtils.java b/java/core/src/java/org/apache/orc/impl/TypeUtils.java
index a5daa89572..40d22e2c43 100644
--- a/java/core/src/java/org/apache/orc/impl/TypeUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/TypeUtils.java
@@ -69,6 +69,8 @@ public static ColumnVector createColumn(TypeDescription schema,
case BINARY:
case CHAR:
case VARCHAR:
+ case Geometry:
+ case Geography:
return new BytesColumnVector(maxSize);
case STRUCT: {
List children = schema.getChildren();
diff --git a/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java
new file mode 100644
index 0000000000..e9a0aa70bf
--- /dev/null
+++ b/java/core/src/java/org/apache/orc/impl/writer/GeospatialTreeWriter.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.impl.writer;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.orc.OrcProto;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.impl.CryptoUtils;
+import org.apache.orc.impl.IntegerWriter;
+import org.apache.orc.impl.PositionRecorder;
+import org.apache.orc.impl.PositionedOutputStream;
+import org.apache.orc.impl.StreamName;
+
+import java.io.IOException;
+import java.util.function.Consumer;
+
+public class GeospatialTreeWriter extends TreeWriterBase {
+ private final PositionedOutputStream stream;
+ private final IntegerWriter length;
+ private boolean isDirectV2 = true;
+ private long rawDataSize = 0;
+ private boolean isGeometry = false;
+
+ public GeospatialTreeWriter(TypeDescription schema,
+ WriterEncryptionVariant encryption,
+ WriterContext context) throws IOException {
+ super(schema, encryption, context);
+ this.isGeometry = schema.getCategory() == TypeDescription.Category.Geometry;
+ this.stream = context.createStream(
+ new StreamName(id, OrcProto.Stream.Kind.DATA, encryption));
+ this.isDirectV2 = isNewWriteFormat(context);
+ this.length = createIntegerWriter(context.createStream(
+ new StreamName(id, OrcProto.Stream.Kind.LENGTH, encryption)),
+ false, isDirectV2, context);
+ if (rowIndexPosition != null) {
+ recordPosition(rowIndexPosition);
+ }
+ }
+
+ @Override
+ OrcProto.ColumnEncoding.Builder getEncoding() {
+ OrcProto.ColumnEncoding.Builder result = super.getEncoding();
+ if (isDirectV2) {
+ result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2);
+ } else {
+ result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT);
+ }
+ return result;
+ }
+
+ @Override
+ public void writeBatch(ColumnVector vector, int offset,
+ int length) throws IOException {
+ super.writeBatch(vector, offset, length);
+ BytesColumnVector vec = (BytesColumnVector) vector;
+ if (vector.isRepeating) {
+ if (vector.noNulls || !vector.isNull[0]) {
+ for (int i = 0; i < length; ++i) {
+ stream.write(vec.vector[0], vec.start[0],
+ vec.length[0]);
+ this.length.write(vec.length[0]);
+ }
+ rawDataSize += (long) length * vec.length[0];
+ if (isGeometry) {
+ indexStatistics.updateGeometry(vec.vector[0], vec.start[0], vec.length[0]);
+ }
+ if (createBloomFilter) {
+ if (bloomFilter != null) {
+ bloomFilter.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+ }
+ bloomFilterUtf8.addBytes(vec.vector[0], vec.start[0], vec.length[0]);
+ }
+ }
+ } else {
+ for (int i = 0; i < length; ++i) {
+ if (vec.noNulls || !vec.isNull[i + offset]) {
+ stream.write(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i]);
+ this.length.write(vec.length[offset + i]);
+ rawDataSize += vec.length[offset + i];
+ BytesWritable bw = new BytesWritable();
+ bw.set(vec.vector[offset + i], vec.start[offset + i], vec.length[offset + i]);
+ if (isGeometry) {
+ indexStatistics.updateGeometry(vec.vector[i], vec.start[i], vec.length[i]);
+ }
+ if (createBloomFilter) {
+ if (bloomFilter != null) {
+ bloomFilter.addBytes(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i]);
+ }
+ bloomFilterUtf8.addBytes(vec.vector[offset + i],
+ vec.start[offset + i], vec.length[offset + i]);
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ public void writeStripe(int requiredIndexEntries) throws IOException {
+ super.writeStripe(requiredIndexEntries);
+ if (rowIndexPosition != null) {
+ recordPosition(rowIndexPosition);
+ }
+ }
+
+ @Override
+ void recordPosition(PositionRecorder recorder) throws IOException {
+ super.recordPosition(recorder);
+ stream.getPosition(recorder);
+ length.getPosition(recorder);
+ }
+
+ @Override
+ public long estimateMemory() {
+ return super.estimateMemory() + stream.getBufferSize() +
+ length.estimateMemory();
+ }
+
+ @Override
+ public long getRawDataSize() {
+ return rawDataSize;
+ }
+
+ @Override
+ public void flushStreams() throws IOException {
+ super.flushStreams();
+ stream.flush();
+ length.flush();
+ }
+
+ @Override
+ public void prepareStripe(int stripeId) {
+ super.prepareStripe(stripeId);
+ Consumer updater = CryptoUtils.modifyIvForStripe(stripeId);
+ stream.changeIv(updater);
+ length.changeIv(updater);
+ }
+}
diff --git a/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java b/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java
index 71eb3a5648..de63f9efb6 100644
--- a/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java
+++ b/java/core/src/java/org/apache/orc/impl/writer/TreeWriter.java
@@ -185,6 +185,9 @@ static TreeWriter createSubtree(TypeDescription schema,
return new ListTreeWriter(schema, encryption, streamFactory);
case UNION:
return new UnionTreeWriter(schema, encryption, streamFactory);
+ case Geometry:
+ case Geography:
+ return new GeospatialTreeWriter(schema, encryption, streamFactory);
default:
throw new IllegalArgumentException("Bad category: " +
schema.getCategory());
diff --git a/java/core/src/test/org/apache/orc/TestColumnStatistics.java b/java/core/src/test/org/apache/orc/TestColumnStatistics.java
index 5fc429199e..dea3359d92 100644
--- a/java/core/src/test/org/apache/orc/TestColumnStatistics.java
+++ b/java/core/src/test/org/apache/orc/TestColumnStatistics.java
@@ -28,10 +28,14 @@
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
+import org.apache.orc.geospatial.BoundingBox;
+import org.apache.orc.geospatial.GeospatialTypes;
import org.apache.orc.impl.ColumnStatisticsImpl;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInfo;
+import org.locationtech.jts.geom.*;
+import org.locationtech.jts.io.WKBWriter;
import java.io.File;
import java.math.BigDecimal;
@@ -742,6 +746,201 @@ public void testMergeIncompatible() {
assertEquals(0, ((DoubleColumnStatistics) doubleStats).getNumberOfValues());
}
+ @Test
+ public void testUpdateGeometry() {
+ TypeDescription desc = TypeDescription.createGeometry();
+ ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc);
+ GeometryFactory geometryFactory = new GeometryFactory();
+ WKBWriter wkbWriter = new WKBWriter();
+
+ byte[][] points = {
+ wkbWriter.write(geometryFactory.createPoint(new Coordinate(1.0, 1.0))),
+ wkbWriter.write(geometryFactory.createPoint(new Coordinate(2.0, 2.0))),
+ };
+
+ for (byte[] point : points) {
+ stats.updateGeometry(new BytesWritable(point));
+ }
+
+ GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats;
+ BoundingBox bbox = geometryStatistics.getBoundingBox();
+ assertEquals(1.0, bbox.getXMin(), 0.0);
+ assertEquals(2.0, bbox.getXMax(), 0.0);
+ assertEquals(1.0, bbox.getYMin(), 0.0);
+ assertEquals(2.0, bbox.getYMax(), 0.0);
+ assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0);
+ assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0);
+ assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0);
+ assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0);
+ assertEquals("BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}",
+ bbox.toString());
+ assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}",
+ geometryStatistics.toString());
+
+ GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes();
+ assertTrue(geospatialTypes.getTypes().contains(1));
+ assertEquals(1, geospatialTypes.getTypes().size());
+ }
+
+ @Test
+ public void testUpdateGeometryWithDifferentTypes() {
+ TypeDescription desc = TypeDescription.createGeometry();
+ ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc);
+ GeometryFactory geometryFactory = new GeometryFactory();
+ WKBWriter wkbWriter = new WKBWriter();
+
+ Point point = geometryFactory.createPoint(new Coordinate(1, 1));
+ Coordinate[] lineCoords = new Coordinate[]{new Coordinate(1, 1), new Coordinate(2, 2)};
+ LineString line = geometryFactory.createLineString(lineCoords);
+ Coordinate[] polygonCoords = new Coordinate[]{
+ new Coordinate(0, 0), new Coordinate(3, 0),
+ new Coordinate(1, 3), new Coordinate(0, 1),
+ new Coordinate(0, 0)
+ };
+ LinearRing shell = geometryFactory.createLinearRing(polygonCoords);
+ Polygon polygon = geometryFactory.createPolygon(shell);
+
+ GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats;
+ BoundingBox bbox = geometryStatistics.getBoundingBox();
+ GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes();
+ // Generate WKB and update stats
+ byte[] pointWkb = wkbWriter.write(point);
+ stats.updateGeometry(new BytesWritable(pointWkb));
+
+ assertEquals(1.0, bbox.getXMin(), 0.0);
+ assertEquals(1.0, bbox.getXMax(), 0.0);
+ assertEquals(1.0, bbox.getYMin(), 0.0);
+ assertEquals(1.0, bbox.getYMax(), 0.0);
+ assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0);
+ assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0);
+ assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0);
+ assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0);
+ assertEquals("BoundingBox{xMin=1.0, xMax=1.0, yMin=1.0, yMax=1.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}",
+ bbox.toString());
+
+ assertTrue(geospatialTypes.getTypes().contains(1));
+ assertEquals(1, geospatialTypes.getTypes().size());
+ assertEquals("GeospatialTypes{types=[Point (XY)]}", geospatialTypes.toString());
+
+
+ assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=1.0, yMin=1.0, yMax=1.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}",
+ geometryStatistics.toString());
+
+ byte[] lineWkb = wkbWriter.write(line);
+ stats.updateGeometry(new BytesWritable(lineWkb));
+ assertEquals(1.0, bbox.getXMin(), 0.0);
+ assertEquals(2.0, bbox.getXMax(), 0.0);
+ assertEquals(1.0, bbox.getYMin(), 0.0);
+ assertEquals(2.0, bbox.getYMax(), 0.0);
+ assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0);
+ assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0);
+ assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0);
+ assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0);
+
+ assertTrue(geospatialTypes.getTypes().contains(1));
+ assertTrue(geospatialTypes.getTypes().contains(2));
+ assertEquals(2, geospatialTypes.getTypes().size());
+ assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY)]}",
+ geospatialTypes.toString());
+
+ byte[] polygonWkb = wkbWriter.write(polygon);
+ stats.updateGeometry(new BytesWritable(polygonWkb));
+ stats.updateGeometry(new BytesWritable(lineWkb));
+ assertEquals(0.0, bbox.getXMin(), 0.0);
+ assertEquals(3.0, bbox.getXMax(), 0.0);
+ assertEquals(0.0, bbox.getYMin(), 0.0);
+ assertEquals(3.0, bbox.getYMax(), 0.0);
+ assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0);
+ assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0);
+ assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0);
+ assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0);
+
+ assertTrue(geospatialTypes.getTypes().contains(1));
+ assertTrue(geospatialTypes.getTypes().contains(2));
+ assertTrue(geospatialTypes.getTypes().contains(2));
+ assertEquals(3, geospatialTypes.getTypes().size());
+ assertEquals("GeospatialTypes{types=[Point (XY), LineString (XY), Polygon (XY)]}",
+ geospatialTypes.toString());
+
+ }
+
+ @Test
+ public void testUpdateGeometryWithZCoordinates() {
+ TypeDescription desc = TypeDescription.createGeometry();
+ ColumnStatisticsImpl stats = ColumnStatisticsImpl.create(desc);
+ GeometryFactory geometryFactory = new GeometryFactory();
+ WKBWriter wkbWriter = new WKBWriter(3);
+
+ Point point1 = geometryFactory.createPoint(new Coordinate(0, 1, 2));
+ Point point2 = geometryFactory.createPoint(new Coordinate(2, 1, 0));
+
+ stats.updateGeometry(new BytesWritable(wkbWriter.write(point1)));
+ stats.updateGeometry(new BytesWritable(wkbWriter.write(point2)));
+
+ GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats;
+ BoundingBox bbox = geometryStatistics.getBoundingBox();
+ assertEquals(0.0, bbox.getXMin(), 0.0);
+ assertEquals(2.0, bbox.getXMax(), 0.0);
+ assertEquals(1.0, bbox.getYMin(), 0.0);
+ assertEquals(1.0, bbox.getYMax(), 0.0);
+ assertEquals(0.0, bbox.getZMin(), 0.0);
+ assertEquals(2.0, bbox.getZMax(), 0.0);
+ assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0);
+ assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0);
+ assertEquals("BoundingBox{xMin=0.0, xMax=2.0, yMin=1.0, yMax=1.0, zMin=0.0, zMax=2.0, mMin=Infinity, mMax=-Infinity}",
+ bbox.toString());
+ assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=0.0, xMax=2.0, yMin=1.0, yMax=1.0, zMin=0.0, zMax=2.0, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XYZ)]}",
+ geometryStatistics.toString());
+
+ GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes();
+ assertTrue(geospatialTypes.getTypes().contains(1001));
+ assertEquals(1, geospatialTypes.getTypes().size());
+ }
+
+ @Test
+ public void TestGeospatialMerge() {
+ TypeDescription desc = TypeDescription.createGeometry();
+ ColumnStatisticsImpl stats0 = ColumnStatisticsImpl.create(desc);
+ ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(desc);
+ GeometryFactory geometryFactory = new GeometryFactory();
+ WKBWriter wkbWriter = new WKBWriter();
+
+ byte[][] points = {
+ wkbWriter.write(geometryFactory.createPoint(new Coordinate(1.0, 1.0))),
+ wkbWriter.write(geometryFactory.createPoint(new Coordinate(2.0, 2.0))),
+ };
+
+ stats0.updateGeometry(new BytesWritable(points[0]));
+ stats1.updateGeometry(new BytesWritable(points[1]));
+
+ GeospatialColumnStatistics geometryStatistics = (GeospatialColumnStatistics) stats0;
+ stats0.merge(stats1);
+
+ BoundingBox bbox = geometryStatistics.getBoundingBox();
+ assertTrue(bbox.isXYValid());
+ assertFalse(bbox.isXYEmpty());
+ assertTrue(bbox.isZValid());
+ assertTrue(bbox.isMValid());
+ assertTrue(bbox.isZEmpty());
+ assertTrue(bbox.isMEmpty());
+ assertEquals(1.0, bbox.getXMin(), 0.0);
+ assertEquals(2.0, bbox.getXMax(), 0.0);
+ assertEquals(1.0, bbox.getYMin(), 0.0);
+ assertEquals(2.0, bbox.getYMax(), 0.0);
+ assertEquals(Double.POSITIVE_INFINITY, bbox.getZMin(), 0.0);
+ assertEquals(Double.NEGATIVE_INFINITY, bbox.getZMax(), 0.0);
+ assertEquals(Double.POSITIVE_INFINITY, bbox.getMMin(), 0.0);
+ assertEquals(Double.NEGATIVE_INFINITY, bbox.getMMax(), 0.0);
+ assertEquals("BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity}",
+ bbox.toString());
+ assertEquals("count: 0 hasNull: false bbox: BoundingBox{xMin=1.0, xMax=2.0, yMin=1.0, yMax=2.0, zMin=Infinity, zMax=-Infinity, mMin=Infinity, mMax=-Infinity} types: GeospatialTypes{types=[Point (XY)]}",
+ geometryStatistics.toString());
+
+ GeospatialTypes geospatialTypes = geometryStatistics.getGeospatialTypes();
+ assertTrue(geospatialTypes.getTypes().contains(1));
+ assertEquals(1, geospatialTypes.getTypes().size());
+ }
+
Path workDir = new Path(System.getProperty("test.tmp.dir",
"target" + File.separator + "test" + File.separator + "tmp"));
diff --git a/java/core/src/test/org/apache/orc/TestOrcGeospatial.java b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java
new file mode 100644
index 0000000000..f0e148fb0d
--- /dev/null
+++ b/java/core/src/test/org/apache/orc/TestOrcGeospatial.java
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.impl.ColumnStatisticsImpl;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.TestInfo;
+import org.locationtech.jts.geom.Coordinate;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.geom.GeometryFactory;
+import org.locationtech.jts.io.WKBReader;
+import org.locationtech.jts.io.WKBWriter;
+
+import java.io.File;
+import java.util.Arrays;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+public class TestOrcGeospatial implements TestConf {
+ Path workDir = new Path(System.getProperty("test.tmp.dir",
+ "target" + File.separator + "test" + File.separator + "tmp"));
+ FileSystem fs;
+ Path testFilePath;
+
+ @BeforeEach
+ public void openFileSystem(TestInfo testInfo) throws Exception {
+ fs = FileSystem.getLocal(conf);
+ testFilePath = new Path(workDir, "TestOrcGeospatial." +
+ testInfo.getTestMethod().get().getName() + ".orc");
+ fs.delete(testFilePath, false);
+ }
+
+ @Test
+ public void testGeometryWriterWithNulls() throws Exception {
+ // Create a geometry schema and ORC file writer
+ TypeDescription schema = TypeDescription.createGeometry();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .bufferSize(10000));
+ GeometryFactory geometryFactory = new GeometryFactory();
+ WKBWriter wkbWriter = new WKBWriter();
+ WKBReader wkbReader = new WKBReader();
+
+ // Add data
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector geos = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 100; i++) {
+ if (i % 2 == 0) {
+ byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i)));
+ geos.setVal(batch.size++, bytes);
+ } else {
+ geos.noNulls = false;
+ geos.isNull[batch.size++] = true;
+ }
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ // Verify reader schema
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals("geometry(OGC:CRS84)", reader.getSchema().toString());
+ assertEquals(100, reader.getNumberOfRows());
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ geos = (BytesColumnVector) batch.cols[0];
+
+ // Verify statistics
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(1, stats.length);
+ assertEquals(50, stats[0].getNumberOfValues());
+ assertTrue(stats[0].hasNull());
+ assertInstanceOf(GeospatialColumnStatistics.class, stats[0]);
+ assertTrue(((GeospatialColumnStatistics) stats[0]).getBoundingBox().isXYValid());
+ assertFalse(((GeospatialColumnStatistics) stats[0]).getBoundingBox().isZValid());
+ assertFalse(((GeospatialColumnStatistics) stats[0]).getBoundingBox().isMValid());
+ assertEquals("BoundingBox{xMin=0.0, xMax=98.0, yMin=0.0, yMax=98.0, zMin=NaN, zMax=NaN, mMin=NaN, mMax=NaN}", ((GeospatialColumnStatistics) stats[0]).getBoundingBox().toString());
+ assertEquals("GeospatialTypes{types=[Point (XY)]}", ((GeospatialColumnStatistics) stats[0]).getGeospatialTypes().toString());
+
+ // Verify data
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for (int r = 0; r < batch.size; ++r) {
+ if (idx % 2 == 0) {
+ Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r]));
+ assertEquals("Point", geom.getGeometryType());
+ assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx)));
+ } else {
+ assertTrue(geos.isNull[r]);
+ }
+ idx += 1;
+ }
+ }
+ rows.close();
+ }
+
+ @Test
+ public void testGeographyWriterWithNulls() throws Exception {
+ // Create geography schema and ORC file writer
+ TypeDescription schema = TypeDescription.createGeography();
+ Writer writer = OrcFile.createWriter(testFilePath,
+ OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
+ .bufferSize(10000));
+ GeometryFactory geometryFactory = new GeometryFactory();
+ WKBWriter wkbWriter = new WKBWriter();
+ WKBReader wkbReader = new WKBReader();
+
+ // Add data
+ VectorizedRowBatch batch = schema.createRowBatch();
+ BytesColumnVector geos = (BytesColumnVector) batch.cols[0];
+ for (int i = 0; i < 100; i++) {
+ if (i % 2 == 0) {
+ byte[] bytes = wkbWriter.write(geometryFactory.createPoint(new Coordinate(i, i)));
+ geos.setVal(batch.size++, bytes);
+ } else {
+ geos.noNulls = false;
+ geos.isNull[batch.size++] = true;
+ }
+ }
+ writer.addRowBatch(batch);
+ writer.close();
+
+ // Verify reader schema
+ Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
+ assertEquals("geography(OGC:CRS84,SPHERICAL)", reader.getSchema().toString());
+ assertEquals(100, reader.getNumberOfRows());
+
+ // Verify statistics, make sure there are no bounding box and geospatial types
+ ColumnStatistics[] stats = reader.getStatistics();
+ assertEquals(1, stats.length);
+ assertEquals(50, stats[0].getNumberOfValues());
+ assertTrue(stats[0].hasNull());
+ assertInstanceOf(GeospatialColumnStatistics.class, stats[0]);
+ assertNull(((GeospatialColumnStatistics) stats[0]).getBoundingBox());
+ assertNull(((GeospatialColumnStatistics) stats[0]).getGeospatialTypes());
+
+ // Verify Data
+ RecordReader rows = reader.rows();
+ batch = reader.getSchema().createRowBatch();
+ geos = (BytesColumnVector) batch.cols[0];
+ int idx = 0;
+ while (rows.nextBatch(batch)) {
+ for (int r = 0; r < batch.size; ++r) {
+ if (idx % 2 == 0) {
+ Geometry geom = wkbReader.read(Arrays.copyOfRange(geos.vector[r], geos.start[r], geos.start[r] + geos.length[r]));
+ assertEquals("Point", geom.getGeometryType());
+ assertEquals(geom, geometryFactory.createPoint(new Coordinate(idx, idx)));
+ } else {
+ assertTrue(geos.isNull[r]);
+ }
+ idx += 1;
+ }
+ }
+ rows.close();
+ }
+}
diff --git a/java/pom.xml b/java/pom.xml
index 50df3df920..f969ea95f1 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -67,6 +67,7 @@
3.4.1
17
${project.basedir}/../target/javadoc
+ 1.20.0
5.12.2
3.7.1
3.8.1