Skip to content
196 changes: 196 additions & 0 deletions c++/include/orc/Geospatial.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/*
* This file contains code adapted from the Apache Arrow project.
*
* Original source:
* https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.h
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please provide a tag based location instead of the main branch because it changes always.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are no tags that contain this patch yet.

*
* The original code is licensed under the Apache License, Version 2.0.
*
* Modifications may have been made from the original source.
*/

#ifndef ORC_GEOSPATIAL_HH
#define ORC_GEOSPATIAL_HH

#include <array>
#include <cmath>
#include <ostream>
#include <string>

namespace orc::geospatial {

constexpr double INF = std::numeric_limits<double>::infinity();
// The maximum number of dimensions supported (X, Y, Z, M)
inline constexpr int MAX_DIMENSIONS = 4;

// Supported combinations of geometry dimensions
enum class Dimensions {
XY = 0, // X and Y only
XYZ = 1, // X, Y, and Z
XYM = 2, // X, Y, and M
XYZM = 3, // X, Y, Z, and M
VALUE_MIN = 0,
VALUE_MAX = 3
};

// Supported geometry types according to ISO WKB
enum class GeometryType {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In Java, we had an additional value, -1, for UNKNOWN_TYPE_ID.

private static final int UNKNOWN_TYPE_ID = -1;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

C++ employs a different implementation that invalidates statistics when unknown types are encountered; therefore, UNKNOWN_TYPE_ID is unnecessary in this context.

POINT = 1,
LINESTRING = 2,
POLYGON = 3,
MULTIPOINT = 4,
MULTILINESTRING = 5,
MULTIPOLYGON = 6,
GEOMETRYCOLLECTION = 7,
VALUE_MIN = 1,
VALUE_MAX = 7
};

// BoundingBox represents the minimum bounding rectangle (or box) for a geometry.
// It supports up to 4 dimensions (X, Y, Z, M).
struct BoundingBox {
using XY = std::array<double, 2>;
using XYZ = std::array<double, 3>;
using XYM = std::array<double, 3>;
using XYZM = std::array<double, 4>;

// Default constructor: initializes to an empty bounding box.
BoundingBox() : min{INF, INF, INF, INF}, max{-INF, -INF, -INF, -INF} {}
// Constructor with explicit min/max values.
BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {}
BoundingBox(const BoundingBox& other) = default;
BoundingBox& operator=(const BoundingBox&) = default;

// Update the bounding box to include a 2D coordinate.
void updateXY(const XY& coord) {
updateInternal(coord);
}
// Update the bounding box to include a 3D coordinate (XYZ).
void updateXYZ(const XYZ& coord) {
updateInternal(coord);
}
// Update the bounding box to include a 3D coordinate (XYM).
void updateXYM(const XYM& coord) {
std::array<int, 3> dims = {0, 1, 3};
for (int i = 0; i < 3; ++i) {
auto dim = dims[i];
if (!std::isnan(min[dim]) && !std::isnan(max[dim])) {
min[dim] = std::min(min[dim], coord[i]);
max[dim] = std::max(max[dim], coord[i]);
}
}
}
// Update the bounding box to include a 4D coordinate (XYZM).
void updateXYZM(const XYZM& coord) {
updateInternal(coord);
}

// Reset the bounding box to its initial empty state.
void reset() {
for (int i = 0; i < MAX_DIMENSIONS; ++i) {
min[i] = INF;
max[i] = -INF;
}
}

// Invalidate the bounding box (set all values to NaN).
void invalidate() {
for (int i = 0; i < MAX_DIMENSIONS; ++i) {
min[i] = std::numeric_limits<double>::quiet_NaN();
max[i] = std::numeric_limits<double>::quiet_NaN();
}
}

// Check if the bound for a given dimension is empty.
bool boundEmpty(int dim) const {
return std::isinf(min[dim] - max[dim]);
}

// Check if the bound for a given dimension is valid (not NaN).
bool boundValid(int dim) const {
return !std::isnan(min[dim]) && !std::isnan(max[dim]);
}

// Get the lower bound (min values).
const XYZM& lowerBound() const {
return min;
}
// Get the upper bound (max values).
const XYZM& upperBound() const {
return max;
}

// Get validity for each dimension.
std::array<bool, MAX_DIMENSIONS> dimensionValid() const {
return {boundValid(0), boundValid(1), boundValid(2), boundValid(3)};
}
// Get emptiness for each dimension.
std::array<bool, MAX_DIMENSIONS> dimensionEmpty() const {
return {boundEmpty(0), boundEmpty(1), boundEmpty(2), boundEmpty(3)};
}

// Merge another bounding box into this one.
void merge(const BoundingBox& other) {
for (int i = 0; i < MAX_DIMENSIONS; ++i) {
if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) ||
std::isnan(other.max[i])) {
min[i] = std::numeric_limits<double>::quiet_NaN();
max[i] = std::numeric_limits<double>::quiet_NaN();
} else {
min[i] = std::min(min[i], other.min[i]);
max[i] = std::max(max[i], other.max[i]);
}
}
}

// Convert the bounding box to a string representation.
std::string toString() const;

XYZM min; // Minimum values for each dimension
XYZM max; // Maximum values for each dimension

private:
// Internal update function for XY, XYZ, or XYZM coordinates.
template <typename Coord>
void updateInternal(const Coord& coord) {
for (size_t i = 0; i < coord.size(); ++i) {
if (!std::isnan(min[i]) && !std::isnan(max[i])) {
min[i] = std::min(min[i], coord[i]);
max[i] = std::max(max[i], coord[i]);
}
}
}
};

inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) {
return lhs.min == rhs.min && lhs.max == rhs.max;
}
inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) {
return !(lhs == rhs);
}
inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) {
os << obj.toString();
return os;
}

} // namespace orc::geospatial

#endif // ORC_GEOSPATIAL_HH
30 changes: 28 additions & 2 deletions c++/include/orc/Statistics.hh
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@
#ifndef ORC_STATISTICS_HH
#define ORC_STATISTICS_HH

#include "orc/Geospatial.hh"
#include "orc/Type.hh"
#include "orc/Vector.hh"
#include "orc/orc-config.hh"

#include <sstream>

namespace orc {

/**
Expand Down Expand Up @@ -367,6 +366,33 @@ namespace orc {
virtual int32_t getMaximumNanos() const = 0;
};

/**
* Statistics for Geometry and Geography
*/
class GeospatialColumnStatistics : public ColumnStatistics {
public:
virtual ~GeospatialColumnStatistics();

/**
* Get bounding box
* @return bounding box
*/
virtual const geospatial::BoundingBox& getBoundingBox() const = 0;

/**
* Get geospatial types
* @return a sorted vector of geometry type IDs that elements is unique
*/
virtual std::vector<int32_t> getGeospatialTypes() const = 0;

/**
* Update stats by a new value
* @param value new value to update
* @param length length of the value
*/
virtual void update(const char* value, size_t length) = 0;
};

class Statistics {
public:
virtual ~Statistics();
Expand Down
24 changes: 23 additions & 1 deletion c++/include/orc/Type.hh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,18 @@

namespace orc {

namespace geospatial {
enum EdgeInterpolationAlgorithm {
SPHERICAL = 0,
VINCENTY = 1,
THOMAS = 2,
ANDOYER = 3,
KARNEY = 4
};
std::string AlgoToString(EdgeInterpolationAlgorithm algo);
EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo);
} // namespace geospatial

enum TypeKind {
BOOLEAN = 0,
BYTE = 1,
Expand All @@ -44,7 +56,9 @@ namespace orc {
DATE = 15,
VARCHAR = 16,
CHAR = 17,
TIMESTAMP_INSTANT = 18
TIMESTAMP_INSTANT = 18,
GEOMETRY = 19,
GEOGRAPHY = 20
};

class Type {
Expand All @@ -59,6 +73,10 @@ namespace orc {
virtual uint64_t getMaximumLength() const = 0;
virtual uint64_t getPrecision() const = 0;
virtual uint64_t getScale() const = 0;
// for geospatial types only
virtual const std::string& getCrs() const = 0;
// for geography type only
virtual geospatial::EdgeInterpolationAlgorithm getAlgorithm() const = 0;
virtual Type& setAttribute(const std::string& key, const std::string& value) = 0;
virtual bool hasAttributeKey(const std::string& key) const = 0;
virtual Type& removeAttribute(const std::string& key) = 0;
Expand Down Expand Up @@ -115,6 +133,10 @@ namespace orc {
std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements);
std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key, std::unique_ptr<Type> value);
std::unique_ptr<Type> createUnionType();
std::unique_ptr<Type> createGeometryType(const std::string& crs = "OGC:CRS84");
std::unique_ptr<Type> createGeographyType(
const std::string& crs = "OGC:CRS84",
geospatial::EdgeInterpolationAlgorithm algo = geospatial::SPHERICAL);

} // namespace orc
#endif
1 change: 1 addition & 0 deletions c++/include/orc/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ install_headers(
'ColumnPrinter.hh',
'Common.hh',
'Exceptions.hh',
'Geospatial.hh',
'Int128.hh',
'MemoryPool.hh',
'OrcFile.hh',
Expand Down
1 change: 1 addition & 0 deletions c++/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ set(SOURCE_FILES
ConvertColumnReader.cc
CpuInfoUtil.cc
Exceptions.cc
Geospatial.cc
Int128.cc
LzoDecompressor.cc
MemoryPool.cc
Expand Down
2 changes: 2 additions & 0 deletions c++/src/ColumnPrinter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,8 @@ namespace orc {
break;

case BINARY:
case GEOMETRY:
case GEOGRAPHY:
result = std::make_unique<BinaryColumnPrinter>(buffer, param);
break;

Expand Down
2 changes: 2 additions & 0 deletions c++/src/ColumnReader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1747,6 +1747,8 @@ namespace orc {
case CHAR:
case STRING:
case VARCHAR:
case GEOMETRY:
case GEOGRAPHY:
switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())) {
case proto::ColumnEncoding_Kind_DICTIONARY:
case proto::ColumnEncoding_Kind_DICTIONARY_V2:
Expand Down
Loading
Loading