From 891e1dc8dc4783a6f3cda586b9baa40cdc1851d8 Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 16 Jun 2025 17:59:55 +0800 Subject: [PATCH 01/16] [C++] Support and types --- c++/include/orc/Geospatial.hh | 255 +++++++++++++++++++++++ c++/include/orc/Statistics.hh | 28 +++ c++/include/orc/Type.hh | 22 +- c++/src/CMakeLists.txt | 1 + c++/src/ColumnPrinter.cc | 2 + c++/src/ColumnReader.cc | 2 + c++/src/ColumnWriter.cc | 61 ++++++ c++/src/Geospatial.cc | 287 ++++++++++++++++++++++++++ c++/src/Reader.cc | 2 + c++/src/SchemaEvolution.cc | 7 +- c++/src/Statistics.cc | 47 +++++ c++/src/Statistics.hh | 123 +++++++++++ c++/src/TypeImpl.cc | 140 +++++++++++++ c++/src/TypeImpl.hh | 27 +++ c++/src/Writer.cc | 35 ++++ c++/test/CMakeLists.txt | 2 + c++/test/TestColumnStatistics.cc | 340 +++++++++++++++++++++++++++++++ c++/test/TestStatistics.cc | 236 +++++++++++++++++++++ c++/test/TestUtil.cc | 64 ++++++ c++/test/TestUtil.hh | 41 ++++ c++/test/TestWriter.cc | 138 ++++++++++++- tools/src/CSVFileImport.cc | 2 + 22 files changed, 1859 insertions(+), 3 deletions(-) create mode 100644 c++/include/orc/Geospatial.hh create mode 100644 c++/src/Geospatial.cc create mode 100644 c++/test/TestStatistics.cc create mode 100644 c++/test/TestUtil.cc create mode 100644 c++/test/TestUtil.hh diff --git a/c++/include/orc/Geospatial.hh b/c++/include/orc/Geospatial.hh new file mode 100644 index 0000000000..f4661049c8 --- /dev/null +++ b/c++/include/orc/Geospatial.hh @@ -0,0 +1,255 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include + +namespace orc::geospatial { + +/// \brief Infinity, used to define bounds of empty bounding boxes +constexpr double kInf = std::numeric_limits::infinity(); + +/// \brief The maximum number of dimensions represented by a geospatial type +/// (i.e., X, Y, Z, and M) +inline constexpr int kMaxDimensions = 4; + +/// \brief Valid combinations of dimensions allowed by ISO well-known binary +/// +/// These values correspond to the 0, 1000, 2000, 3000 component of the WKB integer +/// geometry type (i.e., the value of geometry_type // 1000). +enum class Dimensions { + kXY = 0, + kXYZ = 1, + kXYM = 2, + kXYZM = 3, + kValueMin = 0, + kValueMax = 3 +}; + +/// \brief The supported set of geometry types allowed by ISO well-known binary +/// +/// These values correspond to the 1, 2, ..., 7 component of the WKB integer +/// geometry type (i.e., the value of geometry_type % 1000). +enum class GeometryType { + kPoint = 1, + kLinestring = 2, + kPolygon = 3, + kMultiPoint = 4, + kMultiLinestring = 5, + kMultiPolygon = 6, + kGeometryCollection = 7, + kValueMin = 1, + kValueMax = 7 +}; + +struct BoundingBox { + using XY = std::array; + using XYZ = std::array; + using XYM = std::array; + using XYZM = std::array; + + BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {} + BoundingBox() : min{kInf, kInf, kInf, kInf}, max{-kInf, -kInf, -kInf, -kInf} {} + BoundingBox(const BoundingBox& other) = default; + BoundingBox& operator=(const BoundingBox&) = default; + + /// \brief Update the X and Y bounds to ensure these bounds contain coord + void UpdateXY(const XY& coord) { + UpdateInternal(coord); + } + + /// \brief Update the X, Y, and Z bounds to ensure these bounds contain coord + void UpdateXYZ(const XYZ& coord) { + UpdateInternal(coord); + } + + /// \brief Update the X, Y, and M bounds to ensure these bounds contain coord + void UpdateXYM(const XYM& coord) { + std::array dimensions = {0, 1, 3}; + for (int i = 0; i < 3; i++) { + auto dimension = dimensions[i]; + if ((std::isnan(min[dimension]) == 0) && (std::isnan(max[dimension]) == 0)) { + min[dimension] = std::min(min[dimension], coord[i]); + max[dimension] = std::max(max[dimension], coord[i]); + } + } + } + + /// \brief Update the X, Y, Z, and M bounds to ensure these bounds contain coord + void UpdateXYZM(const XYZM& coord) { + UpdateInternal(coord); + } + + /// \brief Reset these bounds to an empty state such that they contain no coordinates + void Reset() { + for (int i = 0; i < kMaxDimensions; i++) { + min[i] = kInf; + max[i] = -kInf; + } + } + + void Invalidate() { + for (int i = 0; i < kMaxDimensions; i++) { + min[i] = std::numeric_limits::quiet_NaN(); + max[i] = std::numeric_limits::quiet_NaN(); + } + } + + bool BoundEmpty(int dimension) const { + return std::isinf(min[dimension] - max[dimension]) != 0; + } + + bool BoundValid(int dimension) const { + return (std::isnan(min[dimension]) == 0) && (std::isnan(max[dimension]) == 0); + } + + const XYZM& LowerBound() const { + return min; + } + + const XYZM& UpperBound() const { + return max; + } + + std::array DimensionValid() const { + return {BoundValid(0), BoundValid(1), BoundValid(2), BoundValid(3)}; + } + + std::array DimensionEmpty() const { + return {BoundEmpty(0), BoundEmpty(1), BoundEmpty(2), BoundEmpty(3)}; + } + + /// \brief Update these bounds such they also contain other + void Merge(const BoundingBox& other) { + for (int i = 0; i < kMaxDimensions; i++) { + if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) || + std::isnan(other.max[i])) { + min[i] = std::numeric_limits::quiet_NaN(); + max[i] = std::numeric_limits::quiet_NaN(); + } else { + min[i] = std::min(min[i], other.min[i]); + max[i] = std::max(max[i], other.max[i]); + } + } + } + + std::string ToString() const; + + XYZM min; + XYZM max; + + private: + // This works for XY, XYZ, and XYZM + template + void UpdateInternal(Coord coord) { + for (size_t i = 0; i < coord.size(); i++) { + if (!std::isnan(min[i]) && !std::isnan(max[i])) { + min[i] = std::min(min[i], coord[i]); + max[i] = std::max(max[i], coord[i]); + } + } + } +}; + +inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) { + return lhs.min == rhs.min && lhs.max == rhs.max; +} + +inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) { + return !(lhs == rhs); +} + +inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) { + os << obj.ToString(); + return os; +} + + +class WKBBuffer; + +/// \brief Accumulate a BoundingBox and geometry types based on zero or more well-known +/// binary blobs +/// +/// Note that this class is NOT appropriate for bounding a GEOGRAPHY, +/// whose bounds are not a function purely of the vertices. Geography bounding +/// is not yet implemented. +class WKBGeometryBounder { + public: + /// \brief Accumulate the bounds of a serialized well-known binary geometry + /// + /// Throws ParquetException for any parse errors encountered. Bounds for + /// any encountered coordinates are accumulated and the geometry type of + /// the geometry is added to the internal geometry type list. + void MergeGeometry(std::string_view bytesWkb); + + void MergeGeometry(const uint8_t* bytesWkb, size_t bytesSize); + + /// \brief Accumulate the bounds of a previously-calculated BoundingBox + void MergeBox(const BoundingBox& box) { box_.Merge(box); } + + /// \brief Accumulate a previously-calculated list of geometry types + void MergeGeometryTypes(const std::vector geospatialTypes) { + geospatialTypes_.insert(geospatialTypes.begin(), geospatialTypes.end()); + } + + /// \brief Accumulate the bounds of a previously-calculated BoundingBox + void Merge(const WKBGeometryBounder& other) { + if (!IsValid() || !other.IsValid()) { + Invalidate(); + return; + } + box_.Merge(other.box_); + geospatialTypes_.insert(other.geospatialTypes_.begin(), other.geospatialTypes_.end()); + } + + /// \brief Retrieve the accumulated bounds + const BoundingBox& Bounds() const { return box_; } + + /// \brief Retrieve the accumulated geometry types + std::vector GeometryTypes() const; + + /// \brief Reset the internal bounds and geometry types list to an empty state + void Reset() { + isValid_ = true; + box_.Reset(); + geospatialTypes_.clear(); + } + + bool IsValid() const { return isValid_; } + + void Invalidate() { + isValid_ = false; + box_.Invalidate(); + geospatialTypes_.clear(); + } + + + private: + BoundingBox box_; + std::unordered_set geospatialTypes_; + bool isValid_ = true; + + void MergeGeometryInternal(WKBBuffer* src, bool recordWkbType); + + void MergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, bool swap); +}; + +} // namespace orc::geospatial \ No newline at end of file diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh index 4ba8c35f7d..11a553b9b3 100644 --- a/c++/include/orc/Statistics.hh +++ b/c++/include/orc/Statistics.hh @@ -22,6 +22,7 @@ #include "orc/Type.hh" #include "orc/Vector.hh" #include "orc/orc-config.hh" +#include "orc/Geospatial.hh" #include @@ -367,6 +368,33 @@ namespace orc { virtual int32_t getMaximumNanos() const = 0; }; + /** + * Statistics for Geometry and Geography + */ + class GeospatialColumnStatistics : public ColumnStatistics { + public: + virtual ~GeospatialColumnStatistics(); + + /** + * get bounding box + * @return bounding box + */ + virtual const geospatial::BoundingBox& getBoundingBox() const = 0; + + /** + * get geospatial types + * @return geospatial types + */ + virtual std::vector getGeospatialTypes() const = 0; + + /** + * update stats by a new value + * @param value new value to update + * @param length length of the value + */ + virtual void update(const char* value, size_t length) = 0; + }; + class Statistics { public: virtual ~Statistics(); diff --git a/c++/include/orc/Type.hh b/c++/include/orc/Type.hh index 82e0e3cc86..91c714f713 100644 --- a/c++/include/orc/Type.hh +++ b/c++/include/orc/Type.hh @@ -25,6 +25,19 @@ namespace orc { + namespace geospatial { + enum EdgeInterpolationAlgorithm { + SPHERICAL = 0, + VINCENTY = 1, + THOMAS = 2, + ANDOYER = 3, + KARNEY = 4 + }; + using EIAlgo = EdgeInterpolationAlgorithm; + std::string AlgotoString(EIAlgo algo); + EIAlgo AlgoFromString(const std::string& algo); + } // namespace geospatial + enum TypeKind { BOOLEAN = 0, BYTE = 1, @@ -44,7 +57,9 @@ namespace orc { DATE = 15, VARCHAR = 16, CHAR = 17, - TIMESTAMP_INSTANT = 18 + TIMESTAMP_INSTANT = 18, + GEOMETRY = 19, + GEOGRAPHY = 20 }; class Type { @@ -59,6 +74,8 @@ namespace orc { virtual uint64_t getMaximumLength() const = 0; virtual uint64_t getPrecision() const = 0; virtual uint64_t getScale() const = 0; + virtual const std::string& getCRS() const = 0; + virtual geospatial::EIAlgo getEIAlgo() const = 0; virtual Type& setAttribute(const std::string& key, const std::string& value) = 0; virtual bool hasAttributeKey(const std::string& key) const = 0; virtual Type& removeAttribute(const std::string& key) = 0; @@ -115,6 +132,9 @@ namespace orc { std::unique_ptr createListType(std::unique_ptr elements); std::unique_ptr createMapType(std::unique_ptr key, std::unique_ptr value); std::unique_ptr createUnionType(); + std::unique_ptr createGeometryType(const std::string& crs = "OGC:CRS84"); + std::unique_ptr createGeographyType(const std::string& crs = "OGC:CRS84", + geospatial::EIAlgo algo = geospatial::SPHERICAL); } // namespace orc #endif diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index e378429f1e..09a0b148ed 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -171,6 +171,7 @@ set(SOURCE_FILES ConvertColumnReader.cc CpuInfoUtil.cc Exceptions.cc + Geospatial.cc Int128.cc LzoDecompressor.cc MemoryPool.cc diff --git a/c++/src/ColumnPrinter.cc b/c++/src/ColumnPrinter.cc index 8b16ecbd09..6535c612ce 100644 --- a/c++/src/ColumnPrinter.cc +++ b/c++/src/ColumnPrinter.cc @@ -254,6 +254,8 @@ namespace orc { break; case BINARY: + case GEOMETRY: + case GEOGRAPHY: result = std::make_unique(buffer, param); break; diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc index 0fd17de1b8..89ff0e0245 100644 --- a/c++/src/ColumnReader.cc +++ b/c++/src/ColumnReader.cc @@ -1747,6 +1747,8 @@ namespace orc { case CHAR: case STRING: case VARCHAR: + case GEOMETRY: + case GEOGRAPHY: switch (static_cast(stripe.getEncoding(type.getColumnId()).kind())) { case proto::ColumnEncoding_Kind_DICTIONARY: case proto::ColumnEncoding_Kind_DICTIONARY_V2: diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc index d31b1c65d4..72274647ae 100644 --- a/c++/src/ColumnWriter.cc +++ b/c++/src/ColumnWriter.cc @@ -17,10 +17,13 @@ */ #include "orc/Int128.hh" +#include "orc/Statistics.hh" +#include "orc/Type.hh" #include "orc/Writer.hh" #include "ByteRLE.hh" #include "ColumnWriter.hh" +#include #include "RLE.hh" #include "Statistics.hh" #include "Timezone.hh" @@ -2871,6 +2874,61 @@ namespace orc { } } + class GeospatialColumnWriter : public BinaryColumnWriter { + public: + GeospatialColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) + : BinaryColumnWriter(type, factory, options), + isGeometry_(type.getKind() == TypeKind::GEOMETRY) {} + + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, + uint64_t numValues, const char* incomingMask) override { + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const StringVectorBatch& strBatch = dynamic_cast(rowBatch); + auto data = &strBatch.data[offset]; + auto length = &strBatch.length[offset]; + const char* notNull = strBatch.hasNulls ? strBatch.notNull.data() + offset : nullptr; + + bool hasNull = false; + GeospatialColumnStatisticsImpl* geoStats = nullptr; + if (isGeometry_) { + geoStats = dynamic_cast(colIndexStatistics.get()); + } + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + uint64_t len = static_cast(length[i]); + directDataStream->write(data[i], len); + + // update stats + if (geoStats) { + ++count; + geoStats->update(data[i], len); + } + + if (enableBloomFilter) { + bloomFilter->addBytes(data[i], length[i]); + } + } else if (!hasNull) { + hasNull = true; + if (geoStats) { + geoStats->setHasNull(hasNull); + } + } + } + + directLengthEncoder->add(length, numValues, notNull); + + if (geoStats) { + geoStats->increase(count); + } + } + + private: + bool isGeometry_; + }; + std::unique_ptr buildWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) { switch (static_cast(type.getKind())) { @@ -2941,6 +2999,9 @@ namespace orc { return std::make_unique(type, factory, options); case UNION: return std::make_unique(type, factory, options); + case GEOMETRY: + case GEOGRAPHY: + return std::make_unique(type, factory, options); default: throw NotImplementedYet( "Type is not supported yet for creating " diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc new file mode 100644 index 0000000000..e4b46acbda --- /dev/null +++ b/c++/src/Geospatial.cc @@ -0,0 +1,287 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Geospatial.hh" + +#include +#include +#include +#include "orc/Exceptions.hh" + +namespace orc::geospatial { + +template +inline std::enable_if_t, T> SafeLoadAs( + const uint8_t* unaligned) { + std::remove_const_t ret; + std::memcpy(&ret, unaligned, sizeof(T)); + return ret; +} + +template +inline std::enable_if_t && + std::is_trivially_copyable_v && sizeof(T) == sizeof(U), + U> +SafeCopy(T value) { + std::remove_const_t ret; + std::memcpy(&ret, static_cast(&value), sizeof(T)); + return ret; +} + +static bool isLittleEndian() { + static union { + uint32_t i; + char c[4]; + } num = {0x01020304}; + return num.c[0] == 4; +} + +#if defined(_MSC_VER) +# include // IWYU pragma: keep +# define ORC_BYTE_SWAP64 _byteswap_uint64 +# define ORC_BYTE_SWAP32 _byteswap_ulong +#else +# define ORC_BYTE_SWAP64 __builtin_bswap64 +# define ORC_BYTE_SWAP32 __builtin_bswap32 +#endif + +// Swap the byte order (i.e. endianness) +static inline uint32_t ByteSwap(uint32_t value) { + return static_cast(ORC_BYTE_SWAP32(value)); +} +static inline double ByteSwap(double value) { + const uint64_t swapped = ORC_BYTE_SWAP64(SafeCopy(value)); + return SafeCopy(swapped); +} + +std::string BoundingBox::ToString() const { + std::stringstream ss; + ss << "BoundingBox" << std::endl; + ss << " x: [" << min[0] << ", " << max[0] << "]" << std::endl; + ss << " y: [" << min[1] << ", " << max[1] << "]" << std::endl; + ss << " z: [" << min[2] << ", " << max[2] << "]" << std::endl; + ss << " m: [" << min[3] << ", " << max[3] << "]" << std::endl; + + return ss.str(); +} + +/// \brief Object to keep track of the low-level consumption of a well-known binary +/// geometry +/// +/// Briefly, ISO well-known binary supported by the Parquet spec is an endian byte +/// (0x01 or 0x00), followed by geometry type + dimensions encoded as a (uint32_t), +/// followed by geometry-specific data. Coordinate sequences are represented by a +/// uint32_t (the number of coordinates) plus a sequence of doubles (number of coordinates +/// multiplied by the number of dimensions). +class WKBBuffer { + public: + WKBBuffer() : data_(nullptr), size_(0) {} + WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} + + uint8_t ReadUInt8() { return ReadChecked(); } + + uint32_t ReadUInt32(bool swap) { + auto value = ReadChecked(); + return swap ? ByteSwap(value) : value; + } + + template + void ReadCoords(uint32_t nCoords, bool swap, Visit&& visit) { + size_t total_bytes = nCoords * sizeof(Coord); + if (size_ < total_bytes) { + } + + if (swap) { + Coord coord; + for (uint32_t i = 0; i < nCoords; i++) { + coord = ReadUnchecked(); + for (auto& c : coord) { + c = ByteSwap(c); + } + + std::forward(visit)(coord); + } + } else { + for (uint32_t i = 0; i < nCoords; i++) { + std::forward(visit)(ReadUnchecked()); + } + } + } + + size_t size() const { return size_; } + + private: + const uint8_t* data_; + size_t size_; + + template + T ReadChecked() { + if (size_ < sizeof(T)) { + std::stringstream ss; + ss << "Can't read" << sizeof(T) << " bytes from WKBBuffer with " << size_ << " remaining"; + throw ParseError(ss.str()); + } + + return ReadUnchecked(); + } + + template + T ReadUnchecked() { + T out = SafeLoadAs(data_); + data_ += sizeof(T); + size_ -= sizeof(T); + return out; + } +}; + +using GeometryTypeAndDimensions = std::pair; + +namespace { + +std::optional ParseGeometryType(uint32_t wkbGeometryType) { + // The number 1000 can be used because WKB geometry types are constructed + // on purpose such that this relationship is true (e.g., LINESTRING ZM maps + // to 3002). + uint32_t geometryTypeComponent = wkbGeometryType % 1000; + uint32_t dimensionsComponent = wkbGeometryType / 1000; + + auto minGeometryTypeValue = static_cast(GeometryType::kValueMin); + auto maxGeometryTypeValue = static_cast(GeometryType::kValueMax); + auto minDimensionValue = static_cast(Dimensions::kValueMin); + auto maxDimensionValue = static_cast(Dimensions::kValueMax); + + if (geometryTypeComponent < minGeometryTypeValue || + geometryTypeComponent > maxGeometryTypeValue || + dimensionsComponent < minDimensionValue || + dimensionsComponent > maxDimensionValue) { + return std::nullopt; + } + + return std::make_optional(GeometryTypeAndDimensions{static_cast(geometryTypeComponent), + static_cast(dimensionsComponent)}); +} + +} // namespace + +std::vector WKBGeometryBounder::GeometryTypes() const { + std::vector out(geospatialTypes_.begin(), geospatialTypes_.end()); + std::sort(out.begin(), out.end()); + return out; +} + +void WKBGeometryBounder::MergeGeometry(std::string_view bytesWkb) { + if (!isValid_) { return; } + MergeGeometry(reinterpret_cast(bytesWkb.data()), bytesWkb.size()); +} + +void WKBGeometryBounder::MergeGeometry(const uint8_t* bytesWkb, size_t bytesSize) { + if (!isValid_) { return; } + WKBBuffer src{bytesWkb, static_cast(bytesSize)}; + try { + MergeGeometryInternal(&src, /*record_wkb_type=*/true); + } catch (const ParseError&) { + Invalidate(); + return; + } + if (src.size() != 0) { + // "Exepcted zero bytes after consuming WKB + Invalidate(); + } +} + +void WKBGeometryBounder::MergeGeometryInternal(WKBBuffer* src, bool recordWkbType) { + uint8_t endian = src->ReadUInt8(); + bool swap = endian != 0x00; + if(isLittleEndian()) { swap = endian != 0x01; } + + uint32_t wkbGeometryType = src->ReadUInt32(swap); + auto geometryTypeAndDimensions = ParseGeometryType(wkbGeometryType); + if (!geometryTypeAndDimensions.has_value()) { + Invalidate(); + return; + } + auto& [geometry_type, dimensions] = geometryTypeAndDimensions.value(); + + // Keep track of geometry types encountered if at the top level + if (recordWkbType) { + geospatialTypes_.insert(static_cast(wkbGeometryType)); + } + + switch (geometry_type) { + case GeometryType::kPoint: + MergeSequence(src, dimensions, 1, swap); + break; + + case GeometryType::kLinestring: { + uint32_t nCoords = src->ReadUInt32(swap); + MergeSequence(src, dimensions, nCoords, swap); + break; + } + case GeometryType::kPolygon: { + uint32_t n_parts = src->ReadUInt32(swap); + for (uint32_t i = 0; i < n_parts; i++) { + uint32_t nCoords = src->ReadUInt32(swap); + MergeSequence(src, dimensions, nCoords, swap); + } + break; + } + + // These are all encoded the same in WKB, even though this encoding would + // allow for parts to be of a different geometry type or different dimensions. + // For the purposes of bounding, this does not cause us problems. We pass + // record_wkb_type = false because we do not want the child geometry to be + // added to the geometry_types list (e.g., for a MultiPoint, we only want + // the code for MultiPoint to be added, not the code for Point). + case GeometryType::kMultiPoint: + case GeometryType::kMultiLinestring: + case GeometryType::kMultiPolygon: + case GeometryType::kGeometryCollection: { + uint32_t n_parts = src->ReadUInt32(swap); + for (uint32_t i = 0; i < n_parts; i++) { + MergeGeometryInternal(src, /*record_wkb_type*/ false); + } + break; + } + } +} + +void WKBGeometryBounder::MergeSequence(WKBBuffer* src, Dimensions dimensions, + uint32_t nCoords, bool swap) { + switch (dimensions) { + case Dimensions::kXY: + src->ReadCoords( + nCoords, swap, [&](BoundingBox::XY coord) { box_.UpdateXY(coord); }); + break; + case Dimensions::kXYZ: + src->ReadCoords( + nCoords, swap, [&](BoundingBox::XYZ coord) { box_.UpdateXYZ(coord); }); + break; + case Dimensions::kXYM: + src->ReadCoords( + nCoords, swap, [&](BoundingBox::XYM coord) { box_.UpdateXYM(coord); }); + break; + case Dimensions::kXYZM: + src->ReadCoords( + nCoords, swap, [&](BoundingBox::XYZM coord) { box_.UpdateXYZM(coord); }); + break; + default: + Invalidate(); + } +} + +} // namespace orc::geospatial \ No newline at end of file diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 17bf835203..349ae1b407 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -873,6 +873,8 @@ namespace orc { case proto::Type_Kind_CHAR: case proto::Type_Kind_STRING: case proto::Type_Kind_VARCHAR: + case proto::Type_Kind_GEOMETRY: + case proto::Type_Kind_GEOGRAPHY: return 4; default: return 0; diff --git a/c++/src/SchemaEvolution.cc b/c++/src/SchemaEvolution.cc index 7cf3b5c512..442c43c228 100644 --- a/c++/src/SchemaEvolution.cc +++ b/c++/src/SchemaEvolution.cc @@ -18,6 +18,7 @@ #include "SchemaEvolution.hh" #include "orc/Exceptions.hh" +#include "orc/Type.hh" namespace orc { @@ -113,7 +114,9 @@ namespace orc { case TIMESTAMP: case TIMESTAMP_INSTANT: case DATE: - case BINARY: { + case BINARY: + case GEOMETRY: + case GEOGRAPHY: { // Not support break; } @@ -235,6 +238,8 @@ namespace orc { case FLOAT: case DOUBLE: case BINARY: + case GEOMETRY: + case GEOGRAPHY: case TIMESTAMP: case LIST: case MAP: diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc index c1a23cad16..0f5e0cfa2b 100644 --- a/c++/src/Statistics.cc +++ b/c++/src/Statistics.cc @@ -44,6 +44,8 @@ namespace orc { return new DateColumnStatisticsImpl(s, statContext); } else if (s.has_binary_statistics()) { return new BinaryColumnStatisticsImpl(s, statContext); + } else if (s.has_geospatial_statistics()) { + return new GeospatialColumnStatisticsImpl(s); } else { return new ColumnStatisticsImpl(s); } @@ -148,6 +150,10 @@ namespace orc { // PASS } + GeospatialColumnStatistics::~GeospatialColumnStatistics() { + // PASS + } + ColumnStatisticsImpl::~ColumnStatisticsImpl() { // PASS } @@ -188,6 +194,10 @@ namespace orc { // PASS } + GeospatialColumnStatisticsImpl::~GeospatialColumnStatisticsImpl() { + // PASS + } + ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) { stats_.setNumberOfValues(pb.number_of_values()); stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); @@ -391,6 +401,40 @@ namespace orc { } } + GeospatialColumnStatisticsImpl::GeospatialColumnStatisticsImpl( + const proto::ColumnStatistics& pb) { + reset(); + if (!pb.has_geospatial_statistics()) { + bounder_.Invalidate(); + } else { + const proto::GeospatialStatistics& stats = pb.geospatial_statistics(); + geospatial::BoundingBox::XYZM min; + geospatial::BoundingBox::XYZM max; + for (int i = 0; i < geospatial::kMaxDimensions; i++) { + min[i] = max[i] = std::numeric_limits::quiet_NaN(); + } + if (stats.has_bbox()) { + const auto& protoBBox = stats.bbox(); + min[0] = protoBBox.xmin(); + min[1] = protoBBox.ymin(); + max[0] = protoBBox.xmax(); + max[1] = protoBBox.ymax(); + if (protoBBox.has_zmin() && protoBBox.has_zmax()) { + min[2] = protoBBox.zmin(); + max[2] = protoBBox.zmax(); + } + if (protoBBox.has_mmin() && protoBBox.has_mmax()) { + min[3] = protoBBox.mmin(); + max[3] = protoBBox.mmax(); + } + } + bounder_.MergeBox(geospatial::BoundingBox(min, max)); + std::vector types = {stats.geospatial_types().begin(), + stats.geospatial_types().end()}; + bounder_.MergeGeometryTypes(types); + } + } + std::unique_ptr createColumnStatistics(const Type& type) { switch (static_cast(type.getKind())) { case BOOLEAN: @@ -422,6 +466,9 @@ namespace orc { return std::make_unique(); case DECIMAL: return std::make_unique(); + case GEOGRAPHY: + case GEOMETRY: + return std::make_unique(); default: throw NotImplementedYet("Not supported type: " + type.toString()); } diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh index b7ed5d1e56..0d116e54a3 100644 --- a/c++/src/Statistics.hh +++ b/c++/src/Statistics.hh @@ -1683,6 +1683,129 @@ namespace orc { } }; + class GeospatialColumnStatisticsImpl : public GeospatialColumnStatistics, + public MutableColumnStatistics { + private: + geospatial::WKBGeometryBounder bounder_; + InternalCharStatistics stats_; + + public: + GeospatialColumnStatisticsImpl() { + reset(); + } + explicit GeospatialColumnStatisticsImpl(const proto::ColumnStatistics& stats); + virtual ~GeospatialColumnStatisticsImpl(); + + uint64_t getNumberOfValues() const override { + return stats_.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + stats_.setNumberOfValues(value); + } + + void increase(uint64_t count) override { + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); + } + + bool hasNull() const override { + return stats_.hasNull(); + } + + void setHasNull(bool hasNull) override { + stats_.setHasNull(hasNull); + } + + void merge(const MutableColumnStatistics& other) override { + + const GeospatialColumnStatisticsImpl& geoStats = + dynamic_cast(other); + stats_.merge(geoStats.stats_); + bounder_.Merge(geoStats.bounder_); + } + + void reset() override { + stats_.reset(); + bounder_.Reset(); + } + + void update(const char* value, size_t length) override { + bounder_.MergeGeometry(std::string_view(value, length)); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); + + proto::GeospatialStatistics* geoStats = pbStats.mutable_geospatial_statistics(); + const auto& bbox = bounder_.Bounds(); + if (bbox.BoundValid(0) && bbox.BoundValid(1) && !bbox.BoundEmpty(0) && !bbox.BoundEmpty(1)) { + geoStats->mutable_bbox()->set_xmin(bbox.min[0]); + geoStats->mutable_bbox()->set_xmax(bbox.max[0]); + geoStats->mutable_bbox()->set_ymin(bbox.min[1]); + geoStats->mutable_bbox()->set_ymax(bbox.max[1]); + if (bbox.BoundValid(2) && !bbox.BoundEmpty(2)) { + geoStats->mutable_bbox()->set_zmin(bbox.min[2]); + geoStats->mutable_bbox()->set_zmax(bbox.max[2]); + } + if (bbox.BoundValid(3) && !bbox.BoundEmpty(3)) { + geoStats->mutable_bbox()->set_mmin(bbox.min[3]); + geoStats->mutable_bbox()->set_mmax(bbox.max[3]); + } + } + for (auto type : bounder_.GeometryTypes()) { + geoStats->add_geospatial_types(type); + } + } + + std::string toString() const override { + if (!bounder_.IsValid()) { + return " invalid"; + } + + std::stringstream ss; + ss << ""; + + std::string dim_label("xyzm"); + const auto& bbox = bounder_.Bounds(); + auto dim_valid = bbox.DimensionValid(); + auto dim_empty = bbox.DimensionEmpty(); + auto lower = bbox.LowerBound(); + auto upper = bbox.UpperBound(); + + for (int i = 0; i < 4; i++) { + ss << " " << dim_label[i] << ": "; + if (!dim_valid[i]) { + ss << "invalid"; + } else if (dim_empty[i]) { + ss << "empty"; + } else { + ss << "[" << lower[i] << ", " << upper[i] << "]"; + } + } + + std::vector maybe_geometry_types = bounder_.GeometryTypes(); + ss << " geometry_types: ["; + std::string sep(""); + for (int32_t geometry_type : maybe_geometry_types) { + ss << sep << geometry_type; + sep = ", "; + } + ss << "]"; + + return ss.str(); + } + + const geospatial::BoundingBox& getBoundingBox() const override { + return bounder_.Bounds(); + } + + std::vector getGeospatialTypes() const override { + return bounder_.GeometryTypes(); + } + + }; + ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, const StatContext& statContext); diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index cbc7b82796..b78d4665b0 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -19,8 +19,10 @@ #include "TypeImpl.hh" #include "Adaptor.hh" #include "orc/Exceptions.hh" +#include "orc/Type.hh" #include +#include #include namespace orc { @@ -62,6 +64,35 @@ namespace orc { subtypeCount_ = 0; } + TypeImpl::TypeImpl(TypeKind kind, + const std::string& crs) { + parent_ = nullptr; + columnId_ = -1; + maximumColumnId_ = -1; + kind_ = kind; + maxLength_ = 0; + precision_ = 0; + scale_ = 0; + subtypeCount_ = 0; + crs_ = crs; + edgeInterpolationAlgorithm_ = geospatial::EIAlgo::SPHERICAL; +} + +TypeImpl::TypeImpl(TypeKind kind, + const std::string& crs, + geospatial::EIAlgo algo) { + parent_ = nullptr; + columnId_ = -1; + maximumColumnId_ = -1; + kind_ = kind; + maxLength_ = 0; + precision_ = 0; + scale_ = 0; + subtypeCount_ = 0; + crs_ = crs; + edgeInterpolationAlgorithm_ = algo; +} + uint64_t TypeImpl::assignIds(uint64_t root) const { columnId_ = static_cast(root); uint64_t current = root + 1; @@ -120,6 +151,14 @@ namespace orc { return scale_; } + const std::string& TypeImpl::getCRS() const { + return crs_; + } + + geospatial::EIAlgo TypeImpl::getEIAlgo() const { + return edgeInterpolationAlgorithm_; + } + Type& TypeImpl::setAttribute(const std::string& key, const std::string& value) { attributes_[key] = value; return *this; @@ -189,6 +228,45 @@ namespace orc { return true; } + namespace geospatial { +std::string AlgotoString(EIAlgo algo) { + switch (algo) { + case EIAlgo::SPHERICAL: + return "speherial"; + case VINCENTY: + return "vincenty"; + case THOMAS: + return "thomas"; + case ANDOYER: + return "andoyer"; + case KARNEY: + return "karney"; + default: + throw InvalidArgument("Unknown algo"); + } +} + +EIAlgo AlgoFromString(const std::string &algo) { + if (algo == "speherial") { + return EIAlgo::SPHERICAL; + } + if (algo == "vincenty") { + return VINCENTY; + } + if (algo == "thomas") { + return THOMAS; + } + if (algo == "andoyer") { + return ANDOYER; + } + if (algo == "karney") { + return KARNEY; + } + throw InvalidArgument("Unknown algo: " + algo); +} + +} // namespace geospatial + std::string TypeImpl::toString() const { switch (static_cast(kind_)) { case BOOLEAN: @@ -271,6 +349,17 @@ namespace orc { result << "char(" << maxLength_ << ")"; return result.str(); } + case GEOMETRY: { + std::stringstream result; + result << "geometry(" << crs_ << ")"; + return result.str(); + } + case GEOGRAPHY: { + std::stringstream result; + result << "geography(" << crs_ << "," << geospatial::AlgotoString(edgeInterpolationAlgorithm_) + << ")"; + return result.str(); + } default: throw NotImplementedYet("Unknown type"); } @@ -322,6 +411,8 @@ namespace orc { case BINARY: case CHAR: case VARCHAR: + case GEOMETRY: + case GEOGRAPHY: return encoded ? std::make_unique(capacity, memoryPool) : std::make_unique(capacity, memoryPool); @@ -419,6 +510,14 @@ namespace orc { return std::make_unique(UNION); } + std::unique_ptr createGeometryType(const std::string& crs) { + return std::make_unique(GEOMETRY, crs); + } + + std::unique_ptr createGeographyType(const std::string& crs, geospatial::EIAlgo algo) { + return std::make_unique(GEOGRAPHY, crs, algo); + } + std::string printProtobufMessage(const google::protobuf::Message& message); std::unique_ptr convertType(const proto::Type& type, const proto::Footer& footer) { std::unique_ptr ret; @@ -443,6 +542,16 @@ namespace orc { ret = std::make_unique(static_cast(type.kind()), type.maximum_length()); break; + case proto::Type_Kind_GEOMETRY: + ret = std::make_unique(static_cast(type.kind()), type.crs()); + break; + + case proto::Type_Kind_GEOGRAPHY: + ret = std::make_unique(static_cast(type.kind()), + type.crs(), + static_cast(type.algorithm())); + break; + case proto::Type_Kind_DECIMAL: ret = std::make_unique(DECIMAL, type.precision(), type.scale()); break; @@ -523,6 +632,13 @@ namespace orc { case CHAR: result = std::make_unique(fileType->getKind(), fileType->getMaximumLength()); break; + case GEOMETRY: + result = std::make_unique(fileType->getKind(), fileType->getCRS()); + break; + case GEOGRAPHY: + result = std::make_unique(fileType->getKind(), fileType->getCRS(), + fileType->getEIAlgo()); + break; case LIST: result = std::make_unique(fileType->getKind()); @@ -710,6 +826,22 @@ namespace orc { return std::make_unique(DECIMAL, precision, scale); } + std::unique_ptr TypeImpl::parseGeographyType(const std::string& input, size_t start, + size_t end) { + if (input[start] != '(') { + throw std::logic_error("Missing ( after geometry."); + } + size_t pos = start + 1; + size_t sep = input.find(',', pos); + if (sep + 1 >= end || sep == std::string::npos) { + throw std::logic_error("Decimal type must specify CRS."); + } + std::string crs = input.substr(pos, sep - pos); + std::string algoStr = input.substr(sep + 1, end - sep - 1); + geospatial::EdgeInterpolationAlgorithm algo = geospatial::AlgoFromString(algoStr); + return std::make_unique(GEOGRAPHY, crs, algo); + } + void validatePrimitiveType(std::string category, const std::string& input, const size_t pos) { if (input[pos] == '<' || input[pos] == '(') { std::ostringstream oss; @@ -780,6 +912,14 @@ namespace orc { uint64_t maxLength = static_cast(atoi(input.substr(start + 1, end - start + 1).c_str())); return std::make_unique(CHAR, maxLength); + } else if (category == "geometry") { + if (input[start] != '(') { + throw std::logic_error("Missing ( after varchar."); + } + std::string crs = input.substr(start + 1, end - start + 1); + return std::make_unique(GEOMETRY, crs); + } else if (category == "geography") { + return parseGeographyType(input, start, end); } else { throw std::logic_error("Unknown type " + category); } diff --git a/c++/src/TypeImpl.hh b/c++/src/TypeImpl.hh index 647d5a5d2c..5d73c1f9fb 100644 --- a/c++/src/TypeImpl.hh +++ b/c++/src/TypeImpl.hh @@ -24,6 +24,7 @@ #include "Adaptor.hh" #include "wrap/orc-proto-wrapper.hh" +#include #include namespace orc { @@ -41,6 +42,9 @@ namespace orc { uint64_t precision_; uint64_t scale_; std::map attributes_; + std::string crs_; + geospatial::EdgeInterpolationAlgorithm edgeInterpolationAlgorithm_ = + geospatial::EdgeInterpolationAlgorithm::SPHERICAL; public: /** @@ -58,6 +62,16 @@ namespace orc { */ TypeImpl(TypeKind kind, uint64_t precision, uint64_t scale); + /** + * Create geometry type. + */ + TypeImpl(TypeKind kind, const std::string& crs); + + /** + * Create geography type. + */ + TypeImpl(TypeKind kind, const std::string& crs, geospatial::EIAlgo algo); + uint64_t getColumnId() const override; uint64_t getMaximumColumnId() const override; @@ -76,6 +90,10 @@ namespace orc { uint64_t getScale() const override; + const std::string& getCRS() const override; + + geospatial::EIAlgo getEIAlgo() const override; + Type& setAttribute(const std::string& key, const std::string& value) override; bool hasAttributeKey(const std::string& key) const override; @@ -176,6 +194,15 @@ namespace orc { static std::unique_ptr parseDecimalType(const std::string& input, size_t start, size_t end); + + /** + * Parse geography type from string + * @param input the input string of a decimal type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr parseGeographyType(const std::string& input, size_t start, + size_t end); /** * Parse type for a category * @param category type name diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc index 775e6d2452..ce5367adfb 100644 --- a/c++/src/Writer.cc +++ b/c++/src/Writer.cc @@ -24,6 +24,7 @@ #include "Utils.hh" #include +#include namespace orc { @@ -702,6 +703,40 @@ namespace orc { protoType.set_kind(proto::Type_Kind_CHAR); break; } + case GEOMETRY: { + protoType.set_kind(proto::Type_Kind_GEOMETRY); + protoType.set_crs(t.getCRS()); + break; + } + case GEOGRAPHY: { + protoType.set_kind(proto::Type_Kind_GEOGRAPHY); + protoType.set_crs(t.getCRS()); + switch (t.getEIAlgo()) { + case geospatial::EIAlgo::SPHERICAL: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_SPHERICAL); + break; + } + case orc::geospatial::EIAlgo::VINCENTY: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_VINCENTY); + break; + } + case orc::geospatial::EIAlgo::THOMAS: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_VINCENTY); + break; + } + case orc::geospatial::EIAlgo::ANDOYER: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_ANDOYER); + break; + } + case orc::geospatial::EIAlgo::KARNEY: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_KARNEY); + break; + } + default: + throw std::invalid_argument("Unknown Algorithm."); + } + break; + } default: throw std::logic_error("Unknown type."); } diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt index f7328abb32..3261fedde6 100644 --- a/c++/test/CMakeLists.txt +++ b/c++/test/CMakeLists.txt @@ -56,12 +56,14 @@ add_executable (orc-test TestRleEncoder.cc TestRLEV2Util.cc TestSargsApplier.cc + TestStatistics.cc TestSearchArgument.cc TestSchemaEvolution.cc TestStripeIndexStatistics.cc TestTimestampStatistics.cc TestTimezone.cc TestType.cc + TestUtil.cc TestWriter.cc TestCache.cc ${SIMD_TEST_SRCS} diff --git a/c++/test/TestColumnStatistics.cc b/c++/test/TestColumnStatistics.cc index 5cf2d9e41b..4a942c0557 100644 --- a/c++/test/TestColumnStatistics.cc +++ b/c++/test/TestColumnStatistics.cc @@ -20,6 +20,7 @@ #include "orc/OrcFile.hh" #include "wrap/gmock.h" #include "wrap/gtest-wrapper.h" +#include "TestUtil.hh" #include @@ -531,4 +532,343 @@ namespace orc { collectionStats->merge(*other); EXPECT_FALSE(collectionStats->hasTotalChildren()); } + + TEST(ColumnStatistics, TestGeospatialDefaults) { + std::unique_ptr geoStats(new GeospatialColumnStatisticsImpl()); + EXPECT_TRUE(geoStats->getGeospatialTypes().empty()); + auto bbox = geoStats->getBoundingBox(); + for (int i = 0; i < geospatial::kMaxDimensions; i++) { + EXPECT_TRUE(bbox.BoundEmpty(i)); + EXPECT_TRUE(bbox.BoundValid(i)); + } + EXPECT_EQ(" x: empty y: empty z: empty m: empty geometry_types: []", + geoStats->toString()); + } + + TEST(ColumnStatistics, TestGeospatialUpdate) { + std::unique_ptr geoStats(new GeospatialColumnStatisticsImpl()); + EXPECT_TRUE(geoStats->getGeospatialTypes().empty()); + const auto& bbox = geoStats->getBoundingBox(); + for (int i = 0; i < geospatial::kMaxDimensions; i++) { + EXPECT_TRUE(bbox.BoundEmpty(i)); + EXPECT_TRUE(bbox.BoundValid(i)); + } + EXPECT_EQ(geoStats->getGeospatialTypes().size(), 0); + + geospatial::BoundingBox::XYZM expectedMin; + geospatial::BoundingBox::XYZM expectedMax; + std::array expectedEmpty; + std::array expectedValid; + std::vector expectedTypes; + for (int i = 0; i < geospatial::kMaxDimensions; i++) { + expectedMin[i] = geospatial::kInf; + expectedMax[i] = -geospatial::kInf; + expectedEmpty[i] = true; + expectedValid[i] = true; + } + + auto Verify = [&]() { + EXPECT_EQ(expectedEmpty, geoStats->getBoundingBox().DimensionEmpty()); + EXPECT_EQ(expectedValid, geoStats->getBoundingBox().DimensionValid()); + EXPECT_EQ(expectedTypes, geoStats->getGeospatialTypes()); + for (int i = 0; i < geospatial::kMaxDimensions; i++) { + if (geoStats->getBoundingBox().BoundValid(i)) { + EXPECT_EQ(expectedMin[i], geoStats->getBoundingBox().LowerBound()[i]); + EXPECT_EQ(expectedMax[i], geoStats->getBoundingBox().UpperBound()[i]); + } else { + EXPECT_TRUE(std::isnan(geoStats->getBoundingBox().LowerBound()[i])); + EXPECT_TRUE(std::isnan(geoStats->getBoundingBox().UpperBound()[i])); + } + } + }; + + // Update a xy point + std::string xy0 = MakeWKBPoint({10, 11}, false, false); + geoStats->update(xy0.c_str(), xy0.size()); + expectedMin[0] = expectedMax[0] = 10; + expectedMin[1] = expectedMax[1] = 11; + expectedEmpty[0] = expectedEmpty[1] = false; + expectedTypes.push_back(1); + Verify(); + + // Update a xyz point. + std::string xyz0 = MakeWKBPoint({11, 12, 13}, true, false); + geoStats->update(xyz0.c_str(), xyz0.size()); + expectedMax[0] = 11; + expectedMax[1] = 12; + expectedMin[2] = expectedMax[2] = 13; + expectedEmpty[2] = false; + expectedTypes.push_back(1001); + Verify(); + + // Update a xym point. + std::string xym0 = MakeWKBPoint({9, 10, 0, 11}, false, true); + geoStats->update(xym0.c_str(), xym0.size()); + expectedMin[0] = 9; + expectedMin[1] = 10; + expectedMin[3] = expectedMax[3] = 11; + expectedEmpty[3] = false; + expectedTypes.push_back(2001); + Verify(); + + // Update a xymz point. + std::string xymz0 = MakeWKBPoint({8, 9, 10, 12}, true, true); + geoStats->update(xymz0.c_str(), xymz0.size()); + expectedMin[0] = 8; + expectedMin[1] = 9; + expectedMin[2] = 10; + expectedMax[3] = 12; + expectedTypes.push_back(3001); + Verify(); + + // Update NaN to every dimension. + std::string xyzm1 = MakeWKBPoint( + {std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN()}, + true, false); + geoStats->update(xyzm1.c_str(), xyzm1.size()); + Verify(); + + // Update a invalid WKB + std::string invalidWKB; + geoStats->update(invalidWKB.c_str(), invalidWKB.size()); + expectedValid[0] = expectedValid[1] = expectedValid[2] = expectedValid[3] = false; + expectedTypes.clear(); + Verify(); + + // Update a xy point again + std::string xy1 = MakeWKBPoint({10, 11}, false, false); + geoStats->update(xy1.c_str(), xy1.size()); + Verify(); + } + + TEST(ColumnStatistics, TestGeospatialToProto) { + // Test Empty + std::unique_ptr geoStats(new GeospatialColumnStatisticsImpl()); + proto::ColumnStatistics pbStats; + geoStats->toProtoBuf(pbStats); + EXPECT_TRUE(pbStats.has_geospatial_statistics()); + EXPECT_EQ(0, pbStats.geospatial_statistics().geospatial_types().size()); + EXPECT_FALSE(pbStats.geospatial_statistics().has_bbox()); + + // Update a xy point + std::string xy = MakeWKBPoint({10, 11}, false, false); + geoStats->update(xy.c_str(), xy.size()); + pbStats.Clear(); + geoStats->toProtoBuf(pbStats); + EXPECT_TRUE(pbStats.has_geospatial_statistics()); + EXPECT_EQ(1, pbStats.geospatial_statistics().geospatial_types().size()); + EXPECT_EQ(1, pbStats.geospatial_statistics().geospatial_types(0)); + EXPECT_TRUE(pbStats.geospatial_statistics().has_bbox()); + const auto& bbox0 = pbStats.geospatial_statistics().bbox(); + EXPECT_TRUE(bbox0.has_xmin()); + EXPECT_TRUE(bbox0.has_xmax()); + EXPECT_TRUE(bbox0.has_ymin()); + EXPECT_TRUE(bbox0.has_ymax()); + EXPECT_FALSE(bbox0.has_zmin()); + EXPECT_FALSE(bbox0.has_zmax()); + EXPECT_FALSE(bbox0.has_mmin()); + EXPECT_FALSE(bbox0.has_mmax()); + EXPECT_EQ(10, bbox0.xmin()); + EXPECT_EQ(10, bbox0.xmax()); + EXPECT_EQ(11, bbox0.ymin()); + EXPECT_EQ(11, bbox0.ymax()); + + // Update a xyzm point. + std::string xyzm = MakeWKBPoint({-10, -11, -12, -13}, true, true); + geoStats->update(xyzm.c_str(), xyzm.size()); + pbStats.Clear(); + geoStats->toProtoBuf(pbStats); + EXPECT_TRUE(pbStats.has_geospatial_statistics()); + EXPECT_EQ(2, pbStats.geospatial_statistics().geospatial_types().size()); + EXPECT_EQ(1, pbStats.geospatial_statistics().geospatial_types(0)); + EXPECT_EQ(3001, pbStats.geospatial_statistics().geospatial_types(1)); + EXPECT_TRUE(pbStats.geospatial_statistics().has_bbox()); + const auto& bbox1 = pbStats.geospatial_statistics().bbox(); + EXPECT_TRUE(bbox1.has_xmin()); + EXPECT_TRUE(bbox1.has_xmax()); + EXPECT_TRUE(bbox1.has_ymin()); + EXPECT_TRUE(bbox1.has_ymax()); + EXPECT_TRUE(bbox1.has_zmin()); + EXPECT_TRUE(bbox1.has_zmax()); + EXPECT_TRUE(bbox1.has_mmin()); + EXPECT_TRUE(bbox1.has_mmax()); + EXPECT_EQ(-10, bbox1.xmin()); + EXPECT_EQ(10, bbox1.xmax()); + EXPECT_EQ(-11, bbox1.ymin()); + EXPECT_EQ(11, bbox1.ymax()); + EXPECT_EQ(-12, bbox1.zmin()); + EXPECT_EQ(-12, bbox1.zmax()); + EXPECT_EQ(-13, bbox1.mmin()); + EXPECT_EQ(-13, bbox1.mmax()); + + // Update a invalid point + std::string invalidWKB; + geoStats->update(invalidWKB.c_str(), invalidWKB.size()); + pbStats.Clear(); + geoStats->toProtoBuf(pbStats); + EXPECT_TRUE(pbStats.has_geospatial_statistics()); + EXPECT_EQ(0, pbStats.geospatial_statistics().geospatial_types().size()); + EXPECT_FALSE(pbStats.geospatial_statistics().has_bbox()); + } + + TEST(ColumnStatistics, TestGeospatialMerge) { + std::unique_ptr invalidStats(new GeospatialColumnStatisticsImpl()); + invalidStats->update("0", 0); + + std::unique_ptr emptyStats(new GeospatialColumnStatisticsImpl()); + + std::unique_ptr xyStats(new GeospatialColumnStatisticsImpl()); + std::string xy = MakeWKBPoint({10, 11}, false, false); + xyStats->update(xy.c_str(), xy.size()); + + std::unique_ptr xyzStats(new GeospatialColumnStatisticsImpl()); + std::string xyz = MakeWKBPoint({12, 13, 14}, true, false); + xyzStats->update(xyz.c_str(), xyz.size()); + + std::unique_ptr xyzmStats(new GeospatialColumnStatisticsImpl()); + std::string xyzm = MakeWKBPoint({-10, -11, -12, -13}, true, true); + xyzmStats->update(xyzm.c_str(), xyzm.size()); + + // invalid merge invalid + invalidStats->merge(*invalidStats); + std::array expectedValid = {false, false, false, false}; + EXPECT_EQ(invalidStats->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(invalidStats->getGeospatialTypes().size(), 0); + + // Empty merge empty + emptyStats->merge(*emptyStats); + expectedValid = {true, true, true, true}; + std::array expectedEmpty = {true, true, true, true}; + EXPECT_EQ(emptyStats->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().DimensionEmpty(), expectedEmpty); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 0); + + // Empty merge xy + emptyStats->merge(*xyStats); + expectedEmpty = {false, false, true, true}; + EXPECT_EQ(emptyStats->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().DimensionEmpty(), expectedEmpty); + EXPECT_EQ(10, emptyStats->getBoundingBox().LowerBound()[0]); + EXPECT_EQ(10, emptyStats->getBoundingBox().UpperBound()[0]); + EXPECT_EQ(11, emptyStats->getBoundingBox().LowerBound()[1]); + EXPECT_EQ(11, emptyStats->getBoundingBox().UpperBound()[1]); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 1); + EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1); + + // Empty merge xyz + emptyStats->merge(*xyzStats); + expectedEmpty = {false, false, false, true}; + EXPECT_EQ(emptyStats->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().DimensionEmpty(), expectedEmpty); + EXPECT_EQ(10, emptyStats->getBoundingBox().LowerBound()[0]); + EXPECT_EQ(12, emptyStats->getBoundingBox().UpperBound()[0]); + EXPECT_EQ(11, emptyStats->getBoundingBox().LowerBound()[1]); + EXPECT_EQ(13, emptyStats->getBoundingBox().UpperBound()[1]); + EXPECT_EQ(14, emptyStats->getBoundingBox().LowerBound()[2]); + EXPECT_EQ(14, emptyStats->getBoundingBox().UpperBound()[2]); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 2); + EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1); + EXPECT_EQ(emptyStats->getGeospatialTypes()[1], 1001); + + // Empty merge xyzm + emptyStats->merge(*xyzmStats); + expectedEmpty = {false, false, false, false}; + EXPECT_EQ(emptyStats->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().DimensionEmpty(), expectedEmpty); + EXPECT_EQ(-10, emptyStats->getBoundingBox().LowerBound()[0]); + EXPECT_EQ(12, emptyStats->getBoundingBox().UpperBound()[0]); + EXPECT_EQ(-11, emptyStats->getBoundingBox().LowerBound()[1]); + EXPECT_EQ(13, emptyStats->getBoundingBox().UpperBound()[1]); + EXPECT_EQ(-12, emptyStats->getBoundingBox().LowerBound()[2]); + EXPECT_EQ(14, emptyStats->getBoundingBox().UpperBound()[2]); + EXPECT_EQ(-13, emptyStats->getBoundingBox().LowerBound()[3]); + EXPECT_EQ(-13, emptyStats->getBoundingBox().UpperBound()[3]); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 3); + EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1); + EXPECT_EQ(emptyStats->getGeospatialTypes()[1], 1001); + EXPECT_EQ(emptyStats->getGeospatialTypes()[2], 3001); + + // Empty merge invalid + emptyStats->merge(*invalidStats); + expectedValid = {false, false, false, false}; + EXPECT_EQ(emptyStats->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 0); + } + + TEST(ColumnStatistics, TestGeospatialFromProto) { + proto::ColumnStatistics pbStats; + // No geostats + + std::unique_ptr emptyStats0( + new GeospatialColumnStatisticsImpl(pbStats)); + std::array expectedValid = {false, false, false, false}; + EXPECT_TRUE(emptyStats0->getGeospatialTypes().empty()); + EXPECT_EQ(emptyStats0->getBoundingBox().DimensionValid(), expectedValid); + + // Add empty geostats + pbStats.mutable_geospatial_statistics(); + std::unique_ptr emptyStats1( + new GeospatialColumnStatisticsImpl(pbStats)); + EXPECT_TRUE(emptyStats1->getGeospatialTypes().empty()); + EXPECT_EQ(emptyStats1->getBoundingBox().DimensionValid(), expectedValid); + + // Set xy bounds + auto* geoProtoStas = pbStats.mutable_geospatial_statistics(); + geoProtoStas->mutable_bbox()->set_xmin(0); + geoProtoStas->mutable_bbox()->set_xmax(1); + geoProtoStas->mutable_bbox()->set_ymin(0); + geoProtoStas->mutable_bbox()->set_ymax(1); + geoProtoStas->mutable_geospatial_types()->Add(2); + std::unique_ptr xyStats( + new GeospatialColumnStatisticsImpl(pbStats)); + expectedValid = {true, true, false, false}; + EXPECT_EQ(xyStats->getGeospatialTypes().size(), 1); + EXPECT_EQ(xyStats->getGeospatialTypes()[0], 2); + EXPECT_EQ(xyStats->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(0, xyStats->getBoundingBox().LowerBound()[0]); + EXPECT_EQ(1, xyStats->getBoundingBox().UpperBound()[0]); + EXPECT_EQ(0, xyStats->getBoundingBox().LowerBound()[1]); + EXPECT_EQ(1, xyStats->getBoundingBox().UpperBound()[1]); + + // Set xyz bounds + geoProtoStas->mutable_bbox()->set_zmin(0); + geoProtoStas->mutable_bbox()->set_zmax(1); + geoProtoStas->mutable_geospatial_types()->Add(1003); + std::unique_ptr xyzStats( + new GeospatialColumnStatisticsImpl(pbStats)); + expectedValid = {true, true, true, false}; + EXPECT_EQ(xyzStats->getGeospatialTypes().size(), 2); + EXPECT_EQ(xyzStats->getGeospatialTypes()[0], 2); + EXPECT_EQ(xyzStats->getGeospatialTypes()[1], 1003); + EXPECT_EQ(xyzStats->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(0, xyzStats->getBoundingBox().LowerBound()[0]); + EXPECT_EQ(1, xyzStats->getBoundingBox().UpperBound()[0]); + EXPECT_EQ(0, xyzStats->getBoundingBox().LowerBound()[1]); + EXPECT_EQ(1, xyzStats->getBoundingBox().UpperBound()[1]); + EXPECT_EQ(0, xyzStats->getBoundingBox().LowerBound()[2]); + EXPECT_EQ(1, xyzStats->getBoundingBox().UpperBound()[2]); + + // Set xyzm bounds + geoProtoStas->mutable_bbox()->set_mmin(0); + geoProtoStas->mutable_bbox()->set_mmax(1); + geoProtoStas->mutable_geospatial_types()->Add(3003); + std::unique_ptr xyzmStats( + new GeospatialColumnStatisticsImpl(pbStats)); + expectedValid = {true, true, true, true}; + EXPECT_EQ(xyzmStats->getGeospatialTypes().size(), 3); + EXPECT_EQ(xyzmStats->getGeospatialTypes()[0], 2); + EXPECT_EQ(xyzmStats->getGeospatialTypes()[1], 1003); + EXPECT_EQ(xyzmStats->getGeospatialTypes()[2], 3003); + EXPECT_EQ(xyzmStats->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(0, xyzmStats->getBoundingBox().LowerBound()[0]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().UpperBound()[0]); + EXPECT_EQ(0, xyzmStats->getBoundingBox().LowerBound()[1]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().UpperBound()[1]); + EXPECT_EQ(0, xyzmStats->getBoundingBox().LowerBound()[2]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().UpperBound()[2]); + EXPECT_EQ(0, xyzmStats->getBoundingBox().LowerBound()[3]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().UpperBound()[3]); + } + } // namespace orc diff --git a/c++/test/TestStatistics.cc b/c++/test/TestStatistics.cc new file mode 100644 index 0000000000..4d90957094 --- /dev/null +++ b/c++/test/TestStatistics.cc @@ -0,0 +1,236 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Statistics.hh" +#include "orc/OrcFile.hh" + +#include "MemoryInputStream.hh" +#include "MemoryOutputStream.hh" +#include "TestUtil.hh" + +#include "wrap/gtest-wrapper.h" + +#include +#include +#include + +namespace orc { + +#define ENSURE_DYNAMIC_CAST_NOT_NULL(PTR) \ + if (PTR == NULL) throw std::logic_error("dynamic_cast returns null"); + + const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M + + static std::unique_ptr createWriter( + uint64_t stripeSize, + const Type &type, + MemoryPool* memoryPool, + OutputStream *stream) { + WriterOptions options; + options.setStripeSize(stripeSize); + options.setCompressionBlockSize(256); + options.setMemoryBlockSize(256); + options.setCompression(CompressionKind_ZLIB); + options.setMemoryPool(memoryPool); + options.setRowIndexStride(10); + return createWriter(type, stream, options); + } + + static std::unique_ptr createReader( + MemoryPool* memoryPool, + MemoryOutputStream& memStream) { + + std::unique_ptr inStream( + new MemoryInputStream (memStream.getData(), memStream.getLength())); + ReaderOptions options; + options.setMemoryPool(*memoryPool); + return createReader(std::move(inStream), options); + } + + TEST(Statistics, geometryStatsWithNull) { + std::unique_ptr const type(Type::buildTypeFromString("struct")); + + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + MemoryPool* const pool = getDefaultPool(); + uint64_t const stripeSize = 32; // small stripe size to garantee multi stripes + std::unique_ptr writer = createWriter(stripeSize, *type, pool, &memStream); + + uint64_t const batchCount = 1000; + uint64_t const batches = 10; + std::unique_ptr const batch = writer->createRowBatch(batchCount); + StructVectorBatch* structBatch = dynamic_cast(batch.get()); + ENSURE_DYNAMIC_CAST_NOT_NULL(structBatch); + + StringVectorBatch* strBatch = dynamic_cast(structBatch->fields[0]); + ENSURE_DYNAMIC_CAST_NOT_NULL(strBatch); + + // create str values + std::vector wkbs; + std::array mins = {geospatial::kInf, geospatial::kInf, geospatial::kInf, + geospatial::kInf}; + std::array maxs = {-geospatial::kInf, -geospatial::kInf, -geospatial::kInf, + -geospatial::kInf}; + for (uint64_t i = 1; i < batchCount - 1; ++i) { + if (i % 3 == 0) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0}, false, false)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + } else if (i % 3 == 1) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0}, true, false)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + mins[2] = std::min(mins[2], i * 1.0); + maxs[2] = std::max(maxs[2], i * 1.0); + } else if (i % 3 == 2) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0, i * 1.0}, true, true)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + mins[2] = std::min(mins[2], i * 1.0); + maxs[2] = std::max(maxs[2], i * 1.0); + mins[3] = std::min(mins[3], i * 1.0); + maxs[3] = std::max(maxs[3], i * 1.0); + } + } + for (uint64_t i = 1; i < batchCount - 1; ++i) { + strBatch->data[i] = const_cast(wkbs[i - 1].c_str()); + strBatch->length[i] = static_cast(wkbs[i - 1].length()); + } + + structBatch->numElements = batchCount; + strBatch->numElements = batchCount; + + structBatch->hasNulls = true; + structBatch->notNull[0] = '\0'; + structBatch->notNull[batchCount - 1] = '\0'; + strBatch->hasNulls = true; + strBatch->notNull[0] = '\0'; + strBatch->notNull[batchCount - 1] = '\0'; + + for (uint64_t i = 0; i < batches; ++i) { + writer->add(*batch.get()); + } + writer->close(); + + std::unique_ptr reader = createReader(pool, memStream); + + // check column 1 (string) file stats + auto stats1 = reader->getColumnStatistics(1); + const GeospatialColumnStatistics* geoFileStats = + dynamic_cast(stats1.get()); + ENSURE_DYNAMIC_CAST_NOT_NULL(geoFileStats); + EXPECT_EQ(geoFileStats->getGeospatialTypes().size(), 3); + EXPECT_EQ(geoFileStats->getGeospatialTypes()[0], 1); + EXPECT_EQ(geoFileStats->getGeospatialTypes()[1], 1001); + EXPECT_EQ(geoFileStats->getGeospatialTypes()[2], 3001); + std::array expectValid = {true, true, true, true}; + std::array expectEmpty = {false, false, false, false}; + EXPECT_EQ(geoFileStats->getBoundingBox().DimensionValid(), expectValid); + EXPECT_EQ(geoFileStats->getBoundingBox().DimensionEmpty(), expectEmpty); + EXPECT_EQ(geoFileStats->getBoundingBox().LowerBound(), mins); + EXPECT_EQ(geoFileStats->getBoundingBox().UpperBound(), maxs); + } + + TEST(Statistics, geographyStatsWithNull) { + std::unique_ptr const type( + Type::buildTypeFromString("struct")); + + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + MemoryPool* const pool = getDefaultPool(); + uint64_t const stripeSize = 32; // small stripe size to garantee multi stripes + std::unique_ptr writer = createWriter(stripeSize, *type, pool, &memStream); + + uint64_t const batchCount = 1000; + uint64_t const batches = 10; + std::unique_ptr const batch = writer->createRowBatch(batchCount); + StructVectorBatch* structBatch = dynamic_cast(batch.get()); + ENSURE_DYNAMIC_CAST_NOT_NULL(structBatch); + + StringVectorBatch* strBatch = dynamic_cast(structBatch->fields[0]); + ENSURE_DYNAMIC_CAST_NOT_NULL(strBatch); + + // create str values + std::vector wkbs; + std::array mins = {geospatial::kInf, geospatial::kInf, geospatial::kInf, + geospatial::kInf}; + std::array maxs = {-geospatial::kInf, -geospatial::kInf, -geospatial::kInf, + -geospatial::kInf}; + for (uint64_t i = 1; i < batchCount - 1; ++i) { + if (i % 3 == 0) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0}, false, false)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + } else if (i % 3 == 1) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0}, true, false)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + mins[2] = std::min(mins[2], i * 1.0); + maxs[2] = std::max(maxs[2], i * 1.0); + } else if (i % 3 == 2) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0, i * 1.0}, true, true)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + mins[2] = std::min(mins[2], i * 1.0); + maxs[2] = std::max(maxs[2], i * 1.0); + mins[3] = std::min(mins[3], i * 1.0); + maxs[3] = std::max(maxs[3], i * 1.0); + } + } + for (uint64_t i = 1; i < batchCount - 1; ++i) { + strBatch->data[i] = const_cast(wkbs[i - 1].c_str()); + strBatch->length[i] = static_cast(wkbs[i - 1].length()); + } + + structBatch->numElements = batchCount; + strBatch->numElements = batchCount; + + structBatch->hasNulls = true; + structBatch->notNull[0] = '\0'; + structBatch->notNull[batchCount - 1] = '\0'; + strBatch->hasNulls = true; + strBatch->notNull[0] = '\0'; + strBatch->notNull[batchCount - 1] = '\0'; + + for (uint64_t i = 0; i < batches; ++i) { + writer->add(*batch.get()); + } + writer->close(); + + std::unique_ptr reader = createReader(pool, memStream); + + // check column 1 (string) file stats + auto stats1 = reader->getColumnStatistics(1); + const GeospatialColumnStatistics* geoFileStats = + dynamic_cast(stats1.get()); + ENSURE_DYNAMIC_CAST_NOT_NULL(geoFileStats); + EXPECT_EQ(geoFileStats->getGeospatialTypes().size(), 0); + std::array expectValid = {false, false, false, false}; + EXPECT_EQ(geoFileStats->getBoundingBox().DimensionValid(), expectValid); + } +} // namespace orc \ No newline at end of file diff --git a/c++/test/TestUtil.cc b/c++/test/TestUtil.cc new file mode 100644 index 0000000000..7ed4808102 --- /dev/null +++ b/c++/test/TestUtil.cc @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TestUtil.hh" +#include + +namespace orc { +uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ, bool hasM) { + auto wkbGeomType = static_cast(geometryType); + + if (hasZ) { + wkbGeomType += 1000; + } + + if (hasM) { + wkbGeomType += 2000; + } + + return wkbGeomType; +} + +std::string MakeWKBPoint(const std::vector &xyzm, bool hasZ, bool hasM) { + // 1:endianness + 4:type + 8:x + 8:y + int numBytes = kWkbPointXYSize + (hasZ ? sizeof(double) : 0) + (hasM ? sizeof(double) : 0); + std::string wkb(numBytes, 0); + char *ptr = wkb.data(); + + ptr[0] = kWkbNativeEndianness; + uint32_t geom_type = GeometryTypeToWKB(geospatial::GeometryType::kPoint, hasZ, hasM); + std::memcpy(&ptr[1], &geom_type, 4); + std::memcpy(&ptr[5], &xyzm[0], 8); + std::memcpy(&ptr[13], &xyzm[1], 8); + ptr += 21; + + if (hasZ) { + std::memcpy(ptr, &xyzm[2], 8); + ptr += 8; + } + + if (hasM) { + std::memcpy(ptr, &xyzm[3], 8); + ptr += 8; + } + + assert(static_cast(ptr - wkb.data()) == wkb.length()); + return wkb; +} + +} \ No newline at end of file diff --git a/c++/test/TestUtil.hh b/c++/test/TestUtil.hh new file mode 100644 index 0000000000..1731048ebe --- /dev/null +++ b/c++/test/TestUtil.hh @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "orc/Geospatial.hh" + +namespace orc { + +/// \brief Number of bytes in a WKB Point with X and Y dimensions (uint8_t endian, +/// uint32_t geometry type, 2 * double coordinates) +static constexpr int kWkbPointXYSize = 21; + +static bool isLittleEndian() { + static union { + uint32_t i; + char c[4]; + } num = {0x01020304}; + return num.c[0] == 4; +} + +static uint8_t kWkbNativeEndianness = isLittleEndian() ? 0x01 : 0x00; + +uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ, bool hasM); +std::string MakeWKBPoint(const std::vector &xyzm, bool hasZ, bool hasM); + +} // namespace orc \ No newline at end of file diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc index 975462e30c..67f3fa6428 100644 --- a/c++/test/TestWriter.cc +++ b/c++/test/TestWriter.cc @@ -16,12 +16,13 @@ * limitations under the License. */ -#include "orc/ColumnPrinter.hh" +#include #include "orc/OrcFile.hh" #include "MemoryInputStream.hh" #include "MemoryOutputStream.hh" #include "Reader.hh" +#include "TestUtil.hh" #include "wrap/gmock.h" #include "wrap/gtest-wrapper.h" @@ -2400,6 +2401,141 @@ namespace orc { EXPECT_FALSE(rowReader->next(*batch)); } + TEST_P(WriterTest, writeGeometryAndGeographyColumn) { + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + MemoryPool* pool = getDefaultPool(); + std::unique_ptr type(Type::buildTypeFromString("struct")); + uint64_t stripeSize = 1024; // 1K + uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; + std::unique_ptr writer = + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); + + EXPECT_EQ("struct", type->toString()); + + uint64_t batchCount = 100, batchSize = 1000; + std::unique_ptr batch = + writer->createRowBatch(batchSize); + StructVectorBatch* structBatch = + dynamic_cast(batch.get()); + StringVectorBatch* geometryBatch = + dynamic_cast(structBatch->fields[0]); + StringVectorBatch* geographyBatch = + dynamic_cast(structBatch->fields[1]); + + char buffer[8000000]; + char* buf = buffer; + + // write 100 * 1000 rows, every 100 rows are in one row group + // every 2 consecutive rows has one null value. + uint64_t rowCount = 0; + for (uint64_t i = 0; i != batchCount; ++i) { + structBatch->hasNulls = false; + structBatch->numElements = batchSize; + + geometryBatch->hasNulls = true; + geometryBatch->numElements = batchSize; + geographyBatch->hasNulls = true; + geographyBatch->numElements = batchSize; + + for (uint64_t j = 0; j != batchSize; ++j) { + if (rowCount % 2 == 0) { + geometryBatch->notNull[j] = 0; + geographyBatch->notNull[j] = 0; + } else { + geometryBatch->notNull[j] = 1; + geographyBatch->notNull[j] = 1; + + std::string wkb = MakeWKBPoint({j * 1.0, j * 1.0}, false, false); + strncpy(buf, wkb.c_str(), wkb.size()); + + geometryBatch->data[j] = buf; + geometryBatch->length[j] = static_cast(wkb.size()); + geographyBatch->data[j] = buf; + geographyBatch->length[j] = static_cast(wkb.size()); + + buf += wkb.size(); + } + ++rowCount; + } + + writer->add(*batch); + } + writer->close(); + + std::unique_ptr inStream( + new MemoryInputStream (memStream.getData(), memStream.getLength())); + std::unique_ptr reader = createReader(pool, std::move(inStream)); + EXPECT_EQ(batchCount * batchSize, reader->getNumberOfRows()); + EXPECT_TRUE(reader->getNumberOfStripes() > 1); + + EXPECT_EQ("struct", reader->getType().toString()); + // test sequential reader + std::unique_ptr seqReader = createRowReader(reader.get()); + rowCount = 0; + for (uint64_t i = 0; i != batchCount; ++i) { + seqReader->next(*batch); + + EXPECT_FALSE(structBatch->hasNulls); + EXPECT_EQ(batchSize, structBatch->numElements); + + EXPECT_TRUE(geometryBatch->hasNulls); + EXPECT_EQ(batchSize, geometryBatch->numElements); + EXPECT_TRUE(geographyBatch->hasNulls); + EXPECT_EQ(batchSize, geographyBatch->numElements); + + for (uint64_t j = 0; j != batchSize; ++j) { + if (rowCount % 2 == 0) { + EXPECT_TRUE(geometryBatch->notNull[j] == 0); + EXPECT_TRUE(geographyBatch->notNull[j] == 0); + } else { + EXPECT_TRUE(geometryBatch->notNull[j] != 0); + EXPECT_TRUE(geographyBatch->notNull[j] != 0); + std::string wkb = MakeWKBPoint({j * 1.0, j * 1.0}, false, false); + EXPECT_EQ(static_cast(wkb.size()), geometryBatch->length[j]); + EXPECT_TRUE(strncmp(geometryBatch->data[j], wkb.c_str(), wkb.size()) == 0); + EXPECT_EQ(static_cast(wkb.size()), geographyBatch->length[j]); + EXPECT_TRUE(strncmp(geographyBatch->data[j], wkb.c_str(), wkb.size()) == 0); + } + ++rowCount; + } + } + EXPECT_FALSE(seqReader->next(*batch)); + + // test seek reader + std::unique_ptr seekReader = createRowReader(reader.get()); + batch = seekReader->createRowBatch(2); + structBatch = dynamic_cast(batch.get()); + geometryBatch = dynamic_cast(structBatch->fields[0]); + geographyBatch = dynamic_cast(structBatch->fields[1]); + + for (uint64_t row = rowCount - 2; row >= 100; row -= 100) { + seekReader->seekToRow(row); + seekReader->next(*batch); + + EXPECT_FALSE(structBatch->hasNulls); + EXPECT_EQ(2, structBatch->numElements); + EXPECT_TRUE(geometryBatch->hasNulls); + EXPECT_EQ(2, geometryBatch->numElements); + EXPECT_TRUE(geographyBatch->hasNulls); + EXPECT_EQ(2, geographyBatch->numElements); + + EXPECT_TRUE(geometryBatch->notNull[0] == 0); + EXPECT_TRUE(geometryBatch->notNull[1] != 0); + EXPECT_TRUE(geographyBatch->notNull[0] == 0); + EXPECT_TRUE(geographyBatch->notNull[1] != 0); + + std::string wkb = MakeWKBPoint({(row + 1) * 1.0, (row + 1) * 1.0}, false, false); + + EXPECT_EQ(static_cast(wkb.size()), geometryBatch->length[1]); + EXPECT_TRUE(strncmp(geometryBatch->data[1], wkb.c_str(), wkb.size()) == 0); + EXPECT_EQ(static_cast(wkb.size()), geographyBatch->length[1]); + EXPECT_TRUE(strncmp(geographyBatch->data[1], wkb.c_str(), wkb.size()) == 0); + } + } + + std::vector testParams = {{FileVersion::v_0_11(), true}, {FileVersion::v_0_11(), false}, {FileVersion::v_0_12(), false}, diff --git a/tools/src/CSVFileImport.cc b/tools/src/CSVFileImport.cc index ae17b3348a..c99f5b65a4 100644 --- a/tools/src/CSVFileImport.cc +++ b/tools/src/CSVFileImport.cc @@ -395,6 +395,8 @@ int main(int argc, char* argv[]) { case orc::CHAR: case orc::VARCHAR: case orc::BINARY: + case orc::GEOMETRY: + case orc::GEOGRAPHY: bufferList.emplace_back(*orc::getDefaultPool(), 1 * 1024 * 1024); fillStringValues(data, structBatch->fields[i], numValues, i, bufferList.back()); break; From e5076331a6a56e2bac4de87fab7bdf2d9b300014 Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 16 Jun 2025 18:14:57 +0800 Subject: [PATCH 02/16] fix building --- c++/include/orc/Geospatial.hh | 3 +++ c++/src/Geospatial.cc | 2 ++ 2 files changed, 5 insertions(+) diff --git a/c++/include/orc/Geospatial.hh b/c++/include/orc/Geospatial.hh index f4661049c8..47a87c503e 100644 --- a/c++/include/orc/Geospatial.hh +++ b/c++/include/orc/Geospatial.hh @@ -21,6 +21,9 @@ #include #include #include +#include +#include +#include namespace orc::geospatial { diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc index e4b46acbda..a271c66600 100644 --- a/c++/src/Geospatial.cc +++ b/c++/src/Geospatial.cc @@ -21,6 +21,8 @@ #include #include #include +#include + #include "orc/Exceptions.hh" namespace orc::geospatial { From 95d6f5eb0eb22af4400b394ed9ca0469265b4662 Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 16 Jun 2025 18:22:14 +0800 Subject: [PATCH 03/16] cstdint --- c++/include/orc/Geospatial.hh | 1 + 1 file changed, 1 insertion(+) diff --git a/c++/include/orc/Geospatial.hh b/c++/include/orc/Geospatial.hh index 47a87c503e..ebaa87648c 100644 --- a/c++/include/orc/Geospatial.hh +++ b/c++/include/orc/Geospatial.hh @@ -24,6 +24,7 @@ #include #include #include +#include namespace orc::geospatial { From 4bf2ac78a838a95c8705f48a5fdab370d9226dea Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 16 Jun 2025 18:36:53 +0800 Subject: [PATCH 04/16] fix SEH --- c++/test/TestUtil.cc | 1 + c++/test/TestWriter.cc | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/c++/test/TestUtil.cc b/c++/test/TestUtil.cc index 7ed4808102..80039405c8 100644 --- a/c++/test/TestUtil.cc +++ b/c++/test/TestUtil.cc @@ -18,6 +18,7 @@ #include "TestUtil.hh" #include +#include namespace orc { uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ, bool hasM) { diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc index 67f3fa6428..06b3626331 100644 --- a/c++/test/TestWriter.cc +++ b/c++/test/TestWriter.cc @@ -29,6 +29,7 @@ #include #include +#include #include #ifdef __clang__ @@ -2424,8 +2425,8 @@ namespace orc { StringVectorBatch* geographyBatch = dynamic_cast(structBatch->fields[1]); - char buffer[8000000]; - char* buf = buffer; + std::unique_ptr buffer(new char[8000000]); + char* buf = buffer.get(); // write 100 * 1000 rows, every 100 rows are in one row group // every 2 consecutive rows has one null value. From 4f0e74242ec7d05c1a096b8c4b3e93f69214af76 Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 16 Jun 2025 18:52:09 +0800 Subject: [PATCH 05/16] fix linter --- c++/include/orc/Geospatial.hh | 369 +++++++++++++++---------------- c++/src/ColumnWriter.cc | 9 +- c++/src/Statistics.hh | 92 ++++---- c++/src/TypeImpl.cc | 141 ++++++------ c++/test/TestColumnStatistics.cc | 8 +- c++/test/TestStatistics.cc | 17 +- c++/test/TestUtil.cc | 24 +- c++/test/TestUtil.hh | 20 +- c++/test/TestWriter.cc | 26 +-- 9 files changed, 347 insertions(+), 359 deletions(-) diff --git a/c++/include/orc/Geospatial.hh b/c++/include/orc/Geospatial.hh index ebaa87648c..067d45011b 100644 --- a/c++/include/orc/Geospatial.hh +++ b/c++/include/orc/Geospatial.hh @@ -19,241 +19,238 @@ #pragma once #include #include -#include -#include +#include +#include #include #include -#include -#include +#include +#include namespace orc::geospatial { -/// \brief Infinity, used to define bounds of empty bounding boxes -constexpr double kInf = std::numeric_limits::infinity(); - -/// \brief The maximum number of dimensions represented by a geospatial type -/// (i.e., X, Y, Z, and M) -inline constexpr int kMaxDimensions = 4; - -/// \brief Valid combinations of dimensions allowed by ISO well-known binary -/// -/// These values correspond to the 0, 1000, 2000, 3000 component of the WKB integer -/// geometry type (i.e., the value of geometry_type // 1000). -enum class Dimensions { - kXY = 0, - kXYZ = 1, - kXYM = 2, - kXYZM = 3, - kValueMin = 0, - kValueMax = 3 -}; - -/// \brief The supported set of geometry types allowed by ISO well-known binary -/// -/// These values correspond to the 1, 2, ..., 7 component of the WKB integer -/// geometry type (i.e., the value of geometry_type % 1000). -enum class GeometryType { - kPoint = 1, - kLinestring = 2, - kPolygon = 3, - kMultiPoint = 4, - kMultiLinestring = 5, - kMultiPolygon = 6, - kGeometryCollection = 7, - kValueMin = 1, - kValueMax = 7 -}; - -struct BoundingBox { - using XY = std::array; - using XYZ = std::array; - using XYM = std::array; - using XYZM = std::array; - - BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {} - BoundingBox() : min{kInf, kInf, kInf, kInf}, max{-kInf, -kInf, -kInf, -kInf} {} - BoundingBox(const BoundingBox& other) = default; - BoundingBox& operator=(const BoundingBox&) = default; - - /// \brief Update the X and Y bounds to ensure these bounds contain coord - void UpdateXY(const XY& coord) { - UpdateInternal(coord); - } + /// \brief Infinity, used to define bounds of empty bounding boxes + constexpr double kInf = std::numeric_limits::infinity(); - /// \brief Update the X, Y, and Z bounds to ensure these bounds contain coord - void UpdateXYZ(const XYZ& coord) { - UpdateInternal(coord); - } + /// \brief The maximum number of dimensions represented by a geospatial type + /// (i.e., X, Y, Z, and M) + inline constexpr int kMaxDimensions = 4; + + /// \brief Valid combinations of dimensions allowed by ISO well-known binary + /// + /// These values correspond to the 0, 1000, 2000, 3000 component of the WKB integer + /// geometry type (i.e., the value of geometry_type // 1000). + enum class Dimensions { kXY = 0, kXYZ = 1, kXYM = 2, kXYZM = 3, kValueMin = 0, kValueMax = 3 }; - /// \brief Update the X, Y, and M bounds to ensure these bounds contain coord - void UpdateXYM(const XYM& coord) { - std::array dimensions = {0, 1, 3}; - for (int i = 0; i < 3; i++) { - auto dimension = dimensions[i]; - if ((std::isnan(min[dimension]) == 0) && (std::isnan(max[dimension]) == 0)) { - min[dimension] = std::min(min[dimension], coord[i]); - max[dimension] = std::max(max[dimension], coord[i]); + /// \brief The supported set of geometry types allowed by ISO well-known binary + /// + /// These values correspond to the 1, 2, ..., 7 component of the WKB integer + /// geometry type (i.e., the value of geometry_type % 1000). + enum class GeometryType { + kPoint = 1, + kLinestring = 2, + kPolygon = 3, + kMultiPoint = 4, + kMultiLinestring = 5, + kMultiPolygon = 6, + kGeometryCollection = 7, + kValueMin = 1, + kValueMax = 7 + }; + + struct BoundingBox { + using XY = std::array; + using XYZ = std::array; + using XYM = std::array; + using XYZM = std::array; + + BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {} + BoundingBox() : min{kInf, kInf, kInf, kInf}, max{-kInf, -kInf, -kInf, -kInf} {} + BoundingBox(const BoundingBox& other) = default; + BoundingBox& operator=(const BoundingBox&) = default; + + /// \brief Update the X and Y bounds to ensure these bounds contain coord + void UpdateXY(const XY& coord) { + UpdateInternal(coord); + } + + /// \brief Update the X, Y, and Z bounds to ensure these bounds contain coord + void UpdateXYZ(const XYZ& coord) { + UpdateInternal(coord); + } + + /// \brief Update the X, Y, and M bounds to ensure these bounds contain coord + void UpdateXYM(const XYM& coord) { + std::array dimensions = {0, 1, 3}; + for (int i = 0; i < 3; i++) { + auto dimension = dimensions[i]; + if ((std::isnan(min[dimension]) == 0) && (std::isnan(max[dimension]) == 0)) { + min[dimension] = std::min(min[dimension], coord[i]); + max[dimension] = std::max(max[dimension], coord[i]); + } } } - } - /// \brief Update the X, Y, Z, and M bounds to ensure these bounds contain coord - void UpdateXYZM(const XYZM& coord) { - UpdateInternal(coord); - } + /// \brief Update the X, Y, Z, and M bounds to ensure these bounds contain coord + void UpdateXYZM(const XYZM& coord) { + UpdateInternal(coord); + } - /// \brief Reset these bounds to an empty state such that they contain no coordinates - void Reset() { - for (int i = 0; i < kMaxDimensions; i++) { - min[i] = kInf; - max[i] = -kInf; + /// \brief Reset these bounds to an empty state such that they contain no coordinates + void Reset() { + for (int i = 0; i < kMaxDimensions; i++) { + min[i] = kInf; + max[i] = -kInf; + } } - } - void Invalidate() { - for (int i = 0; i < kMaxDimensions; i++) { - min[i] = std::numeric_limits::quiet_NaN(); - max[i] = std::numeric_limits::quiet_NaN(); + void Invalidate() { + for (int i = 0; i < kMaxDimensions; i++) { + min[i] = std::numeric_limits::quiet_NaN(); + max[i] = std::numeric_limits::quiet_NaN(); + } } - } - bool BoundEmpty(int dimension) const { - return std::isinf(min[dimension] - max[dimension]) != 0; - } + bool BoundEmpty(int dimension) const { + return std::isinf(min[dimension] - max[dimension]) != 0; + } - bool BoundValid(int dimension) const { - return (std::isnan(min[dimension]) == 0) && (std::isnan(max[dimension]) == 0); - } + bool BoundValid(int dimension) const { + return (std::isnan(min[dimension]) == 0) && (std::isnan(max[dimension]) == 0); + } - const XYZM& LowerBound() const { - return min; - } + const XYZM& LowerBound() const { + return min; + } - const XYZM& UpperBound() const { - return max; - } + const XYZM& UpperBound() const { + return max; + } - std::array DimensionValid() const { - return {BoundValid(0), BoundValid(1), BoundValid(2), BoundValid(3)}; - } + std::array DimensionValid() const { + return {BoundValid(0), BoundValid(1), BoundValid(2), BoundValid(3)}; + } - std::array DimensionEmpty() const { - return {BoundEmpty(0), BoundEmpty(1), BoundEmpty(2), BoundEmpty(3)}; - } + std::array DimensionEmpty() const { + return {BoundEmpty(0), BoundEmpty(1), BoundEmpty(2), BoundEmpty(3)}; + } - /// \brief Update these bounds such they also contain other - void Merge(const BoundingBox& other) { - for (int i = 0; i < kMaxDimensions; i++) { - if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) || - std::isnan(other.max[i])) { - min[i] = std::numeric_limits::quiet_NaN(); - max[i] = std::numeric_limits::quiet_NaN(); - } else { - min[i] = std::min(min[i], other.min[i]); - max[i] = std::max(max[i], other.max[i]); + /// \brief Update these bounds such they also contain other + void Merge(const BoundingBox& other) { + for (int i = 0; i < kMaxDimensions; i++) { + if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) || + std::isnan(other.max[i])) { + min[i] = std::numeric_limits::quiet_NaN(); + max[i] = std::numeric_limits::quiet_NaN(); + } else { + min[i] = std::min(min[i], other.min[i]); + max[i] = std::max(max[i], other.max[i]); + } } } - } - std::string ToString() const; + std::string ToString() const; - XYZM min; - XYZM max; + XYZM min; + XYZM max; - private: - // This works for XY, XYZ, and XYZM - template - void UpdateInternal(Coord coord) { - for (size_t i = 0; i < coord.size(); i++) { - if (!std::isnan(min[i]) && !std::isnan(max[i])) { - min[i] = std::min(min[i], coord[i]); - max[i] = std::max(max[i], coord[i]); + private: + // This works for XY, XYZ, and XYZM + template + void UpdateInternal(Coord coord) { + for (size_t i = 0; i < coord.size(); i++) { + if (!std::isnan(min[i]) && !std::isnan(max[i])) { + min[i] = std::min(min[i], coord[i]); + max[i] = std::max(max[i], coord[i]); + } } } - } -}; - -inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) { - return lhs.min == rhs.min && lhs.max == rhs.max; -} + }; -inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) { - return !(lhs == rhs); -} + inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) { + return lhs.min == rhs.min && lhs.max == rhs.max; + } -inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) { - os << obj.ToString(); - return os; -} + inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) { + return !(lhs == rhs); + } + inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) { + os << obj.ToString(); + return os; + } -class WKBBuffer; + class WKBBuffer; -/// \brief Accumulate a BoundingBox and geometry types based on zero or more well-known -/// binary blobs -/// -/// Note that this class is NOT appropriate for bounding a GEOGRAPHY, -/// whose bounds are not a function purely of the vertices. Geography bounding -/// is not yet implemented. -class WKBGeometryBounder { - public: - /// \brief Accumulate the bounds of a serialized well-known binary geometry + /// \brief Accumulate a BoundingBox and geometry types based on zero or more well-known + /// binary blobs /// - /// Throws ParquetException for any parse errors encountered. Bounds for - /// any encountered coordinates are accumulated and the geometry type of - /// the geometry is added to the internal geometry type list. - void MergeGeometry(std::string_view bytesWkb); - - void MergeGeometry(const uint8_t* bytesWkb, size_t bytesSize); + /// Note that this class is NOT appropriate for bounding a GEOGRAPHY, + /// whose bounds are not a function purely of the vertices. Geography bounding + /// is not yet implemented. + class WKBGeometryBounder { + public: + /// \brief Accumulate the bounds of a serialized well-known binary geometry + /// + /// Throws ParquetException for any parse errors encountered. Bounds for + /// any encountered coordinates are accumulated and the geometry type of + /// the geometry is added to the internal geometry type list. + void MergeGeometry(std::string_view bytesWkb); + + void MergeGeometry(const uint8_t* bytesWkb, size_t bytesSize); /// \brief Accumulate the bounds of a previously-calculated BoundingBox - void MergeBox(const BoundingBox& box) { box_.Merge(box); } + void MergeBox(const BoundingBox& box) { + box_.Merge(box); + } - /// \brief Accumulate a previously-calculated list of geometry types - void MergeGeometryTypes(const std::vector geospatialTypes) { - geospatialTypes_.insert(geospatialTypes.begin(), geospatialTypes.end()); - } + /// \brief Accumulate a previously-calculated list of geometry types + void MergeGeometryTypes(const std::vector geospatialTypes) { + geospatialTypes_.insert(geospatialTypes.begin(), geospatialTypes.end()); + } - /// \brief Accumulate the bounds of a previously-calculated BoundingBox - void Merge(const WKBGeometryBounder& other) { - if (!IsValid() || !other.IsValid()) { - Invalidate(); - return; + /// \brief Accumulate the bounds of a previously-calculated BoundingBox + void Merge(const WKBGeometryBounder& other) { + if (!IsValid() || !other.IsValid()) { + Invalidate(); + return; + } + box_.Merge(other.box_); + geospatialTypes_.insert(other.geospatialTypes_.begin(), other.geospatialTypes_.end()); } - box_.Merge(other.box_); - geospatialTypes_.insert(other.geospatialTypes_.begin(), other.geospatialTypes_.end()); - } - /// \brief Retrieve the accumulated bounds - const BoundingBox& Bounds() const { return box_; } + /// \brief Retrieve the accumulated bounds + const BoundingBox& Bounds() const { + return box_; + } - /// \brief Retrieve the accumulated geometry types - std::vector GeometryTypes() const; + /// \brief Retrieve the accumulated geometry types + std::vector GeometryTypes() const; - /// \brief Reset the internal bounds and geometry types list to an empty state - void Reset() { - isValid_ = true; - box_.Reset(); - geospatialTypes_.clear(); - } + /// \brief Reset the internal bounds and geometry types list to an empty state + void Reset() { + isValid_ = true; + box_.Reset(); + geospatialTypes_.clear(); + } - bool IsValid() const { return isValid_; } + bool IsValid() const { + return isValid_; + } - void Invalidate() { + void Invalidate() { isValid_ = false; box_.Invalidate(); geospatialTypes_.clear(); - } - + } - private: - BoundingBox box_; - std::unordered_set geospatialTypes_; - bool isValid_ = true; + private: + BoundingBox box_; + std::unordered_set geospatialTypes_; + bool isValid_ = true; - void MergeGeometryInternal(WKBBuffer* src, bool recordWkbType); + void MergeGeometryInternal(WKBBuffer* src, bool recordWkbType); - void MergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, bool swap); -}; + void MergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, bool swap); + }; } // namespace orc::geospatial \ No newline at end of file diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc index 72274647ae..149879b1f8 100644 --- a/c++/src/ColumnWriter.cc +++ b/c++/src/ColumnWriter.cc @@ -21,9 +21,9 @@ #include "orc/Type.hh" #include "orc/Writer.hh" +#include #include "ByteRLE.hh" #include "ColumnWriter.hh" -#include #include "RLE.hh" #include "Statistics.hh" #include "Timezone.hh" @@ -2876,12 +2876,13 @@ namespace orc { class GeospatialColumnWriter : public BinaryColumnWriter { public: - GeospatialColumnWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) + GeospatialColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) : BinaryColumnWriter(type, factory, options), isGeometry_(type.getKind() == TypeKind::GEOMETRY) {} - virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, - uint64_t numValues, const char* incomingMask) override { + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, + const char* incomingMask) override { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); const StringVectorBatch& strBatch = dynamic_cast(rowBatch); diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh index 0d116e54a3..b3c496019b 100644 --- a/c++/src/Statistics.hh +++ b/c++/src/Statistics.hh @@ -1717,7 +1717,6 @@ namespace orc { } void merge(const MutableColumnStatistics& other) override { - const GeospatialColumnStatisticsImpl& geoStats = dynamic_cast(other); stats_.merge(geoStats.stats_); @@ -1745,65 +1744,64 @@ namespace orc { geoStats->mutable_bbox()->set_ymin(bbox.min[1]); geoStats->mutable_bbox()->set_ymax(bbox.max[1]); if (bbox.BoundValid(2) && !bbox.BoundEmpty(2)) { - geoStats->mutable_bbox()->set_zmin(bbox.min[2]); - geoStats->mutable_bbox()->set_zmax(bbox.max[2]); + geoStats->mutable_bbox()->set_zmin(bbox.min[2]); + geoStats->mutable_bbox()->set_zmax(bbox.max[2]); } if (bbox.BoundValid(3) && !bbox.BoundEmpty(3)) { - geoStats->mutable_bbox()->set_mmin(bbox.min[3]); - geoStats->mutable_bbox()->set_mmax(bbox.max[3]); + geoStats->mutable_bbox()->set_mmin(bbox.min[3]); + geoStats->mutable_bbox()->set_mmax(bbox.max[3]); } + } + for (auto type : bounder_.GeometryTypes()) { + geoStats->add_geospatial_types(type); + } } - for (auto type : bounder_.GeometryTypes()) { - geoStats->add_geospatial_types(type); - } - } - std::string toString() const override { - if (!bounder_.IsValid()) { - return " invalid"; - } + std::string toString() const override { + if (!bounder_.IsValid()) { + return " invalid"; + } - std::stringstream ss; - ss << ""; + std::stringstream ss; + ss << ""; - std::string dim_label("xyzm"); - const auto& bbox = bounder_.Bounds(); - auto dim_valid = bbox.DimensionValid(); - auto dim_empty = bbox.DimensionEmpty(); - auto lower = bbox.LowerBound(); - auto upper = bbox.UpperBound(); + std::string dim_label("xyzm"); + const auto& bbox = bounder_.Bounds(); + auto dim_valid = bbox.DimensionValid(); + auto dim_empty = bbox.DimensionEmpty(); + auto lower = bbox.LowerBound(); + auto upper = bbox.UpperBound(); + + for (int i = 0; i < 4; i++) { + ss << " " << dim_label[i] << ": "; + if (!dim_valid[i]) { + ss << "invalid"; + } else if (dim_empty[i]) { + ss << "empty"; + } else { + ss << "[" << lower[i] << ", " << upper[i] << "]"; + } + } - for (int i = 0; i < 4; i++) { - ss << " " << dim_label[i] << ": "; - if (!dim_valid[i]) { - ss << "invalid"; - } else if (dim_empty[i]) { - ss << "empty"; - } else { - ss << "[" << lower[i] << ", " << upper[i] << "]"; + std::vector maybe_geometry_types = bounder_.GeometryTypes(); + ss << " geometry_types: ["; + std::string sep(""); + for (int32_t geometry_type : maybe_geometry_types) { + ss << sep << geometry_type; + sep = ", "; } - } + ss << "]"; - std::vector maybe_geometry_types = bounder_.GeometryTypes(); - ss << " geometry_types: ["; - std::string sep(""); - for (int32_t geometry_type : maybe_geometry_types) { - ss << sep << geometry_type; - sep = ", "; + return ss.str(); } - ss << "]"; - return ss.str(); - } - - const geospatial::BoundingBox& getBoundingBox() const override { - return bounder_.Bounds(); - } - - std::vector getGeospatialTypes() const override { - return bounder_.GeometryTypes(); - } + const geospatial::BoundingBox& getBoundingBox() const override { + return bounder_.Bounds(); + } + std::vector getGeospatialTypes() const override { + return bounder_.GeometryTypes(); + } }; ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index b78d4665b0..97ae371896 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -64,34 +64,31 @@ namespace orc { subtypeCount_ = 0; } - TypeImpl::TypeImpl(TypeKind kind, - const std::string& crs) { - parent_ = nullptr; - columnId_ = -1; - maximumColumnId_ = -1; - kind_ = kind; - maxLength_ = 0; - precision_ = 0; - scale_ = 0; - subtypeCount_ = 0; - crs_ = crs; - edgeInterpolationAlgorithm_ = geospatial::EIAlgo::SPHERICAL; -} - -TypeImpl::TypeImpl(TypeKind kind, - const std::string& crs, - geospatial::EIAlgo algo) { - parent_ = nullptr; - columnId_ = -1; - maximumColumnId_ = -1; - kind_ = kind; - maxLength_ = 0; - precision_ = 0; - scale_ = 0; - subtypeCount_ = 0; - crs_ = crs; - edgeInterpolationAlgorithm_ = algo; -} + TypeImpl::TypeImpl(TypeKind kind, const std::string& crs) { + parent_ = nullptr; + columnId_ = -1; + maximumColumnId_ = -1; + kind_ = kind; + maxLength_ = 0; + precision_ = 0; + scale_ = 0; + subtypeCount_ = 0; + crs_ = crs; + edgeInterpolationAlgorithm_ = geospatial::EIAlgo::SPHERICAL; + } + + TypeImpl::TypeImpl(TypeKind kind, const std::string& crs, geospatial::EIAlgo algo) { + parent_ = nullptr; + columnId_ = -1; + maximumColumnId_ = -1; + kind_ = kind; + maxLength_ = 0; + precision_ = 0; + scale_ = 0; + subtypeCount_ = 0; + crs_ = crs; + edgeInterpolationAlgorithm_ = algo; + } uint64_t TypeImpl::assignIds(uint64_t root) const { columnId_ = static_cast(root); @@ -229,43 +226,43 @@ TypeImpl::TypeImpl(TypeKind kind, } namespace geospatial { -std::string AlgotoString(EIAlgo algo) { - switch (algo) { - case EIAlgo::SPHERICAL: - return "speherial"; - case VINCENTY: - return "vincenty"; - case THOMAS: - return "thomas"; - case ANDOYER: - return "andoyer"; - case KARNEY: - return "karney"; - default: - throw InvalidArgument("Unknown algo"); - } -} + std::string AlgotoString(EIAlgo algo) { + switch (algo) { + case EIAlgo::SPHERICAL: + return "speherial"; + case VINCENTY: + return "vincenty"; + case THOMAS: + return "thomas"; + case ANDOYER: + return "andoyer"; + case KARNEY: + return "karney"; + default: + throw InvalidArgument("Unknown algo"); + } + } -EIAlgo AlgoFromString(const std::string &algo) { - if (algo == "speherial") { - return EIAlgo::SPHERICAL; - } - if (algo == "vincenty") { - return VINCENTY; - } - if (algo == "thomas") { - return THOMAS; - } - if (algo == "andoyer") { - return ANDOYER; - } - if (algo == "karney") { - return KARNEY; - } - throw InvalidArgument("Unknown algo: " + algo); -} + EIAlgo AlgoFromString(const std::string& algo) { + if (algo == "speherial") { + return EIAlgo::SPHERICAL; + } + if (algo == "vincenty") { + return VINCENTY; + } + if (algo == "thomas") { + return THOMAS; + } + if (algo == "andoyer") { + return ANDOYER; + } + if (algo == "karney") { + return KARNEY; + } + throw InvalidArgument("Unknown algo: " + algo); + } -} // namespace geospatial + } // namespace geospatial std::string TypeImpl::toString() const { switch (static_cast(kind_)) { @@ -356,8 +353,8 @@ EIAlgo AlgoFromString(const std::string &algo) { } case GEOGRAPHY: { std::stringstream result; - result << "geography(" << crs_ << "," << geospatial::AlgotoString(edgeInterpolationAlgorithm_) - << ")"; + result << "geography(" << crs_ << "," + << geospatial::AlgotoString(edgeInterpolationAlgorithm_) << ")"; return result.str(); } default: @@ -542,14 +539,14 @@ EIAlgo AlgoFromString(const std::string &algo) { ret = std::make_unique(static_cast(type.kind()), type.maximum_length()); break; - case proto::Type_Kind_GEOMETRY: - ret = std::make_unique(static_cast(type.kind()), type.crs()); + case proto::Type_Kind_GEOMETRY: + ret = std::make_unique(static_cast(type.kind()), type.crs()); break; - case proto::Type_Kind_GEOGRAPHY: - ret = std::make_unique(static_cast(type.kind()), - type.crs(), - static_cast(type.algorithm())); + case proto::Type_Kind_GEOGRAPHY: + ret = std::make_unique( + static_cast(type.kind()), type.crs(), + static_cast(type.algorithm())); break; case proto::Type_Kind_DECIMAL: @@ -827,7 +824,7 @@ EIAlgo AlgoFromString(const std::string &algo) { } std::unique_ptr TypeImpl::parseGeographyType(const std::string& input, size_t start, - size_t end) { + size_t end) { if (input[start] != '(') { throw std::logic_error("Missing ( after geometry."); } diff --git a/c++/test/TestColumnStatistics.cc b/c++/test/TestColumnStatistics.cc index 4a942c0557..a9ab6a0593 100644 --- a/c++/test/TestColumnStatistics.cc +++ b/c++/test/TestColumnStatistics.cc @@ -17,10 +17,10 @@ */ #include "Statistics.hh" +#include "TestUtil.hh" #include "orc/OrcFile.hh" #include "wrap/gmock.h" #include "wrap/gtest-wrapper.h" -#include "TestUtil.hh" #include @@ -713,10 +713,12 @@ namespace orc { } TEST(ColumnStatistics, TestGeospatialMerge) { - std::unique_ptr invalidStats(new GeospatialColumnStatisticsImpl()); + std::unique_ptr invalidStats( + new GeospatialColumnStatisticsImpl()); invalidStats->update("0", 0); - std::unique_ptr emptyStats(new GeospatialColumnStatisticsImpl()); + std::unique_ptr emptyStats( + new GeospatialColumnStatisticsImpl()); std::unique_ptr xyStats(new GeospatialColumnStatisticsImpl()); std::string xy = MakeWKBPoint({10, 11}, false, false); diff --git a/c++/test/TestStatistics.cc b/c++/test/TestStatistics.cc index 4d90957094..6e8e3b5122 100644 --- a/c++/test/TestStatistics.cc +++ b/c++/test/TestStatistics.cc @@ -34,13 +34,10 @@ namespace orc { #define ENSURE_DYNAMIC_CAST_NOT_NULL(PTR) \ if (PTR == NULL) throw std::logic_error("dynamic_cast returns null"); - const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M + const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M - static std::unique_ptr createWriter( - uint64_t stripeSize, - const Type &type, - MemoryPool* memoryPool, - OutputStream *stream) { + static std::unique_ptr createWriter(uint64_t stripeSize, const Type& type, + MemoryPool* memoryPool, OutputStream* stream) { WriterOptions options; options.setStripeSize(stripeSize); options.setCompressionBlockSize(256); @@ -51,12 +48,10 @@ namespace orc { return createWriter(type, stream, options); } - static std::unique_ptr createReader( - MemoryPool* memoryPool, - MemoryOutputStream& memStream) { - + static std::unique_ptr createReader(MemoryPool* memoryPool, + MemoryOutputStream& memStream) { std::unique_ptr inStream( - new MemoryInputStream (memStream.getData(), memStream.getLength())); + new MemoryInputStream(memStream.getData(), memStream.getLength())); ReaderOptions options; options.setMemoryPool(*memoryPool); return createReader(std::move(inStream), options); diff --git a/c++/test/TestUtil.cc b/c++/test/TestUtil.cc index 80039405c8..a457f68535 100644 --- a/c++/test/TestUtil.cc +++ b/c++/test/TestUtil.cc @@ -21,25 +21,25 @@ #include namespace orc { -uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ, bool hasM) { + uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ, bool hasM) { auto wkbGeomType = static_cast(geometryType); if (hasZ) { - wkbGeomType += 1000; + wkbGeomType += 1000; } if (hasM) { - wkbGeomType += 2000; + wkbGeomType += 2000; } return wkbGeomType; -} + } -std::string MakeWKBPoint(const std::vector &xyzm, bool hasZ, bool hasM) { + std::string MakeWKBPoint(const std::vector& xyzm, bool hasZ, bool hasM) { // 1:endianness + 4:type + 8:x + 8:y int numBytes = kWkbPointXYSize + (hasZ ? sizeof(double) : 0) + (hasM ? sizeof(double) : 0); std::string wkb(numBytes, 0); - char *ptr = wkb.data(); + char* ptr = wkb.data(); ptr[0] = kWkbNativeEndianness; uint32_t geom_type = GeometryTypeToWKB(geospatial::GeometryType::kPoint, hasZ, hasM); @@ -49,17 +49,17 @@ std::string MakeWKBPoint(const std::vector &xyzm, bool hasZ, bool hasM) ptr += 21; if (hasZ) { - std::memcpy(ptr, &xyzm[2], 8); - ptr += 8; + std::memcpy(ptr, &xyzm[2], 8); + ptr += 8; } if (hasM) { - std::memcpy(ptr, &xyzm[3], 8); - ptr += 8; + std::memcpy(ptr, &xyzm[3], 8); + ptr += 8; } assert(static_cast(ptr - wkb.data()) == wkb.length()); return wkb; -} + } -} \ No newline at end of file +} // namespace orc \ No newline at end of file diff --git a/c++/test/TestUtil.hh b/c++/test/TestUtil.hh index 1731048ebe..2132aaea83 100644 --- a/c++/test/TestUtil.hh +++ b/c++/test/TestUtil.hh @@ -21,21 +21,21 @@ namespace orc { -/// \brief Number of bytes in a WKB Point with X and Y dimensions (uint8_t endian, -/// uint32_t geometry type, 2 * double coordinates) -static constexpr int kWkbPointXYSize = 21; + /// \brief Number of bytes in a WKB Point with X and Y dimensions (uint8_t endian, + /// uint32_t geometry type, 2 * double coordinates) + static constexpr int kWkbPointXYSize = 21; -static bool isLittleEndian() { + static bool isLittleEndian() { static union { - uint32_t i; - char c[4]; + uint32_t i; + char c[4]; } num = {0x01020304}; return num.c[0] == 4; -} + } -static uint8_t kWkbNativeEndianness = isLittleEndian() ? 0x01 : 0x00; + static uint8_t kWkbNativeEndianness = isLittleEndian() ? 0x01 : 0x00; -uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ, bool hasM); -std::string MakeWKBPoint(const std::vector &xyzm, bool hasZ, bool hasM); + uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ, bool hasM); + std::string MakeWKBPoint(const std::vector& xyzm, bool hasZ, bool hasM); } // namespace orc \ No newline at end of file diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc index 06b3626331..11ba0c9dea 100644 --- a/c++/test/TestWriter.cc +++ b/c++/test/TestWriter.cc @@ -2405,25 +2405,23 @@ namespace orc { TEST_P(WriterTest, writeGeometryAndGeographyColumn) { MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); MemoryPool* pool = getDefaultPool(); - std::unique_ptr type(Type::buildTypeFromString("struct")); + std::unique_ptr type(Type::buildTypeFromString( + "struct")); uint64_t stripeSize = 1024; // 1K uint64_t compressionBlockSize = 1024; // 1k uint64_t memoryBlockSize = 64; std::unique_ptr writer = - createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); - EXPECT_EQ("struct", type->toString()); + EXPECT_EQ("struct", + type->toString()); uint64_t batchCount = 100, batchSize = 1000; - std::unique_ptr batch = - writer->createRowBatch(batchSize); - StructVectorBatch* structBatch = - dynamic_cast(batch.get()); - StringVectorBatch* geometryBatch = - dynamic_cast(structBatch->fields[0]); - StringVectorBatch* geographyBatch = - dynamic_cast(structBatch->fields[1]); + std::unique_ptr batch = writer->createRowBatch(batchSize); + StructVectorBatch* structBatch = dynamic_cast(batch.get()); + StringVectorBatch* geometryBatch = dynamic_cast(structBatch->fields[0]); + StringVectorBatch* geographyBatch = dynamic_cast(structBatch->fields[1]); std::unique_ptr buffer(new char[8000000]); char* buf = buffer.get(); @@ -2466,12 +2464,13 @@ namespace orc { writer->close(); std::unique_ptr inStream( - new MemoryInputStream (memStream.getData(), memStream.getLength())); + new MemoryInputStream(memStream.getData(), memStream.getLength())); std::unique_ptr reader = createReader(pool, std::move(inStream)); EXPECT_EQ(batchCount * batchSize, reader->getNumberOfRows()); EXPECT_TRUE(reader->getNumberOfStripes() > 1); - EXPECT_EQ("struct", reader->getType().toString()); + EXPECT_EQ("struct", + reader->getType().toString()); // test sequential reader std::unique_ptr seqReader = createRowReader(reader.get()); rowCount = 0; @@ -2536,7 +2535,6 @@ namespace orc { } } - std::vector testParams = {{FileVersion::v_0_11(), true}, {FileVersion::v_0_11(), false}, {FileVersion::v_0_12(), false}, From 176ea6b33a03c1847a904bde552ad5a2e94216b5 Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 16 Jun 2025 19:03:07 +0800 Subject: [PATCH 06/16] fix linter for Geospatial.cc --- c++/src/Geospatial.cc | 449 +++++++++++++++++++++--------------------- 1 file changed, 229 insertions(+), 220 deletions(-) diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc index a271c66600..930385a38b 100644 --- a/c++/src/Geospatial.cc +++ b/c++/src/Geospatial.cc @@ -19,271 +19,280 @@ #include "orc/Geospatial.hh" #include -#include #include #include +#include #include "orc/Exceptions.hh" namespace orc::geospatial { -template -inline std::enable_if_t, T> SafeLoadAs( - const uint8_t* unaligned) { - std::remove_const_t ret; - std::memcpy(&ret, unaligned, sizeof(T)); - return ret; -} - -template -inline std::enable_if_t && - std::is_trivially_copyable_v && sizeof(T) == sizeof(U), - U> -SafeCopy(T value) { - std::remove_const_t ret; - std::memcpy(&ret, static_cast(&value), sizeof(T)); - return ret; -} - -static bool isLittleEndian() { - static union { - uint32_t i; - char c[4]; - } num = {0x01020304}; - return num.c[0] == 4; -} + template + inline std::enable_if_t, T> SafeLoadAs(const uint8_t* unaligned) { + std::remove_const_t ret; + std::memcpy(&ret, unaligned, sizeof(T)); + return ret; + } + + template + inline std::enable_if_t && std::is_trivially_copyable_v && + sizeof(T) == sizeof(U), + U> + SafeCopy(T value) { + std::remove_const_t ret; + std::memcpy(&ret, static_cast(&value), sizeof(T)); + return ret; + } + + static bool isLittleEndian() { + static union { + uint32_t i; + char c[4]; + } num = {0x01020304}; + return num.c[0] == 4; + } #if defined(_MSC_VER) -# include // IWYU pragma: keep -# define ORC_BYTE_SWAP64 _byteswap_uint64 -# define ORC_BYTE_SWAP32 _byteswap_ulong +#include // IWYU pragma: keep +#define ORC_BYTE_SWAP64 _byteswap_uint64 +#define ORC_BYTE_SWAP32 _byteswap_ulong #else -# define ORC_BYTE_SWAP64 __builtin_bswap64 -# define ORC_BYTE_SWAP32 __builtin_bswap32 +#define ORC_BYTE_SWAP64 __builtin_bswap64 +#define ORC_BYTE_SWAP32 __builtin_bswap32 #endif -// Swap the byte order (i.e. endianness) -static inline uint32_t ByteSwap(uint32_t value) { - return static_cast(ORC_BYTE_SWAP32(value)); -} -static inline double ByteSwap(double value) { - const uint64_t swapped = ORC_BYTE_SWAP64(SafeCopy(value)); - return SafeCopy(swapped); -} - -std::string BoundingBox::ToString() const { - std::stringstream ss; - ss << "BoundingBox" << std::endl; - ss << " x: [" << min[0] << ", " << max[0] << "]" << std::endl; - ss << " y: [" << min[1] << ", " << max[1] << "]" << std::endl; - ss << " z: [" << min[2] << ", " << max[2] << "]" << std::endl; - ss << " m: [" << min[3] << ", " << max[3] << "]" << std::endl; - - return ss.str(); -} - -/// \brief Object to keep track of the low-level consumption of a well-known binary -/// geometry -/// -/// Briefly, ISO well-known binary supported by the Parquet spec is an endian byte -/// (0x01 or 0x00), followed by geometry type + dimensions encoded as a (uint32_t), -/// followed by geometry-specific data. Coordinate sequences are represented by a -/// uint32_t (the number of coordinates) plus a sequence of doubles (number of coordinates -/// multiplied by the number of dimensions). -class WKBBuffer { - public: - WKBBuffer() : data_(nullptr), size_(0) {} - WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} - - uint8_t ReadUInt8() { return ReadChecked(); } - - uint32_t ReadUInt32(bool swap) { - auto value = ReadChecked(); - return swap ? ByteSwap(value) : value; + // Swap the byte order (i.e. endianness) + static inline uint32_t ByteSwap(uint32_t value) { + return static_cast(ORC_BYTE_SWAP32(value)); + } + static inline double ByteSwap(double value) { + const uint64_t swapped = ORC_BYTE_SWAP64(SafeCopy(value)); + return SafeCopy(swapped); } - template - void ReadCoords(uint32_t nCoords, bool swap, Visit&& visit) { - size_t total_bytes = nCoords * sizeof(Coord); - if (size_ < total_bytes) { + std::string BoundingBox::ToString() const { + std::stringstream ss; + ss << "BoundingBox" << std::endl; + ss << " x: [" << min[0] << ", " << max[0] << "]" << std::endl; + ss << " y: [" << min[1] << ", " << max[1] << "]" << std::endl; + ss << " z: [" << min[2] << ", " << max[2] << "]" << std::endl; + ss << " m: [" << min[3] << ", " << max[3] << "]" << std::endl; + + return ss.str(); + } + + /// \brief Object to keep track of the low-level consumption of a well-known binary + /// geometry + /// + /// Briefly, ISO well-known binary supported by the Parquet spec is an endian byte + /// (0x01 or 0x00), followed by geometry type + dimensions encoded as a (uint32_t), + /// followed by geometry-specific data. Coordinate sequences are represented by a + /// uint32_t (the number of coordinates) plus a sequence of doubles (number of coordinates + /// multiplied by the number of dimensions). + class WKBBuffer { + public: + WKBBuffer() : data_(nullptr), size_(0) {} + WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} + + uint8_t ReadUInt8() { + return ReadChecked(); } - if (swap) { - Coord coord; - for (uint32_t i = 0; i < nCoords; i++) { - coord = ReadUnchecked(); - for (auto& c : coord) { - c = ByteSwap(c); - } + uint32_t ReadUInt32(bool swap) { + auto value = ReadChecked(); + return swap ? ByteSwap(value) : value; + } - std::forward(visit)(coord); + template + void ReadCoords(uint32_t nCoords, bool swap, Visit&& visit) { + size_t total_bytes = nCoords * sizeof(Coord); + if (size_ < total_bytes) { } - } else { - for (uint32_t i = 0; i < nCoords; i++) { - std::forward(visit)(ReadUnchecked()); + + if (swap) { + Coord coord; + for (uint32_t i = 0; i < nCoords; i++) { + coord = ReadUnchecked(); + for (auto& c : coord) { + c = ByteSwap(c); + } + + std::forward(visit)(coord); + } + } else { + for (uint32_t i = 0; i < nCoords; i++) { + std::forward(visit)(ReadUnchecked()); + } } } - } - size_t size() const { return size_; } + size_t size() const { + return size_; + } - private: - const uint8_t* data_; - size_t size_; + private: + const uint8_t* data_; + size_t size_; - template - T ReadChecked() { - if (size_ < sizeof(T)) { - std::stringstream ss; - ss << "Can't read" << sizeof(T) << " bytes from WKBBuffer with " << size_ << " remaining"; - throw ParseError(ss.str()); + template + T ReadChecked() { + if (size_ < sizeof(T)) { + std::stringstream ss; + ss << "Can't read" << sizeof(T) << " bytes from WKBBuffer with " << size_ << " remaining"; + throw ParseError(ss.str()); + } + + return ReadUnchecked(); } - return ReadUnchecked(); - } + template + T ReadUnchecked() { + T out = SafeLoadAs(data_); + data_ += sizeof(T); + size_ -= sizeof(T); + return out; + } + }; - template - T ReadUnchecked() { - T out = SafeLoadAs(data_); - data_ += sizeof(T); - size_ -= sizeof(T); - return out; - } -}; + using GeometryTypeAndDimensions = std::pair; -using GeometryTypeAndDimensions = std::pair; + namespace { -namespace { + std::optional ParseGeometryType(uint32_t wkbGeometryType) { + // The number 1000 can be used because WKB geometry types are constructed + // on purpose such that this relationship is true (e.g., LINESTRING ZM maps + // to 3002). + uint32_t geometryTypeComponent = wkbGeometryType % 1000; + uint32_t dimensionsComponent = wkbGeometryType / 1000; -std::optional ParseGeometryType(uint32_t wkbGeometryType) { - // The number 1000 can be used because WKB geometry types are constructed - // on purpose such that this relationship is true (e.g., LINESTRING ZM maps - // to 3002). - uint32_t geometryTypeComponent = wkbGeometryType % 1000; - uint32_t dimensionsComponent = wkbGeometryType / 1000; + auto minGeometryTypeValue = static_cast(GeometryType::kValueMin); + auto maxGeometryTypeValue = static_cast(GeometryType::kValueMax); + auto minDimensionValue = static_cast(Dimensions::kValueMin); + auto maxDimensionValue = static_cast(Dimensions::kValueMax); - auto minGeometryTypeValue = static_cast(GeometryType::kValueMin); - auto maxGeometryTypeValue = static_cast(GeometryType::kValueMax); - auto minDimensionValue = static_cast(Dimensions::kValueMin); - auto maxDimensionValue = static_cast(Dimensions::kValueMax); + if (geometryTypeComponent < minGeometryTypeValue || + geometryTypeComponent > maxGeometryTypeValue || dimensionsComponent < minDimensionValue || + dimensionsComponent > maxDimensionValue) { + return std::nullopt; + } - if (geometryTypeComponent < minGeometryTypeValue || - geometryTypeComponent > maxGeometryTypeValue || - dimensionsComponent < minDimensionValue || - dimensionsComponent > maxDimensionValue) { - return std::nullopt; - } + return std::make_optional( + GeometryTypeAndDimensions{static_cast(geometryTypeComponent), + static_cast(dimensionsComponent)}); + } - return std::make_optional(GeometryTypeAndDimensions{static_cast(geometryTypeComponent), - static_cast(dimensionsComponent)}); -} - -} // namespace - -std::vector WKBGeometryBounder::GeometryTypes() const { - std::vector out(geospatialTypes_.begin(), geospatialTypes_.end()); - std::sort(out.begin(), out.end()); - return out; -} - -void WKBGeometryBounder::MergeGeometry(std::string_view bytesWkb) { - if (!isValid_) { return; } - MergeGeometry(reinterpret_cast(bytesWkb.data()), bytesWkb.size()); -} - -void WKBGeometryBounder::MergeGeometry(const uint8_t* bytesWkb, size_t bytesSize) { - if (!isValid_) { return; } - WKBBuffer src{bytesWkb, static_cast(bytesSize)}; - try { - MergeGeometryInternal(&src, /*record_wkb_type=*/true); - } catch (const ParseError&) { - Invalidate(); - return; - } - if (src.size() != 0) { - // "Exepcted zero bytes after consuming WKB - Invalidate(); + } // namespace + + std::vector WKBGeometryBounder::GeometryTypes() const { + std::vector out(geospatialTypes_.begin(), geospatialTypes_.end()); + std::sort(out.begin(), out.end()); + return out; } -} - -void WKBGeometryBounder::MergeGeometryInternal(WKBBuffer* src, bool recordWkbType) { - uint8_t endian = src->ReadUInt8(); - bool swap = endian != 0x00; - if(isLittleEndian()) { swap = endian != 0x01; } - - uint32_t wkbGeometryType = src->ReadUInt32(swap); - auto geometryTypeAndDimensions = ParseGeometryType(wkbGeometryType); - if (!geometryTypeAndDimensions.has_value()) { - Invalidate(); - return; + + void WKBGeometryBounder::MergeGeometry(std::string_view bytesWkb) { + if (!isValid_) { + return; + } + MergeGeometry(reinterpret_cast(bytesWkb.data()), bytesWkb.size()); } - auto& [geometry_type, dimensions] = geometryTypeAndDimensions.value(); - // Keep track of geometry types encountered if at the top level - if (recordWkbType) { - geospatialTypes_.insert(static_cast(wkbGeometryType)); + void WKBGeometryBounder::MergeGeometry(const uint8_t* bytesWkb, size_t bytesSize) { + if (!isValid_) { + return; + } + WKBBuffer src{bytesWkb, static_cast(bytesSize)}; + try { + MergeGeometryInternal(&src, /*record_wkb_type=*/true); + } catch (const ParseError&) { + Invalidate(); + return; + } + if (src.size() != 0) { + // "Exepcted zero bytes after consuming WKB + Invalidate(); + } } - switch (geometry_type) { - case GeometryType::kPoint: - MergeSequence(src, dimensions, 1, swap); - break; + void WKBGeometryBounder::MergeGeometryInternal(WKBBuffer* src, bool recordWkbType) { + uint8_t endian = src->ReadUInt8(); + bool swap = endian != 0x00; + if (isLittleEndian()) { + swap = endian != 0x01; + } + + uint32_t wkbGeometryType = src->ReadUInt32(swap); + auto geometryTypeAndDimensions = ParseGeometryType(wkbGeometryType); + if (!geometryTypeAndDimensions.has_value()) { + Invalidate(); + return; + } + auto& [geometry_type, dimensions] = geometryTypeAndDimensions.value(); - case GeometryType::kLinestring: { - uint32_t nCoords = src->ReadUInt32(swap); - MergeSequence(src, dimensions, nCoords, swap); - break; + // Keep track of geometry types encountered if at the top level + if (recordWkbType) { + geospatialTypes_.insert(static_cast(wkbGeometryType)); } - case GeometryType::kPolygon: { - uint32_t n_parts = src->ReadUInt32(swap); - for (uint32_t i = 0; i < n_parts; i++) { + + switch (geometry_type) { + case GeometryType::kPoint: + MergeSequence(src, dimensions, 1, swap); + break; + + case GeometryType::kLinestring: { uint32_t nCoords = src->ReadUInt32(swap); MergeSequence(src, dimensions, nCoords, swap); + break; + } + case GeometryType::kPolygon: { + uint32_t n_parts = src->ReadUInt32(swap); + for (uint32_t i = 0; i < n_parts; i++) { + uint32_t nCoords = src->ReadUInt32(swap); + MergeSequence(src, dimensions, nCoords, swap); + } + break; } - break; - } - // These are all encoded the same in WKB, even though this encoding would - // allow for parts to be of a different geometry type or different dimensions. - // For the purposes of bounding, this does not cause us problems. We pass - // record_wkb_type = false because we do not want the child geometry to be - // added to the geometry_types list (e.g., for a MultiPoint, we only want - // the code for MultiPoint to be added, not the code for Point). - case GeometryType::kMultiPoint: - case GeometryType::kMultiLinestring: - case GeometryType::kMultiPolygon: - case GeometryType::kGeometryCollection: { - uint32_t n_parts = src->ReadUInt32(swap); - for (uint32_t i = 0; i < n_parts; i++) { - MergeGeometryInternal(src, /*record_wkb_type*/ false); + // These are all encoded the same in WKB, even though this encoding would + // allow for parts to be of a different geometry type or different dimensions. + // For the purposes of bounding, this does not cause us problems. We pass + // record_wkb_type = false because we do not want the child geometry to be + // added to the geometry_types list (e.g., for a MultiPoint, we only want + // the code for MultiPoint to be added, not the code for Point). + case GeometryType::kMultiPoint: + case GeometryType::kMultiLinestring: + case GeometryType::kMultiPolygon: + case GeometryType::kGeometryCollection: { + uint32_t n_parts = src->ReadUInt32(swap); + for (uint32_t i = 0; i < n_parts; i++) { + MergeGeometryInternal(src, /*record_wkb_type*/ false); + } + break; } - break; } } -} - -void WKBGeometryBounder::MergeSequence(WKBBuffer* src, Dimensions dimensions, - uint32_t nCoords, bool swap) { - switch (dimensions) { - case Dimensions::kXY: - src->ReadCoords( - nCoords, swap, [&](BoundingBox::XY coord) { box_.UpdateXY(coord); }); - break; - case Dimensions::kXYZ: - src->ReadCoords( - nCoords, swap, [&](BoundingBox::XYZ coord) { box_.UpdateXYZ(coord); }); - break; - case Dimensions::kXYM: - src->ReadCoords( - nCoords, swap, [&](BoundingBox::XYM coord) { box_.UpdateXYM(coord); }); - break; - case Dimensions::kXYZM: - src->ReadCoords( - nCoords, swap, [&](BoundingBox::XYZM coord) { box_.UpdateXYZM(coord); }); - break; - default: - Invalidate(); + + void WKBGeometryBounder::MergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, + bool swap) { + switch (dimensions) { + case Dimensions::kXY: + src->ReadCoords(nCoords, swap, + [&](BoundingBox::XY coord) { box_.UpdateXY(coord); }); + break; + case Dimensions::kXYZ: + src->ReadCoords(nCoords, swap, + [&](BoundingBox::XYZ coord) { box_.UpdateXYZ(coord); }); + break; + case Dimensions::kXYM: + src->ReadCoords(nCoords, swap, + [&](BoundingBox::XYM coord) { box_.UpdateXYM(coord); }); + break; + case Dimensions::kXYZM: + src->ReadCoords( + nCoords, swap, [&](BoundingBox::XYZM coord) { box_.UpdateXYZM(coord); }); + break; + default: + Invalidate(); + } } -} -} // namespace orc::geospatial \ No newline at end of file +} // namespace orc::geospatial \ No newline at end of file From 728f07ef2a4bce202018d30e191f36dfc29da0e0 Mon Sep 17 00:00:00 2001 From: ffacs Date: Mon, 16 Jun 2025 19:05:04 +0800 Subject: [PATCH 07/16] fix linter for TypeImpl.hh --- c++/src/TypeImpl.hh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/c++/src/TypeImpl.hh b/c++/src/TypeImpl.hh index 5d73c1f9fb..b9f380703e 100644 --- a/c++/src/TypeImpl.hh +++ b/c++/src/TypeImpl.hh @@ -194,7 +194,6 @@ namespace orc { static std::unique_ptr parseDecimalType(const std::string& input, size_t start, size_t end); - /** * Parse geography type from string * @param input the input string of a decimal type @@ -202,7 +201,7 @@ namespace orc { * @param end end position of the input string */ static std::unique_ptr parseGeographyType(const std::string& input, size_t start, - size_t end); + size_t end); /** * Parse type for a category * @param category type name From e0421f97685ac269a4c21b574f9668a7c0f82fa9 Mon Sep 17 00:00:00 2001 From: ffacs Date: Wed, 2 Jul 2025 00:18:29 +0800 Subject: [PATCH 08/16] Refactor coding style and address comments --- c++/include/orc/Geospatial.hh | 400 +++++++++++++++---------------- c++/include/orc/Statistics.hh | 4 +- c++/src/Geospatial.cc | 90 +++---- c++/src/Statistics.cc | 8 +- c++/src/Statistics.hh | 34 +-- c++/src/TypeImpl.cc | 6 +- c++/test/TestColumnStatistics.cc | 140 +++++------ c++/test/TestStatistics.cc | 27 +-- c++/test/TestUtil.cc | 2 +- tools/src/CSVFileImport.cc | 4 +- 10 files changed, 360 insertions(+), 355 deletions(-) diff --git a/c++/include/orc/Geospatial.hh b/c++/include/orc/Geospatial.hh index 067d45011b..9c601ee943 100644 --- a/c++/include/orc/Geospatial.hh +++ b/c++/include/orc/Geospatial.hh @@ -16,7 +16,20 @@ * limitations under the License. */ -#pragma once +/* + * This file contains code adapted from the Apache Arrow project. + * + * Original source: + * https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.h + * + * The original code is licensed under the Apache License, Version 2.0. + * + * Modifications may have been made from the original source. + */ + +#ifndef ORC_GEOSPATIAL_HH +#define ORC_GEOSPATIAL_HH + #include #include #include @@ -26,231 +39,218 @@ #include #include -namespace orc::geospatial { - - /// \brief Infinity, used to define bounds of empty bounding boxes - constexpr double kInf = std::numeric_limits::infinity(); - - /// \brief The maximum number of dimensions represented by a geospatial type - /// (i.e., X, Y, Z, and M) - inline constexpr int kMaxDimensions = 4; - - /// \brief Valid combinations of dimensions allowed by ISO well-known binary - /// - /// These values correspond to the 0, 1000, 2000, 3000 component of the WKB integer - /// geometry type (i.e., the value of geometry_type // 1000). - enum class Dimensions { kXY = 0, kXYZ = 1, kXYM = 2, kXYZM = 3, kValueMin = 0, kValueMax = 3 }; - - /// \brief The supported set of geometry types allowed by ISO well-known binary - /// - /// These values correspond to the 1, 2, ..., 7 component of the WKB integer - /// geometry type (i.e., the value of geometry_type % 1000). - enum class GeometryType { - kPoint = 1, - kLinestring = 2, - kPolygon = 3, - kMultiPoint = 4, - kMultiLinestring = 5, - kMultiPolygon = 6, - kGeometryCollection = 7, - kValueMin = 1, - kValueMax = 7 - }; - - struct BoundingBox { - using XY = std::array; - using XYZ = std::array; - using XYM = std::array; - using XYZM = std::array; - - BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {} - BoundingBox() : min{kInf, kInf, kInf, kInf}, max{-kInf, -kInf, -kInf, -kInf} {} - BoundingBox(const BoundingBox& other) = default; - BoundingBox& operator=(const BoundingBox&) = default; - - /// \brief Update the X and Y bounds to ensure these bounds contain coord - void UpdateXY(const XY& coord) { - UpdateInternal(coord); - } - - /// \brief Update the X, Y, and Z bounds to ensure these bounds contain coord - void UpdateXYZ(const XYZ& coord) { - UpdateInternal(coord); - } - - /// \brief Update the X, Y, and M bounds to ensure these bounds contain coord - void UpdateXYM(const XYM& coord) { - std::array dimensions = {0, 1, 3}; - for (int i = 0; i < 3; i++) { - auto dimension = dimensions[i]; - if ((std::isnan(min[dimension]) == 0) && (std::isnan(max[dimension]) == 0)) { - min[dimension] = std::min(min[dimension], coord[i]); - max[dimension] = std::max(max[dimension], coord[i]); +namespace orc { + namespace geospatial { + + constexpr double INF = std::numeric_limits::infinity(); + // The maximum number of dimensions supported (X, Y, Z, M) + inline constexpr int MAX_DIMENSIONS = 4; + + // Supported combinations of geometry dimensions + enum class Dimensions { + XY = 0, // X and Y only + XYZ = 1, // X, Y, and Z + XYM = 2, // X, Y, and M + XYZM = 3, // X, Y, Z, and M + VALUE_MIN = 0, + VALUE_MAX = 3 + }; + + // Supported geometry types according to ISO WKB + enum class GeometryType { + POINT = 1, + LINESTRING = 2, + POLYGON = 3, + MULTIPOINT = 4, + MULTILINESTRING = 5, + MULTIPOLYGON = 6, + GEOMETRYCOLLECTION = 7, + VALUE_MIN = 1, + VALUE_MAX = 7 + }; + + // BoundingBox represents the minimum bounding rectangle (or box) for a geometry. + // It supports up to 4 dimensions (X, Y, Z, M). + struct BoundingBox { + using XY = std::array; + using XYZ = std::array; + using XYM = std::array; + using XYZM = std::array; + + // Default constructor: initializes to an empty bounding box. + BoundingBox() : min{INF, INF, INF, INF}, max{-INF, -INF, -INF, -INF} {} + // Constructor with explicit min/max values. + BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {} + BoundingBox(const BoundingBox& other) = default; + BoundingBox& operator=(const BoundingBox&) = default; + + // Update the bounding box to include a 2D coordinate. + void updateXY(const XY& coord) { + updateInternal(coord); + } + // Update the bounding box to include a 3D coordinate (XYZ). + void updateXYZ(const XYZ& coord) { + updateInternal(coord); + } + // Update the bounding box to include a 3D coordinate (XYM). + void updateXYM(const XYM& coord) { + std::array dims = {0, 1, 3}; + for (int i = 0; i < 3; ++i) { + auto dim = dims[i]; + if (!std::isnan(min[dim]) && !std::isnan(max[dim])) { + min[dim] = std::min(min[dim], coord[i]); + max[dim] = std::max(max[dim], coord[i]); + } } } - } - - /// \brief Update the X, Y, Z, and M bounds to ensure these bounds contain coord - void UpdateXYZM(const XYZM& coord) { - UpdateInternal(coord); - } - - /// \brief Reset these bounds to an empty state such that they contain no coordinates - void Reset() { - for (int i = 0; i < kMaxDimensions; i++) { - min[i] = kInf; - max[i] = -kInf; + // Update the bounding box to include a 4D coordinate (XYZM). + void updateXYZM(const XYZM& coord) { + updateInternal(coord); } - } - void Invalidate() { - for (int i = 0; i < kMaxDimensions; i++) { - min[i] = std::numeric_limits::quiet_NaN(); - max[i] = std::numeric_limits::quiet_NaN(); + // Reset the bounding box to its initial empty state. + void reset() { + for (int i = 0; i < MAX_DIMENSIONS; ++i) { + min[i] = INF; + max[i] = -INF; + } } - } - bool BoundEmpty(int dimension) const { - return std::isinf(min[dimension] - max[dimension]) != 0; - } - - bool BoundValid(int dimension) const { - return (std::isnan(min[dimension]) == 0) && (std::isnan(max[dimension]) == 0); - } - - const XYZM& LowerBound() const { - return min; - } - - const XYZM& UpperBound() const { - return max; - } - - std::array DimensionValid() const { - return {BoundValid(0), BoundValid(1), BoundValid(2), BoundValid(3)}; - } - - std::array DimensionEmpty() const { - return {BoundEmpty(0), BoundEmpty(1), BoundEmpty(2), BoundEmpty(3)}; - } - - /// \brief Update these bounds such they also contain other - void Merge(const BoundingBox& other) { - for (int i = 0; i < kMaxDimensions; i++) { - if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) || - std::isnan(other.max[i])) { + // Invalidate the bounding box (set all values to NaN). + void invalidate() { + for (int i = 0; i < MAX_DIMENSIONS; ++i) { min[i] = std::numeric_limits::quiet_NaN(); max[i] = std::numeric_limits::quiet_NaN(); - } else { - min[i] = std::min(min[i], other.min[i]); - max[i] = std::max(max[i], other.max[i]); } } - } - - std::string ToString() const; - XYZM min; - XYZM max; - - private: - // This works for XY, XYZ, and XYZM - template - void UpdateInternal(Coord coord) { - for (size_t i = 0; i < coord.size(); i++) { - if (!std::isnan(min[i]) && !std::isnan(max[i])) { - min[i] = std::min(min[i], coord[i]); - max[i] = std::max(max[i], coord[i]); - } + // Check if the bound for a given dimension is empty. + bool boundEmpty(int dim) const { + return std::isinf(min[dim] - max[dim]); } - } - }; - - inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) { - return lhs.min == rhs.min && lhs.max == rhs.max; - } - inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) { - return !(lhs == rhs); - } - - inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) { - os << obj.ToString(); - return os; - } + // Check if the bound for a given dimension is valid (not NaN). + bool boundValid(int dim) const { + return !std::isnan(min[dim]) && !std::isnan(max[dim]); + } - class WKBBuffer; + // Get the lower bound (min values). + const XYZM& lowerBound() const { + return min; + } + // Get the upper bound (max values). + const XYZM& upperBound() const { + return max; + } - /// \brief Accumulate a BoundingBox and geometry types based on zero or more well-known - /// binary blobs - /// - /// Note that this class is NOT appropriate for bounding a GEOGRAPHY, - /// whose bounds are not a function purely of the vertices. Geography bounding - /// is not yet implemented. - class WKBGeometryBounder { - public: - /// \brief Accumulate the bounds of a serialized well-known binary geometry - /// - /// Throws ParquetException for any parse errors encountered. Bounds for - /// any encountered coordinates are accumulated and the geometry type of - /// the geometry is added to the internal geometry type list. - void MergeGeometry(std::string_view bytesWkb); + // Get validity for each dimension. + std::array dimensionValid() const { + return {boundValid(0), boundValid(1), boundValid(2), boundValid(3)}; + } + // Get emptiness for each dimension. + std::array dimensionEmpty() const { + return {boundEmpty(0), boundEmpty(1), boundEmpty(2), boundEmpty(3)}; + } - void MergeGeometry(const uint8_t* bytesWkb, size_t bytesSize); + // Merge another bounding box into this one. + void merge(const BoundingBox& other) { + for (int i = 0; i < MAX_DIMENSIONS; ++i) { + if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) || + std::isnan(other.max[i])) { + min[i] = std::numeric_limits::quiet_NaN(); + max[i] = std::numeric_limits::quiet_NaN(); + } else { + min[i] = std::min(min[i], other.min[i]); + max[i] = std::max(max[i], other.max[i]); + } + } + } - /// \brief Accumulate the bounds of a previously-calculated BoundingBox - void MergeBox(const BoundingBox& box) { - box_.Merge(box); - } + // Convert the bounding box to a string representation. + std::string toString() const; + + XYZM min; // Minimum values for each dimension + XYZM max; // Maximum values for each dimension + + private: + // Internal update function for XY, XYZ, or XYZM coordinates. + template + void updateInternal(const Coord& coord) { + for (size_t i = 0; i < coord.size(); ++i) { + if (!std::isnan(min[i]) && !std::isnan(max[i])) { + min[i] = std::min(min[i], coord[i]); + max[i] = std::max(max[i], coord[i]); + } + } + } + }; - /// \brief Accumulate a previously-calculated list of geometry types - void MergeGeometryTypes(const std::vector geospatialTypes) { - geospatialTypes_.insert(geospatialTypes.begin(), geospatialTypes.end()); + inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) { + return lhs.min == rhs.min && lhs.max == rhs.max; } - - /// \brief Accumulate the bounds of a previously-calculated BoundingBox - void Merge(const WKBGeometryBounder& other) { - if (!IsValid() || !other.IsValid()) { - Invalidate(); - return; - } - box_.Merge(other.box_); - geospatialTypes_.insert(other.geospatialTypes_.begin(), other.geospatialTypes_.end()); + inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) { + return !(lhs == rhs); } - - /// \brief Retrieve the accumulated bounds - const BoundingBox& Bounds() const { - return box_; + inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) { + os << obj.toString(); + return os; } - /// \brief Retrieve the accumulated geometry types - std::vector GeometryTypes() const; + class WKBBuffer; - /// \brief Reset the internal bounds and geometry types list to an empty state - void Reset() { - isValid_ = true; - box_.Reset(); - geospatialTypes_.clear(); - } + class WKBGeometryBounder { + public: + void mergeGeometry(std::string_view bytesWkb); + void mergeGeometry(const uint8_t* bytesWkb, size_t bytesSize); - bool IsValid() const { - return isValid_; - } + void mergeBox(const BoundingBox& box) { + box_.merge(box); + } + void mergeGeometryTypes(const std::vector& geospatialTypes) { + geospatialTypes_.insert(geospatialTypes.begin(), geospatialTypes.end()); + } + void merge(const WKBGeometryBounder& other) { + if (!isValid() || !other.isValid()) { + invalidate(); + return; + } + box_.merge(other.box_); + geospatialTypes_.insert(other.geospatialTypes_.begin(), other.geospatialTypes_.end()); + } - void Invalidate() { - isValid_ = false; - box_.Invalidate(); - geospatialTypes_.clear(); - } + // Get the bounding box for the merged geometries. + const BoundingBox& bounds() const { + return box_; + } + + // Get the set of geometry types encountered during merging. + // Returns a sorted vector of geometry type IDs. + std::vector geometryTypes() const; + + void reset() { + isValid_ = true; + box_.reset(); + geospatialTypes_.clear(); + } + bool isValid() const { + return isValid_; + } + void invalidate() { + isValid_ = false; + box_.invalidate(); + geospatialTypes_.clear(); + } - private: - BoundingBox box_; - std::unordered_set geospatialTypes_; - bool isValid_ = true; + private: + BoundingBox box_; + std::unordered_set geospatialTypes_; + bool isValid_ = true; - void MergeGeometryInternal(WKBBuffer* src, bool recordWkbType); + void mergeGeometryInternal(WKBBuffer* src, bool recordWkbType); + void mergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, bool swap); + }; - void MergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, bool swap); - }; + } // namespace geospatial +} // namespace orc -} // namespace orc::geospatial \ No newline at end of file +#endif // ORC_GEOSPATIAL_HH diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh index 11a553b9b3..93b4824dd5 100644 --- a/c++/include/orc/Statistics.hh +++ b/c++/include/orc/Statistics.hh @@ -19,12 +19,10 @@ #ifndef ORC_STATISTICS_HH #define ORC_STATISTICS_HH +#include "orc/Geospatial.hh" #include "orc/Type.hh" #include "orc/Vector.hh" #include "orc/orc-config.hh" -#include "orc/Geospatial.hh" - -#include namespace orc { diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc index 930385a38b..0b97c9391d 100644 --- a/c++/src/Geospatial.cc +++ b/c++/src/Geospatial.cc @@ -16,6 +16,17 @@ * limitations under the License. */ +/* + * This file contains code adapted from the Apache Arrow project. + * + * Original source: + * https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.cc + * + * The original code is licensed under the Apache License, Version 2.0. + * + * Modifications may have been made from the original source. + */ + #include "orc/Geospatial.hh" #include @@ -70,14 +81,11 @@ namespace orc::geospatial { return SafeCopy(swapped); } - std::string BoundingBox::ToString() const { + std::string BoundingBox::toString() const { std::stringstream ss; - ss << "BoundingBox" << std::endl; - ss << " x: [" << min[0] << ", " << max[0] << "]" << std::endl; - ss << " y: [" << min[1] << ", " << max[1] << "]" << std::endl; - ss << " z: [" << min[2] << ", " << max[2] << "]" << std::endl; - ss << " m: [" << min[3] << ", " << max[3] << "]" << std::endl; - + ss << "BoundingBox{xMin=" << min[0] << ", xMax=" << max[0] << ", yMin=" << min[1] + << ", yMax=" << max[1] << ", zMin=" << min[2] << ", zMax=" << max[2] << ", mMin=" << min[3] + << ", mMax=" << max[3] << "}"; return ss.str(); } @@ -165,10 +173,10 @@ namespace orc::geospatial { uint32_t geometryTypeComponent = wkbGeometryType % 1000; uint32_t dimensionsComponent = wkbGeometryType / 1000; - auto minGeometryTypeValue = static_cast(GeometryType::kValueMin); - auto maxGeometryTypeValue = static_cast(GeometryType::kValueMax); - auto minDimensionValue = static_cast(Dimensions::kValueMin); - auto maxDimensionValue = static_cast(Dimensions::kValueMax); + auto minGeometryTypeValue = static_cast(GeometryType::VALUE_MIN); + auto maxGeometryTypeValue = static_cast(GeometryType::VALUE_MAX); + auto minDimensionValue = static_cast(Dimensions::VALUE_MIN); + auto maxDimensionValue = static_cast(Dimensions::VALUE_MAX); if (geometryTypeComponent < minGeometryTypeValue || geometryTypeComponent > maxGeometryTypeValue || dimensionsComponent < minDimensionValue || @@ -183,37 +191,37 @@ namespace orc::geospatial { } // namespace - std::vector WKBGeometryBounder::GeometryTypes() const { + std::vector WKBGeometryBounder::geometryTypes() const { std::vector out(geospatialTypes_.begin(), geospatialTypes_.end()); std::sort(out.begin(), out.end()); return out; } - void WKBGeometryBounder::MergeGeometry(std::string_view bytesWkb) { + void WKBGeometryBounder::mergeGeometry(std::string_view bytesWkb) { if (!isValid_) { return; } - MergeGeometry(reinterpret_cast(bytesWkb.data()), bytesWkb.size()); + mergeGeometry(reinterpret_cast(bytesWkb.data()), bytesWkb.size()); } - void WKBGeometryBounder::MergeGeometry(const uint8_t* bytesWkb, size_t bytesSize) { + void WKBGeometryBounder::mergeGeometry(const uint8_t* bytesWkb, size_t bytesSize) { if (!isValid_) { return; } WKBBuffer src{bytesWkb, static_cast(bytesSize)}; try { - MergeGeometryInternal(&src, /*record_wkb_type=*/true); + mergeGeometryInternal(&src, /*record_wkb_type=*/true); } catch (const ParseError&) { - Invalidate(); + invalidate(); return; } if (src.size() != 0) { // "Exepcted zero bytes after consuming WKB - Invalidate(); + invalidate(); } } - void WKBGeometryBounder::MergeGeometryInternal(WKBBuffer* src, bool recordWkbType) { + void WKBGeometryBounder::mergeGeometryInternal(WKBBuffer* src, bool recordWkbType) { uint8_t endian = src->ReadUInt8(); bool swap = endian != 0x00; if (isLittleEndian()) { @@ -223,7 +231,7 @@ namespace orc::geospatial { uint32_t wkbGeometryType = src->ReadUInt32(swap); auto geometryTypeAndDimensions = ParseGeometryType(wkbGeometryType); if (!geometryTypeAndDimensions.has_value()) { - Invalidate(); + invalidate(); return; } auto& [geometry_type, dimensions] = geometryTypeAndDimensions.value(); @@ -234,20 +242,20 @@ namespace orc::geospatial { } switch (geometry_type) { - case GeometryType::kPoint: - MergeSequence(src, dimensions, 1, swap); + case GeometryType::POINT: + mergeSequence(src, dimensions, 1, swap); break; - case GeometryType::kLinestring: { + case GeometryType::LINESTRING: { uint32_t nCoords = src->ReadUInt32(swap); - MergeSequence(src, dimensions, nCoords, swap); + mergeSequence(src, dimensions, nCoords, swap); break; } - case GeometryType::kPolygon: { + case GeometryType::POLYGON: { uint32_t n_parts = src->ReadUInt32(swap); for (uint32_t i = 0; i < n_parts; i++) { uint32_t nCoords = src->ReadUInt32(swap); - MergeSequence(src, dimensions, nCoords, swap); + mergeSequence(src, dimensions, nCoords, swap); } break; } @@ -258,40 +266,40 @@ namespace orc::geospatial { // record_wkb_type = false because we do not want the child geometry to be // added to the geometry_types list (e.g., for a MultiPoint, we only want // the code for MultiPoint to be added, not the code for Point). - case GeometryType::kMultiPoint: - case GeometryType::kMultiLinestring: - case GeometryType::kMultiPolygon: - case GeometryType::kGeometryCollection: { + case GeometryType::MULTIPOINT: + case GeometryType::MULTILINESTRING: + case GeometryType::MULTIPOLYGON: + case GeometryType::GEOMETRYCOLLECTION: { uint32_t n_parts = src->ReadUInt32(swap); for (uint32_t i = 0; i < n_parts; i++) { - MergeGeometryInternal(src, /*record_wkb_type*/ false); + mergeGeometryInternal(src, /*record_wkb_type*/ false); } break; } } } - void WKBGeometryBounder::MergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, + void WKBGeometryBounder::mergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, bool swap) { switch (dimensions) { - case Dimensions::kXY: + case Dimensions::XY: src->ReadCoords(nCoords, swap, - [&](BoundingBox::XY coord) { box_.UpdateXY(coord); }); + [&](BoundingBox::XY coord) { box_.updateXY(coord); }); break; - case Dimensions::kXYZ: + case Dimensions::XYZ: src->ReadCoords(nCoords, swap, - [&](BoundingBox::XYZ coord) { box_.UpdateXYZ(coord); }); + [&](BoundingBox::XYZ coord) { box_.updateXYZ(coord); }); break; - case Dimensions::kXYM: + case Dimensions::XYM: src->ReadCoords(nCoords, swap, - [&](BoundingBox::XYM coord) { box_.UpdateXYM(coord); }); + [&](BoundingBox::XYM coord) { box_.updateXYM(coord); }); break; - case Dimensions::kXYZM: + case Dimensions::XYZM: src->ReadCoords( - nCoords, swap, [&](BoundingBox::XYZM coord) { box_.UpdateXYZM(coord); }); + nCoords, swap, [&](BoundingBox::XYZM coord) { box_.updateXYZM(coord); }); break; default: - Invalidate(); + invalidate(); } } diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc index 0f5e0cfa2b..a86247f107 100644 --- a/c++/src/Statistics.cc +++ b/c++/src/Statistics.cc @@ -405,12 +405,12 @@ namespace orc { const proto::ColumnStatistics& pb) { reset(); if (!pb.has_geospatial_statistics()) { - bounder_.Invalidate(); + bounder_.invalidate(); } else { const proto::GeospatialStatistics& stats = pb.geospatial_statistics(); geospatial::BoundingBox::XYZM min; geospatial::BoundingBox::XYZM max; - for (int i = 0; i < geospatial::kMaxDimensions; i++) { + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { min[i] = max[i] = std::numeric_limits::quiet_NaN(); } if (stats.has_bbox()) { @@ -428,10 +428,10 @@ namespace orc { max[3] = protoBBox.mmax(); } } - bounder_.MergeBox(geospatial::BoundingBox(min, max)); + bounder_.mergeBox(geospatial::BoundingBox(min, max)); std::vector types = {stats.geospatial_types().begin(), stats.geospatial_types().end()}; - bounder_.MergeGeometryTypes(types); + bounder_.mergeGeometryTypes(types); } } diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh index b3c496019b..ab958b6371 100644 --- a/c++/src/Statistics.hh +++ b/c++/src/Statistics.hh @@ -1720,16 +1720,16 @@ namespace orc { const GeospatialColumnStatisticsImpl& geoStats = dynamic_cast(other); stats_.merge(geoStats.stats_); - bounder_.Merge(geoStats.bounder_); + bounder_.merge(geoStats.bounder_); } void reset() override { stats_.reset(); - bounder_.Reset(); + bounder_.reset(); } void update(const char* value, size_t length) override { - bounder_.MergeGeometry(std::string_view(value, length)); + bounder_.mergeGeometry(std::string_view(value, length)); } void toProtoBuf(proto::ColumnStatistics& pbStats) const override { @@ -1737,28 +1737,28 @@ namespace orc { pbStats.set_number_of_values(stats_.getNumberOfValues()); proto::GeospatialStatistics* geoStats = pbStats.mutable_geospatial_statistics(); - const auto& bbox = bounder_.Bounds(); - if (bbox.BoundValid(0) && bbox.BoundValid(1) && !bbox.BoundEmpty(0) && !bbox.BoundEmpty(1)) { + const auto& bbox = bounder_.bounds(); + if (bbox.boundValid(0) && bbox.boundValid(1) && !bbox.boundEmpty(0) && !bbox.boundEmpty(1)) { geoStats->mutable_bbox()->set_xmin(bbox.min[0]); geoStats->mutable_bbox()->set_xmax(bbox.max[0]); geoStats->mutable_bbox()->set_ymin(bbox.min[1]); geoStats->mutable_bbox()->set_ymax(bbox.max[1]); - if (bbox.BoundValid(2) && !bbox.BoundEmpty(2)) { + if (bbox.boundValid(2) && !bbox.boundEmpty(2)) { geoStats->mutable_bbox()->set_zmin(bbox.min[2]); geoStats->mutable_bbox()->set_zmax(bbox.max[2]); } - if (bbox.BoundValid(3) && !bbox.BoundEmpty(3)) { + if (bbox.boundValid(3) && !bbox.boundEmpty(3)) { geoStats->mutable_bbox()->set_mmin(bbox.min[3]); geoStats->mutable_bbox()->set_mmax(bbox.max[3]); } } - for (auto type : bounder_.GeometryTypes()) { + for (auto type : bounder_.geometryTypes()) { geoStats->add_geospatial_types(type); } } std::string toString() const override { - if (!bounder_.IsValid()) { + if (!bounder_.isValid()) { return " invalid"; } @@ -1766,11 +1766,11 @@ namespace orc { ss << ""; std::string dim_label("xyzm"); - const auto& bbox = bounder_.Bounds(); - auto dim_valid = bbox.DimensionValid(); - auto dim_empty = bbox.DimensionEmpty(); - auto lower = bbox.LowerBound(); - auto upper = bbox.UpperBound(); + const auto& bbox = bounder_.bounds(); + auto dim_valid = bbox.dimensionValid(); + auto dim_empty = bbox.dimensionEmpty(); + auto lower = bbox.lowerBound(); + auto upper = bbox.upperBound(); for (int i = 0; i < 4; i++) { ss << " " << dim_label[i] << ": "; @@ -1783,7 +1783,7 @@ namespace orc { } } - std::vector maybe_geometry_types = bounder_.GeometryTypes(); + std::vector maybe_geometry_types = bounder_.geometryTypes(); ss << " geometry_types: ["; std::string sep(""); for (int32_t geometry_type : maybe_geometry_types) { @@ -1796,11 +1796,11 @@ namespace orc { } const geospatial::BoundingBox& getBoundingBox() const override { - return bounder_.Bounds(); + return bounder_.bounds(); } std::vector getGeospatialTypes() const override { - return bounder_.GeometryTypes(); + return bounder_.geometryTypes(); } }; diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index 97ae371896..6118d639a8 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -826,12 +826,12 @@ namespace orc { std::unique_ptr TypeImpl::parseGeographyType(const std::string& input, size_t start, size_t end) { if (input[start] != '(') { - throw std::logic_error("Missing ( after geometry."); + throw std::logic_error("Missing ( after geography."); } size_t pos = start + 1; size_t sep = input.find(',', pos); if (sep + 1 >= end || sep == std::string::npos) { - throw std::logic_error("Decimal type must specify CRS."); + throw std::logic_error("Geography type must specify CRS."); } std::string crs = input.substr(pos, sep - pos); std::string algoStr = input.substr(sep + 1, end - sep - 1); @@ -911,7 +911,7 @@ namespace orc { return std::make_unique(CHAR, maxLength); } else if (category == "geometry") { if (input[start] != '(') { - throw std::logic_error("Missing ( after varchar."); + throw std::logic_error("Missing ( after geometry."); } std::string crs = input.substr(start + 1, end - start + 1); return std::make_unique(GEOMETRY, crs); diff --git a/c++/test/TestColumnStatistics.cc b/c++/test/TestColumnStatistics.cc index a9ab6a0593..642a8019de 100644 --- a/c++/test/TestColumnStatistics.cc +++ b/c++/test/TestColumnStatistics.cc @@ -537,9 +537,9 @@ namespace orc { std::unique_ptr geoStats(new GeospatialColumnStatisticsImpl()); EXPECT_TRUE(geoStats->getGeospatialTypes().empty()); auto bbox = geoStats->getBoundingBox(); - for (int i = 0; i < geospatial::kMaxDimensions; i++) { - EXPECT_TRUE(bbox.BoundEmpty(i)); - EXPECT_TRUE(bbox.BoundValid(i)); + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + EXPECT_TRUE(bbox.boundEmpty(i)); + EXPECT_TRUE(bbox.boundValid(i)); } EXPECT_EQ(" x: empty y: empty z: empty m: empty geometry_types: []", geoStats->toString()); @@ -549,35 +549,35 @@ namespace orc { std::unique_ptr geoStats(new GeospatialColumnStatisticsImpl()); EXPECT_TRUE(geoStats->getGeospatialTypes().empty()); const auto& bbox = geoStats->getBoundingBox(); - for (int i = 0; i < geospatial::kMaxDimensions; i++) { - EXPECT_TRUE(bbox.BoundEmpty(i)); - EXPECT_TRUE(bbox.BoundValid(i)); + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + EXPECT_TRUE(bbox.boundEmpty(i)); + EXPECT_TRUE(bbox.boundValid(i)); } EXPECT_EQ(geoStats->getGeospatialTypes().size(), 0); geospatial::BoundingBox::XYZM expectedMin; geospatial::BoundingBox::XYZM expectedMax; - std::array expectedEmpty; - std::array expectedValid; + std::array expectedEmpty; + std::array expectedValid; std::vector expectedTypes; - for (int i = 0; i < geospatial::kMaxDimensions; i++) { - expectedMin[i] = geospatial::kInf; - expectedMax[i] = -geospatial::kInf; + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + expectedMin[i] = geospatial::INF; + expectedMax[i] = -geospatial::INF; expectedEmpty[i] = true; expectedValid[i] = true; } auto Verify = [&]() { - EXPECT_EQ(expectedEmpty, geoStats->getBoundingBox().DimensionEmpty()); - EXPECT_EQ(expectedValid, geoStats->getBoundingBox().DimensionValid()); + EXPECT_EQ(expectedEmpty, geoStats->getBoundingBox().dimensionEmpty()); + EXPECT_EQ(expectedValid, geoStats->getBoundingBox().dimensionValid()); EXPECT_EQ(expectedTypes, geoStats->getGeospatialTypes()); - for (int i = 0; i < geospatial::kMaxDimensions; i++) { - if (geoStats->getBoundingBox().BoundValid(i)) { - EXPECT_EQ(expectedMin[i], geoStats->getBoundingBox().LowerBound()[i]); - EXPECT_EQ(expectedMax[i], geoStats->getBoundingBox().UpperBound()[i]); + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + if (geoStats->getBoundingBox().boundValid(i)) { + EXPECT_EQ(expectedMin[i], geoStats->getBoundingBox().lowerBound()[i]); + EXPECT_EQ(expectedMax[i], geoStats->getBoundingBox().upperBound()[i]); } else { - EXPECT_TRUE(std::isnan(geoStats->getBoundingBox().LowerBound()[i])); - EXPECT_TRUE(std::isnan(geoStats->getBoundingBox().UpperBound()[i])); + EXPECT_TRUE(std::isnan(geoStats->getBoundingBox().lowerBound()[i])); + EXPECT_TRUE(std::isnan(geoStats->getBoundingBox().upperBound()[i])); } } }; @@ -735,40 +735,40 @@ namespace orc { // invalid merge invalid invalidStats->merge(*invalidStats); std::array expectedValid = {false, false, false, false}; - EXPECT_EQ(invalidStats->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(invalidStats->getBoundingBox().dimensionValid(), expectedValid); EXPECT_EQ(invalidStats->getGeospatialTypes().size(), 0); // Empty merge empty emptyStats->merge(*emptyStats); expectedValid = {true, true, true, true}; std::array expectedEmpty = {true, true, true, true}; - EXPECT_EQ(emptyStats->getBoundingBox().DimensionValid(), expectedValid); - EXPECT_EQ(emptyStats->getBoundingBox().DimensionEmpty(), expectedEmpty); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty); EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 0); // Empty merge xy emptyStats->merge(*xyStats); expectedEmpty = {false, false, true, true}; - EXPECT_EQ(emptyStats->getBoundingBox().DimensionValid(), expectedValid); - EXPECT_EQ(emptyStats->getBoundingBox().DimensionEmpty(), expectedEmpty); - EXPECT_EQ(10, emptyStats->getBoundingBox().LowerBound()[0]); - EXPECT_EQ(10, emptyStats->getBoundingBox().UpperBound()[0]); - EXPECT_EQ(11, emptyStats->getBoundingBox().LowerBound()[1]); - EXPECT_EQ(11, emptyStats->getBoundingBox().UpperBound()[1]); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty); + EXPECT_EQ(10, emptyStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(10, emptyStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(11, emptyStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(11, emptyStats->getBoundingBox().upperBound()[1]); EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 1); EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1); // Empty merge xyz emptyStats->merge(*xyzStats); expectedEmpty = {false, false, false, true}; - EXPECT_EQ(emptyStats->getBoundingBox().DimensionValid(), expectedValid); - EXPECT_EQ(emptyStats->getBoundingBox().DimensionEmpty(), expectedEmpty); - EXPECT_EQ(10, emptyStats->getBoundingBox().LowerBound()[0]); - EXPECT_EQ(12, emptyStats->getBoundingBox().UpperBound()[0]); - EXPECT_EQ(11, emptyStats->getBoundingBox().LowerBound()[1]); - EXPECT_EQ(13, emptyStats->getBoundingBox().UpperBound()[1]); - EXPECT_EQ(14, emptyStats->getBoundingBox().LowerBound()[2]); - EXPECT_EQ(14, emptyStats->getBoundingBox().UpperBound()[2]); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty); + EXPECT_EQ(10, emptyStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(12, emptyStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(11, emptyStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(13, emptyStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(14, emptyStats->getBoundingBox().lowerBound()[2]); + EXPECT_EQ(14, emptyStats->getBoundingBox().upperBound()[2]); EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 2); EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1); EXPECT_EQ(emptyStats->getGeospatialTypes()[1], 1001); @@ -776,16 +776,16 @@ namespace orc { // Empty merge xyzm emptyStats->merge(*xyzmStats); expectedEmpty = {false, false, false, false}; - EXPECT_EQ(emptyStats->getBoundingBox().DimensionValid(), expectedValid); - EXPECT_EQ(emptyStats->getBoundingBox().DimensionEmpty(), expectedEmpty); - EXPECT_EQ(-10, emptyStats->getBoundingBox().LowerBound()[0]); - EXPECT_EQ(12, emptyStats->getBoundingBox().UpperBound()[0]); - EXPECT_EQ(-11, emptyStats->getBoundingBox().LowerBound()[1]); - EXPECT_EQ(13, emptyStats->getBoundingBox().UpperBound()[1]); - EXPECT_EQ(-12, emptyStats->getBoundingBox().LowerBound()[2]); - EXPECT_EQ(14, emptyStats->getBoundingBox().UpperBound()[2]); - EXPECT_EQ(-13, emptyStats->getBoundingBox().LowerBound()[3]); - EXPECT_EQ(-13, emptyStats->getBoundingBox().UpperBound()[3]); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty); + EXPECT_EQ(-10, emptyStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(12, emptyStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(-11, emptyStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(13, emptyStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(-12, emptyStats->getBoundingBox().lowerBound()[2]); + EXPECT_EQ(14, emptyStats->getBoundingBox().upperBound()[2]); + EXPECT_EQ(-13, emptyStats->getBoundingBox().lowerBound()[3]); + EXPECT_EQ(-13, emptyStats->getBoundingBox().upperBound()[3]); EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 3); EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1); EXPECT_EQ(emptyStats->getGeospatialTypes()[1], 1001); @@ -794,7 +794,7 @@ namespace orc { // Empty merge invalid emptyStats->merge(*invalidStats); expectedValid = {false, false, false, false}; - EXPECT_EQ(emptyStats->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 0); } @@ -806,14 +806,14 @@ namespace orc { new GeospatialColumnStatisticsImpl(pbStats)); std::array expectedValid = {false, false, false, false}; EXPECT_TRUE(emptyStats0->getGeospatialTypes().empty()); - EXPECT_EQ(emptyStats0->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(emptyStats0->getBoundingBox().dimensionValid(), expectedValid); // Add empty geostats pbStats.mutable_geospatial_statistics(); std::unique_ptr emptyStats1( new GeospatialColumnStatisticsImpl(pbStats)); EXPECT_TRUE(emptyStats1->getGeospatialTypes().empty()); - EXPECT_EQ(emptyStats1->getBoundingBox().DimensionValid(), expectedValid); + EXPECT_EQ(emptyStats1->getBoundingBox().dimensionValid(), expectedValid); // Set xy bounds auto* geoProtoStas = pbStats.mutable_geospatial_statistics(); @@ -827,11 +827,11 @@ namespace orc { expectedValid = {true, true, false, false}; EXPECT_EQ(xyStats->getGeospatialTypes().size(), 1); EXPECT_EQ(xyStats->getGeospatialTypes()[0], 2); - EXPECT_EQ(xyStats->getBoundingBox().DimensionValid(), expectedValid); - EXPECT_EQ(0, xyStats->getBoundingBox().LowerBound()[0]); - EXPECT_EQ(1, xyStats->getBoundingBox().UpperBound()[0]); - EXPECT_EQ(0, xyStats->getBoundingBox().LowerBound()[1]); - EXPECT_EQ(1, xyStats->getBoundingBox().UpperBound()[1]); + EXPECT_EQ(xyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(0, xyStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(1, xyStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(0, xyStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(1, xyStats->getBoundingBox().upperBound()[1]); // Set xyz bounds geoProtoStas->mutable_bbox()->set_zmin(0); @@ -843,13 +843,13 @@ namespace orc { EXPECT_EQ(xyzStats->getGeospatialTypes().size(), 2); EXPECT_EQ(xyzStats->getGeospatialTypes()[0], 2); EXPECT_EQ(xyzStats->getGeospatialTypes()[1], 1003); - EXPECT_EQ(xyzStats->getBoundingBox().DimensionValid(), expectedValid); - EXPECT_EQ(0, xyzStats->getBoundingBox().LowerBound()[0]); - EXPECT_EQ(1, xyzStats->getBoundingBox().UpperBound()[0]); - EXPECT_EQ(0, xyzStats->getBoundingBox().LowerBound()[1]); - EXPECT_EQ(1, xyzStats->getBoundingBox().UpperBound()[1]); - EXPECT_EQ(0, xyzStats->getBoundingBox().LowerBound()[2]); - EXPECT_EQ(1, xyzStats->getBoundingBox().UpperBound()[2]); + EXPECT_EQ(xyzStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(0, xyzStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(1, xyzStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(0, xyzStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(1, xyzStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(0, xyzStats->getBoundingBox().lowerBound()[2]); + EXPECT_EQ(1, xyzStats->getBoundingBox().upperBound()[2]); // Set xyzm bounds geoProtoStas->mutable_bbox()->set_mmin(0); @@ -862,15 +862,15 @@ namespace orc { EXPECT_EQ(xyzmStats->getGeospatialTypes()[0], 2); EXPECT_EQ(xyzmStats->getGeospatialTypes()[1], 1003); EXPECT_EQ(xyzmStats->getGeospatialTypes()[2], 3003); - EXPECT_EQ(xyzmStats->getBoundingBox().DimensionValid(), expectedValid); - EXPECT_EQ(0, xyzmStats->getBoundingBox().LowerBound()[0]); - EXPECT_EQ(1, xyzmStats->getBoundingBox().UpperBound()[0]); - EXPECT_EQ(0, xyzmStats->getBoundingBox().LowerBound()[1]); - EXPECT_EQ(1, xyzmStats->getBoundingBox().UpperBound()[1]); - EXPECT_EQ(0, xyzmStats->getBoundingBox().LowerBound()[2]); - EXPECT_EQ(1, xyzmStats->getBoundingBox().UpperBound()[2]); - EXPECT_EQ(0, xyzmStats->getBoundingBox().LowerBound()[3]); - EXPECT_EQ(1, xyzmStats->getBoundingBox().UpperBound()[3]); + EXPECT_EQ(xyzmStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[2]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[2]); + EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[3]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[3]); } } // namespace orc diff --git a/c++/test/TestStatistics.cc b/c++/test/TestStatistics.cc index 6e8e3b5122..61c5e08cb6 100644 --- a/c++/test/TestStatistics.cc +++ b/c++/test/TestStatistics.cc @@ -16,7 +16,6 @@ * limitations under the License. */ -#include "Statistics.hh" #include "orc/OrcFile.hh" #include "MemoryInputStream.hh" @@ -76,10 +75,10 @@ namespace orc { // create str values std::vector wkbs; - std::array mins = {geospatial::kInf, geospatial::kInf, geospatial::kInf, - geospatial::kInf}; - std::array maxs = {-geospatial::kInf, -geospatial::kInf, -geospatial::kInf, - -geospatial::kInf}; + std::array mins = {geospatial::INF, geospatial::INF, geospatial::INF, + geospatial::INF}; + std::array maxs = {-geospatial::INF, -geospatial::INF, -geospatial::INF, + -geospatial::INF}; for (uint64_t i = 1; i < batchCount - 1; ++i) { if (i % 3 == 0) { wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0}, false, false)); @@ -140,10 +139,10 @@ namespace orc { EXPECT_EQ(geoFileStats->getGeospatialTypes()[2], 3001); std::array expectValid = {true, true, true, true}; std::array expectEmpty = {false, false, false, false}; - EXPECT_EQ(geoFileStats->getBoundingBox().DimensionValid(), expectValid); - EXPECT_EQ(geoFileStats->getBoundingBox().DimensionEmpty(), expectEmpty); - EXPECT_EQ(geoFileStats->getBoundingBox().LowerBound(), mins); - EXPECT_EQ(geoFileStats->getBoundingBox().UpperBound(), maxs); + EXPECT_EQ(geoFileStats->getBoundingBox().dimensionValid(), expectValid); + EXPECT_EQ(geoFileStats->getBoundingBox().dimensionEmpty(), expectEmpty); + EXPECT_EQ(geoFileStats->getBoundingBox().lowerBound(), mins); + EXPECT_EQ(geoFileStats->getBoundingBox().upperBound(), maxs); } TEST(Statistics, geographyStatsWithNull) { @@ -166,10 +165,10 @@ namespace orc { // create str values std::vector wkbs; - std::array mins = {geospatial::kInf, geospatial::kInf, geospatial::kInf, - geospatial::kInf}; - std::array maxs = {-geospatial::kInf, -geospatial::kInf, -geospatial::kInf, - -geospatial::kInf}; + std::array mins = {geospatial::INF, geospatial::INF, geospatial::INF, + geospatial::INF}; + std::array maxs = {-geospatial::INF, -geospatial::INF, -geospatial::INF, + -geospatial::INF}; for (uint64_t i = 1; i < batchCount - 1; ++i) { if (i % 3 == 0) { wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0}, false, false)); @@ -226,6 +225,6 @@ namespace orc { ENSURE_DYNAMIC_CAST_NOT_NULL(geoFileStats); EXPECT_EQ(geoFileStats->getGeospatialTypes().size(), 0); std::array expectValid = {false, false, false, false}; - EXPECT_EQ(geoFileStats->getBoundingBox().DimensionValid(), expectValid); + EXPECT_EQ(geoFileStats->getBoundingBox().dimensionValid(), expectValid); } } // namespace orc \ No newline at end of file diff --git a/c++/test/TestUtil.cc b/c++/test/TestUtil.cc index a457f68535..a76880340c 100644 --- a/c++/test/TestUtil.cc +++ b/c++/test/TestUtil.cc @@ -42,7 +42,7 @@ namespace orc { char* ptr = wkb.data(); ptr[0] = kWkbNativeEndianness; - uint32_t geom_type = GeometryTypeToWKB(geospatial::GeometryType::kPoint, hasZ, hasM); + uint32_t geom_type = GeometryTypeToWKB(geospatial::GeometryType::POINT, hasZ, hasM); std::memcpy(&ptr[1], &geom_type, 4); std::memcpy(&ptr[5], &xyzm[0], 8); std::memcpy(&ptr[13], &xyzm[1], 8); diff --git a/tools/src/CSVFileImport.cc b/tools/src/CSVFileImport.cc index c99f5b65a4..31a6f52a2d 100644 --- a/tools/src/CSVFileImport.cc +++ b/tools/src/CSVFileImport.cc @@ -395,8 +395,6 @@ int main(int argc, char* argv[]) { case orc::CHAR: case orc::VARCHAR: case orc::BINARY: - case orc::GEOMETRY: - case orc::GEOGRAPHY: bufferList.emplace_back(*orc::getDefaultPool(), 1 * 1024 * 1024); fillStringValues(data, structBatch->fields[i], numValues, i, bufferList.back()); break; @@ -422,6 +420,8 @@ int main(int argc, char* argv[]) { case orc::LIST: case orc::MAP: case orc::UNION: + case orc::GEOMETRY: + case orc::GEOGRAPHY: throw std::runtime_error(subType->toString() + " is not supported yet."); } } From 4b15527d8d1e9609994833add2d6b747aeff6c6c Mon Sep 17 00:00:00 2001 From: ffacs Date: Wed, 2 Jul 2025 00:34:25 +0800 Subject: [PATCH 09/16] Fix meson build --- c++/include/orc/meson.build | 1 + c++/src/meson.build | 1 + 2 files changed, 2 insertions(+) diff --git a/c++/include/orc/meson.build b/c++/include/orc/meson.build index 2e9e181991..e2524051f0 100644 --- a/c++/include/orc/meson.build +++ b/c++/include/orc/meson.build @@ -34,6 +34,7 @@ install_headers( 'ColumnPrinter.hh', 'Common.hh', 'Exceptions.hh', + 'Geospatial.hh', 'Int128.hh', 'MemoryPool.hh', 'OrcFile.hh', diff --git a/c++/src/meson.build b/c++/src/meson.build index 3d77d3242e..0794dec843 100644 --- a/c++/src/meson.build +++ b/c++/src/meson.build @@ -151,6 +151,7 @@ source_files += files( 'ConvertColumnReader.cc', 'CpuInfoUtil.cc', 'Exceptions.cc', + 'Geospatial.cc', 'Int128.cc', 'LzoDecompressor.cc', 'MemoryPool.cc', From 4c7c05d467d577e44a2d68142515ee25b91a8565 Mon Sep 17 00:00:00 2001 From: ffacs Date: Wed, 2 Jul 2025 00:42:47 +0800 Subject: [PATCH 10/16] fix meson build again --- c++/test/meson.build | 2 ++ 1 file changed, 2 insertions(+) diff --git a/c++/test/meson.build b/c++/test/meson.build index ba84bf7fa5..a8d30a6b94 100644 --- a/c++/test/meson.build +++ b/c++/test/meson.build @@ -50,10 +50,12 @@ test_sources = [ 'TestSargsApplier.cc', 'TestSearchArgument.cc', 'TestSchemaEvolution.cc', + 'TestStatistics.cc', 'TestStripeIndexStatistics.cc', 'TestTimestampStatistics.cc', 'TestTimezone.cc', 'TestType.cc', + 'TestUtil.cc', 'TestWriter.cc', 'TestCache.cc', ] From 7b32319b243570034f35fc910459326b2ad6b858 Mon Sep 17 00:00:00 2001 From: ffacs Date: Thu, 3 Jul 2025 13:55:15 +0800 Subject: [PATCH 11/16] fix linter --- c++/include/orc/Geospatial.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c++/include/orc/Geospatial.hh b/c++/include/orc/Geospatial.hh index 9c601ee943..17af3bbf03 100644 --- a/c++/include/orc/Geospatial.hh +++ b/c++/include/orc/Geospatial.hh @@ -222,7 +222,7 @@ namespace orc { const BoundingBox& bounds() const { return box_; } - + // Get the set of geometry types encountered during merging. // Returns a sorted vector of geometry type IDs. std::vector geometryTypes() const; From f4d7c27d02aff812d7d62391a1f8c5ebe949ce4a Mon Sep 17 00:00:00 2001 From: ffacs Date: Thu, 3 Jul 2025 14:07:12 +0800 Subject: [PATCH 12/16] Split Geospatial.hh --- c++/include/orc/Geospatial.hh | 58 ------------------------ c++/src/Geospatial.cc | 5 ++- c++/src/Geospatial.hh | 85 +++++++++++++++++++++++++++++++++++ c++/src/Statistics.hh | 1 + 4 files changed, 89 insertions(+), 60 deletions(-) create mode 100644 c++/src/Geospatial.hh diff --git a/c++/include/orc/Geospatial.hh b/c++/include/orc/Geospatial.hh index 17af3bbf03..b03a8664bb 100644 --- a/c++/include/orc/Geospatial.hh +++ b/c++/include/orc/Geospatial.hh @@ -32,12 +32,8 @@ #include #include -#include #include #include -#include -#include -#include namespace orc { namespace geospatial { @@ -196,60 +192,6 @@ namespace orc { return os; } - class WKBBuffer; - - class WKBGeometryBounder { - public: - void mergeGeometry(std::string_view bytesWkb); - void mergeGeometry(const uint8_t* bytesWkb, size_t bytesSize); - - void mergeBox(const BoundingBox& box) { - box_.merge(box); - } - void mergeGeometryTypes(const std::vector& geospatialTypes) { - geospatialTypes_.insert(geospatialTypes.begin(), geospatialTypes.end()); - } - void merge(const WKBGeometryBounder& other) { - if (!isValid() || !other.isValid()) { - invalidate(); - return; - } - box_.merge(other.box_); - geospatialTypes_.insert(other.geospatialTypes_.begin(), other.geospatialTypes_.end()); - } - - // Get the bounding box for the merged geometries. - const BoundingBox& bounds() const { - return box_; - } - - // Get the set of geometry types encountered during merging. - // Returns a sorted vector of geometry type IDs. - std::vector geometryTypes() const; - - void reset() { - isValid_ = true; - box_.reset(); - geospatialTypes_.clear(); - } - bool isValid() const { - return isValid_; - } - void invalidate() { - isValid_ = false; - box_.invalidate(); - geospatialTypes_.clear(); - } - - private: - BoundingBox box_; - std::unordered_set geospatialTypes_; - bool isValid_ = true; - - void mergeGeometryInternal(WKBBuffer* src, bool recordWkbType); - void mergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, bool swap); - }; - } // namespace geospatial } // namespace orc diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc index 0b97c9391d..a730384346 100644 --- a/c++/src/Geospatial.cc +++ b/c++/src/Geospatial.cc @@ -28,14 +28,15 @@ */ #include "orc/Geospatial.hh" +#include "orc/Exceptions.hh" + +#include "Geospatial.hh" #include #include #include #include -#include "orc/Exceptions.hh" - namespace orc::geospatial { template diff --git a/c++/src/Geospatial.hh b/c++/src/Geospatial.hh new file mode 100644 index 0000000000..d12aee004c --- /dev/null +++ b/c++/src/Geospatial.hh @@ -0,0 +1,85 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_GEOSPATIAL_IMPL_HH +#define ORC_GEOSPATIAL_IMPL_HH + +#include "orc/Geospatial.hh" + +#include + +namespace orc { + namespace geospatial { + class WKBBuffer; + + class WKBGeometryBounder { + public: + void mergeGeometry(std::string_view bytesWkb); + void mergeGeometry(const uint8_t* bytesWkb, size_t bytesSize); + + void mergeBox(const BoundingBox& box) { + box_.merge(box); + } + void mergeGeometryTypes(const std::vector& geospatialTypes) { + geospatialTypes_.insert(geospatialTypes.begin(), geospatialTypes.end()); + } + void merge(const WKBGeometryBounder& other) { + if (!isValid() || !other.isValid()) { + invalidate(); + return; + } + box_.merge(other.box_); + geospatialTypes_.insert(other.geospatialTypes_.begin(), other.geospatialTypes_.end()); + } + + // Get the bounding box for the merged geometries. + const BoundingBox& bounds() const { + return box_; + } + + // Get the set of geometry types encountered during merging. + // Returns a sorted vector of geometry type IDs. + std::vector geometryTypes() const; + + void reset() { + isValid_ = true; + box_.reset(); + geospatialTypes_.clear(); + } + bool isValid() const { + return isValid_; + } + void invalidate() { + isValid_ = false; + box_.invalidate(); + geospatialTypes_.clear(); + } + + private: + BoundingBox box_; + std::unordered_set geospatialTypes_; + bool isValid_ = true; + + void mergeGeometryInternal(WKBBuffer* src, bool recordWkbType); + void mergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, bool swap); + }; + } // namespace geospatial +} // namespace orc + +#endif \ No newline at end of file diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh index ab958b6371..94b1e5d2b2 100644 --- a/c++/src/Statistics.hh +++ b/c++/src/Statistics.hh @@ -24,6 +24,7 @@ #include "orc/OrcFile.hh" #include "orc/Reader.hh" +#include "Geospatial.hh" #include "Timezone.hh" #include "TypeImpl.hh" From e8eb551b87d736ab1ab05a1e5bc07b3ff547c070 Mon Sep 17 00:00:00 2001 From: ffacs Date: Thu, 3 Jul 2025 14:16:38 +0800 Subject: [PATCH 13/16] Fix build --- c++/src/Geospatial.hh | 1 + c++/test/TestUtil.hh | 2 ++ 2 files changed, 3 insertions(+) diff --git a/c++/src/Geospatial.hh b/c++/src/Geospatial.hh index d12aee004c..f8ad5d78f3 100644 --- a/c++/src/Geospatial.hh +++ b/c++/src/Geospatial.hh @@ -23,6 +23,7 @@ #include "orc/Geospatial.hh" #include +#include namespace orc { namespace geospatial { diff --git a/c++/test/TestUtil.hh b/c++/test/TestUtil.hh index 2132aaea83..0a502f1865 100644 --- a/c++/test/TestUtil.hh +++ b/c++/test/TestUtil.hh @@ -19,6 +19,8 @@ #pragma once #include "orc/Geospatial.hh" +#include + namespace orc { /// \brief Number of bytes in a WKB Point with X and Y dimensions (uint8_t endian, From 630020be3bb2aadad77aa77c7bfe7ffc90f36884 Mon Sep 17 00:00:00 2001 From: ffacs Date: Thu, 3 Jul 2025 14:27:34 +0800 Subject: [PATCH 14/16] include cstdint --- c++/test/TestUtil.hh | 1 + 1 file changed, 1 insertion(+) diff --git a/c++/test/TestUtil.hh b/c++/test/TestUtil.hh index 0a502f1865..104fbc0397 100644 --- a/c++/test/TestUtil.hh +++ b/c++/test/TestUtil.hh @@ -19,6 +19,7 @@ #pragma once #include "orc/Geospatial.hh" +#include #include namespace orc { From e14391d6913bf6b2d8d71546543fb680c23ee114 Mon Sep 17 00:00:00 2001 From: ffacs Date: Sun, 6 Jul 2025 16:48:32 +0800 Subject: [PATCH 15/16] Address comments --- c++/include/orc/Geospatial.hh | 282 +++++++++++++++++----------------- c++/include/orc/Statistics.hh | 8 +- c++/include/orc/Type.hh | 14 +- c++/src/ColumnWriter.cc | 11 +- c++/src/Geospatial.cc | 68 ++++---- c++/src/Geospatial.hh | 2 +- c++/src/TypeImpl.cc | 20 +-- c++/src/TypeImpl.hh | 4 +- c++/src/Writer.cc | 12 +- 9 files changed, 213 insertions(+), 208 deletions(-) diff --git a/c++/include/orc/Geospatial.hh b/c++/include/orc/Geospatial.hh index b03a8664bb..d3b9e28285 100644 --- a/c++/include/orc/Geospatial.hh +++ b/c++/include/orc/Geospatial.hh @@ -35,164 +35,162 @@ #include #include -namespace orc { - namespace geospatial { - - constexpr double INF = std::numeric_limits::infinity(); - // The maximum number of dimensions supported (X, Y, Z, M) - inline constexpr int MAX_DIMENSIONS = 4; - - // Supported combinations of geometry dimensions - enum class Dimensions { - XY = 0, // X and Y only - XYZ = 1, // X, Y, and Z - XYM = 2, // X, Y, and M - XYZM = 3, // X, Y, Z, and M - VALUE_MIN = 0, - VALUE_MAX = 3 - }; - - // Supported geometry types according to ISO WKB - enum class GeometryType { - POINT = 1, - LINESTRING = 2, - POLYGON = 3, - MULTIPOINT = 4, - MULTILINESTRING = 5, - MULTIPOLYGON = 6, - GEOMETRYCOLLECTION = 7, - VALUE_MIN = 1, - VALUE_MAX = 7 - }; - - // BoundingBox represents the minimum bounding rectangle (or box) for a geometry. - // It supports up to 4 dimensions (X, Y, Z, M). - struct BoundingBox { - using XY = std::array; - using XYZ = std::array; - using XYM = std::array; - using XYZM = std::array; - - // Default constructor: initializes to an empty bounding box. - BoundingBox() : min{INF, INF, INF, INF}, max{-INF, -INF, -INF, -INF} {} - // Constructor with explicit min/max values. - BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {} - BoundingBox(const BoundingBox& other) = default; - BoundingBox& operator=(const BoundingBox&) = default; - - // Update the bounding box to include a 2D coordinate. - void updateXY(const XY& coord) { - updateInternal(coord); - } - // Update the bounding box to include a 3D coordinate (XYZ). - void updateXYZ(const XYZ& coord) { - updateInternal(coord); - } - // Update the bounding box to include a 3D coordinate (XYM). - void updateXYM(const XYM& coord) { - std::array dims = {0, 1, 3}; - for (int i = 0; i < 3; ++i) { - auto dim = dims[i]; - if (!std::isnan(min[dim]) && !std::isnan(max[dim])) { - min[dim] = std::min(min[dim], coord[i]); - max[dim] = std::max(max[dim], coord[i]); - } +namespace orc::geospatial { + + constexpr double INF = std::numeric_limits::infinity(); + // The maximum number of dimensions supported (X, Y, Z, M) + inline constexpr int MAX_DIMENSIONS = 4; + + // Supported combinations of geometry dimensions + enum class Dimensions { + XY = 0, // X and Y only + XYZ = 1, // X, Y, and Z + XYM = 2, // X, Y, and M + XYZM = 3, // X, Y, Z, and M + VALUE_MIN = 0, + VALUE_MAX = 3 + }; + + // Supported geometry types according to ISO WKB + enum class GeometryType { + POINT = 1, + LINESTRING = 2, + POLYGON = 3, + MULTIPOINT = 4, + MULTILINESTRING = 5, + MULTIPOLYGON = 6, + GEOMETRYCOLLECTION = 7, + VALUE_MIN = 1, + VALUE_MAX = 7 + }; + + // BoundingBox represents the minimum bounding rectangle (or box) for a geometry. + // It supports up to 4 dimensions (X, Y, Z, M). + struct BoundingBox { + using XY = std::array; + using XYZ = std::array; + using XYM = std::array; + using XYZM = std::array; + + // Default constructor: initializes to an empty bounding box. + BoundingBox() : min{INF, INF, INF, INF}, max{-INF, -INF, -INF, -INF} {} + // Constructor with explicit min/max values. + BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {} + BoundingBox(const BoundingBox& other) = default; + BoundingBox& operator=(const BoundingBox&) = default; + + // Update the bounding box to include a 2D coordinate. + void updateXY(const XY& coord) { + updateInternal(coord); + } + // Update the bounding box to include a 3D coordinate (XYZ). + void updateXYZ(const XYZ& coord) { + updateInternal(coord); + } + // Update the bounding box to include a 3D coordinate (XYM). + void updateXYM(const XYM& coord) { + std::array dims = {0, 1, 3}; + for (int i = 0; i < 3; ++i) { + auto dim = dims[i]; + if (!std::isnan(min[dim]) && !std::isnan(max[dim])) { + min[dim] = std::min(min[dim], coord[i]); + max[dim] = std::max(max[dim], coord[i]); } } - // Update the bounding box to include a 4D coordinate (XYZM). - void updateXYZM(const XYZM& coord) { - updateInternal(coord); - } + } + // Update the bounding box to include a 4D coordinate (XYZM). + void updateXYZM(const XYZM& coord) { + updateInternal(coord); + } - // Reset the bounding box to its initial empty state. - void reset() { - for (int i = 0; i < MAX_DIMENSIONS; ++i) { - min[i] = INF; - max[i] = -INF; - } + // Reset the bounding box to its initial empty state. + void reset() { + for (int i = 0; i < MAX_DIMENSIONS; ++i) { + min[i] = INF; + max[i] = -INF; } + } - // Invalidate the bounding box (set all values to NaN). - void invalidate() { - for (int i = 0; i < MAX_DIMENSIONS; ++i) { - min[i] = std::numeric_limits::quiet_NaN(); - max[i] = std::numeric_limits::quiet_NaN(); - } + // Invalidate the bounding box (set all values to NaN). + void invalidate() { + for (int i = 0; i < MAX_DIMENSIONS; ++i) { + min[i] = std::numeric_limits::quiet_NaN(); + max[i] = std::numeric_limits::quiet_NaN(); } + } - // Check if the bound for a given dimension is empty. - bool boundEmpty(int dim) const { - return std::isinf(min[dim] - max[dim]); - } + // Check if the bound for a given dimension is empty. + bool boundEmpty(int dim) const { + return std::isinf(min[dim] - max[dim]); + } - // Check if the bound for a given dimension is valid (not NaN). - bool boundValid(int dim) const { - return !std::isnan(min[dim]) && !std::isnan(max[dim]); - } + // Check if the bound for a given dimension is valid (not NaN). + bool boundValid(int dim) const { + return !std::isnan(min[dim]) && !std::isnan(max[dim]); + } - // Get the lower bound (min values). - const XYZM& lowerBound() const { - return min; - } - // Get the upper bound (max values). - const XYZM& upperBound() const { - return max; - } + // Get the lower bound (min values). + const XYZM& lowerBound() const { + return min; + } + // Get the upper bound (max values). + const XYZM& upperBound() const { + return max; + } - // Get validity for each dimension. - std::array dimensionValid() const { - return {boundValid(0), boundValid(1), boundValid(2), boundValid(3)}; - } - // Get emptiness for each dimension. - std::array dimensionEmpty() const { - return {boundEmpty(0), boundEmpty(1), boundEmpty(2), boundEmpty(3)}; - } + // Get validity for each dimension. + std::array dimensionValid() const { + return {boundValid(0), boundValid(1), boundValid(2), boundValid(3)}; + } + // Get emptiness for each dimension. + std::array dimensionEmpty() const { + return {boundEmpty(0), boundEmpty(1), boundEmpty(2), boundEmpty(3)}; + } - // Merge another bounding box into this one. - void merge(const BoundingBox& other) { - for (int i = 0; i < MAX_DIMENSIONS; ++i) { - if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) || - std::isnan(other.max[i])) { - min[i] = std::numeric_limits::quiet_NaN(); - max[i] = std::numeric_limits::quiet_NaN(); - } else { - min[i] = std::min(min[i], other.min[i]); - max[i] = std::max(max[i], other.max[i]); - } + // Merge another bounding box into this one. + void merge(const BoundingBox& other) { + for (int i = 0; i < MAX_DIMENSIONS; ++i) { + if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) || + std::isnan(other.max[i])) { + min[i] = std::numeric_limits::quiet_NaN(); + max[i] = std::numeric_limits::quiet_NaN(); + } else { + min[i] = std::min(min[i], other.min[i]); + max[i] = std::max(max[i], other.max[i]); } } + } + + // Convert the bounding box to a string representation. + std::string toString() const; + + XYZM min; // Minimum values for each dimension + XYZM max; // Maximum values for each dimension - // Convert the bounding box to a string representation. - std::string toString() const; - - XYZM min; // Minimum values for each dimension - XYZM max; // Maximum values for each dimension - - private: - // Internal update function for XY, XYZ, or XYZM coordinates. - template - void updateInternal(const Coord& coord) { - for (size_t i = 0; i < coord.size(); ++i) { - if (!std::isnan(min[i]) && !std::isnan(max[i])) { - min[i] = std::min(min[i], coord[i]); - max[i] = std::max(max[i], coord[i]); - } + private: + // Internal update function for XY, XYZ, or XYZM coordinates. + template + void updateInternal(const Coord& coord) { + for (size_t i = 0; i < coord.size(); ++i) { + if (!std::isnan(min[i]) && !std::isnan(max[i])) { + min[i] = std::min(min[i], coord[i]); + max[i] = std::max(max[i], coord[i]); } } - }; - - inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) { - return lhs.min == rhs.min && lhs.max == rhs.max; - } - inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) { - return !(lhs == rhs); } - inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) { - os << obj.toString(); - return os; - } - - } // namespace geospatial -} // namespace orc + }; + + inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) { + return lhs.min == rhs.min && lhs.max == rhs.max; + } + inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) { + return !(lhs == rhs); + } + inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) { + os << obj.toString(); + return os; + } + +} // namespace orc::geospatial #endif // ORC_GEOSPATIAL_HH diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh index 93b4824dd5..58169abe59 100644 --- a/c++/include/orc/Statistics.hh +++ b/c++/include/orc/Statistics.hh @@ -374,19 +374,19 @@ namespace orc { virtual ~GeospatialColumnStatistics(); /** - * get bounding box + * Get bounding box * @return bounding box */ virtual const geospatial::BoundingBox& getBoundingBox() const = 0; /** - * get geospatial types - * @return geospatial types + * Get geospatial types + * @return a sorted vector of geometry type IDs that elements is unique */ virtual std::vector getGeospatialTypes() const = 0; /** - * update stats by a new value + * Update stats by a new value * @param value new value to update * @param length length of the value */ diff --git a/c++/include/orc/Type.hh b/c++/include/orc/Type.hh index 91c714f713..238a968fb0 100644 --- a/c++/include/orc/Type.hh +++ b/c++/include/orc/Type.hh @@ -33,9 +33,8 @@ namespace orc { ANDOYER = 3, KARNEY = 4 }; - using EIAlgo = EdgeInterpolationAlgorithm; - std::string AlgotoString(EIAlgo algo); - EIAlgo AlgoFromString(const std::string& algo); + std::string AlgotoString(EdgeInterpolationAlgorithm algo); + EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo); } // namespace geospatial enum TypeKind { @@ -74,8 +73,10 @@ namespace orc { virtual uint64_t getMaximumLength() const = 0; virtual uint64_t getPrecision() const = 0; virtual uint64_t getScale() const = 0; + // for geospatial types only virtual const std::string& getCRS() const = 0; - virtual geospatial::EIAlgo getEIAlgo() const = 0; + // for geography types only + virtual geospatial::EdgeInterpolationAlgorithm getAlgorithm() const = 0; virtual Type& setAttribute(const std::string& key, const std::string& value) = 0; virtual bool hasAttributeKey(const std::string& key) const = 0; virtual Type& removeAttribute(const std::string& key) = 0; @@ -133,8 +134,9 @@ namespace orc { std::unique_ptr createMapType(std::unique_ptr key, std::unique_ptr value); std::unique_ptr createUnionType(); std::unique_ptr createGeometryType(const std::string& crs = "OGC:CRS84"); - std::unique_ptr createGeographyType(const std::string& crs = "OGC:CRS84", - geospatial::EIAlgo algo = geospatial::SPHERICAL); + std::unique_ptr createGeographyType( + const std::string& crs = "OGC:CRS84", + geospatial::EdgeInterpolationAlgorithm algo = geospatial::SPHERICAL); } // namespace orc #endif diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc index 149879b1f8..c99890b88f 100644 --- a/c++/src/ColumnWriter.cc +++ b/c++/src/ColumnWriter.cc @@ -2885,10 +2885,13 @@ namespace orc { const char* incomingMask) override { ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - const StringVectorBatch& strBatch = dynamic_cast(rowBatch); - auto data = &strBatch.data[offset]; - auto length = &strBatch.length[offset]; - const char* notNull = strBatch.hasNulls ? strBatch.notNull.data() + offset : nullptr; + const StringVectorBatch* strBatch = dynamic_cast(&rowBatch); + if (strBatch == nullptr) { + throw InvalidArgument("Failed to cast to StringVectorBatch"); + } + auto data = &strBatch->data[offset]; + auto length = &strBatch->length[offset]; + const char* notNull = strBatch->hasNulls ? strBatch->notNull.data() + offset : nullptr; bool hasNull = false; GeospatialColumnStatisticsImpl* geoStats = nullptr; diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc index a730384346..6d7d268703 100644 --- a/c++/src/Geospatial.cc +++ b/c++/src/Geospatial.cc @@ -40,7 +40,7 @@ namespace orc::geospatial { template - inline std::enable_if_t, T> SafeLoadAs(const uint8_t* unaligned) { + inline std::enable_if_t, T> safeLoadAs(const uint8_t* unaligned) { std::remove_const_t ret; std::memcpy(&ret, unaligned, sizeof(T)); return ret; @@ -50,7 +50,7 @@ namespace orc::geospatial { inline std::enable_if_t && std::is_trivially_copyable_v && sizeof(T) == sizeof(U), U> - SafeCopy(T value) { + safeCopy(T value) { std::remove_const_t ret; std::memcpy(&ret, static_cast(&value), sizeof(T)); return ret; @@ -66,20 +66,20 @@ namespace orc::geospatial { #if defined(_MSC_VER) #include // IWYU pragma: keep -#define ORC_BYTE_SWAP64 _byteswap_uint64 -#define ORC_BYTE_SWAP32 _byteswap_ulong +#define ORC_BYTE_SWAP64 _byteSwap_uint64 +#define ORC_BYTE_SWAP32 _byteSwap_ulong #else #define ORC_BYTE_SWAP64 __builtin_bswap64 #define ORC_BYTE_SWAP32 __builtin_bswap32 #endif // Swap the byte order (i.e. endianness) - static inline uint32_t ByteSwap(uint32_t value) { + static inline uint32_t byteSwap(uint32_t value) { return static_cast(ORC_BYTE_SWAP32(value)); } - static inline double ByteSwap(double value) { - const uint64_t swapped = ORC_BYTE_SWAP64(SafeCopy(value)); - return SafeCopy(swapped); + static inline double byteSwap(double value) { + const uint64_t swapped = ORC_BYTE_SWAP64(safeCopy(value)); + return safeCopy(swapped); } std::string BoundingBox::toString() const { @@ -103,17 +103,17 @@ namespace orc::geospatial { WKBBuffer() : data_(nullptr), size_(0) {} WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} - uint8_t ReadUInt8() { - return ReadChecked(); + uint8_t readUInt8() { + return readChecked(); } - uint32_t ReadUInt32(bool swap) { - auto value = ReadChecked(); - return swap ? ByteSwap(value) : value; + uint32_t readUInt32(bool swap) { + auto value = readChecked(); + return swap ? byteSwap(value) : value; } template - void ReadCoords(uint32_t nCoords, bool swap, Visit&& visit) { + void readCoords(uint32_t nCoords, bool swap, Visit&& visit) { size_t total_bytes = nCoords * sizeof(Coord); if (size_ < total_bytes) { } @@ -121,16 +121,16 @@ namespace orc::geospatial { if (swap) { Coord coord; for (uint32_t i = 0; i < nCoords; i++) { - coord = ReadUnchecked(); + coord = readUnchecked(); for (auto& c : coord) { - c = ByteSwap(c); + c = byteSwap(c); } std::forward(visit)(coord); } } else { for (uint32_t i = 0; i < nCoords; i++) { - std::forward(visit)(ReadUnchecked()); + std::forward(visit)(readUnchecked()); } } } @@ -144,19 +144,19 @@ namespace orc::geospatial { size_t size_; template - T ReadChecked() { + T readChecked() { if (size_ < sizeof(T)) { std::stringstream ss; ss << "Can't read" << sizeof(T) << " bytes from WKBBuffer with " << size_ << " remaining"; throw ParseError(ss.str()); } - return ReadUnchecked(); + return readUnchecked(); } template - T ReadUnchecked() { - T out = SafeLoadAs(data_); + T readUnchecked() { + T out = safeLoadAs(data_); data_ += sizeof(T); size_ -= sizeof(T); return out; @@ -167,7 +167,7 @@ namespace orc::geospatial { namespace { - std::optional ParseGeometryType(uint32_t wkbGeometryType) { + std::optional parseGeometryType(uint32_t wkbGeometryType) { // The number 1000 can be used because WKB geometry types are constructed // on purpose such that this relationship is true (e.g., LINESTRING ZM maps // to 3002). @@ -223,14 +223,14 @@ namespace orc::geospatial { } void WKBGeometryBounder::mergeGeometryInternal(WKBBuffer* src, bool recordWkbType) { - uint8_t endian = src->ReadUInt8(); + uint8_t endian = src->readUInt8(); bool swap = endian != 0x00; if (isLittleEndian()) { swap = endian != 0x01; } - uint32_t wkbGeometryType = src->ReadUInt32(swap); - auto geometryTypeAndDimensions = ParseGeometryType(wkbGeometryType); + uint32_t wkbGeometryType = src->readUInt32(swap); + auto geometryTypeAndDimensions = parseGeometryType(wkbGeometryType); if (!geometryTypeAndDimensions.has_value()) { invalidate(); return; @@ -248,14 +248,14 @@ namespace orc::geospatial { break; case GeometryType::LINESTRING: { - uint32_t nCoords = src->ReadUInt32(swap); + uint32_t nCoords = src->readUInt32(swap); mergeSequence(src, dimensions, nCoords, swap); break; } case GeometryType::POLYGON: { - uint32_t n_parts = src->ReadUInt32(swap); + uint32_t n_parts = src->readUInt32(swap); for (uint32_t i = 0; i < n_parts; i++) { - uint32_t nCoords = src->ReadUInt32(swap); + uint32_t nCoords = src->readUInt32(swap); mergeSequence(src, dimensions, nCoords, swap); } break; @@ -271,7 +271,7 @@ namespace orc::geospatial { case GeometryType::MULTILINESTRING: case GeometryType::MULTIPOLYGON: case GeometryType::GEOMETRYCOLLECTION: { - uint32_t n_parts = src->ReadUInt32(swap); + uint32_t n_parts = src->readUInt32(swap); for (uint32_t i = 0; i < n_parts; i++) { mergeGeometryInternal(src, /*record_wkb_type*/ false); } @@ -284,19 +284,19 @@ namespace orc::geospatial { bool swap) { switch (dimensions) { case Dimensions::XY: - src->ReadCoords(nCoords, swap, + src->readCoords(nCoords, swap, [&](BoundingBox::XY coord) { box_.updateXY(coord); }); break; case Dimensions::XYZ: - src->ReadCoords(nCoords, swap, + src->readCoords(nCoords, swap, [&](BoundingBox::XYZ coord) { box_.updateXYZ(coord); }); break; case Dimensions::XYM: - src->ReadCoords(nCoords, swap, + src->readCoords(nCoords, swap, [&](BoundingBox::XYM coord) { box_.updateXYM(coord); }); break; case Dimensions::XYZM: - src->ReadCoords( + src->readCoords( nCoords, swap, [&](BoundingBox::XYZM coord) { box_.updateXYZM(coord); }); break; default: @@ -304,4 +304,4 @@ namespace orc::geospatial { } } -} // namespace orc::geospatial \ No newline at end of file +} // namespace orc::geospatial diff --git a/c++/src/Geospatial.hh b/c++/src/Geospatial.hh index f8ad5d78f3..aebb72747a 100644 --- a/c++/src/Geospatial.hh +++ b/c++/src/Geospatial.hh @@ -83,4 +83,4 @@ namespace orc { } // namespace geospatial } // namespace orc -#endif \ No newline at end of file +#endif diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index 6118d639a8..661c98e074 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -74,10 +74,11 @@ namespace orc { scale_ = 0; subtypeCount_ = 0; crs_ = crs; - edgeInterpolationAlgorithm_ = geospatial::EIAlgo::SPHERICAL; + edgeInterpolationAlgorithm_ = geospatial::EdgeInterpolationAlgorithm::SPHERICAL; } - TypeImpl::TypeImpl(TypeKind kind, const std::string& crs, geospatial::EIAlgo algo) { + TypeImpl::TypeImpl(TypeKind kind, const std::string& crs, + geospatial::EdgeInterpolationAlgorithm algo) { parent_ = nullptr; columnId_ = -1; maximumColumnId_ = -1; @@ -152,7 +153,7 @@ namespace orc { return crs_; } - geospatial::EIAlgo TypeImpl::getEIAlgo() const { + geospatial::EdgeInterpolationAlgorithm TypeImpl::getAlgorithm() const { return edgeInterpolationAlgorithm_; } @@ -226,9 +227,9 @@ namespace orc { } namespace geospatial { - std::string AlgotoString(EIAlgo algo) { + std::string AlgotoString(EdgeInterpolationAlgorithm algo) { switch (algo) { - case EIAlgo::SPHERICAL: + case EdgeInterpolationAlgorithm::SPHERICAL: return "speherial"; case VINCENTY: return "vincenty"; @@ -243,9 +244,9 @@ namespace orc { } } - EIAlgo AlgoFromString(const std::string& algo) { + EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo) { if (algo == "speherial") { - return EIAlgo::SPHERICAL; + return EdgeInterpolationAlgorithm::SPHERICAL; } if (algo == "vincenty") { return VINCENTY; @@ -511,7 +512,8 @@ namespace orc { return std::make_unique(GEOMETRY, crs); } - std::unique_ptr createGeographyType(const std::string& crs, geospatial::EIAlgo algo) { + std::unique_ptr createGeographyType(const std::string& crs, + geospatial::EdgeInterpolationAlgorithm algo) { return std::make_unique(GEOGRAPHY, crs, algo); } @@ -634,7 +636,7 @@ namespace orc { break; case GEOGRAPHY: result = std::make_unique(fileType->getKind(), fileType->getCRS(), - fileType->getEIAlgo()); + fileType->getAlgorithm()); break; case LIST: diff --git a/c++/src/TypeImpl.hh b/c++/src/TypeImpl.hh index b9f380703e..4d61c00684 100644 --- a/c++/src/TypeImpl.hh +++ b/c++/src/TypeImpl.hh @@ -70,7 +70,7 @@ namespace orc { /** * Create geography type. */ - TypeImpl(TypeKind kind, const std::string& crs, geospatial::EIAlgo algo); + TypeImpl(TypeKind kind, const std::string& crs, geospatial::EdgeInterpolationAlgorithm algo); uint64_t getColumnId() const override; @@ -92,7 +92,7 @@ namespace orc { const std::string& getCRS() const override; - geospatial::EIAlgo getEIAlgo() const override; + geospatial::EdgeInterpolationAlgorithm getAlgorithm() const override; Type& setAttribute(const std::string& key, const std::string& value) override; diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc index ce5367adfb..b18aa8b8d6 100644 --- a/c++/src/Writer.cc +++ b/c++/src/Writer.cc @@ -711,24 +711,24 @@ namespace orc { case GEOGRAPHY: { protoType.set_kind(proto::Type_Kind_GEOGRAPHY); protoType.set_crs(t.getCRS()); - switch (t.getEIAlgo()) { - case geospatial::EIAlgo::SPHERICAL: { + switch (t.getAlgorithm()) { + case geospatial::EdgeInterpolationAlgorithm::SPHERICAL: { protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_SPHERICAL); break; } - case orc::geospatial::EIAlgo::VINCENTY: { + case orc::geospatial::EdgeInterpolationAlgorithm::VINCENTY: { protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_VINCENTY); break; } - case orc::geospatial::EIAlgo::THOMAS: { + case orc::geospatial::EdgeInterpolationAlgorithm::THOMAS: { protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_VINCENTY); break; } - case orc::geospatial::EIAlgo::ANDOYER: { + case orc::geospatial::EdgeInterpolationAlgorithm::ANDOYER: { protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_ANDOYER); break; } - case orc::geospatial::EIAlgo::KARNEY: { + case orc::geospatial::EdgeInterpolationAlgorithm::KARNEY: { protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_KARNEY); break; } From 39427e953fc42b43ebb12c031a085e13e74e0298 Mon Sep 17 00:00:00 2001 From: ffacs Date: Sun, 13 Jul 2025 15:20:22 +0800 Subject: [PATCH 16/16] rename public api --- c++/include/orc/Type.hh | 6 +++--- c++/src/TypeImpl.cc | 10 +++++----- c++/src/TypeImpl.hh | 2 +- c++/src/Writer.cc | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/c++/include/orc/Type.hh b/c++/include/orc/Type.hh index 238a968fb0..4bb794ff34 100644 --- a/c++/include/orc/Type.hh +++ b/c++/include/orc/Type.hh @@ -33,7 +33,7 @@ namespace orc { ANDOYER = 3, KARNEY = 4 }; - std::string AlgotoString(EdgeInterpolationAlgorithm algo); + std::string AlgoToString(EdgeInterpolationAlgorithm algo); EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo); } // namespace geospatial @@ -74,8 +74,8 @@ namespace orc { virtual uint64_t getPrecision() const = 0; virtual uint64_t getScale() const = 0; // for geospatial types only - virtual const std::string& getCRS() const = 0; - // for geography types only + virtual const std::string& getCrs() const = 0; + // for geography type only virtual geospatial::EdgeInterpolationAlgorithm getAlgorithm() const = 0; virtual Type& setAttribute(const std::string& key, const std::string& value) = 0; virtual bool hasAttributeKey(const std::string& key) const = 0; diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index 661c98e074..18c4985ab1 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -149,7 +149,7 @@ namespace orc { return scale_; } - const std::string& TypeImpl::getCRS() const { + const std::string& TypeImpl::getCrs() const { return crs_; } @@ -227,7 +227,7 @@ namespace orc { } namespace geospatial { - std::string AlgotoString(EdgeInterpolationAlgorithm algo) { + std::string AlgoToString(EdgeInterpolationAlgorithm algo) { switch (algo) { case EdgeInterpolationAlgorithm::SPHERICAL: return "speherial"; @@ -355,7 +355,7 @@ namespace orc { case GEOGRAPHY: { std::stringstream result; result << "geography(" << crs_ << "," - << geospatial::AlgotoString(edgeInterpolationAlgorithm_) << ")"; + << geospatial::AlgoToString(edgeInterpolationAlgorithm_) << ")"; return result.str(); } default: @@ -632,10 +632,10 @@ namespace orc { result = std::make_unique(fileType->getKind(), fileType->getMaximumLength()); break; case GEOMETRY: - result = std::make_unique(fileType->getKind(), fileType->getCRS()); + result = std::make_unique(fileType->getKind(), fileType->getCrs()); break; case GEOGRAPHY: - result = std::make_unique(fileType->getKind(), fileType->getCRS(), + result = std::make_unique(fileType->getKind(), fileType->getCrs(), fileType->getAlgorithm()); break; diff --git a/c++/src/TypeImpl.hh b/c++/src/TypeImpl.hh index 4d61c00684..2db175aba6 100644 --- a/c++/src/TypeImpl.hh +++ b/c++/src/TypeImpl.hh @@ -90,7 +90,7 @@ namespace orc { uint64_t getScale() const override; - const std::string& getCRS() const override; + const std::string& getCrs() const override; geospatial::EdgeInterpolationAlgorithm getAlgorithm() const override; diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc index b18aa8b8d6..c235169cca 100644 --- a/c++/src/Writer.cc +++ b/c++/src/Writer.cc @@ -705,12 +705,12 @@ namespace orc { } case GEOMETRY: { protoType.set_kind(proto::Type_Kind_GEOMETRY); - protoType.set_crs(t.getCRS()); + protoType.set_crs(t.getCrs()); break; } case GEOGRAPHY: { protoType.set_kind(proto::Type_Kind_GEOGRAPHY); - protoType.set_crs(t.getCRS()); + protoType.set_crs(t.getCrs()); switch (t.getAlgorithm()) { case geospatial::EdgeInterpolationAlgorithm::SPHERICAL: { protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_SPHERICAL);