diff --git a/c++/include/orc/Geospatial.hh b/c++/include/orc/Geospatial.hh new file mode 100644 index 0000000000..d3b9e28285 --- /dev/null +++ b/c++/include/orc/Geospatial.hh @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This file contains code adapted from the Apache Arrow project. + * + * Original source: + * https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.h + * + * The original code is licensed under the Apache License, Version 2.0. + * + * Modifications may have been made from the original source. + */ + +#ifndef ORC_GEOSPATIAL_HH +#define ORC_GEOSPATIAL_HH + +#include +#include +#include +#include + +namespace orc::geospatial { + + constexpr double INF = std::numeric_limits::infinity(); + // The maximum number of dimensions supported (X, Y, Z, M) + inline constexpr int MAX_DIMENSIONS = 4; + + // Supported combinations of geometry dimensions + enum class Dimensions { + XY = 0, // X and Y only + XYZ = 1, // X, Y, and Z + XYM = 2, // X, Y, and M + XYZM = 3, // X, Y, Z, and M + VALUE_MIN = 0, + VALUE_MAX = 3 + }; + + // Supported geometry types according to ISO WKB + enum class GeometryType { + POINT = 1, + LINESTRING = 2, + POLYGON = 3, + MULTIPOINT = 4, + MULTILINESTRING = 5, + MULTIPOLYGON = 6, + GEOMETRYCOLLECTION = 7, + VALUE_MIN = 1, + VALUE_MAX = 7 + }; + + // BoundingBox represents the minimum bounding rectangle (or box) for a geometry. + // It supports up to 4 dimensions (X, Y, Z, M). + struct BoundingBox { + using XY = std::array; + using XYZ = std::array; + using XYM = std::array; + using XYZM = std::array; + + // Default constructor: initializes to an empty bounding box. + BoundingBox() : min{INF, INF, INF, INF}, max{-INF, -INF, -INF, -INF} {} + // Constructor with explicit min/max values. + BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {} + BoundingBox(const BoundingBox& other) = default; + BoundingBox& operator=(const BoundingBox&) = default; + + // Update the bounding box to include a 2D coordinate. + void updateXY(const XY& coord) { + updateInternal(coord); + } + // Update the bounding box to include a 3D coordinate (XYZ). + void updateXYZ(const XYZ& coord) { + updateInternal(coord); + } + // Update the bounding box to include a 3D coordinate (XYM). + void updateXYM(const XYM& coord) { + std::array dims = {0, 1, 3}; + for (int i = 0; i < 3; ++i) { + auto dim = dims[i]; + if (!std::isnan(min[dim]) && !std::isnan(max[dim])) { + min[dim] = std::min(min[dim], coord[i]); + max[dim] = std::max(max[dim], coord[i]); + } + } + } + // Update the bounding box to include a 4D coordinate (XYZM). + void updateXYZM(const XYZM& coord) { + updateInternal(coord); + } + + // Reset the bounding box to its initial empty state. + void reset() { + for (int i = 0; i < MAX_DIMENSIONS; ++i) { + min[i] = INF; + max[i] = -INF; + } + } + + // Invalidate the bounding box (set all values to NaN). + void invalidate() { + for (int i = 0; i < MAX_DIMENSIONS; ++i) { + min[i] = std::numeric_limits::quiet_NaN(); + max[i] = std::numeric_limits::quiet_NaN(); + } + } + + // Check if the bound for a given dimension is empty. + bool boundEmpty(int dim) const { + return std::isinf(min[dim] - max[dim]); + } + + // Check if the bound for a given dimension is valid (not NaN). + bool boundValid(int dim) const { + return !std::isnan(min[dim]) && !std::isnan(max[dim]); + } + + // Get the lower bound (min values). + const XYZM& lowerBound() const { + return min; + } + // Get the upper bound (max values). + const XYZM& upperBound() const { + return max; + } + + // Get validity for each dimension. + std::array dimensionValid() const { + return {boundValid(0), boundValid(1), boundValid(2), boundValid(3)}; + } + // Get emptiness for each dimension. + std::array dimensionEmpty() const { + return {boundEmpty(0), boundEmpty(1), boundEmpty(2), boundEmpty(3)}; + } + + // Merge another bounding box into this one. + void merge(const BoundingBox& other) { + for (int i = 0; i < MAX_DIMENSIONS; ++i) { + if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) || + std::isnan(other.max[i])) { + min[i] = std::numeric_limits::quiet_NaN(); + max[i] = std::numeric_limits::quiet_NaN(); + } else { + min[i] = std::min(min[i], other.min[i]); + max[i] = std::max(max[i], other.max[i]); + } + } + } + + // Convert the bounding box to a string representation. + std::string toString() const; + + XYZM min; // Minimum values for each dimension + XYZM max; // Maximum values for each dimension + + private: + // Internal update function for XY, XYZ, or XYZM coordinates. + template + void updateInternal(const Coord& coord) { + for (size_t i = 0; i < coord.size(); ++i) { + if (!std::isnan(min[i]) && !std::isnan(max[i])) { + min[i] = std::min(min[i], coord[i]); + max[i] = std::max(max[i], coord[i]); + } + } + } + }; + + inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) { + return lhs.min == rhs.min && lhs.max == rhs.max; + } + inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) { + return !(lhs == rhs); + } + inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) { + os << obj.toString(); + return os; + } + +} // namespace orc::geospatial + +#endif // ORC_GEOSPATIAL_HH diff --git a/c++/include/orc/Statistics.hh b/c++/include/orc/Statistics.hh index 4ba8c35f7d..58169abe59 100644 --- a/c++/include/orc/Statistics.hh +++ b/c++/include/orc/Statistics.hh @@ -19,12 +19,11 @@ #ifndef ORC_STATISTICS_HH #define ORC_STATISTICS_HH +#include "orc/Geospatial.hh" #include "orc/Type.hh" #include "orc/Vector.hh" #include "orc/orc-config.hh" -#include - namespace orc { /** @@ -367,6 +366,33 @@ namespace orc { virtual int32_t getMaximumNanos() const = 0; }; + /** + * Statistics for Geometry and Geography + */ + class GeospatialColumnStatistics : public ColumnStatistics { + public: + virtual ~GeospatialColumnStatistics(); + + /** + * Get bounding box + * @return bounding box + */ + virtual const geospatial::BoundingBox& getBoundingBox() const = 0; + + /** + * Get geospatial types + * @return a sorted vector of geometry type IDs that elements is unique + */ + virtual std::vector getGeospatialTypes() const = 0; + + /** + * Update stats by a new value + * @param value new value to update + * @param length length of the value + */ + virtual void update(const char* value, size_t length) = 0; + }; + class Statistics { public: virtual ~Statistics(); diff --git a/c++/include/orc/Type.hh b/c++/include/orc/Type.hh index 82e0e3cc86..4bb794ff34 100644 --- a/c++/include/orc/Type.hh +++ b/c++/include/orc/Type.hh @@ -25,6 +25,18 @@ namespace orc { + namespace geospatial { + enum EdgeInterpolationAlgorithm { + SPHERICAL = 0, + VINCENTY = 1, + THOMAS = 2, + ANDOYER = 3, + KARNEY = 4 + }; + std::string AlgoToString(EdgeInterpolationAlgorithm algo); + EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo); + } // namespace geospatial + enum TypeKind { BOOLEAN = 0, BYTE = 1, @@ -44,7 +56,9 @@ namespace orc { DATE = 15, VARCHAR = 16, CHAR = 17, - TIMESTAMP_INSTANT = 18 + TIMESTAMP_INSTANT = 18, + GEOMETRY = 19, + GEOGRAPHY = 20 }; class Type { @@ -59,6 +73,10 @@ namespace orc { virtual uint64_t getMaximumLength() const = 0; virtual uint64_t getPrecision() const = 0; virtual uint64_t getScale() const = 0; + // for geospatial types only + virtual const std::string& getCrs() const = 0; + // for geography type only + virtual geospatial::EdgeInterpolationAlgorithm getAlgorithm() const = 0; virtual Type& setAttribute(const std::string& key, const std::string& value) = 0; virtual bool hasAttributeKey(const std::string& key) const = 0; virtual Type& removeAttribute(const std::string& key) = 0; @@ -115,6 +133,10 @@ namespace orc { std::unique_ptr createListType(std::unique_ptr elements); std::unique_ptr createMapType(std::unique_ptr key, std::unique_ptr value); std::unique_ptr createUnionType(); + std::unique_ptr createGeometryType(const std::string& crs = "OGC:CRS84"); + std::unique_ptr createGeographyType( + const std::string& crs = "OGC:CRS84", + geospatial::EdgeInterpolationAlgorithm algo = geospatial::SPHERICAL); } // namespace orc #endif diff --git a/c++/include/orc/meson.build b/c++/include/orc/meson.build index 2e9e181991..e2524051f0 100644 --- a/c++/include/orc/meson.build +++ b/c++/include/orc/meson.build @@ -34,6 +34,7 @@ install_headers( 'ColumnPrinter.hh', 'Common.hh', 'Exceptions.hh', + 'Geospatial.hh', 'Int128.hh', 'MemoryPool.hh', 'OrcFile.hh', diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt index e378429f1e..09a0b148ed 100644 --- a/c++/src/CMakeLists.txt +++ b/c++/src/CMakeLists.txt @@ -171,6 +171,7 @@ set(SOURCE_FILES ConvertColumnReader.cc CpuInfoUtil.cc Exceptions.cc + Geospatial.cc Int128.cc LzoDecompressor.cc MemoryPool.cc diff --git a/c++/src/ColumnPrinter.cc b/c++/src/ColumnPrinter.cc index 8b16ecbd09..6535c612ce 100644 --- a/c++/src/ColumnPrinter.cc +++ b/c++/src/ColumnPrinter.cc @@ -254,6 +254,8 @@ namespace orc { break; case BINARY: + case GEOMETRY: + case GEOGRAPHY: result = std::make_unique(buffer, param); break; diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc index 0fd17de1b8..89ff0e0245 100644 --- a/c++/src/ColumnReader.cc +++ b/c++/src/ColumnReader.cc @@ -1747,6 +1747,8 @@ namespace orc { case CHAR: case STRING: case VARCHAR: + case GEOMETRY: + case GEOGRAPHY: switch (static_cast(stripe.getEncoding(type.getColumnId()).kind())) { case proto::ColumnEncoding_Kind_DICTIONARY: case proto::ColumnEncoding_Kind_DICTIONARY_V2: diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc index d31b1c65d4..c99890b88f 100644 --- a/c++/src/ColumnWriter.cc +++ b/c++/src/ColumnWriter.cc @@ -17,8 +17,11 @@ */ #include "orc/Int128.hh" +#include "orc/Statistics.hh" +#include "orc/Type.hh" #include "orc/Writer.hh" +#include #include "ByteRLE.hh" #include "ColumnWriter.hh" #include "RLE.hh" @@ -2871,6 +2874,65 @@ namespace orc { } } + class GeospatialColumnWriter : public BinaryColumnWriter { + public: + GeospatialColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options) + : BinaryColumnWriter(type, factory, options), + isGeometry_(type.getKind() == TypeKind::GEOMETRY) {} + + virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues, + const char* incomingMask) override { + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const StringVectorBatch* strBatch = dynamic_cast(&rowBatch); + if (strBatch == nullptr) { + throw InvalidArgument("Failed to cast to StringVectorBatch"); + } + auto data = &strBatch->data[offset]; + auto length = &strBatch->length[offset]; + const char* notNull = strBatch->hasNulls ? strBatch->notNull.data() + offset : nullptr; + + bool hasNull = false; + GeospatialColumnStatisticsImpl* geoStats = nullptr; + if (isGeometry_) { + geoStats = dynamic_cast(colIndexStatistics.get()); + } + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + uint64_t len = static_cast(length[i]); + directDataStream->write(data[i], len); + + // update stats + if (geoStats) { + ++count; + geoStats->update(data[i], len); + } + + if (enableBloomFilter) { + bloomFilter->addBytes(data[i], length[i]); + } + } else if (!hasNull) { + hasNull = true; + if (geoStats) { + geoStats->setHasNull(hasNull); + } + } + } + + directLengthEncoder->add(length, numValues, notNull); + + if (geoStats) { + geoStats->increase(count); + } + } + + private: + bool isGeometry_; + }; + std::unique_ptr buildWriter(const Type& type, const StreamsFactory& factory, const WriterOptions& options) { switch (static_cast(type.getKind())) { @@ -2941,6 +3003,9 @@ namespace orc { return std::make_unique(type, factory, options); case UNION: return std::make_unique(type, factory, options); + case GEOMETRY: + case GEOGRAPHY: + return std::make_unique(type, factory, options); default: throw NotImplementedYet( "Type is not supported yet for creating " diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc new file mode 100644 index 0000000000..6d7d268703 --- /dev/null +++ b/c++/src/Geospatial.cc @@ -0,0 +1,307 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This file contains code adapted from the Apache Arrow project. + * + * Original source: + * https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.cc + * + * The original code is licensed under the Apache License, Version 2.0. + * + * Modifications may have been made from the original source. + */ + +#include "orc/Geospatial.hh" +#include "orc/Exceptions.hh" + +#include "Geospatial.hh" + +#include +#include +#include +#include + +namespace orc::geospatial { + + template + inline std::enable_if_t, T> safeLoadAs(const uint8_t* unaligned) { + std::remove_const_t ret; + std::memcpy(&ret, unaligned, sizeof(T)); + return ret; + } + + template + inline std::enable_if_t && std::is_trivially_copyable_v && + sizeof(T) == sizeof(U), + U> + safeCopy(T value) { + std::remove_const_t ret; + std::memcpy(&ret, static_cast(&value), sizeof(T)); + return ret; + } + + static bool isLittleEndian() { + static union { + uint32_t i; + char c[4]; + } num = {0x01020304}; + return num.c[0] == 4; + } + +#if defined(_MSC_VER) +#include // IWYU pragma: keep +#define ORC_BYTE_SWAP64 _byteSwap_uint64 +#define ORC_BYTE_SWAP32 _byteSwap_ulong +#else +#define ORC_BYTE_SWAP64 __builtin_bswap64 +#define ORC_BYTE_SWAP32 __builtin_bswap32 +#endif + + // Swap the byte order (i.e. endianness) + static inline uint32_t byteSwap(uint32_t value) { + return static_cast(ORC_BYTE_SWAP32(value)); + } + static inline double byteSwap(double value) { + const uint64_t swapped = ORC_BYTE_SWAP64(safeCopy(value)); + return safeCopy(swapped); + } + + std::string BoundingBox::toString() const { + std::stringstream ss; + ss << "BoundingBox{xMin=" << min[0] << ", xMax=" << max[0] << ", yMin=" << min[1] + << ", yMax=" << max[1] << ", zMin=" << min[2] << ", zMax=" << max[2] << ", mMin=" << min[3] + << ", mMax=" << max[3] << "}"; + return ss.str(); + } + + /// \brief Object to keep track of the low-level consumption of a well-known binary + /// geometry + /// + /// Briefly, ISO well-known binary supported by the Parquet spec is an endian byte + /// (0x01 or 0x00), followed by geometry type + dimensions encoded as a (uint32_t), + /// followed by geometry-specific data. Coordinate sequences are represented by a + /// uint32_t (the number of coordinates) plus a sequence of doubles (number of coordinates + /// multiplied by the number of dimensions). + class WKBBuffer { + public: + WKBBuffer() : data_(nullptr), size_(0) {} + WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} + + uint8_t readUInt8() { + return readChecked(); + } + + uint32_t readUInt32(bool swap) { + auto value = readChecked(); + return swap ? byteSwap(value) : value; + } + + template + void readCoords(uint32_t nCoords, bool swap, Visit&& visit) { + size_t total_bytes = nCoords * sizeof(Coord); + if (size_ < total_bytes) { + } + + if (swap) { + Coord coord; + for (uint32_t i = 0; i < nCoords; i++) { + coord = readUnchecked(); + for (auto& c : coord) { + c = byteSwap(c); + } + + std::forward(visit)(coord); + } + } else { + for (uint32_t i = 0; i < nCoords; i++) { + std::forward(visit)(readUnchecked()); + } + } + } + + size_t size() const { + return size_; + } + + private: + const uint8_t* data_; + size_t size_; + + template + T readChecked() { + if (size_ < sizeof(T)) { + std::stringstream ss; + ss << "Can't read" << sizeof(T) << " bytes from WKBBuffer with " << size_ << " remaining"; + throw ParseError(ss.str()); + } + + return readUnchecked(); + } + + template + T readUnchecked() { + T out = safeLoadAs(data_); + data_ += sizeof(T); + size_ -= sizeof(T); + return out; + } + }; + + using GeometryTypeAndDimensions = std::pair; + + namespace { + + std::optional parseGeometryType(uint32_t wkbGeometryType) { + // The number 1000 can be used because WKB geometry types are constructed + // on purpose such that this relationship is true (e.g., LINESTRING ZM maps + // to 3002). + uint32_t geometryTypeComponent = wkbGeometryType % 1000; + uint32_t dimensionsComponent = wkbGeometryType / 1000; + + auto minGeometryTypeValue = static_cast(GeometryType::VALUE_MIN); + auto maxGeometryTypeValue = static_cast(GeometryType::VALUE_MAX); + auto minDimensionValue = static_cast(Dimensions::VALUE_MIN); + auto maxDimensionValue = static_cast(Dimensions::VALUE_MAX); + + if (geometryTypeComponent < minGeometryTypeValue || + geometryTypeComponent > maxGeometryTypeValue || dimensionsComponent < minDimensionValue || + dimensionsComponent > maxDimensionValue) { + return std::nullopt; + } + + return std::make_optional( + GeometryTypeAndDimensions{static_cast(geometryTypeComponent), + static_cast(dimensionsComponent)}); + } + + } // namespace + + std::vector WKBGeometryBounder::geometryTypes() const { + std::vector out(geospatialTypes_.begin(), geospatialTypes_.end()); + std::sort(out.begin(), out.end()); + return out; + } + + void WKBGeometryBounder::mergeGeometry(std::string_view bytesWkb) { + if (!isValid_) { + return; + } + mergeGeometry(reinterpret_cast(bytesWkb.data()), bytesWkb.size()); + } + + void WKBGeometryBounder::mergeGeometry(const uint8_t* bytesWkb, size_t bytesSize) { + if (!isValid_) { + return; + } + WKBBuffer src{bytesWkb, static_cast(bytesSize)}; + try { + mergeGeometryInternal(&src, /*record_wkb_type=*/true); + } catch (const ParseError&) { + invalidate(); + return; + } + if (src.size() != 0) { + // "Exepcted zero bytes after consuming WKB + invalidate(); + } + } + + void WKBGeometryBounder::mergeGeometryInternal(WKBBuffer* src, bool recordWkbType) { + uint8_t endian = src->readUInt8(); + bool swap = endian != 0x00; + if (isLittleEndian()) { + swap = endian != 0x01; + } + + uint32_t wkbGeometryType = src->readUInt32(swap); + auto geometryTypeAndDimensions = parseGeometryType(wkbGeometryType); + if (!geometryTypeAndDimensions.has_value()) { + invalidate(); + return; + } + auto& [geometry_type, dimensions] = geometryTypeAndDimensions.value(); + + // Keep track of geometry types encountered if at the top level + if (recordWkbType) { + geospatialTypes_.insert(static_cast(wkbGeometryType)); + } + + switch (geometry_type) { + case GeometryType::POINT: + mergeSequence(src, dimensions, 1, swap); + break; + + case GeometryType::LINESTRING: { + uint32_t nCoords = src->readUInt32(swap); + mergeSequence(src, dimensions, nCoords, swap); + break; + } + case GeometryType::POLYGON: { + uint32_t n_parts = src->readUInt32(swap); + for (uint32_t i = 0; i < n_parts; i++) { + uint32_t nCoords = src->readUInt32(swap); + mergeSequence(src, dimensions, nCoords, swap); + } + break; + } + + // These are all encoded the same in WKB, even though this encoding would + // allow for parts to be of a different geometry type or different dimensions. + // For the purposes of bounding, this does not cause us problems. We pass + // record_wkb_type = false because we do not want the child geometry to be + // added to the geometry_types list (e.g., for a MultiPoint, we only want + // the code for MultiPoint to be added, not the code for Point). + case GeometryType::MULTIPOINT: + case GeometryType::MULTILINESTRING: + case GeometryType::MULTIPOLYGON: + case GeometryType::GEOMETRYCOLLECTION: { + uint32_t n_parts = src->readUInt32(swap); + for (uint32_t i = 0; i < n_parts; i++) { + mergeGeometryInternal(src, /*record_wkb_type*/ false); + } + break; + } + } + } + + void WKBGeometryBounder::mergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, + bool swap) { + switch (dimensions) { + case Dimensions::XY: + src->readCoords(nCoords, swap, + [&](BoundingBox::XY coord) { box_.updateXY(coord); }); + break; + case Dimensions::XYZ: + src->readCoords(nCoords, swap, + [&](BoundingBox::XYZ coord) { box_.updateXYZ(coord); }); + break; + case Dimensions::XYM: + src->readCoords(nCoords, swap, + [&](BoundingBox::XYM coord) { box_.updateXYM(coord); }); + break; + case Dimensions::XYZM: + src->readCoords( + nCoords, swap, [&](BoundingBox::XYZM coord) { box_.updateXYZM(coord); }); + break; + default: + invalidate(); + } + } + +} // namespace orc::geospatial diff --git a/c++/src/Geospatial.hh b/c++/src/Geospatial.hh new file mode 100644 index 0000000000..aebb72747a --- /dev/null +++ b/c++/src/Geospatial.hh @@ -0,0 +1,86 @@ + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_GEOSPATIAL_IMPL_HH +#define ORC_GEOSPATIAL_IMPL_HH + +#include "orc/Geospatial.hh" + +#include +#include + +namespace orc { + namespace geospatial { + class WKBBuffer; + + class WKBGeometryBounder { + public: + void mergeGeometry(std::string_view bytesWkb); + void mergeGeometry(const uint8_t* bytesWkb, size_t bytesSize); + + void mergeBox(const BoundingBox& box) { + box_.merge(box); + } + void mergeGeometryTypes(const std::vector& geospatialTypes) { + geospatialTypes_.insert(geospatialTypes.begin(), geospatialTypes.end()); + } + void merge(const WKBGeometryBounder& other) { + if (!isValid() || !other.isValid()) { + invalidate(); + return; + } + box_.merge(other.box_); + geospatialTypes_.insert(other.geospatialTypes_.begin(), other.geospatialTypes_.end()); + } + + // Get the bounding box for the merged geometries. + const BoundingBox& bounds() const { + return box_; + } + + // Get the set of geometry types encountered during merging. + // Returns a sorted vector of geometry type IDs. + std::vector geometryTypes() const; + + void reset() { + isValid_ = true; + box_.reset(); + geospatialTypes_.clear(); + } + bool isValid() const { + return isValid_; + } + void invalidate() { + isValid_ = false; + box_.invalidate(); + geospatialTypes_.clear(); + } + + private: + BoundingBox box_; + std::unordered_set geospatialTypes_; + bool isValid_ = true; + + void mergeGeometryInternal(WKBBuffer* src, bool recordWkbType); + void mergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, bool swap); + }; + } // namespace geospatial +} // namespace orc + +#endif diff --git a/c++/src/Reader.cc b/c++/src/Reader.cc index 17bf835203..349ae1b407 100644 --- a/c++/src/Reader.cc +++ b/c++/src/Reader.cc @@ -873,6 +873,8 @@ namespace orc { case proto::Type_Kind_CHAR: case proto::Type_Kind_STRING: case proto::Type_Kind_VARCHAR: + case proto::Type_Kind_GEOMETRY: + case proto::Type_Kind_GEOGRAPHY: return 4; default: return 0; diff --git a/c++/src/SchemaEvolution.cc b/c++/src/SchemaEvolution.cc index 7cf3b5c512..442c43c228 100644 --- a/c++/src/SchemaEvolution.cc +++ b/c++/src/SchemaEvolution.cc @@ -18,6 +18,7 @@ #include "SchemaEvolution.hh" #include "orc/Exceptions.hh" +#include "orc/Type.hh" namespace orc { @@ -113,7 +114,9 @@ namespace orc { case TIMESTAMP: case TIMESTAMP_INSTANT: case DATE: - case BINARY: { + case BINARY: + case GEOMETRY: + case GEOGRAPHY: { // Not support break; } @@ -235,6 +238,8 @@ namespace orc { case FLOAT: case DOUBLE: case BINARY: + case GEOMETRY: + case GEOGRAPHY: case TIMESTAMP: case LIST: case MAP: diff --git a/c++/src/Statistics.cc b/c++/src/Statistics.cc index c1a23cad16..a86247f107 100644 --- a/c++/src/Statistics.cc +++ b/c++/src/Statistics.cc @@ -44,6 +44,8 @@ namespace orc { return new DateColumnStatisticsImpl(s, statContext); } else if (s.has_binary_statistics()) { return new BinaryColumnStatisticsImpl(s, statContext); + } else if (s.has_geospatial_statistics()) { + return new GeospatialColumnStatisticsImpl(s); } else { return new ColumnStatisticsImpl(s); } @@ -148,6 +150,10 @@ namespace orc { // PASS } + GeospatialColumnStatistics::~GeospatialColumnStatistics() { + // PASS + } + ColumnStatisticsImpl::~ColumnStatisticsImpl() { // PASS } @@ -188,6 +194,10 @@ namespace orc { // PASS } + GeospatialColumnStatisticsImpl::~GeospatialColumnStatisticsImpl() { + // PASS + } + ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) { stats_.setNumberOfValues(pb.number_of_values()); stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true); @@ -391,6 +401,40 @@ namespace orc { } } + GeospatialColumnStatisticsImpl::GeospatialColumnStatisticsImpl( + const proto::ColumnStatistics& pb) { + reset(); + if (!pb.has_geospatial_statistics()) { + bounder_.invalidate(); + } else { + const proto::GeospatialStatistics& stats = pb.geospatial_statistics(); + geospatial::BoundingBox::XYZM min; + geospatial::BoundingBox::XYZM max; + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + min[i] = max[i] = std::numeric_limits::quiet_NaN(); + } + if (stats.has_bbox()) { + const auto& protoBBox = stats.bbox(); + min[0] = protoBBox.xmin(); + min[1] = protoBBox.ymin(); + max[0] = protoBBox.xmax(); + max[1] = protoBBox.ymax(); + if (protoBBox.has_zmin() && protoBBox.has_zmax()) { + min[2] = protoBBox.zmin(); + max[2] = protoBBox.zmax(); + } + if (protoBBox.has_mmin() && protoBBox.has_mmax()) { + min[3] = protoBBox.mmin(); + max[3] = protoBBox.mmax(); + } + } + bounder_.mergeBox(geospatial::BoundingBox(min, max)); + std::vector types = {stats.geospatial_types().begin(), + stats.geospatial_types().end()}; + bounder_.mergeGeometryTypes(types); + } + } + std::unique_ptr createColumnStatistics(const Type& type) { switch (static_cast(type.getKind())) { case BOOLEAN: @@ -422,6 +466,9 @@ namespace orc { return std::make_unique(); case DECIMAL: return std::make_unique(); + case GEOGRAPHY: + case GEOMETRY: + return std::make_unique(); default: throw NotImplementedYet("Not supported type: " + type.toString()); } diff --git a/c++/src/Statistics.hh b/c++/src/Statistics.hh index b7ed5d1e56..94b1e5d2b2 100644 --- a/c++/src/Statistics.hh +++ b/c++/src/Statistics.hh @@ -24,6 +24,7 @@ #include "orc/OrcFile.hh" #include "orc/Reader.hh" +#include "Geospatial.hh" #include "Timezone.hh" #include "TypeImpl.hh" @@ -1683,6 +1684,127 @@ namespace orc { } }; + class GeospatialColumnStatisticsImpl : public GeospatialColumnStatistics, + public MutableColumnStatistics { + private: + geospatial::WKBGeometryBounder bounder_; + InternalCharStatistics stats_; + + public: + GeospatialColumnStatisticsImpl() { + reset(); + } + explicit GeospatialColumnStatisticsImpl(const proto::ColumnStatistics& stats); + virtual ~GeospatialColumnStatisticsImpl(); + + uint64_t getNumberOfValues() const override { + return stats_.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + stats_.setNumberOfValues(value); + } + + void increase(uint64_t count) override { + stats_.setNumberOfValues(stats_.getNumberOfValues() + count); + } + + bool hasNull() const override { + return stats_.hasNull(); + } + + void setHasNull(bool hasNull) override { + stats_.setHasNull(hasNull); + } + + void merge(const MutableColumnStatistics& other) override { + const GeospatialColumnStatisticsImpl& geoStats = + dynamic_cast(other); + stats_.merge(geoStats.stats_); + bounder_.merge(geoStats.bounder_); + } + + void reset() override { + stats_.reset(); + bounder_.reset(); + } + + void update(const char* value, size_t length) override { + bounder_.mergeGeometry(std::string_view(value, length)); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_has_null(stats_.hasNull()); + pbStats.set_number_of_values(stats_.getNumberOfValues()); + + proto::GeospatialStatistics* geoStats = pbStats.mutable_geospatial_statistics(); + const auto& bbox = bounder_.bounds(); + if (bbox.boundValid(0) && bbox.boundValid(1) && !bbox.boundEmpty(0) && !bbox.boundEmpty(1)) { + geoStats->mutable_bbox()->set_xmin(bbox.min[0]); + geoStats->mutable_bbox()->set_xmax(bbox.max[0]); + geoStats->mutable_bbox()->set_ymin(bbox.min[1]); + geoStats->mutable_bbox()->set_ymax(bbox.max[1]); + if (bbox.boundValid(2) && !bbox.boundEmpty(2)) { + geoStats->mutable_bbox()->set_zmin(bbox.min[2]); + geoStats->mutable_bbox()->set_zmax(bbox.max[2]); + } + if (bbox.boundValid(3) && !bbox.boundEmpty(3)) { + geoStats->mutable_bbox()->set_mmin(bbox.min[3]); + geoStats->mutable_bbox()->set_mmax(bbox.max[3]); + } + } + for (auto type : bounder_.geometryTypes()) { + geoStats->add_geospatial_types(type); + } + } + + std::string toString() const override { + if (!bounder_.isValid()) { + return " invalid"; + } + + std::stringstream ss; + ss << ""; + + std::string dim_label("xyzm"); + const auto& bbox = bounder_.bounds(); + auto dim_valid = bbox.dimensionValid(); + auto dim_empty = bbox.dimensionEmpty(); + auto lower = bbox.lowerBound(); + auto upper = bbox.upperBound(); + + for (int i = 0; i < 4; i++) { + ss << " " << dim_label[i] << ": "; + if (!dim_valid[i]) { + ss << "invalid"; + } else if (dim_empty[i]) { + ss << "empty"; + } else { + ss << "[" << lower[i] << ", " << upper[i] << "]"; + } + } + + std::vector maybe_geometry_types = bounder_.geometryTypes(); + ss << " geometry_types: ["; + std::string sep(""); + for (int32_t geometry_type : maybe_geometry_types) { + ss << sep << geometry_type; + sep = ", "; + } + ss << "]"; + + return ss.str(); + } + + const geospatial::BoundingBox& getBoundingBox() const override { + return bounder_.bounds(); + } + + std::vector getGeospatialTypes() const override { + return bounder_.geometryTypes(); + } + }; + ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, const StatContext& statContext); diff --git a/c++/src/TypeImpl.cc b/c++/src/TypeImpl.cc index cbc7b82796..18c4985ab1 100644 --- a/c++/src/TypeImpl.cc +++ b/c++/src/TypeImpl.cc @@ -19,8 +19,10 @@ #include "TypeImpl.hh" #include "Adaptor.hh" #include "orc/Exceptions.hh" +#include "orc/Type.hh" #include +#include #include namespace orc { @@ -62,6 +64,33 @@ namespace orc { subtypeCount_ = 0; } + TypeImpl::TypeImpl(TypeKind kind, const std::string& crs) { + parent_ = nullptr; + columnId_ = -1; + maximumColumnId_ = -1; + kind_ = kind; + maxLength_ = 0; + precision_ = 0; + scale_ = 0; + subtypeCount_ = 0; + crs_ = crs; + edgeInterpolationAlgorithm_ = geospatial::EdgeInterpolationAlgorithm::SPHERICAL; + } + + TypeImpl::TypeImpl(TypeKind kind, const std::string& crs, + geospatial::EdgeInterpolationAlgorithm algo) { + parent_ = nullptr; + columnId_ = -1; + maximumColumnId_ = -1; + kind_ = kind; + maxLength_ = 0; + precision_ = 0; + scale_ = 0; + subtypeCount_ = 0; + crs_ = crs; + edgeInterpolationAlgorithm_ = algo; + } + uint64_t TypeImpl::assignIds(uint64_t root) const { columnId_ = static_cast(root); uint64_t current = root + 1; @@ -120,6 +149,14 @@ namespace orc { return scale_; } + const std::string& TypeImpl::getCrs() const { + return crs_; + } + + geospatial::EdgeInterpolationAlgorithm TypeImpl::getAlgorithm() const { + return edgeInterpolationAlgorithm_; + } + Type& TypeImpl::setAttribute(const std::string& key, const std::string& value) { attributes_[key] = value; return *this; @@ -189,6 +226,45 @@ namespace orc { return true; } + namespace geospatial { + std::string AlgoToString(EdgeInterpolationAlgorithm algo) { + switch (algo) { + case EdgeInterpolationAlgorithm::SPHERICAL: + return "speherial"; + case VINCENTY: + return "vincenty"; + case THOMAS: + return "thomas"; + case ANDOYER: + return "andoyer"; + case KARNEY: + return "karney"; + default: + throw InvalidArgument("Unknown algo"); + } + } + + EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo) { + if (algo == "speherial") { + return EdgeInterpolationAlgorithm::SPHERICAL; + } + if (algo == "vincenty") { + return VINCENTY; + } + if (algo == "thomas") { + return THOMAS; + } + if (algo == "andoyer") { + return ANDOYER; + } + if (algo == "karney") { + return KARNEY; + } + throw InvalidArgument("Unknown algo: " + algo); + } + + } // namespace geospatial + std::string TypeImpl::toString() const { switch (static_cast(kind_)) { case BOOLEAN: @@ -271,6 +347,17 @@ namespace orc { result << "char(" << maxLength_ << ")"; return result.str(); } + case GEOMETRY: { + std::stringstream result; + result << "geometry(" << crs_ << ")"; + return result.str(); + } + case GEOGRAPHY: { + std::stringstream result; + result << "geography(" << crs_ << "," + << geospatial::AlgoToString(edgeInterpolationAlgorithm_) << ")"; + return result.str(); + } default: throw NotImplementedYet("Unknown type"); } @@ -322,6 +409,8 @@ namespace orc { case BINARY: case CHAR: case VARCHAR: + case GEOMETRY: + case GEOGRAPHY: return encoded ? std::make_unique(capacity, memoryPool) : std::make_unique(capacity, memoryPool); @@ -419,6 +508,15 @@ namespace orc { return std::make_unique(UNION); } + std::unique_ptr createGeometryType(const std::string& crs) { + return std::make_unique(GEOMETRY, crs); + } + + std::unique_ptr createGeographyType(const std::string& crs, + geospatial::EdgeInterpolationAlgorithm algo) { + return std::make_unique(GEOGRAPHY, crs, algo); + } + std::string printProtobufMessage(const google::protobuf::Message& message); std::unique_ptr convertType(const proto::Type& type, const proto::Footer& footer) { std::unique_ptr ret; @@ -443,6 +541,16 @@ namespace orc { ret = std::make_unique(static_cast(type.kind()), type.maximum_length()); break; + case proto::Type_Kind_GEOMETRY: + ret = std::make_unique(static_cast(type.kind()), type.crs()); + break; + + case proto::Type_Kind_GEOGRAPHY: + ret = std::make_unique( + static_cast(type.kind()), type.crs(), + static_cast(type.algorithm())); + break; + case proto::Type_Kind_DECIMAL: ret = std::make_unique(DECIMAL, type.precision(), type.scale()); break; @@ -523,6 +631,13 @@ namespace orc { case CHAR: result = std::make_unique(fileType->getKind(), fileType->getMaximumLength()); break; + case GEOMETRY: + result = std::make_unique(fileType->getKind(), fileType->getCrs()); + break; + case GEOGRAPHY: + result = std::make_unique(fileType->getKind(), fileType->getCrs(), + fileType->getAlgorithm()); + break; case LIST: result = std::make_unique(fileType->getKind()); @@ -710,6 +825,22 @@ namespace orc { return std::make_unique(DECIMAL, precision, scale); } + std::unique_ptr TypeImpl::parseGeographyType(const std::string& input, size_t start, + size_t end) { + if (input[start] != '(') { + throw std::logic_error("Missing ( after geography."); + } + size_t pos = start + 1; + size_t sep = input.find(',', pos); + if (sep + 1 >= end || sep == std::string::npos) { + throw std::logic_error("Geography type must specify CRS."); + } + std::string crs = input.substr(pos, sep - pos); + std::string algoStr = input.substr(sep + 1, end - sep - 1); + geospatial::EdgeInterpolationAlgorithm algo = geospatial::AlgoFromString(algoStr); + return std::make_unique(GEOGRAPHY, crs, algo); + } + void validatePrimitiveType(std::string category, const std::string& input, const size_t pos) { if (input[pos] == '<' || input[pos] == '(') { std::ostringstream oss; @@ -780,6 +911,14 @@ namespace orc { uint64_t maxLength = static_cast(atoi(input.substr(start + 1, end - start + 1).c_str())); return std::make_unique(CHAR, maxLength); + } else if (category == "geometry") { + if (input[start] != '(') { + throw std::logic_error("Missing ( after geometry."); + } + std::string crs = input.substr(start + 1, end - start + 1); + return std::make_unique(GEOMETRY, crs); + } else if (category == "geography") { + return parseGeographyType(input, start, end); } else { throw std::logic_error("Unknown type " + category); } diff --git a/c++/src/TypeImpl.hh b/c++/src/TypeImpl.hh index 647d5a5d2c..2db175aba6 100644 --- a/c++/src/TypeImpl.hh +++ b/c++/src/TypeImpl.hh @@ -24,6 +24,7 @@ #include "Adaptor.hh" #include "wrap/orc-proto-wrapper.hh" +#include #include namespace orc { @@ -41,6 +42,9 @@ namespace orc { uint64_t precision_; uint64_t scale_; std::map attributes_; + std::string crs_; + geospatial::EdgeInterpolationAlgorithm edgeInterpolationAlgorithm_ = + geospatial::EdgeInterpolationAlgorithm::SPHERICAL; public: /** @@ -58,6 +62,16 @@ namespace orc { */ TypeImpl(TypeKind kind, uint64_t precision, uint64_t scale); + /** + * Create geometry type. + */ + TypeImpl(TypeKind kind, const std::string& crs); + + /** + * Create geography type. + */ + TypeImpl(TypeKind kind, const std::string& crs, geospatial::EdgeInterpolationAlgorithm algo); + uint64_t getColumnId() const override; uint64_t getMaximumColumnId() const override; @@ -76,6 +90,10 @@ namespace orc { uint64_t getScale() const override; + const std::string& getCrs() const override; + + geospatial::EdgeInterpolationAlgorithm getAlgorithm() const override; + Type& setAttribute(const std::string& key, const std::string& value) override; bool hasAttributeKey(const std::string& key) const override; @@ -176,6 +194,14 @@ namespace orc { static std::unique_ptr parseDecimalType(const std::string& input, size_t start, size_t end); + /** + * Parse geography type from string + * @param input the input string of a decimal type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr parseGeographyType(const std::string& input, size_t start, + size_t end); /** * Parse type for a category * @param category type name diff --git a/c++/src/Writer.cc b/c++/src/Writer.cc index 775e6d2452..c235169cca 100644 --- a/c++/src/Writer.cc +++ b/c++/src/Writer.cc @@ -24,6 +24,7 @@ #include "Utils.hh" #include +#include namespace orc { @@ -702,6 +703,40 @@ namespace orc { protoType.set_kind(proto::Type_Kind_CHAR); break; } + case GEOMETRY: { + protoType.set_kind(proto::Type_Kind_GEOMETRY); + protoType.set_crs(t.getCrs()); + break; + } + case GEOGRAPHY: { + protoType.set_kind(proto::Type_Kind_GEOGRAPHY); + protoType.set_crs(t.getCrs()); + switch (t.getAlgorithm()) { + case geospatial::EdgeInterpolationAlgorithm::SPHERICAL: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_SPHERICAL); + break; + } + case orc::geospatial::EdgeInterpolationAlgorithm::VINCENTY: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_VINCENTY); + break; + } + case orc::geospatial::EdgeInterpolationAlgorithm::THOMAS: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_VINCENTY); + break; + } + case orc::geospatial::EdgeInterpolationAlgorithm::ANDOYER: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_ANDOYER); + break; + } + case orc::geospatial::EdgeInterpolationAlgorithm::KARNEY: { + protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_KARNEY); + break; + } + default: + throw std::invalid_argument("Unknown Algorithm."); + } + break; + } default: throw std::logic_error("Unknown type."); } diff --git a/c++/src/meson.build b/c++/src/meson.build index 3d77d3242e..0794dec843 100644 --- a/c++/src/meson.build +++ b/c++/src/meson.build @@ -151,6 +151,7 @@ source_files += files( 'ConvertColumnReader.cc', 'CpuInfoUtil.cc', 'Exceptions.cc', + 'Geospatial.cc', 'Int128.cc', 'LzoDecompressor.cc', 'MemoryPool.cc', diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt index f7328abb32..3261fedde6 100644 --- a/c++/test/CMakeLists.txt +++ b/c++/test/CMakeLists.txt @@ -56,12 +56,14 @@ add_executable (orc-test TestRleEncoder.cc TestRLEV2Util.cc TestSargsApplier.cc + TestStatistics.cc TestSearchArgument.cc TestSchemaEvolution.cc TestStripeIndexStatistics.cc TestTimestampStatistics.cc TestTimezone.cc TestType.cc + TestUtil.cc TestWriter.cc TestCache.cc ${SIMD_TEST_SRCS} diff --git a/c++/test/TestColumnStatistics.cc b/c++/test/TestColumnStatistics.cc index 5cf2d9e41b..642a8019de 100644 --- a/c++/test/TestColumnStatistics.cc +++ b/c++/test/TestColumnStatistics.cc @@ -17,6 +17,7 @@ */ #include "Statistics.hh" +#include "TestUtil.hh" #include "orc/OrcFile.hh" #include "wrap/gmock.h" #include "wrap/gtest-wrapper.h" @@ -531,4 +532,345 @@ namespace orc { collectionStats->merge(*other); EXPECT_FALSE(collectionStats->hasTotalChildren()); } + + TEST(ColumnStatistics, TestGeospatialDefaults) { + std::unique_ptr geoStats(new GeospatialColumnStatisticsImpl()); + EXPECT_TRUE(geoStats->getGeospatialTypes().empty()); + auto bbox = geoStats->getBoundingBox(); + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + EXPECT_TRUE(bbox.boundEmpty(i)); + EXPECT_TRUE(bbox.boundValid(i)); + } + EXPECT_EQ(" x: empty y: empty z: empty m: empty geometry_types: []", + geoStats->toString()); + } + + TEST(ColumnStatistics, TestGeospatialUpdate) { + std::unique_ptr geoStats(new GeospatialColumnStatisticsImpl()); + EXPECT_TRUE(geoStats->getGeospatialTypes().empty()); + const auto& bbox = geoStats->getBoundingBox(); + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + EXPECT_TRUE(bbox.boundEmpty(i)); + EXPECT_TRUE(bbox.boundValid(i)); + } + EXPECT_EQ(geoStats->getGeospatialTypes().size(), 0); + + geospatial::BoundingBox::XYZM expectedMin; + geospatial::BoundingBox::XYZM expectedMax; + std::array expectedEmpty; + std::array expectedValid; + std::vector expectedTypes; + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + expectedMin[i] = geospatial::INF; + expectedMax[i] = -geospatial::INF; + expectedEmpty[i] = true; + expectedValid[i] = true; + } + + auto Verify = [&]() { + EXPECT_EQ(expectedEmpty, geoStats->getBoundingBox().dimensionEmpty()); + EXPECT_EQ(expectedValid, geoStats->getBoundingBox().dimensionValid()); + EXPECT_EQ(expectedTypes, geoStats->getGeospatialTypes()); + for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) { + if (geoStats->getBoundingBox().boundValid(i)) { + EXPECT_EQ(expectedMin[i], geoStats->getBoundingBox().lowerBound()[i]); + EXPECT_EQ(expectedMax[i], geoStats->getBoundingBox().upperBound()[i]); + } else { + EXPECT_TRUE(std::isnan(geoStats->getBoundingBox().lowerBound()[i])); + EXPECT_TRUE(std::isnan(geoStats->getBoundingBox().upperBound()[i])); + } + } + }; + + // Update a xy point + std::string xy0 = MakeWKBPoint({10, 11}, false, false); + geoStats->update(xy0.c_str(), xy0.size()); + expectedMin[0] = expectedMax[0] = 10; + expectedMin[1] = expectedMax[1] = 11; + expectedEmpty[0] = expectedEmpty[1] = false; + expectedTypes.push_back(1); + Verify(); + + // Update a xyz point. + std::string xyz0 = MakeWKBPoint({11, 12, 13}, true, false); + geoStats->update(xyz0.c_str(), xyz0.size()); + expectedMax[0] = 11; + expectedMax[1] = 12; + expectedMin[2] = expectedMax[2] = 13; + expectedEmpty[2] = false; + expectedTypes.push_back(1001); + Verify(); + + // Update a xym point. + std::string xym0 = MakeWKBPoint({9, 10, 0, 11}, false, true); + geoStats->update(xym0.c_str(), xym0.size()); + expectedMin[0] = 9; + expectedMin[1] = 10; + expectedMin[3] = expectedMax[3] = 11; + expectedEmpty[3] = false; + expectedTypes.push_back(2001); + Verify(); + + // Update a xymz point. + std::string xymz0 = MakeWKBPoint({8, 9, 10, 12}, true, true); + geoStats->update(xymz0.c_str(), xymz0.size()); + expectedMin[0] = 8; + expectedMin[1] = 9; + expectedMin[2] = 10; + expectedMax[3] = 12; + expectedTypes.push_back(3001); + Verify(); + + // Update NaN to every dimension. + std::string xyzm1 = MakeWKBPoint( + {std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN()}, + true, false); + geoStats->update(xyzm1.c_str(), xyzm1.size()); + Verify(); + + // Update a invalid WKB + std::string invalidWKB; + geoStats->update(invalidWKB.c_str(), invalidWKB.size()); + expectedValid[0] = expectedValid[1] = expectedValid[2] = expectedValid[3] = false; + expectedTypes.clear(); + Verify(); + + // Update a xy point again + std::string xy1 = MakeWKBPoint({10, 11}, false, false); + geoStats->update(xy1.c_str(), xy1.size()); + Verify(); + } + + TEST(ColumnStatistics, TestGeospatialToProto) { + // Test Empty + std::unique_ptr geoStats(new GeospatialColumnStatisticsImpl()); + proto::ColumnStatistics pbStats; + geoStats->toProtoBuf(pbStats); + EXPECT_TRUE(pbStats.has_geospatial_statistics()); + EXPECT_EQ(0, pbStats.geospatial_statistics().geospatial_types().size()); + EXPECT_FALSE(pbStats.geospatial_statistics().has_bbox()); + + // Update a xy point + std::string xy = MakeWKBPoint({10, 11}, false, false); + geoStats->update(xy.c_str(), xy.size()); + pbStats.Clear(); + geoStats->toProtoBuf(pbStats); + EXPECT_TRUE(pbStats.has_geospatial_statistics()); + EXPECT_EQ(1, pbStats.geospatial_statistics().geospatial_types().size()); + EXPECT_EQ(1, pbStats.geospatial_statistics().geospatial_types(0)); + EXPECT_TRUE(pbStats.geospatial_statistics().has_bbox()); + const auto& bbox0 = pbStats.geospatial_statistics().bbox(); + EXPECT_TRUE(bbox0.has_xmin()); + EXPECT_TRUE(bbox0.has_xmax()); + EXPECT_TRUE(bbox0.has_ymin()); + EXPECT_TRUE(bbox0.has_ymax()); + EXPECT_FALSE(bbox0.has_zmin()); + EXPECT_FALSE(bbox0.has_zmax()); + EXPECT_FALSE(bbox0.has_mmin()); + EXPECT_FALSE(bbox0.has_mmax()); + EXPECT_EQ(10, bbox0.xmin()); + EXPECT_EQ(10, bbox0.xmax()); + EXPECT_EQ(11, bbox0.ymin()); + EXPECT_EQ(11, bbox0.ymax()); + + // Update a xyzm point. + std::string xyzm = MakeWKBPoint({-10, -11, -12, -13}, true, true); + geoStats->update(xyzm.c_str(), xyzm.size()); + pbStats.Clear(); + geoStats->toProtoBuf(pbStats); + EXPECT_TRUE(pbStats.has_geospatial_statistics()); + EXPECT_EQ(2, pbStats.geospatial_statistics().geospatial_types().size()); + EXPECT_EQ(1, pbStats.geospatial_statistics().geospatial_types(0)); + EXPECT_EQ(3001, pbStats.geospatial_statistics().geospatial_types(1)); + EXPECT_TRUE(pbStats.geospatial_statistics().has_bbox()); + const auto& bbox1 = pbStats.geospatial_statistics().bbox(); + EXPECT_TRUE(bbox1.has_xmin()); + EXPECT_TRUE(bbox1.has_xmax()); + EXPECT_TRUE(bbox1.has_ymin()); + EXPECT_TRUE(bbox1.has_ymax()); + EXPECT_TRUE(bbox1.has_zmin()); + EXPECT_TRUE(bbox1.has_zmax()); + EXPECT_TRUE(bbox1.has_mmin()); + EXPECT_TRUE(bbox1.has_mmax()); + EXPECT_EQ(-10, bbox1.xmin()); + EXPECT_EQ(10, bbox1.xmax()); + EXPECT_EQ(-11, bbox1.ymin()); + EXPECT_EQ(11, bbox1.ymax()); + EXPECT_EQ(-12, bbox1.zmin()); + EXPECT_EQ(-12, bbox1.zmax()); + EXPECT_EQ(-13, bbox1.mmin()); + EXPECT_EQ(-13, bbox1.mmax()); + + // Update a invalid point + std::string invalidWKB; + geoStats->update(invalidWKB.c_str(), invalidWKB.size()); + pbStats.Clear(); + geoStats->toProtoBuf(pbStats); + EXPECT_TRUE(pbStats.has_geospatial_statistics()); + EXPECT_EQ(0, pbStats.geospatial_statistics().geospatial_types().size()); + EXPECT_FALSE(pbStats.geospatial_statistics().has_bbox()); + } + + TEST(ColumnStatistics, TestGeospatialMerge) { + std::unique_ptr invalidStats( + new GeospatialColumnStatisticsImpl()); + invalidStats->update("0", 0); + + std::unique_ptr emptyStats( + new GeospatialColumnStatisticsImpl()); + + std::unique_ptr xyStats(new GeospatialColumnStatisticsImpl()); + std::string xy = MakeWKBPoint({10, 11}, false, false); + xyStats->update(xy.c_str(), xy.size()); + + std::unique_ptr xyzStats(new GeospatialColumnStatisticsImpl()); + std::string xyz = MakeWKBPoint({12, 13, 14}, true, false); + xyzStats->update(xyz.c_str(), xyz.size()); + + std::unique_ptr xyzmStats(new GeospatialColumnStatisticsImpl()); + std::string xyzm = MakeWKBPoint({-10, -11, -12, -13}, true, true); + xyzmStats->update(xyzm.c_str(), xyzm.size()); + + // invalid merge invalid + invalidStats->merge(*invalidStats); + std::array expectedValid = {false, false, false, false}; + EXPECT_EQ(invalidStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(invalidStats->getGeospatialTypes().size(), 0); + + // Empty merge empty + emptyStats->merge(*emptyStats); + expectedValid = {true, true, true, true}; + std::array expectedEmpty = {true, true, true, true}; + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 0); + + // Empty merge xy + emptyStats->merge(*xyStats); + expectedEmpty = {false, false, true, true}; + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty); + EXPECT_EQ(10, emptyStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(10, emptyStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(11, emptyStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(11, emptyStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 1); + EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1); + + // Empty merge xyz + emptyStats->merge(*xyzStats); + expectedEmpty = {false, false, false, true}; + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty); + EXPECT_EQ(10, emptyStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(12, emptyStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(11, emptyStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(13, emptyStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(14, emptyStats->getBoundingBox().lowerBound()[2]); + EXPECT_EQ(14, emptyStats->getBoundingBox().upperBound()[2]); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 2); + EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1); + EXPECT_EQ(emptyStats->getGeospatialTypes()[1], 1001); + + // Empty merge xyzm + emptyStats->merge(*xyzmStats); + expectedEmpty = {false, false, false, false}; + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getBoundingBox().dimensionEmpty(), expectedEmpty); + EXPECT_EQ(-10, emptyStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(12, emptyStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(-11, emptyStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(13, emptyStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(-12, emptyStats->getBoundingBox().lowerBound()[2]); + EXPECT_EQ(14, emptyStats->getBoundingBox().upperBound()[2]); + EXPECT_EQ(-13, emptyStats->getBoundingBox().lowerBound()[3]); + EXPECT_EQ(-13, emptyStats->getBoundingBox().upperBound()[3]); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 3); + EXPECT_EQ(emptyStats->getGeospatialTypes()[0], 1); + EXPECT_EQ(emptyStats->getGeospatialTypes()[1], 1001); + EXPECT_EQ(emptyStats->getGeospatialTypes()[2], 3001); + + // Empty merge invalid + emptyStats->merge(*invalidStats); + expectedValid = {false, false, false, false}; + EXPECT_EQ(emptyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(emptyStats->getGeospatialTypes().size(), 0); + } + + TEST(ColumnStatistics, TestGeospatialFromProto) { + proto::ColumnStatistics pbStats; + // No geostats + + std::unique_ptr emptyStats0( + new GeospatialColumnStatisticsImpl(pbStats)); + std::array expectedValid = {false, false, false, false}; + EXPECT_TRUE(emptyStats0->getGeospatialTypes().empty()); + EXPECT_EQ(emptyStats0->getBoundingBox().dimensionValid(), expectedValid); + + // Add empty geostats + pbStats.mutable_geospatial_statistics(); + std::unique_ptr emptyStats1( + new GeospatialColumnStatisticsImpl(pbStats)); + EXPECT_TRUE(emptyStats1->getGeospatialTypes().empty()); + EXPECT_EQ(emptyStats1->getBoundingBox().dimensionValid(), expectedValid); + + // Set xy bounds + auto* geoProtoStas = pbStats.mutable_geospatial_statistics(); + geoProtoStas->mutable_bbox()->set_xmin(0); + geoProtoStas->mutable_bbox()->set_xmax(1); + geoProtoStas->mutable_bbox()->set_ymin(0); + geoProtoStas->mutable_bbox()->set_ymax(1); + geoProtoStas->mutable_geospatial_types()->Add(2); + std::unique_ptr xyStats( + new GeospatialColumnStatisticsImpl(pbStats)); + expectedValid = {true, true, false, false}; + EXPECT_EQ(xyStats->getGeospatialTypes().size(), 1); + EXPECT_EQ(xyStats->getGeospatialTypes()[0], 2); + EXPECT_EQ(xyStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(0, xyStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(1, xyStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(0, xyStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(1, xyStats->getBoundingBox().upperBound()[1]); + + // Set xyz bounds + geoProtoStas->mutable_bbox()->set_zmin(0); + geoProtoStas->mutable_bbox()->set_zmax(1); + geoProtoStas->mutable_geospatial_types()->Add(1003); + std::unique_ptr xyzStats( + new GeospatialColumnStatisticsImpl(pbStats)); + expectedValid = {true, true, true, false}; + EXPECT_EQ(xyzStats->getGeospatialTypes().size(), 2); + EXPECT_EQ(xyzStats->getGeospatialTypes()[0], 2); + EXPECT_EQ(xyzStats->getGeospatialTypes()[1], 1003); + EXPECT_EQ(xyzStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(0, xyzStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(1, xyzStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(0, xyzStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(1, xyzStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(0, xyzStats->getBoundingBox().lowerBound()[2]); + EXPECT_EQ(1, xyzStats->getBoundingBox().upperBound()[2]); + + // Set xyzm bounds + geoProtoStas->mutable_bbox()->set_mmin(0); + geoProtoStas->mutable_bbox()->set_mmax(1); + geoProtoStas->mutable_geospatial_types()->Add(3003); + std::unique_ptr xyzmStats( + new GeospatialColumnStatisticsImpl(pbStats)); + expectedValid = {true, true, true, true}; + EXPECT_EQ(xyzmStats->getGeospatialTypes().size(), 3); + EXPECT_EQ(xyzmStats->getGeospatialTypes()[0], 2); + EXPECT_EQ(xyzmStats->getGeospatialTypes()[1], 1003); + EXPECT_EQ(xyzmStats->getGeospatialTypes()[2], 3003); + EXPECT_EQ(xyzmStats->getBoundingBox().dimensionValid(), expectedValid); + EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[0]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[0]); + EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[1]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[1]); + EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[2]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[2]); + EXPECT_EQ(0, xyzmStats->getBoundingBox().lowerBound()[3]); + EXPECT_EQ(1, xyzmStats->getBoundingBox().upperBound()[3]); + } + } // namespace orc diff --git a/c++/test/TestStatistics.cc b/c++/test/TestStatistics.cc new file mode 100644 index 0000000000..61c5e08cb6 --- /dev/null +++ b/c++/test/TestStatistics.cc @@ -0,0 +1,230 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/OrcFile.hh" + +#include "MemoryInputStream.hh" +#include "MemoryOutputStream.hh" +#include "TestUtil.hh" + +#include "wrap/gtest-wrapper.h" + +#include +#include +#include + +namespace orc { + +#define ENSURE_DYNAMIC_CAST_NOT_NULL(PTR) \ + if (PTR == NULL) throw std::logic_error("dynamic_cast returns null"); + + const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M + + static std::unique_ptr createWriter(uint64_t stripeSize, const Type& type, + MemoryPool* memoryPool, OutputStream* stream) { + WriterOptions options; + options.setStripeSize(stripeSize); + options.setCompressionBlockSize(256); + options.setMemoryBlockSize(256); + options.setCompression(CompressionKind_ZLIB); + options.setMemoryPool(memoryPool); + options.setRowIndexStride(10); + return createWriter(type, stream, options); + } + + static std::unique_ptr createReader(MemoryPool* memoryPool, + MemoryOutputStream& memStream) { + std::unique_ptr inStream( + new MemoryInputStream(memStream.getData(), memStream.getLength())); + ReaderOptions options; + options.setMemoryPool(*memoryPool); + return createReader(std::move(inStream), options); + } + + TEST(Statistics, geometryStatsWithNull) { + std::unique_ptr const type(Type::buildTypeFromString("struct")); + + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + MemoryPool* const pool = getDefaultPool(); + uint64_t const stripeSize = 32; // small stripe size to garantee multi stripes + std::unique_ptr writer = createWriter(stripeSize, *type, pool, &memStream); + + uint64_t const batchCount = 1000; + uint64_t const batches = 10; + std::unique_ptr const batch = writer->createRowBatch(batchCount); + StructVectorBatch* structBatch = dynamic_cast(batch.get()); + ENSURE_DYNAMIC_CAST_NOT_NULL(structBatch); + + StringVectorBatch* strBatch = dynamic_cast(structBatch->fields[0]); + ENSURE_DYNAMIC_CAST_NOT_NULL(strBatch); + + // create str values + std::vector wkbs; + std::array mins = {geospatial::INF, geospatial::INF, geospatial::INF, + geospatial::INF}; + std::array maxs = {-geospatial::INF, -geospatial::INF, -geospatial::INF, + -geospatial::INF}; + for (uint64_t i = 1; i < batchCount - 1; ++i) { + if (i % 3 == 0) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0}, false, false)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + } else if (i % 3 == 1) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0}, true, false)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + mins[2] = std::min(mins[2], i * 1.0); + maxs[2] = std::max(maxs[2], i * 1.0); + } else if (i % 3 == 2) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0, i * 1.0}, true, true)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + mins[2] = std::min(mins[2], i * 1.0); + maxs[2] = std::max(maxs[2], i * 1.0); + mins[3] = std::min(mins[3], i * 1.0); + maxs[3] = std::max(maxs[3], i * 1.0); + } + } + for (uint64_t i = 1; i < batchCount - 1; ++i) { + strBatch->data[i] = const_cast(wkbs[i - 1].c_str()); + strBatch->length[i] = static_cast(wkbs[i - 1].length()); + } + + structBatch->numElements = batchCount; + strBatch->numElements = batchCount; + + structBatch->hasNulls = true; + structBatch->notNull[0] = '\0'; + structBatch->notNull[batchCount - 1] = '\0'; + strBatch->hasNulls = true; + strBatch->notNull[0] = '\0'; + strBatch->notNull[batchCount - 1] = '\0'; + + for (uint64_t i = 0; i < batches; ++i) { + writer->add(*batch.get()); + } + writer->close(); + + std::unique_ptr reader = createReader(pool, memStream); + + // check column 1 (string) file stats + auto stats1 = reader->getColumnStatistics(1); + const GeospatialColumnStatistics* geoFileStats = + dynamic_cast(stats1.get()); + ENSURE_DYNAMIC_CAST_NOT_NULL(geoFileStats); + EXPECT_EQ(geoFileStats->getGeospatialTypes().size(), 3); + EXPECT_EQ(geoFileStats->getGeospatialTypes()[0], 1); + EXPECT_EQ(geoFileStats->getGeospatialTypes()[1], 1001); + EXPECT_EQ(geoFileStats->getGeospatialTypes()[2], 3001); + std::array expectValid = {true, true, true, true}; + std::array expectEmpty = {false, false, false, false}; + EXPECT_EQ(geoFileStats->getBoundingBox().dimensionValid(), expectValid); + EXPECT_EQ(geoFileStats->getBoundingBox().dimensionEmpty(), expectEmpty); + EXPECT_EQ(geoFileStats->getBoundingBox().lowerBound(), mins); + EXPECT_EQ(geoFileStats->getBoundingBox().upperBound(), maxs); + } + + TEST(Statistics, geographyStatsWithNull) { + std::unique_ptr const type( + Type::buildTypeFromString("struct")); + + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + MemoryPool* const pool = getDefaultPool(); + uint64_t const stripeSize = 32; // small stripe size to garantee multi stripes + std::unique_ptr writer = createWriter(stripeSize, *type, pool, &memStream); + + uint64_t const batchCount = 1000; + uint64_t const batches = 10; + std::unique_ptr const batch = writer->createRowBatch(batchCount); + StructVectorBatch* structBatch = dynamic_cast(batch.get()); + ENSURE_DYNAMIC_CAST_NOT_NULL(structBatch); + + StringVectorBatch* strBatch = dynamic_cast(structBatch->fields[0]); + ENSURE_DYNAMIC_CAST_NOT_NULL(strBatch); + + // create str values + std::vector wkbs; + std::array mins = {geospatial::INF, geospatial::INF, geospatial::INF, + geospatial::INF}; + std::array maxs = {-geospatial::INF, -geospatial::INF, -geospatial::INF, + -geospatial::INF}; + for (uint64_t i = 1; i < batchCount - 1; ++i) { + if (i % 3 == 0) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0}, false, false)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + } else if (i % 3 == 1) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0}, true, false)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + mins[2] = std::min(mins[2], i * 1.0); + maxs[2] = std::max(maxs[2], i * 1.0); + } else if (i % 3 == 2) { + wkbs.push_back(MakeWKBPoint({i * 1.0, i * 1.0, i * 1.0, i * 1.0}, true, true)); + mins[0] = std::min(mins[0], i * 1.0); + maxs[0] = std::max(maxs[0], i * 1.0); + mins[1] = std::min(mins[1], i * 1.0); + maxs[1] = std::max(maxs[1], i * 1.0); + mins[2] = std::min(mins[2], i * 1.0); + maxs[2] = std::max(maxs[2], i * 1.0); + mins[3] = std::min(mins[3], i * 1.0); + maxs[3] = std::max(maxs[3], i * 1.0); + } + } + for (uint64_t i = 1; i < batchCount - 1; ++i) { + strBatch->data[i] = const_cast(wkbs[i - 1].c_str()); + strBatch->length[i] = static_cast(wkbs[i - 1].length()); + } + + structBatch->numElements = batchCount; + strBatch->numElements = batchCount; + + structBatch->hasNulls = true; + structBatch->notNull[0] = '\0'; + structBatch->notNull[batchCount - 1] = '\0'; + strBatch->hasNulls = true; + strBatch->notNull[0] = '\0'; + strBatch->notNull[batchCount - 1] = '\0'; + + for (uint64_t i = 0; i < batches; ++i) { + writer->add(*batch.get()); + } + writer->close(); + + std::unique_ptr reader = createReader(pool, memStream); + + // check column 1 (string) file stats + auto stats1 = reader->getColumnStatistics(1); + const GeospatialColumnStatistics* geoFileStats = + dynamic_cast(stats1.get()); + ENSURE_DYNAMIC_CAST_NOT_NULL(geoFileStats); + EXPECT_EQ(geoFileStats->getGeospatialTypes().size(), 0); + std::array expectValid = {false, false, false, false}; + EXPECT_EQ(geoFileStats->getBoundingBox().dimensionValid(), expectValid); + } +} // namespace orc \ No newline at end of file diff --git a/c++/test/TestUtil.cc b/c++/test/TestUtil.cc new file mode 100644 index 0000000000..a76880340c --- /dev/null +++ b/c++/test/TestUtil.cc @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TestUtil.hh" +#include +#include + +namespace orc { + uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ, bool hasM) { + auto wkbGeomType = static_cast(geometryType); + + if (hasZ) { + wkbGeomType += 1000; + } + + if (hasM) { + wkbGeomType += 2000; + } + + return wkbGeomType; + } + + std::string MakeWKBPoint(const std::vector& xyzm, bool hasZ, bool hasM) { + // 1:endianness + 4:type + 8:x + 8:y + int numBytes = kWkbPointXYSize + (hasZ ? sizeof(double) : 0) + (hasM ? sizeof(double) : 0); + std::string wkb(numBytes, 0); + char* ptr = wkb.data(); + + ptr[0] = kWkbNativeEndianness; + uint32_t geom_type = GeometryTypeToWKB(geospatial::GeometryType::POINT, hasZ, hasM); + std::memcpy(&ptr[1], &geom_type, 4); + std::memcpy(&ptr[5], &xyzm[0], 8); + std::memcpy(&ptr[13], &xyzm[1], 8); + ptr += 21; + + if (hasZ) { + std::memcpy(ptr, &xyzm[2], 8); + ptr += 8; + } + + if (hasM) { + std::memcpy(ptr, &xyzm[3], 8); + ptr += 8; + } + + assert(static_cast(ptr - wkb.data()) == wkb.length()); + return wkb; + } + +} // namespace orc \ No newline at end of file diff --git a/c++/test/TestUtil.hh b/c++/test/TestUtil.hh new file mode 100644 index 0000000000..104fbc0397 --- /dev/null +++ b/c++/test/TestUtil.hh @@ -0,0 +1,44 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "orc/Geospatial.hh" + +#include +#include + +namespace orc { + + /// \brief Number of bytes in a WKB Point with X and Y dimensions (uint8_t endian, + /// uint32_t geometry type, 2 * double coordinates) + static constexpr int kWkbPointXYSize = 21; + + static bool isLittleEndian() { + static union { + uint32_t i; + char c[4]; + } num = {0x01020304}; + return num.c[0] == 4; + } + + static uint8_t kWkbNativeEndianness = isLittleEndian() ? 0x01 : 0x00; + + uint32_t GeometryTypeToWKB(geospatial::GeometryType geometryType, bool hasZ, bool hasM); + std::string MakeWKBPoint(const std::vector& xyzm, bool hasZ, bool hasM); + +} // namespace orc \ No newline at end of file diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc index 975462e30c..11ba0c9dea 100644 --- a/c++/test/TestWriter.cc +++ b/c++/test/TestWriter.cc @@ -16,18 +16,20 @@ * limitations under the License. */ -#include "orc/ColumnPrinter.hh" +#include #include "orc/OrcFile.hh" #include "MemoryInputStream.hh" #include "MemoryOutputStream.hh" #include "Reader.hh" +#include "TestUtil.hh" #include "wrap/gmock.h" #include "wrap/gtest-wrapper.h" #include #include +#include #include #ifdef __clang__ @@ -2400,6 +2402,139 @@ namespace orc { EXPECT_FALSE(rowReader->next(*batch)); } + TEST_P(WriterTest, writeGeometryAndGeographyColumn) { + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + MemoryPool* pool = getDefaultPool(); + std::unique_ptr type(Type::buildTypeFromString( + "struct")); + uint64_t stripeSize = 1024; // 1K + uint64_t compressionBlockSize = 1024; // 1k + uint64_t memoryBlockSize = 64; + std::unique_ptr writer = + createWriter(stripeSize, memoryBlockSize, compressionBlockSize, CompressionKind_ZLIB, *type, + pool, &memStream, fileVersion, enableAlignBlockBoundToRowGroup ? 1024 : 0); + + EXPECT_EQ("struct", + type->toString()); + + uint64_t batchCount = 100, batchSize = 1000; + std::unique_ptr batch = writer->createRowBatch(batchSize); + StructVectorBatch* structBatch = dynamic_cast(batch.get()); + StringVectorBatch* geometryBatch = dynamic_cast(structBatch->fields[0]); + StringVectorBatch* geographyBatch = dynamic_cast(structBatch->fields[1]); + + std::unique_ptr buffer(new char[8000000]); + char* buf = buffer.get(); + + // write 100 * 1000 rows, every 100 rows are in one row group + // every 2 consecutive rows has one null value. + uint64_t rowCount = 0; + for (uint64_t i = 0; i != batchCount; ++i) { + structBatch->hasNulls = false; + structBatch->numElements = batchSize; + + geometryBatch->hasNulls = true; + geometryBatch->numElements = batchSize; + geographyBatch->hasNulls = true; + geographyBatch->numElements = batchSize; + + for (uint64_t j = 0; j != batchSize; ++j) { + if (rowCount % 2 == 0) { + geometryBatch->notNull[j] = 0; + geographyBatch->notNull[j] = 0; + } else { + geometryBatch->notNull[j] = 1; + geographyBatch->notNull[j] = 1; + + std::string wkb = MakeWKBPoint({j * 1.0, j * 1.0}, false, false); + strncpy(buf, wkb.c_str(), wkb.size()); + + geometryBatch->data[j] = buf; + geometryBatch->length[j] = static_cast(wkb.size()); + geographyBatch->data[j] = buf; + geographyBatch->length[j] = static_cast(wkb.size()); + + buf += wkb.size(); + } + ++rowCount; + } + + writer->add(*batch); + } + writer->close(); + + std::unique_ptr inStream( + new MemoryInputStream(memStream.getData(), memStream.getLength())); + std::unique_ptr reader = createReader(pool, std::move(inStream)); + EXPECT_EQ(batchCount * batchSize, reader->getNumberOfRows()); + EXPECT_TRUE(reader->getNumberOfStripes() > 1); + + EXPECT_EQ("struct", + reader->getType().toString()); + // test sequential reader + std::unique_ptr seqReader = createRowReader(reader.get()); + rowCount = 0; + for (uint64_t i = 0; i != batchCount; ++i) { + seqReader->next(*batch); + + EXPECT_FALSE(structBatch->hasNulls); + EXPECT_EQ(batchSize, structBatch->numElements); + + EXPECT_TRUE(geometryBatch->hasNulls); + EXPECT_EQ(batchSize, geometryBatch->numElements); + EXPECT_TRUE(geographyBatch->hasNulls); + EXPECT_EQ(batchSize, geographyBatch->numElements); + + for (uint64_t j = 0; j != batchSize; ++j) { + if (rowCount % 2 == 0) { + EXPECT_TRUE(geometryBatch->notNull[j] == 0); + EXPECT_TRUE(geographyBatch->notNull[j] == 0); + } else { + EXPECT_TRUE(geometryBatch->notNull[j] != 0); + EXPECT_TRUE(geographyBatch->notNull[j] != 0); + std::string wkb = MakeWKBPoint({j * 1.0, j * 1.0}, false, false); + EXPECT_EQ(static_cast(wkb.size()), geometryBatch->length[j]); + EXPECT_TRUE(strncmp(geometryBatch->data[j], wkb.c_str(), wkb.size()) == 0); + EXPECT_EQ(static_cast(wkb.size()), geographyBatch->length[j]); + EXPECT_TRUE(strncmp(geographyBatch->data[j], wkb.c_str(), wkb.size()) == 0); + } + ++rowCount; + } + } + EXPECT_FALSE(seqReader->next(*batch)); + + // test seek reader + std::unique_ptr seekReader = createRowReader(reader.get()); + batch = seekReader->createRowBatch(2); + structBatch = dynamic_cast(batch.get()); + geometryBatch = dynamic_cast(structBatch->fields[0]); + geographyBatch = dynamic_cast(structBatch->fields[1]); + + for (uint64_t row = rowCount - 2; row >= 100; row -= 100) { + seekReader->seekToRow(row); + seekReader->next(*batch); + + EXPECT_FALSE(structBatch->hasNulls); + EXPECT_EQ(2, structBatch->numElements); + EXPECT_TRUE(geometryBatch->hasNulls); + EXPECT_EQ(2, geometryBatch->numElements); + EXPECT_TRUE(geographyBatch->hasNulls); + EXPECT_EQ(2, geographyBatch->numElements); + + EXPECT_TRUE(geometryBatch->notNull[0] == 0); + EXPECT_TRUE(geometryBatch->notNull[1] != 0); + EXPECT_TRUE(geographyBatch->notNull[0] == 0); + EXPECT_TRUE(geographyBatch->notNull[1] != 0); + + std::string wkb = MakeWKBPoint({(row + 1) * 1.0, (row + 1) * 1.0}, false, false); + + EXPECT_EQ(static_cast(wkb.size()), geometryBatch->length[1]); + EXPECT_TRUE(strncmp(geometryBatch->data[1], wkb.c_str(), wkb.size()) == 0); + EXPECT_EQ(static_cast(wkb.size()), geographyBatch->length[1]); + EXPECT_TRUE(strncmp(geographyBatch->data[1], wkb.c_str(), wkb.size()) == 0); + } + } + std::vector testParams = {{FileVersion::v_0_11(), true}, {FileVersion::v_0_11(), false}, {FileVersion::v_0_12(), false}, diff --git a/c++/test/meson.build b/c++/test/meson.build index ba84bf7fa5..a8d30a6b94 100644 --- a/c++/test/meson.build +++ b/c++/test/meson.build @@ -50,10 +50,12 @@ test_sources = [ 'TestSargsApplier.cc', 'TestSearchArgument.cc', 'TestSchemaEvolution.cc', + 'TestStatistics.cc', 'TestStripeIndexStatistics.cc', 'TestTimestampStatistics.cc', 'TestTimezone.cc', 'TestType.cc', + 'TestUtil.cc', 'TestWriter.cc', 'TestCache.cc', ] diff --git a/tools/src/CSVFileImport.cc b/tools/src/CSVFileImport.cc index ae17b3348a..31a6f52a2d 100644 --- a/tools/src/CSVFileImport.cc +++ b/tools/src/CSVFileImport.cc @@ -420,6 +420,8 @@ int main(int argc, char* argv[]) { case orc::LIST: case orc::MAP: case orc::UNION: + case orc::GEOMETRY: + case orc::GEOGRAPHY: throw std::runtime_error(subType->toString() + " is not supported yet."); } }