diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 1178c658c48..5df5e748f39 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -42,6 +42,7 @@ install(FILES rle-encoding.h sse-util.h stl.h + type_traits.h visibility.h DESTINATION include/arrow/util) diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index 5a66d7e85ad..92bdcb5fc08 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -28,7 +28,6 @@ #include "arrow/buffer.h" #include "arrow/memory_pool.h" -#include "arrow/status.h" #include "arrow/test-util.h" #include "arrow/util/bit-stream-utils.h" #include "arrow/util/bit-util.h" @@ -334,4 +333,36 @@ TEST(BitStreamUtil, ZigZag) { TestZigZag(-std::numeric_limits::max()); } +TEST(BitUtil, RoundTripLittleEndianTest) { + uint64_t value = 0xFF; + +#if ARROW_LITTLE_ENDIAN + uint64_t expected = value; +#else + uint64_t expected = std::numeric_limits::max() << 56; +#endif + + uint64_t little_endian_result = BitUtil::ToLittleEndian(value); + ASSERT_EQ(expected, little_endian_result); + + uint64_t from_little_endian = BitUtil::FromLittleEndian(little_endian_result); + ASSERT_EQ(value, from_little_endian); +} + +TEST(BitUtil, RoundTripBigEndianTest) { + uint64_t value = 0xFF; + +#if ARROW_LITTLE_ENDIAN + uint64_t expected = std::numeric_limits::max() << 56; +#else + uint64_t expected = value; +#endif + + uint64_t big_endian_result = BitUtil::ToBigEndian(value); + ASSERT_EQ(expected, big_endian_result); + + uint64_t from_big_endian = BitUtil::FromBigEndian(big_endian_result); + ASSERT_EQ(value, from_big_endian); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 2509de21ffb..8043f90ccdf 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -56,6 +56,7 @@ #include #include "arrow/util/macros.h" +#include "arrow/util/type_traits.h" #include "arrow/util/visibility.h" #ifdef ARROW_USE_SSE @@ -305,7 +306,7 @@ static inline uint32_t ByteSwap(uint32_t value) { return static_cast(ARROW_BYTE_SWAP32(value)); } static inline int16_t ByteSwap(int16_t value) { - constexpr int16_t m = static_cast(0xff); + constexpr auto m = static_cast(0xff); return static_cast(((value >> 8) & m) | ((value & m) << 8)); } static inline uint16_t ByteSwap(uint16_t value) { @@ -331,8 +332,8 @@ static inline void ByteSwap(void* dst, const void* src, int len) { break; } - uint8_t* d = reinterpret_cast(dst); - const uint8_t* s = reinterpret_cast(src); + auto d = reinterpret_cast(dst); + auto s = reinterpret_cast(src); for (int i = 0; i < len; ++i) { d[i] = s[len - i - 1]; } @@ -341,36 +342,57 @@ static inline void ByteSwap(void* dst, const void* src, int len) { /// Converts to big endian format (if not already in big endian) from the /// machine's native endian format. #if ARROW_LITTLE_ENDIAN -static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); } -static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); } -static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); } -static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); } -static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); } -static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); } +template > +static inline T ToBigEndian(T value) { + return ByteSwap(value); +} + +template > +static inline T ToLittleEndian(T value) { + return value; +} #else -static inline int64_t ToBigEndian(int64_t val) { return val; } -static inline uint64_t ToBigEndian(uint64_t val) { return val; } -static inline int32_t ToBigEndian(int32_t val) { return val; } -static inline uint32_t ToBigEndian(uint32_t val) { return val; } -static inline int16_t ToBigEndian(int16_t val) { return val; } -static inline uint16_t ToBigEndian(uint16_t val) { return val; } +template > +static inline T ToBigEndian(T value) { + return value; +} #endif /// Converts from big endian format to the machine's native endian format. #if ARROW_LITTLE_ENDIAN -static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); } -static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); } -static inline int32_t FromBigEndian(int32_t value) { return ByteSwap(value); } -static inline uint32_t FromBigEndian(uint32_t value) { return ByteSwap(value); } -static inline int16_t FromBigEndian(int16_t value) { return ByteSwap(value); } -static inline uint16_t FromBigEndian(uint16_t value) { return ByteSwap(value); } +template > +static inline T FromBigEndian(T value) { + return ByteSwap(value); +} + +template > +static inline T FromLittleEndian(T value) { + return value; +} #else -static inline int64_t FromBigEndian(int64_t val) { return val; } -static inline uint64_t FromBigEndian(uint64_t val) { return val; } -static inline int32_t FromBigEndian(int32_t val) { return val; } -static inline uint32_t FromBigEndian(uint32_t val) { return val; } -static inline int16_t FromBigEndian(int16_t val) { return val; } -static inline uint16_t FromBigEndian(uint16_t val) { return val; } +template > +static inline T FromBigEndian(T value) { + return value; +} + +template > +static inline T FromLittleEndian(T value) { + return ByteSwap(value); +} #endif // Logical right shift for signed integer types diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 49d5c024950..7196b252c5b 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -27,6 +27,7 @@ #pragma intrinsic(_BitScanReverse) #endif +#include "arrow/util/bit-util.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" @@ -41,11 +42,13 @@ Decimal128::Decimal128(const std::string& str) : Decimal128() { } Decimal128::Decimal128(const uint8_t* bytes) - : Decimal128(reinterpret_cast(bytes)[0], - reinterpret_cast(bytes)[1]) {} + : Decimal128(BitUtil::FromLittleEndian(reinterpret_cast(bytes)[1]), + BitUtil::FromLittleEndian(reinterpret_cast(bytes)[0])) { +} std::array Decimal128::ToBytes() const { - const uint64_t raw[] = {static_cast(high_bits_), low_bits_}; + const uint64_t raw[] = {BitUtil::ToLittleEndian(low_bits_), + BitUtil::ToLittleEndian(static_cast(high_bits_))}; const auto* raw_data = reinterpret_cast(raw); std::array out{{0}}; std::copy(raw_data, raw_data + out.size(), out.begin()); diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index 58496a874f1..ba252bee480 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -53,7 +53,8 @@ class ARROW_EXPORT Decimal128 { /// \brief Parse the number from a base 10 string representation. explicit Decimal128(const std::string& value); - /// \brief Create an Decimal128 from an array of bytes + /// \brief Create an Decimal128 from an array of bytes. Bytes are assumed to be in + /// little endian byte order. explicit Decimal128(const uint8_t* bytes); /// \brief Negate the current value @@ -104,7 +105,7 @@ class ARROW_EXPORT Decimal128 { /// \brief Get the low bits of the two's complement representation of the number. uint64_t low_bits() const { return low_bits_; } - /// \brief Return the raw bytes of the value. + /// \brief Return the raw bytes of the value in little-endian byte order. std::array ToBytes() const; /// \brief Convert the Decimal128 value to a base 10 decimal string with the given diff --git a/cpp/src/arrow/util/type_traits.h b/cpp/src/arrow/util/type_traits.h new file mode 100644 index 00000000000..c05309af826 --- /dev/null +++ b/cpp/src/arrow/util/type_traits.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_TYPE_TRAITS_H +#define ARROW_UTIL_TYPE_TRAITS_H + +#include + +namespace arrow { + +/// \brief Metafunction to allow checking if a type matches any of another set of types +template +struct IsOneOf : std::false_type {}; /// Base case: nothing has matched + +template +struct IsOneOf { + /// Recursive case: T == U or T matches any other types provided (not including U). + static constexpr bool value = std::is_same::value || IsOneOf::value; +}; + +/// \brief Shorthand for using IsOneOf + std::enable_if +template +using EnableIfIsOneOf = typename std::enable_if::value, T>::type; + +} // namespace arrow + +#endif // ARROW_UTIL_TYPE_TRAITS_H diff --git a/format/Layout.md b/format/Layout.md index 3c21dbc0dc2..ebf93821aab 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -41,9 +41,8 @@ concepts, here is a small glossary to help disambiguate. or a fully-specified nested type. When we say slot we mean a relative type value, not necessarily any physical storage region. * Logical type: A data type that is implemented using some relative (physical) - type. For example, a Decimal value stored in 16 bytes could be stored in a - primitive array with slot size 16 bytes. Similarly, strings can be stored as - `List<1-byte>`. + type. For example, Decimal values are stored as 16 bytes in a fixed byte + size array. Similarly, strings can be stored as `List<1-byte>`. * Parent and child arrays: names to express relationships between physical value arrays in a nested type structure. For example, a `List`-type parent array has a T-type array as its child (see more on lists below). diff --git a/format/Metadata.md b/format/Metadata.md index 80ca08ae13f..893b0a4742f 100644 --- a/format/Metadata.md +++ b/format/Metadata.md @@ -391,7 +391,8 @@ logical type, which have no children) and 3 buffers: ### Decimal -TBD +Decimals are represented as a 2's complement 128-bit (16 byte) signed integer +in little-endian byte order. ### Timestamp