Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ install(FILES
rle-encoding.h
sse-util.h
stl.h
type_traits.h
visibility.h
DESTINATION include/arrow/util)

Expand Down
33 changes: 32 additions & 1 deletion cpp/src/arrow/util/bit-util-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@

#include "arrow/buffer.h"
#include "arrow/memory_pool.h"
#include "arrow/status.h"
#include "arrow/test-util.h"
#include "arrow/util/bit-stream-utils.h"
#include "arrow/util/bit-util.h"
Expand Down Expand Up @@ -334,4 +333,36 @@ TEST(BitStreamUtil, ZigZag) {
TestZigZag(-std::numeric_limits<int32_t>::max());
}

TEST(BitUtil, RoundTripLittleEndianTest) {
uint64_t value = 0xFF;

#if ARROW_LITTLE_ENDIAN
uint64_t expected = value;
#else
uint64_t expected = std::numeric_limits<uint64_t>::max() << 56;
#endif

uint64_t little_endian_result = BitUtil::ToLittleEndian(value);
ASSERT_EQ(expected, little_endian_result);

uint64_t from_little_endian = BitUtil::FromLittleEndian(little_endian_result);
ASSERT_EQ(value, from_little_endian);
}

TEST(BitUtil, RoundTripBigEndianTest) {
uint64_t value = 0xFF;

#if ARROW_LITTLE_ENDIAN
uint64_t expected = std::numeric_limits<uint64_t>::max() << 56;
#else
uint64_t expected = value;
#endif

uint64_t big_endian_result = BitUtil::ToBigEndian(value);
ASSERT_EQ(expected, big_endian_result);

uint64_t from_big_endian = BitUtil::FromBigEndian(big_endian_result);
ASSERT_EQ(value, from_big_endian);
}

} // namespace arrow
76 changes: 49 additions & 27 deletions cpp/src/arrow/util/bit-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
#include <vector>

#include "arrow/util/macros.h"
#include "arrow/util/type_traits.h"
#include "arrow/util/visibility.h"

#ifdef ARROW_USE_SSE
Expand Down Expand Up @@ -305,7 +306,7 @@ static inline uint32_t ByteSwap(uint32_t value) {
return static_cast<uint32_t>(ARROW_BYTE_SWAP32(value));
}
static inline int16_t ByteSwap(int16_t value) {
constexpr int16_t m = static_cast<int16_t>(0xff);
constexpr auto m = static_cast<int16_t>(0xff);
return static_cast<int16_t>(((value >> 8) & m) | ((value & m) << 8));
}
static inline uint16_t ByteSwap(uint16_t value) {
Expand All @@ -331,8 +332,8 @@ static inline void ByteSwap(void* dst, const void* src, int len) {
break;
}

uint8_t* d = reinterpret_cast<uint8_t*>(dst);
const uint8_t* s = reinterpret_cast<const uint8_t*>(src);
auto d = reinterpret_cast<uint8_t*>(dst);
auto s = reinterpret_cast<const uint8_t*>(src);
for (int i = 0; i < len; ++i) {
d[i] = s[len - i - 1];
}
Expand All @@ -341,36 +342,57 @@ static inline void ByteSwap(void* dst, const void* src, int len) {
/// Converts to big endian format (if not already in big endian) from the
/// machine's native endian format.
#if ARROW_LITTLE_ENDIAN
static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); }
static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); }
static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); }
static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); }
static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); }
static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); }
template <typename T,
typename =
EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t>>
static inline T ToBigEndian(T value) {
return ByteSwap(value);
}

template <typename T,
typename =
EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t>>
static inline T ToLittleEndian(T value) {
return value;
}
#else
static inline int64_t ToBigEndian(int64_t val) { return val; }
static inline uint64_t ToBigEndian(uint64_t val) { return val; }
static inline int32_t ToBigEndian(int32_t val) { return val; }
static inline uint32_t ToBigEndian(uint32_t val) { return val; }
static inline int16_t ToBigEndian(int16_t val) { return val; }
static inline uint16_t ToBigEndian(uint16_t val) { return val; }
template <typename T,
typename =
EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t>>
static inline T ToBigEndian(T value) {
return value;
}
#endif

/// Converts from big endian format to the machine's native endian format.
#if ARROW_LITTLE_ENDIAN
static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); }
static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); }
static inline int32_t FromBigEndian(int32_t value) { return ByteSwap(value); }
static inline uint32_t FromBigEndian(uint32_t value) { return ByteSwap(value); }
static inline int16_t FromBigEndian(int16_t value) { return ByteSwap(value); }
static inline uint16_t FromBigEndian(uint16_t value) { return ByteSwap(value); }
template <typename T,
typename =
EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t>>
static inline T FromBigEndian(T value) {
return ByteSwap(value);
}

template <typename T,
typename =
EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t>>
static inline T FromLittleEndian(T value) {
return value;
}
#else
static inline int64_t FromBigEndian(int64_t val) { return val; }
static inline uint64_t FromBigEndian(uint64_t val) { return val; }
static inline int32_t FromBigEndian(int32_t val) { return val; }
static inline uint32_t FromBigEndian(uint32_t val) { return val; }
static inline int16_t FromBigEndian(int16_t val) { return val; }
static inline uint16_t FromBigEndian(uint16_t val) { return val; }
template <typename T,
typename =
EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t>>
static inline T FromBigEndian(T value) {
return value;
}

template <typename T,
typename =
EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t>>
static inline T FromLittleEndian(T value) {
return ByteSwap(value);
}
#endif

// Logical right shift for signed integer types
Expand Down
9 changes: 6 additions & 3 deletions cpp/src/arrow/util/decimal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#pragma intrinsic(_BitScanReverse)
#endif

#include "arrow/util/bit-util.h"
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"

Expand All @@ -41,11 +42,13 @@ Decimal128::Decimal128(const std::string& str) : Decimal128() {
}

Decimal128::Decimal128(const uint8_t* bytes)
: Decimal128(reinterpret_cast<const int64_t*>(bytes)[0],
reinterpret_cast<const uint64_t*>(bytes)[1]) {}
: Decimal128(BitUtil::FromLittleEndian(reinterpret_cast<const int64_t*>(bytes)[1]),
BitUtil::FromLittleEndian(reinterpret_cast<const uint64_t*>(bytes)[0])) {
}

std::array<uint8_t, 16> Decimal128::ToBytes() const {
const uint64_t raw[] = {static_cast<uint64_t>(high_bits_), low_bits_};
const uint64_t raw[] = {BitUtil::ToLittleEndian(low_bits_),
BitUtil::ToLittleEndian(static_cast<uint64_t>(high_bits_))};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reinterpret_cast here -- can this change the bit pattern?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

static_cast from one builtin integer type to another of the same byte size won't change the underlying bits.

The compiler will also fail to type check this because reinterpret_cast from one integer to another isn't allowed by the standard: http://en.cppreference.com/w/cpp/language/reinterpret_cast.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to clarify where this behavior is defined in the standard: http://en.cppreference.com/w/cpp/language/implicit_conversion#Integral_conversions

The specific language is this:

If the destination type is unsigned, the resulting value is the smallest unsigned value equal to the source value modulo 2 ** n
where n is the number of bits used to represent the destination type.
That is, depending on whether the destination type is wider or narrower, signed integers are sign-extended [footnote 1] or truncated and unsigned integers are zero-extended or truncated respectively.

Here's the footnote :)

This only applies if the arithmetic is two's complement which is only required for the exact-width integer types. Note, however, that at the moment all platforms with a C++ compiler use two's complement arithmetic

We're using exact-width integer types here, so all behavior here is well-defined and within the standard.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha. there is always *reinterpret_cast<T*>(&val) but this is helpful to know

const auto* raw_data = reinterpret_cast<const uint8_t*>(raw);
std::array<uint8_t, 16> out{{0}};
std::copy(raw_data, raw_data + out.size(), out.begin());
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/arrow/util/decimal.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ class ARROW_EXPORT Decimal128 {
/// \brief Parse the number from a base 10 string representation.
explicit Decimal128(const std::string& value);

/// \brief Create an Decimal128 from an array of bytes
/// \brief Create an Decimal128 from an array of bytes. Bytes are assumed to be in
/// little endian byte order.
explicit Decimal128(const uint8_t* bytes);

/// \brief Negate the current value
Expand Down Expand Up @@ -104,7 +105,7 @@ class ARROW_EXPORT Decimal128 {
/// \brief Get the low bits of the two's complement representation of the number.
uint64_t low_bits() const { return low_bits_; }

/// \brief Return the raw bytes of the value.
/// \brief Return the raw bytes of the value in little-endian byte order.
std::array<uint8_t, 16> ToBytes() const;

/// \brief Convert the Decimal128 value to a base 10 decimal string with the given
Expand Down
41 changes: 41 additions & 0 deletions cpp/src/arrow/util/type_traits.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#ifndef ARROW_UTIL_TYPE_TRAITS_H
#define ARROW_UTIL_TYPE_TRAITS_H

#include <type_traits>

namespace arrow {

/// \brief Metafunction to allow checking if a type matches any of another set of types
template <typename...>
struct IsOneOf : std::false_type {}; /// Base case: nothing has matched

template <typename T, typename U, typename... Args>
struct IsOneOf<T, U, Args...> {
/// Recursive case: T == U or T matches any other types provided (not including U).
static constexpr bool value = std::is_same<T, U>::value || IsOneOf<T, Args...>::value;
};

/// \brief Shorthand for using IsOneOf + std::enable_if
template <typename T, typename... Args>
using EnableIfIsOneOf = typename std::enable_if<IsOneOf<T, Args...>::value, T>::type;

} // namespace arrow

#endif // ARROW_UTIL_TYPE_TRAITS_H
5 changes: 2 additions & 3 deletions format/Layout.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,8 @@ concepts, here is a small glossary to help disambiguate.
or a fully-specified nested type. When we say slot we mean a relative type
value, not necessarily any physical storage region.
* Logical type: A data type that is implemented using some relative (physical)
type. For example, a Decimal value stored in 16 bytes could be stored in a
primitive array with slot size 16 bytes. Similarly, strings can be stored as
`List<1-byte>`.
type. For example, Decimal values are stored as 16 bytes in a fixed byte
size array. Similarly, strings can be stored as `List<1-byte>`.
* Parent and child arrays: names to express relationships between physical
value arrays in a nested type structure. For example, a `List<T>`-type parent
array has a T-type array as its child (see more on lists below).
Expand Down
3 changes: 2 additions & 1 deletion format/Metadata.md
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,8 @@ logical type, which have no children) and 3 buffers:

### Decimal

TBD
Decimals are represented as a 2's complement 128-bit (16 byte) signed integer
in little-endian byte order.

### Timestamp

Expand Down