From a5cd719c10d52658bc4fdd5bdebaea2a6d6f02a8 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 3 Sep 2019 16:38:03 -0400 Subject: [PATCH 1/4] ipc json should use strings for 64 bit ints --- cpp/src/arrow/ipc/json_internal.cc | 81 ++++++++++++++++++------------ cpp/src/arrow/type_traits.h | 19 +++++++ 2 files changed, 67 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc index 1f2c44e7224..4a019982c4a 100644 --- a/cpp/src/arrow/ipc/json_internal.cc +++ b/cpp/src/arrow/ipc/json_internal.cc @@ -18,6 +18,7 @@ #include "arrow/ipc/json_internal.h" #include +#include #include #include #include @@ -39,6 +40,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" +#include "arrow/util/parsing.h" #include "arrow/util/string.h" #include "arrow/visitor_inline.h" @@ -398,36 +400,45 @@ class ArrayWriter { } template - typename std::enable_if::value, void>::type WriteDataValues( - const T& arr) { - static const char null_string[] = "0"; + enable_if_integer_repr WriteDataValues(const T& arr) { + using c_type = typename T::TypeClass::c_type; const auto data = arr.raw_values(); - for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsValid(i)) { - writer_->Int64(data[i]); - } else { - writer_->RawNumber(null_string, sizeof(null_string)); + + if (sizeof(c_type) < sizeof(int64_t)) { + for (int64_t i = 0; i < arr.length(); ++i) { + if (arr.IsValid(i)) { + if (std::is_signed::value) { + writer_->Int(static_cast(data[i])); + } else { + writer_->Uint(static_cast(data[i])); + } + } else { + writer_->RawNumber("0", sizeof("0")); + } } + return; } - } - template - typename std::enable_if::value, void>::type WriteDataValues( - const T& arr) { - static const char null_string[] = "0"; - const auto data = arr.raw_values(); + // write strings instead of numbers since numbers can't represent all 64 bit integers for (int64_t i = 0; i < arr.length(); ++i) { + constexpr size_t kBufferSize = 128; + char str[kBufferSize] = {0}; + int length = 0; if (arr.IsValid(i)) { - writer_->Uint64(data[i]); + if (std::is_signed::value) { + length = snprintf(str, kBufferSize, "%li", static_cast(data[i])); + } else { + length = snprintf(str, kBufferSize, "%lu", static_cast(data[i])); + } } else { - writer_->RawNumber(null_string, sizeof(null_string)); + str[0] = '0'; } + writer_->String(str, length); } } template - typename std::enable_if::value, void>::type WriteDataValues( - const T& arr) { + enable_if_floating_point WriteDataValues(const T& arr) { static const char null_string[] = "0."; const auto data = arr.raw_values(); for (int64_t i = 0; i < arr.length(); ++i) { @@ -1054,30 +1065,34 @@ static Status GetField(const rj::Value& obj, DictionaryMemo* dictionary_memo, return Status::OK(); } -template -inline typename std::enable_if::value, typename T::c_type>::type -UnboxValue(const rj::Value& val) { - DCHECK(val.IsInt64()); - return static_cast(val.GetInt64()); -} +template +enable_if_integer_repr UnboxValue( + const rj::Value& val, + typename std::enable_if::value>::type* = nullptr) { + if (sizeof(CType) < sizeof(int64_t)) { + if (IsSignedInt::value) { + DCHECK(val.IsInt()); + return static_cast(val.GetInt()); + } + DCHECK(val.IsUint()); + return static_cast(val.GetUint()); + } -template -inline typename std::enable_if::value, typename T::c_type>::type -UnboxValue(const rj::Value& val) { - DCHECK(val.IsUint()); - return static_cast(val.GetUint64()); + ::arrow::internal::StringConverter::ArrowType> converter; + CType out; + auto success = converter(val.GetString(), val.GetStringLength(), &out); + DCHECK(success); + return out; } template -inline typename std::enable_if::value, typename T::c_type>::type -UnboxValue(const rj::Value& val) { +enable_if_floating_point UnboxValue(const rj::Value& val) { DCHECK(val.IsFloat()); return static_cast(val.GetDouble()); } template -inline typename std::enable_if::value, bool>::type -UnboxValue(const rj::Value& val) { +enable_if_boolean UnboxValue(const rj::Value& val) { DCHECK(val.IsBool()); return val.GetBool(); } diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 0e4439db864..8d4ffb0e4f0 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -414,6 +414,11 @@ struct CTypeTraits> { // Useful type predicates // +template +struct as_void { + using type = void; +}; + template using is_number_type = std::is_base_of; @@ -441,6 +446,16 @@ struct is_8bit_int { (std::is_same::value || std::is_same::value); }; +template +struct is_integer_repr_type : std::false_type {}; + +template <> +struct is_integer_repr_type : std::false_type {}; + +template +struct is_integer_repr_type::type> + : std::is_integral {}; + template struct is_any_string_type { static constexpr bool value = @@ -457,6 +472,10 @@ using enable_if_primitive_ctype = template using enable_if_integer = typename std::enable_if::value, R>::type; +template +using enable_if_integer_repr = + typename std::enable_if::value, R>::type; + template using is_signed_integer = std::integral_constant::value && From 121dee160a572ebfd751611e51c53cc6701e3245 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 5 Sep 2019 13:46:03 -0400 Subject: [PATCH 2/4] use std::to_string --- cpp/src/arrow/ipc/json_internal.cc | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc index 4a019982c4a..3f85d37cee0 100644 --- a/cpp/src/arrow/ipc/json_internal.cc +++ b/cpp/src/arrow/ipc/json_internal.cc @@ -18,7 +18,6 @@ #include "arrow/ipc/json_internal.h" #include -#include #include #include #include @@ -421,19 +420,8 @@ class ArrayWriter { // write strings instead of numbers since numbers can't represent all 64 bit integers for (int64_t i = 0; i < arr.length(); ++i) { - constexpr size_t kBufferSize = 128; - char str[kBufferSize] = {0}; - int length = 0; - if (arr.IsValid(i)) { - if (std::is_signed::value) { - length = snprintf(str, kBufferSize, "%li", static_cast(data[i])); - } else { - length = snprintf(str, kBufferSize, "%lu", static_cast(data[i])); - } - } else { - str[0] = '0'; - } - writer_->String(str, length); + auto str = std::to_string(data[i]); + writer_->String(str.c_str(), str.size()); } } From 8318e9e58b1feb3c71bf5fe87bae22cef2149827 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 9 Sep 2019 12:42:33 -0400 Subject: [PATCH 3/4] rewrite integration/integration_test.py to generate int64 cols as strings --- cpp/src/arrow/ipc/json_internal.cc | 2 +- integration/integration_test.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc index 3f85d37cee0..cdaf0bbd17e 100644 --- a/cpp/src/arrow/ipc/json_internal.cc +++ b/cpp/src/arrow/ipc/json_internal.cc @@ -421,7 +421,7 @@ class ArrayWriter { // write strings instead of numbers since numbers can't represent all 64 bit integers for (int64_t i = 0; i < arr.length(); ++i) { auto str = std::to_string(data[i]); - writer_->String(str.c_str(), str.size()); + writer_->String(str.c_str(), static_cast(str.size())); } } diff --git a/integration/integration_test.py b/integration/integration_test.py index be20b4ee233..bf6ac3f8113 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -184,6 +184,15 @@ def _get_buffers(self): class IntegerType(PrimitiveType): + class Column(PrimitiveColumn): + + def __init__(self, name, count, is_valid, values, bit_width): + super(IntegerType.Column, self).__init__(name, count, is_valid, values) + self.bit_width = bit_width + + def _encode_value(self, x): + return x if self.bit_width < 64 else str(x) + def __init__(self, name, is_signed, bit_width, nullable=True, min_value=TEST_INT_MIN, @@ -225,7 +234,7 @@ def generate_range(self, size, lower, upper, name=None): if name is None: name = self.name - return PrimitiveColumn(name, size, is_valid, values) + return IntegerType.Column(name, size, is_valid, values, self.bit_width) class DateType(IntegerType): From 09b6a9492b83290f64d1e640804d27af65cd0f8a Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 10 Sep 2019 10:46:44 -0400 Subject: [PATCH 4/4] de-nest IntegerColumn --- integration/integration_test.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/integration/integration_test.py b/integration/integration_test.py index bf6ac3f8113..cb5510de50a 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -179,21 +179,22 @@ def _get_buffers(self): ] -TEST_INT_MAX = 2 ** 31 - 1 -TEST_INT_MIN = ~TEST_INT_MAX +class IntegerColumn(PrimitiveColumn): + def __init__(self, name, count, is_valid, values, bit_width): + super(IntegerColumn, self).__init__(name, count, is_valid, values) + self.bit_width = bit_width -class IntegerType(PrimitiveType): - class Column(PrimitiveColumn): + def _encode_value(self, x): + return x if self.bit_width < 64 else str(x) - def __init__(self, name, count, is_valid, values, bit_width): - super(IntegerType.Column, self).__init__(name, count, is_valid, values) - self.bit_width = bit_width - def _encode_value(self, x): - return x if self.bit_width < 64 else str(x) +TEST_INT_MAX = 2 ** 31 - 1 +TEST_INT_MIN = ~TEST_INT_MAX +class IntegerType(PrimitiveType): + def __init__(self, name, is_signed, bit_width, nullable=True, min_value=TEST_INT_MIN, max_value=TEST_INT_MAX): @@ -234,7 +235,7 @@ def generate_range(self, size, lower, upper, name=None): if name is None: name = self.name - return IntegerType.Column(name, size, is_valid, values, self.bit_width) + return IntegerColumn(name, size, is_valid, values, self.bit_width) class DateType(IntegerType):