From 3a53c9b334f8a2bfa2141fa62336d5f556c9277f Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 10 Sep 2019 19:57:02 -0500 Subject: [PATCH] ARROW-6310: [C++] IPC json should use strings for 64 bit ints JS can't represent all 64 bit integers as numbers (which are doubles), so store them in the JSON IPC as strings instead. Closes #5267 from bkietz/6310-Write-64-bit-integers-as- and squashes the following commits: 09b6a9492 de-nest IntegerColumn 8318e9e58 rewrite integration/integration_test.py to generate int64 cols as strings 121dee160 use std::to_string a5cd719c1 ipc json should use strings for 64 bit ints Authored-by: Benjamin Kietzman Signed-off-by: Wes McKinney --- cpp/src/arrow/ipc/json_internal.cc | 75 ++++++++++++++++-------------- cpp/src/arrow/type_traits.h | 19 ++++++++ integration/integration_test.py | 12 ++++- 3 files changed, 69 insertions(+), 37 deletions(-) diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc index 1f2c44e7224..cdaf0bbd17e 100644 --- a/cpp/src/arrow/ipc/json_internal.cc +++ b/cpp/src/arrow/ipc/json_internal.cc @@ -39,6 +39,7 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" +#include "arrow/util/parsing.h" #include "arrow/util/string.h" #include "arrow/visitor_inline.h" @@ -398,36 +399,34 @@ class ArrayWriter { } template - typename std::enable_if::value, void>::type WriteDataValues( - const T& arr) { - static const char null_string[] = "0"; + enable_if_integer_repr WriteDataValues(const T& arr) { + using c_type = typename T::TypeClass::c_type; const auto data = arr.raw_values(); - for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsValid(i)) { - writer_->Int64(data[i]); - } else { - writer_->RawNumber(null_string, sizeof(null_string)); + + if (sizeof(c_type) < sizeof(int64_t)) { + for (int64_t i = 0; i < arr.length(); ++i) { + if (arr.IsValid(i)) { + if (std::is_signed::value) { + writer_->Int(static_cast(data[i])); + } else { + writer_->Uint(static_cast(data[i])); + } + } else { + writer_->RawNumber("0", sizeof("0")); + } } + return; } - } - template - typename std::enable_if::value, void>::type WriteDataValues( - const T& arr) { - static const char null_string[] = "0"; - const auto data = arr.raw_values(); + // write strings instead of numbers since numbers can't represent all 64 bit integers for (int64_t i = 0; i < arr.length(); ++i) { - if (arr.IsValid(i)) { - writer_->Uint64(data[i]); - } else { - writer_->RawNumber(null_string, sizeof(null_string)); - } + auto str = std::to_string(data[i]); + writer_->String(str.c_str(), static_cast(str.size())); } } template - typename std::enable_if::value, void>::type WriteDataValues( - const T& arr) { + enable_if_floating_point WriteDataValues(const T& arr) { static const char null_string[] = "0."; const auto data = arr.raw_values(); for (int64_t i = 0; i < arr.length(); ++i) { @@ -1054,30 +1053,34 @@ static Status GetField(const rj::Value& obj, DictionaryMemo* dictionary_memo, return Status::OK(); } -template -inline typename std::enable_if::value, typename T::c_type>::type -UnboxValue(const rj::Value& val) { - DCHECK(val.IsInt64()); - return static_cast(val.GetInt64()); -} +template +enable_if_integer_repr UnboxValue( + const rj::Value& val, + typename std::enable_if::value>::type* = nullptr) { + if (sizeof(CType) < sizeof(int64_t)) { + if (IsSignedInt::value) { + DCHECK(val.IsInt()); + return static_cast(val.GetInt()); + } + DCHECK(val.IsUint()); + return static_cast(val.GetUint()); + } -template -inline typename std::enable_if::value, typename T::c_type>::type -UnboxValue(const rj::Value& val) { - DCHECK(val.IsUint()); - return static_cast(val.GetUint64()); + ::arrow::internal::StringConverter::ArrowType> converter; + CType out; + auto success = converter(val.GetString(), val.GetStringLength(), &out); + DCHECK(success); + return out; } template -inline typename std::enable_if::value, typename T::c_type>::type -UnboxValue(const rj::Value& val) { +enable_if_floating_point UnboxValue(const rj::Value& val) { DCHECK(val.IsFloat()); return static_cast(val.GetDouble()); } template -inline typename std::enable_if::value, bool>::type -UnboxValue(const rj::Value& val) { +enable_if_boolean UnboxValue(const rj::Value& val) { DCHECK(val.IsBool()); return val.GetBool(); } diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 0e4439db864..8d4ffb0e4f0 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -414,6 +414,11 @@ struct CTypeTraits> { // Useful type predicates // +template +struct as_void { + using type = void; +}; + template using is_number_type = std::is_base_of; @@ -441,6 +446,16 @@ struct is_8bit_int { (std::is_same::value || std::is_same::value); }; +template +struct is_integer_repr_type : std::false_type {}; + +template <> +struct is_integer_repr_type : std::false_type {}; + +template +struct is_integer_repr_type::type> + : std::is_integral {}; + template struct is_any_string_type { static constexpr bool value = @@ -457,6 +472,10 @@ using enable_if_primitive_ctype = template using enable_if_integer = typename std::enable_if::value, R>::type; +template +using enable_if_integer_repr = + typename std::enable_if::value, R>::type; + template using is_signed_integer = std::integral_constant::value && diff --git a/integration/integration_test.py b/integration/integration_test.py index be20b4ee233..cb5510de50a 100644 --- a/integration/integration_test.py +++ b/integration/integration_test.py @@ -179,6 +179,16 @@ def _get_buffers(self): ] +class IntegerColumn(PrimitiveColumn): + + def __init__(self, name, count, is_valid, values, bit_width): + super(IntegerColumn, self).__init__(name, count, is_valid, values) + self.bit_width = bit_width + + def _encode_value(self, x): + return x if self.bit_width < 64 else str(x) + + TEST_INT_MAX = 2 ** 31 - 1 TEST_INT_MIN = ~TEST_INT_MAX @@ -225,7 +235,7 @@ def generate_range(self, size, lower, upper, name=None): if name is None: name = self.name - return PrimitiveColumn(name, size, is_valid, values) + return IntegerColumn(name, size, is_valid, values, self.bit_width) class DateType(IntegerType):