From e14f60a4b9caa702d37dd191d159f72c92d41f52 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 27 May 2020 11:34:18 -0400 Subject: [PATCH 1/6] ARROW-8471: [C++][Integration] Represent 64 bit integers as strings --- cpp/src/arrow/ipc/json_integration_test.cc | 6 +-- cpp/src/arrow/ipc/json_internal.cc | 63 +++++++++++++--------- cpp/src/arrow/type_traits.h | 8 +++ cpp/src/arrow/util/formatting.h | 9 ++-- dev/archery/archery/integration/datagen.py | 11 ++-- 5 files changed, 62 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/ipc/json_integration_test.cc b/cpp/src/arrow/ipc/json_integration_test.cc index 90864b7a260..e6d1924a835 100644 --- a/cpp/src/arrow/ipc/json_integration_test.cc +++ b/cpp/src/arrow/ipc/json_integration_test.cc @@ -255,7 +255,7 @@ static const char* JSON_EXAMPLE = R"example( "fields": [ { "name": "foo", - "type": {"name": "int", "isSigned": true, "bitWidth": 32}, + "type": {"name": "int", "isSigned": true, "bitWidth": 64}, "nullable": true, "children": [] }, { @@ -272,7 +272,7 @@ static const char* JSON_EXAMPLE = R"example( { "name": "foo", "count": 5, - "DATA": [1, 2, 3, 4, 5], + "DATA": ["1", "2", "3", "4", "5"], "VALIDITY": [1, 0, 1, 1, 1] }, { @@ -289,7 +289,7 @@ static const char* JSON_EXAMPLE = R"example( { "name": "foo", "count": 4, - "DATA": [1, 2, 3, 4], + "DATA": ["1", "2", "3", "4"], "VALIDITY": [1, 0, 1, 1] }, { diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc index e04ead70b81..898977b02c9 100644 --- a/cpp/src/arrow/ipc/json_internal.cc +++ b/cpp/src/arrow/ipc/json_internal.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -440,29 +441,37 @@ class ArrayWriter { return Status::OK(); } - template ::value> - void WriteIntegerDataValues(const T& arr) { - static const char null_string[] = "0"; - const auto data = arr.raw_values(); + template + enable_if_t::value && + sizeof(CType) != sizeof(int64_t)> + WriteDataValues(const ArrayType& arr) { + static const std::string null_string = "0"; for (int64_t i = 0; i < arr.length(); ++i) { if (arr.IsValid(i)) { - IsSigned ? writer_->Int64(data[i]) : writer_->Uint64(data[i]); + writer_->Int64(arr.Value(i)); } else { - writer_->RawNumber(null_string, sizeof(null_string)); + writer_->RawNumber(null_string.data(), null_string.size()); } } } - template - enable_if_physical_signed_integer WriteDataValues( - const ArrayType& arr) { - WriteIntegerDataValues(arr); - } + template + enable_if_t::value && + sizeof(CType) == sizeof(int64_t)> + WriteDataValues(const ArrayType& arr) { + ::arrow::internal::StringFormatter::ArrowType> fmt; - template - enable_if_physical_unsigned_integer WriteDataValues( - const ArrayType& arr) { - WriteIntegerDataValues(arr); + static const std::string null_string = "0"; + for (int64_t i = 0; i < arr.length(); ++i) { + if (arr.IsValid(i)) { + fmt(arr.Value(i), + [&](util::string_view repr) { writer_->String(repr.data(), repr.size()); }); + } else { + writer_->String(null_string.data(), null_string.size()); + } + } } template @@ -1154,18 +1163,24 @@ enable_if_boolean UnboxValue(const rj::Value& val) { return val.GetBool(); } -template -enable_if_physical_signed_integer UnboxValue( - const rj::Value& val) { +template +enable_if_t::value && sizeof(CType) != sizeof(int64_t), CType> +UnboxValue(const rj::Value& val) { DCHECK(val.IsInt64()); - return static_cast(val.GetInt64()); + return static_cast(val.GetInt64()); } -template -enable_if_physical_unsigned_integer UnboxValue( - const rj::Value& val) { - DCHECK(val.IsUint()); - return static_cast(val.GetUint64()); +template +enable_if_t::value && sizeof(CType) == sizeof(int64_t), CType> +UnboxValue(const rj::Value& val) { + DCHECK(val.IsString()); + + CType out; + bool success = ::arrow::internal::ParseValue::ArrowType>( + val.GetString(), val.GetStringLength(), &out); + + DCHECK(success); + return out; } template diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index f61b690ac9d..937443b3012 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -646,6 +646,14 @@ template using enable_if_physical_unsigned_integer = enable_if_t::value, R>; +template +using is_physical_integer_type = + std::integral_constant::value || + is_physical_signed_integer_type::value>; + +template +using enable_if_physical_integer = enable_if_t::value, R>; + // Like is_floating_type but excluding half-floats which don't have a // float-like c type. template diff --git a/cpp/src/arrow/util/formatting.h b/cpp/src/arrow/util/formatting.h index 7deecc00230..bc1b10fafe6 100644 --- a/cpp/src/arrow/util/formatting.h +++ b/cpp/src/arrow/util/formatting.h @@ -41,6 +41,9 @@ namespace internal { template class StringFormatter; +template +using Return = decltype(std::declval()(util::string_view{})); + template <> class StringFormatter { public: @@ -49,7 +52,7 @@ class StringFormatter { using value_type = bool; template - Status operator()(bool value, Appender&& append) { + Return operator()(bool value, Appender&& append) { if (value) { const char string[] = "true"; return append(util::string_view(string)); @@ -95,7 +98,7 @@ class IntToStringFormatterMixin { (is_signed ? 2 : 1) + std::numeric_limits::digits10; template - Status operator()(value_type value, Appender&& append) { + Return operator()(value_type value, Appender&& append) { char buffer[buffer_size]; char* ptr = buffer + buffer_size; int32_t size = 0; @@ -206,7 +209,7 @@ class FloatToStringFormatterMixin : public FloatToStringFormatter { explicit FloatToStringFormatterMixin(const std::shared_ptr& = NULLPTR) {} template - Status operator()(value_type value, Appender&& append) { + Return operator()(value_type value, Appender&& append) { char buffer[buffer_size]; int size = FormatFloat(value, buffer, buffer_size); return append(util::string_view(buffer, size)); diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 0b699e8ac52..12f3544d99f 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -179,8 +179,8 @@ def generate_column(self, size, name=None): return self.generate_range(size, lower_bound, upper_bound, name=name) def generate_range(self, size, lower, upper, name=None): - values = [int(x) for x in - np.random.randint(lower, upper, size=size)] + values = list(map(str if self.bit_width == 64 else int, + np.random.randint(lower, upper, size=size))) is_valid = self._make_is_valid(size) @@ -1540,9 +1540,10 @@ def _temp_path(): .skip_category('Java') # TODO(ARROW-7779) .skip_category('JS'), - generate_extension_case().skip_category('Go') - .skip_category('JS') - .skip_category('Rust'), + generate_extension_case() + .skip_category('Go') + .skip_category('JS') + .skip_category('Rust'), ] if flight: From 47ce0f8fb9eb8759e9f30345867a67eb81438633 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Wed, 27 May 2020 15:36:46 -0400 Subject: [PATCH 2/6] refactor Go to emit and expect 64 bit ints as strings --- dev/archery/archery/integration/tester_go.py | 3 ++- go/arrow/internal/arrjson/arrjson.go | 20 ++++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/dev/archery/archery/integration/tester_go.py b/dev/archery/archery/integration/tester_go.py index 5dda81dfc2e..ea799c5a1bd 100644 --- a/dev/archery/archery/integration/tester_go.py +++ b/dev/archery/archery/integration/tester_go.py @@ -26,7 +26,8 @@ class GoTester(Tester): CONSUMER = True # FIXME(sbinet): revisit for Go modules - GOPATH = os.getenv('GOPATH', '~/go') + HOME = os.getenv('HOME', '~') + GOPATH = os.getenv('GOPATH', os.path.join(HOME, 'go')) GOBIN = os.environ.get('GOBIN', os.path.join(GOPATH, 'bin')) GO_INTEGRATION_EXE = os.path.join(GOBIN, 'arrow-json-integration-test') diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index 3b4e4ab1d0e..4e6fea12e9b 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -963,11 +963,11 @@ func i32ToJSON(arr *array.Int32) []interface{} { func i64FromJSON(vs []interface{}) []int64 { o := make([]int64, len(vs)) for i, v := range vs { - vv, err := v.(json.Number).Int64() + vv, err := strconv.ParseInt(v.(string), 10, 64) if err != nil { panic(err) } - o[i] = int64(vv) + o[i] = vv } return o } @@ -975,7 +975,11 @@ func i64FromJSON(vs []interface{}) []int64 { func i64ToJSON(arr *array.Int64) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - o[i] = arr.Value(i) + if arr.IsValid(i) { + o[i] = strconv.FormatInt(arr.Value(i), 10) + } else { + o[i] = "0" + } } return o } @@ -1043,11 +1047,11 @@ func u32ToJSON(arr *array.Uint32) []interface{} { func u64FromJSON(vs []interface{}) []uint64 { o := make([]uint64, len(vs)) for i, v := range vs { - vv, err := strconv.ParseUint(v.(json.Number).String(), 10, 64) + vv, err := strconv.ParseUint(v.(string), 10, 64) if err != nil { panic(err) } - o[i] = uint64(vv) + o[i] = vv } return o } @@ -1055,7 +1059,11 @@ func u64FromJSON(vs []interface{}) []uint64 { func u64ToJSON(arr *array.Uint64) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - o[i] = arr.Value(i) + if arr.IsValid(i) { + o[i] = strconv.FormatUint(arr.Value(i), 10) + } else { + o[i] = "0" + } } return o } From 2e97dffee475a6ced1703dcea6da610d24933119 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 28 May 2020 15:16:53 -0400 Subject: [PATCH 3/6] also adjust duration and date in Go --- dev/archery/archery/integration/datagen.py | 2 +- go/arrow/internal/arrjson/arrjson.go | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 12f3544d99f..1a2f41018af 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -179,7 +179,7 @@ def generate_column(self, size, name=None): return self.generate_range(size, lower_bound, upper_bound, name=name) def generate_range(self, size, lower, upper, name=None): - values = list(map(str if self.bit_width == 64 else int, + values = list(map(int if self.bit_width < 64 else str, np.random.randint(lower, upper, size=size))) is_valid = self._make_is_valid(size) diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index 4e6fea12e9b..f67179deaf8 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -1201,7 +1201,7 @@ func date32ToJSON(arr *array.Date32) []interface{} { func date64FromJSON(vs []interface{}) []arrow.Date64 { o := make([]arrow.Date64, len(vs)) for i, v := range vs { - vv, err := v.(json.Number).Int64() + vv, err := strconv.ParseInt(v.(string), 10, 64) if err != nil { panic(err) } @@ -1213,7 +1213,7 @@ func date64FromJSON(vs []interface{}) []arrow.Date64 { func date64ToJSON(arr *array.Date64) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - o[i] = int64(arr.Value(i)) + o[i] = string(arr.Value(i)) } return o } @@ -1241,7 +1241,7 @@ func time32ToJSON(arr *array.Time32) []interface{} { func time64FromJSON(vs []interface{}) []arrow.Time64 { o := make([]arrow.Time64, len(vs)) for i, v := range vs { - vv, err := v.(json.Number).Int64() + vv, err := strconv.ParseInt(v.(string), 10, 64) if err != nil { panic(err) } @@ -1253,7 +1253,7 @@ func time64FromJSON(vs []interface{}) []arrow.Time64 { func time64ToJSON(arr *array.Time64) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - o[i] = int64(arr.Value(i)) + o[i] = string(arr.Value(i)) } return o } @@ -1261,7 +1261,7 @@ func time64ToJSON(arr *array.Time64) []interface{} { func timestampFromJSON(vs []interface{}) []arrow.Timestamp { o := make([]arrow.Timestamp, len(vs)) for i, v := range vs { - vv, err := v.(json.Number).Int64() + vv, err := strconv.ParseInt(v.(string), 10, 64) if err != nil { panic(err) } @@ -1273,7 +1273,7 @@ func timestampFromJSON(vs []interface{}) []arrow.Timestamp { func timestampToJSON(arr *array.Timestamp) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - o[i] = int64(arr.Value(i)) + o[i] = string(arr.Value(i)) } return o } @@ -1326,7 +1326,7 @@ func daytimeintervalToJSON(arr *array.DayTimeInterval) []interface{} { func durationFromJSON(vs []interface{}) []arrow.Duration { o := make([]arrow.Duration, len(vs)) for i, v := range vs { - vv, err := v.(json.Number).Int64() + vv, err := strconv.ParseInt(v.(string), 10, 64) if err != nil { panic(err) } @@ -1338,7 +1338,7 @@ func durationFromJSON(vs []interface{}) []arrow.Duration { func durationToJSON(arr *array.Duration) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - o[i] = arr.Value(i) + o[i] = string(arr.Value(i)) } return o } From dbdd2930fbe8535a1e1d6aeabf46883c2628cf67 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 28 May 2020 17:01:33 -0400 Subject: [PATCH 4/6] emit '0' in null slots --- go/arrow/internal/arrjson/arrjson.go | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index f67179deaf8..62db2fd0437 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -1213,7 +1213,11 @@ func date64FromJSON(vs []interface{}) []arrow.Date64 { func date64ToJSON(arr *array.Date64) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - o[i] = string(arr.Value(i)) + if arr.IsValid(i) { + o[i] = strconv.FormatInt(int64(arr.Value(i)), 10) + } else { + o[i] = "0" + } } return o } @@ -1253,7 +1257,11 @@ func time64FromJSON(vs []interface{}) []arrow.Time64 { func time64ToJSON(arr *array.Time64) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - o[i] = string(arr.Value(i)) + if arr.IsValid(i) { + o[i] = strconv.FormatInt(int64(arr.Value(i)), 10) + } else { + o[i] = "0" + } } return o } @@ -1273,7 +1281,11 @@ func timestampFromJSON(vs []interface{}) []arrow.Timestamp { func timestampToJSON(arr *array.Timestamp) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - o[i] = string(arr.Value(i)) + if arr.IsValid(i) { + o[i] = strconv.FormatInt(int64(arr.Value(i)), 10) + } else { + o[i] = "0" + } } return o } @@ -1338,7 +1350,11 @@ func durationFromJSON(vs []interface{}) []arrow.Duration { func durationToJSON(arr *array.Duration) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - o[i] = string(arr.Value(i)) + if arr.IsValid(i) { + o[i] = strconv.FormatInt(int64(arr.Value(i)), 10) + } else { + o[i] = "0" + } } return o } From bf3722721cc527361248771cf0e9be71f23c340e Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 2 Jun 2020 13:49:28 -0400 Subject: [PATCH 5/6] add extremes to integer ranges --- cpp/src/arrow/ipc/json_integration_test.cc | 2 +- dev/archery/archery/integration/datagen.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/ipc/json_integration_test.cc b/cpp/src/arrow/ipc/json_integration_test.cc index e6d1924a835..9352fdefc17 100644 --- a/cpp/src/arrow/ipc/json_integration_test.cc +++ b/cpp/src/arrow/ipc/json_integration_test.cc @@ -289,7 +289,7 @@ static const char* JSON_EXAMPLE = R"example( { "name": "foo", "count": 4, - "DATA": ["1", "2", "3", "4"], + "DATA": ["-1", "0", "9223372036854775807", "-9223372036854775808"], "VALIDITY": [1, 0, 1, 1] }, { diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 1a2f41018af..51543683c43 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -176,11 +176,15 @@ def _get_type(self): def generate_column(self, size, name=None): lower_bound, upper_bound = self._get_generated_data_bounds() - return self.generate_range(size, lower_bound, upper_bound, name=name) - - def generate_range(self, size, lower, upper, name=None): - values = list(map(int if self.bit_width < 64 else str, - np.random.randint(lower, upper, size=size))) + return self.generate_range(size, lower_bound, upper_bound, + name=name, include_extremes=True) + + def generate_range(self, size, lower, upper, name=None, + include_extremes=False): + values = np.random.randint(lower, upper, size=size) + if include_extremes and size >= 2: + values[:2] = [lower, upper] + values = list(map(int if self.bit_width < 64 else str, values)) is_valid = self._make_is_valid(size) From 1ec251730bfca9e0a5d0f6d04e13766823a2a967 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Tue, 2 Jun 2020 13:51:20 -0400 Subject: [PATCH 6/6] repair go indentation --- go/arrow/internal/arrjson/arrjson.go | 60 ++++++++++++++-------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index 62db2fd0437..a9f44859a76 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -975,11 +975,11 @@ func i64FromJSON(vs []interface{}) []int64 { func i64ToJSON(arr *array.Int64) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - if arr.IsValid(i) { - o[i] = strconv.FormatInt(arr.Value(i), 10) - } else { - o[i] = "0" - } + if arr.IsValid(i) { + o[i] = strconv.FormatInt(arr.Value(i), 10) + } else { + o[i] = "0" + } } return o } @@ -1059,11 +1059,11 @@ func u64FromJSON(vs []interface{}) []uint64 { func u64ToJSON(arr *array.Uint64) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - if arr.IsValid(i) { - o[i] = strconv.FormatUint(arr.Value(i), 10) - } else { - o[i] = "0" - } + if arr.IsValid(i) { + o[i] = strconv.FormatUint(arr.Value(i), 10) + } else { + o[i] = "0" + } } return o } @@ -1213,11 +1213,11 @@ func date64FromJSON(vs []interface{}) []arrow.Date64 { func date64ToJSON(arr *array.Date64) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - if arr.IsValid(i) { - o[i] = strconv.FormatInt(int64(arr.Value(i)), 10) - } else { - o[i] = "0" - } + if arr.IsValid(i) { + o[i] = strconv.FormatInt(int64(arr.Value(i)), 10) + } else { + o[i] = "0" + } } return o } @@ -1257,11 +1257,11 @@ func time64FromJSON(vs []interface{}) []arrow.Time64 { func time64ToJSON(arr *array.Time64) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - if arr.IsValid(i) { - o[i] = strconv.FormatInt(int64(arr.Value(i)), 10) - } else { - o[i] = "0" - } + if arr.IsValid(i) { + o[i] = strconv.FormatInt(int64(arr.Value(i)), 10) + } else { + o[i] = "0" + } } return o } @@ -1281,11 +1281,11 @@ func timestampFromJSON(vs []interface{}) []arrow.Timestamp { func timestampToJSON(arr *array.Timestamp) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - if arr.IsValid(i) { - o[i] = strconv.FormatInt(int64(arr.Value(i)), 10) - } else { - o[i] = "0" - } + if arr.IsValid(i) { + o[i] = strconv.FormatInt(int64(arr.Value(i)), 10) + } else { + o[i] = "0" + } } return o } @@ -1350,11 +1350,11 @@ func durationFromJSON(vs []interface{}) []arrow.Duration { func durationToJSON(arr *array.Duration) []interface{} { o := make([]interface{}, arr.Len()) for i := range o { - if arr.IsValid(i) { - o[i] = strconv.FormatInt(int64(arr.Value(i)), 10) - } else { - o[i] = "0" - } + if arr.IsValid(i) { + o[i] = strconv.FormatInt(int64(arr.Value(i)), 10) + } else { + o[i] = "0" + } } return o }