From 5240b14d771d7dec472210fe91cad0d32c8911a7 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 14 Apr 2020 18:26:00 +0200 Subject: [PATCH 1/2] ARROW-8450: [Integration][C++] Implement large offsets types Implement integration tests for LargeList, LargeBinary and LargeString types. Enable them for C++ (only). --- cpp/src/arrow/ipc/json_integration_test.cc | 22 +-- cpp/src/arrow/ipc/json_internal.cc | 14 +- cpp/src/arrow/ipc/json_test.cc | 16 +- dev/archery/archery/cli.py | 8 +- dev/archery/archery/integration/datagen.py | 177 ++++++++++++++------- dev/archery/archery/integration/util.py | 25 ++- 6 files changed, 153 insertions(+), 109 deletions(-) diff --git a/cpp/src/arrow/ipc/json_integration_test.cc b/cpp/src/arrow/ipc/json_integration_test.cc index f2eea294951..4185cb37fb1 100644 --- a/cpp/src/arrow/ipc/json_integration_test.cc +++ b/cpp/src/arrow/ipc/json_integration_test.cc @@ -250,24 +250,12 @@ static const char* JSON_EXAMPLE = R"example( { "name": "foo", "type": {"name": "int", "isSigned": true, "bitWidth": 32}, - "nullable": true, "children": [], - "typeLayout": { - "vectors": [ - {"type": "VALIDITY", "typeBitWidth": 1}, - {"type": "DATA", "typeBitWidth": 32} - ] - } + "nullable": true, "children": [] }, { "name": "bar", "type": {"name": "floatingpoint", "precision": "DOUBLE"}, - "nullable": true, "children": [], - "typeLayout": { - "vectors": [ - {"type": "VALIDITY", "typeBitWidth": 1}, - {"type": "DATA", "typeBitWidth": 64} - ] - } + "nullable": true, "children": [] } ] }, @@ -318,12 +306,6 @@ static const char* JSON_EXAMPLE2 = R"example( "name": "foo", "type": {"name": "int", "isSigned": true, "bitWidth": 32}, "nullable": true, "children": [], - "typeLayout": { - "vectors": [ - {"type": "VALIDITY", "typeBitWidth": 1}, - {"type": "DATA", "typeBitWidth": 32} - ] - }, "metadata": [ {"key": "converted_from_time32", "value": "true"} ] diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc index 133681cbd7b..72dabefce9e 100644 --- a/cpp/src/arrow/ipc/json_internal.cc +++ b/cpp/src/arrow/ipc/json_internal.cc @@ -335,10 +335,8 @@ class SchemaWriter { Status Visit(const TimeType& type) { return WritePrimitive("time", type); } Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); } Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); } - Status Visit(const LargeStringType& type) { return WriteVarBytes("large_utf8", type); } - Status Visit(const LargeBinaryType& type) { - return WriteVarBytes("large_binary", type); - } + Status Visit(const LargeStringType& type) { return WriteVarBytes("largeutf8", type); } + Status Visit(const LargeBinaryType& type) { return WriteVarBytes("largebinary", type); } Status Visit(const FixedSizeBinaryType& type) { return WritePrimitive("fixedsizebinary", type); } @@ -358,7 +356,7 @@ class SchemaWriter { } Status Visit(const LargeListType& type) { - WriteName("large_list", type); + WriteName("largelist", type); return Status::OK(); } @@ -932,9 +930,9 @@ static Status GetType(const RjObject& json_type, *type = utf8(); } else if (type_name == "binary") { *type = binary(); - } else if (type_name == "large_utf8") { + } else if (type_name == "largeutf8") { *type = large_utf8(); - } else if (type_name == "large_binary") { + } else if (type_name == "largebinary") { *type = large_binary(); } else if (type_name == "fixedsizebinary") { return GetFixedSizeBinary(json_type, type); @@ -957,7 +955,7 @@ static Status GetType(const RjObject& json_type, return Status::Invalid("List must have exactly one child"); } *type = list(children[0]); - } else if (type_name == "large_list") { + } else if (type_name == "largelist") { if (children.size() != 1) { return Status::Invalid("Large list must have exactly one child"); } diff --git a/cpp/src/arrow/ipc/json_test.cc b/cpp/src/arrow/ipc/json_test.cc index bfc2fab8dac..21a695ab711 100644 --- a/cpp/src/arrow/ipc/json_test.cc +++ b/cpp/src/arrow/ipc/json_test.cc @@ -337,24 +337,12 @@ TEST(TestJsonFileReadWrite, MinimalFormatExample) { { "name": "foo", "type": {"name": "int", "isSigned": true, "bitWidth": 32}, - "nullable": true, "children": [], - "typeLayout": { - "vectors": [ - {"type": "VALIDITY", "typeBitWidth": 1}, - {"type": "DATA", "typeBitWidth": 32} - ] - } + "nullable": true, "children": [] }, { "name": "bar", "type": {"name": "floatingpoint", "precision": "DOUBLE"}, - "nullable": true, "children": [], - "typeLayout": { - "vectors": [ - {"type": "VALIDITY", "typeBitWidth": 1}, - {"type": "DATA", "typeBitWidth": 64} - ] - } + "nullable": true, "children": [] } ] }, diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 6c07e05a142..c8af1b261ec 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -346,7 +346,7 @@ def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras, logger.debug(f"Running benchmark {rev_or_path}") conf = CppBenchmarkRunner.default_configuration( - cmake_extras=cmake_extras, **kwargs) + cmake_extras=cmake_extras, **kwargs) runner_base = BenchmarkRunner.from_rev_or_path( src, root, rev_or_path, conf) @@ -399,7 +399,7 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras, logger.debug(f"Running benchmark {rev_or_path}") conf = CppBenchmarkRunner.default_configuration( - cmake_extras=cmake_extras, **kwargs) + cmake_extras=cmake_extras, **kwargs) runner_base = BenchmarkRunner.from_rev_or_path( src, root, rev_or_path, conf, @@ -497,7 +497,7 @@ def benchmark_diff(ctx, src, preserve, output, cmake_extras, f"{baseline} (baseline)") conf = CppBenchmarkRunner.default_configuration( - cmake_extras=cmake_extras, **kwargs) + cmake_extras=cmake_extras, **kwargs) runner_cont = BenchmarkRunner.from_rev_or_path( src, root, contender, conf, @@ -551,7 +551,7 @@ def _set_default(opt, default): @click.option('stop_on_error', '-x', '--stop-on-error', is_flag=True, default=False, help='Stop on first error') -@click.option('--gold_dirs', multiple=True, +@click.option('--gold-dirs', multiple=True, help="gold integration test file paths") @click.option('-k', '--match', help=("Substring for test names to include in run, " diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 0f81acc5f60..8f4e9c29ca5 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -24,12 +24,13 @@ import numpy as np -from .util import (frombytes, rands, tobytes, SKIP_ARROW, SKIP_FLIGHT) +from .util import (frombytes, tobytes, random_bytes, random_utf8, + SKIP_ARROW, SKIP_FLIGHT) class Field(object): - def __init__(self, name, nullable=True, metadata=[]): + def __init__(self, name, *, nullable=True, metadata=[]): self.name = name self.nullable = nullable self.metadata = metadata @@ -132,7 +133,7 @@ def generate_column(self, size, name=None): class IntegerField(PrimitiveField): - def __init__(self, name, is_signed, bit_width, nullable=True, + def __init__(self, name, is_signed, bit_width, *, nullable=True, metadata=[], min_value=TEST_INT_MIN, max_value=TEST_INT_MAX): @@ -188,7 +189,7 @@ class DateField(IntegerField): MILLISECOND: [-62135596800000, 253402214400000] } - def __init__(self, name, unit, nullable=True, metadata=[]): + def __init__(self, name, unit, *, nullable=True, metadata=[]): bit_width = 32 if unit == self.DAY else 64 min_value, max_value = self._ranges[unit] @@ -230,7 +231,7 @@ class TimeField(IntegerField): 'ns': [0, 86400000000000] } - def __init__(self, name, unit='s', nullable=True, + def __init__(self, name, unit='s', *, nullable=True, metadata=[]): min_val, max_val = self._ranges[unit] super(TimeField, self).__init__(name, True, self.BIT_WIDTHS[unit], @@ -258,7 +259,7 @@ class TimestampField(IntegerField): 'ns': [np.iinfo('int64').min, np.iinfo('int64').max] } - def __init__(self, name, unit='s', tz=None, nullable=True, + def __init__(self, name, unit='s', tz=None, *, nullable=True, metadata=[]): min_val, max_val = self._ranges[unit] super(TimestampField, self).__init__(name, True, 64, @@ -283,13 +284,13 @@ def _get_type(self): class DurationIntervalField(IntegerField): - def __init__(self, name, unit='s', nullable=True, + def __init__(self, name, unit='s', *, nullable=True, metadata=[]): min_val, max_val = np.iinfo('int64').min, np.iinfo('int64').max, super(DurationIntervalField, self).__init__( - name, True, 64, - nullable=nullable, metadata=metadata, - min_value=min_val, max_value=max_val) + name, True, 64, + nullable=nullable, metadata=metadata, + min_value=min_val, max_value=max_val) self.unit = unit def _get_type(self): @@ -302,12 +303,12 @@ def _get_type(self): class YearMonthIntervalField(IntegerField): - def __init__(self, name, nullable=True, metadata=[]): + def __init__(self, name, *, nullable=True, metadata=[]): min_val, max_val = [-10000*12, 10000*12] # +/- 10000 years. super(YearMonthIntervalField, self).__init__( - name, True, 32, - nullable=nullable, metadata=metadata, - min_value=min_val, max_value=max_val) + name, True, 32, + nullable=nullable, metadata=metadata, + min_value=min_val, max_value=max_val) def _get_type(self): fields = [ @@ -319,7 +320,7 @@ def _get_type(self): class DayTimeIntervalField(PrimitiveField): - def __init__(self, name, nullable=True, metadata=[]): + def __init__(self, name, *, nullable=True, metadata=[]): super(DayTimeIntervalField, self).__init__(name, nullable=True, metadata=metadata) @@ -349,7 +350,7 @@ def generate_column(self, size, name=None): class FloatingPointField(PrimitiveField): - def __init__(self, name, bit_width, nullable=True, + def __init__(self, name, bit_width, *, nullable=True, metadata=[]): super(FloatingPointField, self).__init__(name, nullable=nullable, @@ -401,8 +402,8 @@ def decimal_range_from_precision(precision): class DecimalField(PrimitiveField): - def __init__(self, name, precision, scale, bit_width=128, nullable=True, - metadata=[]): + def __init__(self, name, precision, scale, bit_width=128, *, + nullable=True, metadata=[]): super(DecimalField, self).__init__(name, nullable=True, metadata=metadata) self.precision = precision @@ -471,17 +472,18 @@ def column_class(self): def _get_type(self): return OrderedDict([('name', 'binary')]) + def _random_sizes(self, size): + return np.random.exponential(scale=4, size=size).astype(np.int32) + def generate_column(self, size, name=None): - K = 7 is_valid = self._make_is_valid(size) values = [] - for i in range(size): + sizes = self._random_sizes(size) + + for i, nbytes in enumerate(sizes): if is_valid[i]: - draw = (np.random.randint(0, 255, size=K) - .astype(np.uint8) - .tostring()) - values.append(draw) + values.append(random_bytes(nbytes)) else: values.append(b"") @@ -490,9 +492,15 @@ def generate_column(self, size, name=None): return self.column_class(name, size, is_valid, values) +class LargeBinaryField(BinaryField): + + def _get_type(self): + return OrderedDict([('name', 'largebinary')]) + + class FixedSizeBinaryField(PrimitiveField): - def __init__(self, name, byte_width, nullable=True, + def __init__(self, name, byte_width, *, nullable=True, metadata=[]): super(FixedSizeBinaryField, self).__init__(name, nullable=nullable, metadata=metadata) @@ -510,23 +518,12 @@ def _get_type(self): return OrderedDict([('name', 'fixedsizebinary'), ('byteWidth', self.byte_width)]) - def _get_type_layout(self): - return OrderedDict([ - ('vectors', - [OrderedDict([('type', 'VALIDITY'), - ('typeBitWidth', 1)]), - OrderedDict([('type', 'DATA'), - ('typeBitWidth', self.byte_width)])])]) - def generate_column(self, size, name=None): is_valid = self._make_is_valid(size) values = [] for i in range(size): - draw = (np.random.randint(0, 255, size=self.byte_width) - .astype(np.uint8) - .tostring()) - values.append(draw) + values.append(random_bytes(self.byte_width)) if name is None: name = self.name @@ -549,7 +546,7 @@ def generate_column(self, size, name=None): for i in range(size): if is_valid[i]: - values.append(tobytes(rands(K))) + values.append(tobytes(random_utf8(K))) else: values.append(b"") @@ -558,6 +555,12 @@ def generate_column(self, size, name=None): return self.column_class(name, size, is_valid, values) +class LargeStringField(StringField): + + def _get_type(self): + return OrderedDict([('name', 'largeutf8')]) + + class Schema(object): def __init__(self, fields, metadata=None): @@ -604,7 +607,7 @@ def _get_buffers(self): class FixedSizeBinaryColumn(PrimitiveColumn): def _encode_value(self, x): - return ''.join('{:02x}'.format(c).upper() for c in x) + return frombytes(binascii.hexlify(x).upper()) def _get_buffers(self): data = [] @@ -625,7 +628,7 @@ def _encode_value(self, x): class ListField(Field): - def __init__(self, name, value_field, nullable=True, + def __init__(self, name, value_field, *, nullable=True, metadata=[]): super(ListField, self).__init__(name, nullable=nullable, metadata=metadata) @@ -660,6 +663,14 @@ def generate_column(self, size, name=None): return ListColumn(name, size, is_valid, offsets, values) +class LargeListField(ListField): + + def _get_type(self): + return OrderedDict([ + ('name', 'largelist') + ]) + + class ListColumn(Column): def __init__(self, name, count, is_valid, offsets, values): @@ -680,22 +691,22 @@ def _get_children(self): class MapField(Field): - def __init__(self, name, key_field, item_field, nullable=True, - metadata=[], keysSorted=False): + def __init__(self, name, key_field, item_field, *, nullable=True, + metadata=[], keys_sorted=False): super(MapField, self).__init__(name, nullable=nullable, metadata=metadata) assert not key_field.nullable self.key_field = key_field self.item_field = item_field - self.pair_field = StructField( - 'entries', [key_field, item_field], False) - self.keysSorted = keysSorted + self.pair_field = StructField('entries', [key_field, item_field], + nullable=False) + self.keys_sorted = keys_sorted def _get_type(self): return OrderedDict([ ('name', 'map'), - ('keysSorted', self.keysSorted) + ('keysSorted', self.keys_sorted) ]) def _get_children(self): @@ -742,7 +753,7 @@ def _get_children(self): class FixedSizeListField(Field): - def __init__(self, name, value_field, list_size, nullable=True, + def __init__(self, name, value_field, list_size, *, nullable=True, metadata=[]): super(FixedSizeListField, self).__init__(name, nullable=nullable, metadata=metadata) @@ -785,7 +796,7 @@ def _get_children(self): class StructField(Field): - def __init__(self, name, fields, nullable=True, + def __init__(self, name, fields, *, nullable=True, metadata=[]): super(StructField, self).__init__(name, nullable=nullable, metadata=metadata) @@ -829,7 +840,7 @@ def get_json(self): class DictionaryField(Field): - def __init__(self, name, index_field, dictionary, nullable=True, + def __init__(self, name, index_field, dictionary, *, nullable=True, metadata=[]): super(DictionaryField, self).__init__(name, nullable=nullable, metadata=metadata) @@ -935,6 +946,10 @@ def get_field(name, type_, **kwargs): return BinaryField(name, **kwargs) elif type_ == 'utf8': return StringField(name, **kwargs) + elif type_ == 'largebinary': + return LargeBinaryField(name, **kwargs) + elif type_ == 'largeutf8': + return LargeStringField(name, **kwargs) elif type_.startswith('fixedsizebinary_'): byte_width = int(type_.split('_')[1]) return FixedSizeBinaryField(name, byte_width=byte_width, **kwargs) @@ -1020,6 +1035,18 @@ def generate_primitive_case(batch_sizes, name='primitive'): return _generate_file(name, fields, batch_sizes) +def generate_primitive_large_offsets_case(batch_sizes): + types = ['largebinary', 'largeutf8'] + + fields = [] + + for type_ in types: + fields.append(get_field(type_ + "_nullable", type_, nullable=True)) + fields.append(get_field(type_ + "_nonnullable", type_, nullable=False)) + + return _generate_file('primitive_large_offsets', fields, batch_sizes) + + def generate_null_case(batch_sizes): # Interleave null with non-null types to ensure the appropriate number of # buffers (0) is read and written @@ -1106,15 +1133,42 @@ def generate_nested_case(): get_field('item', 'int32'), 4), StructField('struct_nullable', [get_field('f1', 'int32'), get_field('f2', 'utf8')]), - - # TODO(wesm): this causes segfault - # ListField('list_nonnullable', get_field('item', 'int32'), False), + # Fails on Go (ARROW-8452) + # ListField('list_nonnullable', get_field('item', 'int32'), + # nullable=False), ] batch_sizes = [7, 10] return _generate_file("nested", fields, batch_sizes) +def generate_recursive_nested_case(): + fields = [ + ListField('lists_list', + ListField('inner_list', get_field('item', 'int16'))), + ListField('structs_list', + StructField('inner_struct', + [get_field('f1', 'int32'), + get_field('f2', 'utf8')])), + ] + + batch_sizes = [7, 10] + return _generate_file("recursive_nested", fields, batch_sizes) + + +def generate_nested_large_offsets_case(): + fields = [ + LargeListField('large_list_nullable', get_field('item', 'int32')), + LargeListField('large_list_nonnullable', + get_field('item', 'int32'), nullable=False), + LargeListField('large_list_nested', + ListField('inner_list', get_field('item', 'int16'))), + ] + + batch_sizes = [7, 10] + return _generate_file("nested_large_offsets", fields, batch_sizes) + + def generate_dictionary_case(): dict0 = Dictionary(0, StringField('dictionary1'), size=10, name='DICT0') dict1 = Dictionary(1, StringField('dictionary1'), size=5, name='DICT1') @@ -1140,9 +1194,9 @@ def generate_nested_dictionary_case(): dict1 = Dictionary(1, list_of_dict, size=30, name='DICT1') struct_of_dict = StructField('struct', [ - DictionaryField('str_dict_a', get_field('', 'int8'), dict0), - DictionaryField('str_dict_b', get_field('', 'int8'), dict0) - ]) + DictionaryField('str_dict_a', get_field('', 'int8'), dict0), + DictionaryField('str_dict_b', get_field('', 'int8'), dict0) + ]) dict2 = Dictionary(2, struct_of_dict, size=30, name='DICT2') fields = [ @@ -1166,6 +1220,11 @@ def _temp_path(): generate_primitive_case([17, 20], name='primitive'), generate_primitive_case([0, 0, 0], name='primitive_zerolength'), + generate_primitive_large_offsets_case([17, 20]) + .skip_category('Go') + .skip_category('Java') # TODO(ARROW-6110) + .skip_category('JS'), + generate_null_case([10, 0]) .skip_category('JS') # TODO(ARROW-7900) .skip_category('Go'), # TODO(ARROW-7901) @@ -1187,6 +1246,14 @@ def _temp_path(): generate_nested_case(), + # TODO(ARROW-8453) + generate_recursive_nested_case().skip_category('Go'), + + generate_nested_large_offsets_case() + .skip_category('Go') + .skip_category('Java') # TODO(ARROW-6111) + .skip_category('JS'), + generate_custom_metadata_case().skip_category('Go') .skip_category('Java') .skip_category('JS'), diff --git a/dev/archery/archery/integration/util.py b/dev/archery/archery/integration/util.py index e3f2542b1e8..a4c4982ecb3 100644 --- a/dev/archery/archery/integration/util.py +++ b/dev/archery/archery/integration/util.py @@ -18,8 +18,8 @@ import contextlib import io import os +import random import socket -import string import subprocess import sys import threading @@ -32,9 +32,6 @@ def guid(): return uuid.uuid4().hex -RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), - dtype=(np.str_, 1)) - # SKIP categories SKIP_ARROW = 'arrow' SKIP_FLIGHT = 'flight' @@ -100,14 +97,26 @@ def cork(self): log = printer.print -def rands(nchars): +_RAND_CHARS = np.array(list("abcdefghijklmnop123456Ârrôwµ£°€矢"), dtype="U") + + +def random_utf8(nchars): """ - Generate one random byte string. + Generate one random UTF8 string. + """ + return ''.join(np.random.choice(_RAND_CHARS, nchars)) - See `rands_array` if you want to create an array of random strings. +def random_bytes(nbytes): + """ + Generate one random binary string. """ - return ''.join(np.random.choice(RANDS_CHARS, nchars)) + # NOTE getrandbits(0) fails + if nbytes > 0: + return random.getrandbits(nbytes * 8).to_bytes(nbytes, + byteorder='little') + else: + return b"" def tobytes(o): From 01beb13e79383733f9b5b42e06c48e68728df492 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 15 Apr 2020 17:53:53 +0200 Subject: [PATCH 2/2] Write large offsets as strings --- cpp/src/arrow/ipc/json_internal.cc | 48 +++++-- dev/archery/archery/integration/datagen.py | 142 ++++++++++++++------- 2 files changed, 137 insertions(+), 53 deletions(-) diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc index 72dabefce9e..3dfae2a5666 100644 --- a/cpp/src/arrow/ipc/json_internal.cc +++ b/cpp/src/arrow/ipc/json_internal.cc @@ -38,8 +38,10 @@ #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" +#include "arrow/util/formatting.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" +#include "arrow/util/parsing.h" #include "arrow/util/string.h" #include "arrow/visitor_inline.h" @@ -523,8 +525,21 @@ class ArrayWriter { void WriteIntegerField(const char* name, const T* values, int64_t length) { writer_->Key(name); writer_->StartArray(); - for (int i = 0; i < length; ++i) { - writer_->Int64(values[i]); + if (sizeof(T) < sizeof(int64_t)) { + for (int i = 0; i < length; ++i) { + writer_->Int64(values[i]); + } + } else { + // Represent 64-bit integers as strings, as JSON numbers cannot represent + // them exactly. + ::arrow::internal::StringFormatter::ArrowType> formatter; + auto append = [this](util::string_view v) { + writer_->String(v.data(), static_cast(v.size())); + return Status::OK(); + }; + for (int i = 0; i < length; ++i) { + DCHECK_OK(formatter(values[i], append)); + } } writer_->EndArray(); } @@ -1297,13 +1312,28 @@ class ArrayReader { ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(length * sizeof(T), pool_)); T* values = reinterpret_cast(buffer->mutable_data()); - for (int i = 0; i < length; ++i) { - const rj::Value& val = json_array[i]; - DCHECK(val.IsInt() || val.IsInt64()); - if (val.IsInt()) { - values[i] = static_cast(val.GetInt()); - } else { - values[i] = static_cast(val.GetInt64()); + if (sizeof(T) < sizeof(int64_t)) { + for (int i = 0; i < length; ++i) { + const rj::Value& val = json_array[i]; + DCHECK(val.IsInt() || val.IsInt64()); + if (val.IsInt()) { + values[i] = static_cast(val.GetInt()); + } else { + values[i] = static_cast(val.GetInt64()); + } + } + } else { + // Read 64-bit integers as strings, as JSON numbers cannot represent + // them exactly. + ::arrow::internal::StringConverter::ArrowType> converter; + for (int i = 0; i < length; ++i) { + const rj::Value& val = json_array[i]; + DCHECK(val.IsString()); + if (!converter(val.GetString(), val.GetStringLength(), &values[i])) { + return Status::Invalid("Failed to parse integer: '", + std::string(val.GetString(), val.GetStringLength()), + "'"); + } } } diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 8f4e9c29ca5..3c15abc2ab2 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -459,7 +459,13 @@ def generate_column(self, size, name=None): return PrimitiveColumn(name, size, is_valid, values) -class BinaryField(PrimitiveField): +class FixedSizeBinaryField(PrimitiveField): + + def __init__(self, name, byte_width, *, nullable=True, + metadata=[]): + super(FixedSizeBinaryField, self).__init__(name, nullable=nullable, + metadata=metadata) + self.byte_width = byte_width @property def numpy_type(self): @@ -467,44 +473,25 @@ def numpy_type(self): @property def column_class(self): - return BinaryColumn + return FixedSizeBinaryColumn def _get_type(self): - return OrderedDict([('name', 'binary')]) - - def _random_sizes(self, size): - return np.random.exponential(scale=4, size=size).astype(np.int32) + return OrderedDict([('name', 'fixedsizebinary'), + ('byteWidth', self.byte_width)]) def generate_column(self, size, name=None): is_valid = self._make_is_valid(size) values = [] - sizes = self._random_sizes(size) - - for i, nbytes in enumerate(sizes): - if is_valid[i]: - values.append(random_bytes(nbytes)) - else: - values.append(b"") + for i in range(size): + values.append(random_bytes(self.byte_width)) if name is None: name = self.name return self.column_class(name, size, is_valid, values) -class LargeBinaryField(BinaryField): - - def _get_type(self): - return OrderedDict([('name', 'largebinary')]) - - -class FixedSizeBinaryField(PrimitiveField): - - def __init__(self, name, byte_width, *, nullable=True, - metadata=[]): - super(FixedSizeBinaryField, self).__init__(name, nullable=nullable, - metadata=metadata) - self.byte_width = byte_width +class BinaryField(PrimitiveField): @property def numpy_type(self): @@ -512,18 +499,25 @@ def numpy_type(self): @property def column_class(self): - return FixedSizeBinaryColumn + return BinaryColumn def _get_type(self): - return OrderedDict([('name', 'fixedsizebinary'), - ('byteWidth', self.byte_width)]) + return OrderedDict([('name', 'binary')]) + + def _random_sizes(self, size): + return np.random.exponential(scale=4, size=size).astype(np.int32) def generate_column(self, size, name=None): is_valid = self._make_is_valid(size) values = [] - for i in range(size): - values.append(random_bytes(self.byte_width)) + sizes = self._random_sizes(size) + + for i, nbytes in enumerate(sizes): + if is_valid[i]: + values.append(random_bytes(nbytes)) + else: + values.append(b"") if name is None: name = self.name @@ -555,8 +549,22 @@ def generate_column(self, size, name=None): return self.column_class(name, size, is_valid, values) +class LargeBinaryField(BinaryField): + + @property + def column_class(self): + return LargeBinaryColumn + + def _get_type(self): + return OrderedDict([('name', 'largebinary')]) + + class LargeStringField(StringField): + @property + def column_class(self): + return LargeStringColumn + def _get_type(self): return OrderedDict([('name', 'largeutf8')]) @@ -578,7 +586,21 @@ def get_json(self): return OrderedDict(entries) -class BinaryColumn(PrimitiveColumn): +class _NarrowOffsetsMixin: + + def _encode_offsets(self, offsets): + return list(map(int, offsets)) + + +class _LargeOffsetsMixin: + + def _encode_offsets(self, offsets): + # 64-bit offsets have to be represented as strings to roundtrip + # through JSON. + return list(map(str, offsets)) + + +class _BaseBinaryColumn(PrimitiveColumn): def _encode_value(self, x): return frombytes(binascii.hexlify(x).upper()) @@ -599,11 +621,33 @@ def _get_buffers(self): return [ ('VALIDITY', [int(x) for x in self.is_valid]), - ('OFFSET', offsets), + ('OFFSET', self._encode_offsets(offsets)), ('DATA', data) ] +class _BaseStringColumn(_BaseBinaryColumn): + + def _encode_value(self, x): + return frombytes(x) + + +class BinaryColumn(_BaseBinaryColumn, _NarrowOffsetsMixin): + pass + + +class StringColumn(_BaseStringColumn, _NarrowOffsetsMixin): + pass + + +class LargeBinaryColumn(_BaseBinaryColumn, _LargeOffsetsMixin): + pass + + +class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin): + pass + + class FixedSizeBinaryColumn(PrimitiveColumn): def _encode_value(self, x): @@ -620,12 +664,6 @@ def _get_buffers(self): ] -class StringColumn(BinaryColumn): - - def _encode_value(self, x): - return frombytes(x) - - class ListField(Field): def __init__(self, name, value_field, *, nullable=True, @@ -634,6 +672,10 @@ def __init__(self, name, value_field, *, nullable=True, metadata=metadata) self.value_field = value_field + @property + def column_class(self): + return ListColumn + def _get_type(self): return OrderedDict([ ('name', 'list') @@ -660,21 +702,25 @@ def generate_column(self, size, name=None): if name is None: name = self.name - return ListColumn(name, size, is_valid, offsets, values) + return self.column_class(name, size, is_valid, offsets, values) class LargeListField(ListField): + @property + def column_class(self): + return LargeListColumn + def _get_type(self): return OrderedDict([ ('name', 'largelist') ]) -class ListColumn(Column): +class _BaseListColumn(Column): def __init__(self, name, count, is_valid, offsets, values): - super(ListColumn, self).__init__(name, count) + super().__init__(name, count) self.is_valid = is_valid self.offsets = offsets self.values = values @@ -682,13 +728,21 @@ def __init__(self, name, count, is_valid, offsets, values): def _get_buffers(self): return [ ('VALIDITY', [int(v) for v in self.is_valid]), - ('OFFSET', list(self.offsets)) + ('OFFSET', self._encode_offsets(self.offsets)) ] def _get_children(self): return [self.values.get_json()] +class ListColumn(_BaseListColumn, _NarrowOffsetsMixin): + pass + + +class LargeListColumn(_BaseListColumn, _LargeOffsetsMixin): + pass + + class MapField(Field): def __init__(self, name, key_field, item_field, *, nullable=True, @@ -1165,7 +1219,7 @@ def generate_nested_large_offsets_case(): ListField('inner_list', get_field('item', 'int16'))), ] - batch_sizes = [7, 10] + batch_sizes = [0, 13] return _generate_file("nested_large_offsets", fields, batch_sizes)