From 5240b14d771d7dec472210fe91cad0d32c8911a7 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Tue, 14 Apr 2020 18:26:00 +0200
Subject: [PATCH 1/2] ARROW-8450: [Integration][C++] Implement large offsets
 types

Implement integration tests for LargeList, LargeBinary and LargeString types.
Enable them for C++ (only).
---
 cpp/src/arrow/ipc/json_integration_test.cc |  22 +--
 cpp/src/arrow/ipc/json_internal.cc         |  14 +-
 cpp/src/arrow/ipc/json_test.cc             |  16 +-
 dev/archery/archery/cli.py                 |   8 +-
 dev/archery/archery/integration/datagen.py | 177 ++++++++++++++-------
 dev/archery/archery/integration/util.py    |  25 ++-
 6 files changed, 153 insertions(+), 109 deletions(-)

diff --git a/cpp/src/arrow/ipc/json_integration_test.cc b/cpp/src/arrow/ipc/json_integration_test.cc
index f2eea294951..4185cb37fb1 100644
--- a/cpp/src/arrow/ipc/json_integration_test.cc
+++ b/cpp/src/arrow/ipc/json_integration_test.cc
@@ -250,24 +250,12 @@ static const char* JSON_EXAMPLE = R"example(
       {
         "name": "foo",
         "type": {"name": "int", "isSigned": true, "bitWidth": 32},
-        "nullable": true, "children": [],
-        "typeLayout": {
-          "vectors": [
-            {"type": "VALIDITY", "typeBitWidth": 1},
-            {"type": "DATA", "typeBitWidth": 32}
-          ]
-        }
+        "nullable": true, "children": []
       },
       {
         "name": "bar",
         "type": {"name": "floatingpoint", "precision": "DOUBLE"},
-        "nullable": true, "children": [],
-        "typeLayout": {
-          "vectors": [
-            {"type": "VALIDITY", "typeBitWidth": 1},
-            {"type": "DATA", "typeBitWidth": 64}
-          ]
-        }
+        "nullable": true, "children": []
       }
     ]
   },
@@ -318,12 +306,6 @@ static const char* JSON_EXAMPLE2 = R"example(
         "name": "foo",
         "type": {"name": "int", "isSigned": true, "bitWidth": 32},
         "nullable": true, "children": [],
-        "typeLayout": {
-          "vectors": [
-            {"type": "VALIDITY", "typeBitWidth": 1},
-            {"type": "DATA", "typeBitWidth": 32}
-          ]
-        },
         "metadata": [
           {"key": "converted_from_time32", "value": "true"}
         ]
diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc
index 133681cbd7b..72dabefce9e 100644
--- a/cpp/src/arrow/ipc/json_internal.cc
+++ b/cpp/src/arrow/ipc/json_internal.cc
@@ -335,10 +335,8 @@ class SchemaWriter {
   Status Visit(const TimeType& type) { return WritePrimitive("time", type); }
   Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); }
   Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); }
-  Status Visit(const LargeStringType& type) { return WriteVarBytes("large_utf8", type); }
-  Status Visit(const LargeBinaryType& type) {
-    return WriteVarBytes("large_binary", type);
-  }
+  Status Visit(const LargeStringType& type) { return WriteVarBytes("largeutf8", type); }
+  Status Visit(const LargeBinaryType& type) { return WriteVarBytes("largebinary", type); }
   Status Visit(const FixedSizeBinaryType& type) {
     return WritePrimitive("fixedsizebinary", type);
   }
@@ -358,7 +356,7 @@ class SchemaWriter {
   }
 
   Status Visit(const LargeListType& type) {
-    WriteName("large_list", type);
+    WriteName("largelist", type);
     return Status::OK();
   }
 
@@ -932,9 +930,9 @@ static Status GetType(const RjObject& json_type,
     *type = utf8();
   } else if (type_name == "binary") {
     *type = binary();
-  } else if (type_name == "large_utf8") {
+  } else if (type_name == "largeutf8") {
     *type = large_utf8();
-  } else if (type_name == "large_binary") {
+  } else if (type_name == "largebinary") {
     *type = large_binary();
   } else if (type_name == "fixedsizebinary") {
     return GetFixedSizeBinary(json_type, type);
@@ -957,7 +955,7 @@ static Status GetType(const RjObject& json_type,
       return Status::Invalid("List must have exactly one child");
     }
     *type = list(children[0]);
-  } else if (type_name == "large_list") {
+  } else if (type_name == "largelist") {
     if (children.size() != 1) {
       return Status::Invalid("Large list must have exactly one child");
     }
diff --git a/cpp/src/arrow/ipc/json_test.cc b/cpp/src/arrow/ipc/json_test.cc
index bfc2fab8dac..21a695ab711 100644
--- a/cpp/src/arrow/ipc/json_test.cc
+++ b/cpp/src/arrow/ipc/json_test.cc
@@ -337,24 +337,12 @@ TEST(TestJsonFileReadWrite, MinimalFormatExample) {
       {
         "name": "foo",
         "type": {"name": "int", "isSigned": true, "bitWidth": 32},
-        "nullable": true, "children": [],
-        "typeLayout": {
-          "vectors": [
-            {"type": "VALIDITY", "typeBitWidth": 1},
-            {"type": "DATA", "typeBitWidth": 32}
-          ]
-        }
+        "nullable": true, "children": []
       },
       {
         "name": "bar",
         "type": {"name": "floatingpoint", "precision": "DOUBLE"},
-        "nullable": true, "children": [],
-        "typeLayout": {
-          "vectors": [
-            {"type": "VALIDITY", "typeBitWidth": 1},
-            {"type": "DATA", "typeBitWidth": 64}
-          ]
-        }
+        "nullable": true, "children": []
       }
     ]
   },
diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py
index 6c07e05a142..c8af1b261ec 100644
--- a/dev/archery/archery/cli.py
+++ b/dev/archery/archery/cli.py
@@ -346,7 +346,7 @@ def benchmark_list(ctx, rev_or_path, src, preserve, output, cmake_extras,
         logger.debug(f"Running benchmark {rev_or_path}")
 
         conf = CppBenchmarkRunner.default_configuration(
-                cmake_extras=cmake_extras, **kwargs)
+            cmake_extras=cmake_extras, **kwargs)
 
         runner_base = BenchmarkRunner.from_rev_or_path(
             src, root, rev_or_path, conf)
@@ -399,7 +399,7 @@ def benchmark_run(ctx, rev_or_path, src, preserve, output, cmake_extras,
         logger.debug(f"Running benchmark {rev_or_path}")
 
         conf = CppBenchmarkRunner.default_configuration(
-                cmake_extras=cmake_extras, **kwargs)
+            cmake_extras=cmake_extras, **kwargs)
 
         runner_base = BenchmarkRunner.from_rev_or_path(
             src, root, rev_or_path, conf,
@@ -497,7 +497,7 @@ def benchmark_diff(ctx, src, preserve, output, cmake_extras,
                      f"{baseline} (baseline)")
 
         conf = CppBenchmarkRunner.default_configuration(
-                cmake_extras=cmake_extras, **kwargs)
+            cmake_extras=cmake_extras, **kwargs)
 
         runner_cont = BenchmarkRunner.from_rev_or_path(
             src, root, contender, conf,
@@ -551,7 +551,7 @@ def _set_default(opt, default):
 @click.option('stop_on_error', '-x', '--stop-on-error',
               is_flag=True, default=False,
               help='Stop on first error')
-@click.option('--gold_dirs', multiple=True,
+@click.option('--gold-dirs', multiple=True,
               help="gold integration test file paths")
 @click.option('-k', '--match',
               help=("Substring for test names to include in run, "
diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index 0f81acc5f60..8f4e9c29ca5 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -24,12 +24,13 @@
 
 import numpy as np
 
-from .util import (frombytes, rands, tobytes, SKIP_ARROW, SKIP_FLIGHT)
+from .util import (frombytes, tobytes, random_bytes, random_utf8,
+                   SKIP_ARROW, SKIP_FLIGHT)
 
 
 class Field(object):
 
-    def __init__(self, name, nullable=True, metadata=[]):
+    def __init__(self, name, *, nullable=True, metadata=[]):
         self.name = name
         self.nullable = nullable
         self.metadata = metadata
@@ -132,7 +133,7 @@ def generate_column(self, size, name=None):
 
 class IntegerField(PrimitiveField):
 
-    def __init__(self, name, is_signed, bit_width, nullable=True,
+    def __init__(self, name, is_signed, bit_width, *, nullable=True,
                  metadata=[],
                  min_value=TEST_INT_MIN,
                  max_value=TEST_INT_MAX):
@@ -188,7 +189,7 @@ class DateField(IntegerField):
         MILLISECOND: [-62135596800000, 253402214400000]
     }
 
-    def __init__(self, name, unit, nullable=True, metadata=[]):
+    def __init__(self, name, unit, *, nullable=True, metadata=[]):
         bit_width = 32 if unit == self.DAY else 64
 
         min_value, max_value = self._ranges[unit]
@@ -230,7 +231,7 @@ class TimeField(IntegerField):
         'ns': [0, 86400000000000]
     }
 
-    def __init__(self, name, unit='s', nullable=True,
+    def __init__(self, name, unit='s', *, nullable=True,
                  metadata=[]):
         min_val, max_val = self._ranges[unit]
         super(TimeField, self).__init__(name, True, self.BIT_WIDTHS[unit],
@@ -258,7 +259,7 @@ class TimestampField(IntegerField):
         'ns': [np.iinfo('int64').min, np.iinfo('int64').max]
     }
 
-    def __init__(self, name, unit='s', tz=None, nullable=True,
+    def __init__(self, name, unit='s', tz=None, *, nullable=True,
                  metadata=[]):
         min_val, max_val = self._ranges[unit]
         super(TimestampField, self).__init__(name, True, 64,
@@ -283,13 +284,13 @@ def _get_type(self):
 
 class DurationIntervalField(IntegerField):
 
-    def __init__(self, name, unit='s', nullable=True,
+    def __init__(self, name, unit='s', *, nullable=True,
                  metadata=[]):
         min_val, max_val = np.iinfo('int64').min, np.iinfo('int64').max,
         super(DurationIntervalField, self).__init__(
-                name, True, 64,
-                nullable=nullable, metadata=metadata,
-                min_value=min_val, max_value=max_val)
+            name, True, 64,
+            nullable=nullable, metadata=metadata,
+            min_value=min_val, max_value=max_val)
         self.unit = unit
 
     def _get_type(self):
@@ -302,12 +303,12 @@ def _get_type(self):
 
 
 class YearMonthIntervalField(IntegerField):
-    def __init__(self, name, nullable=True, metadata=[]):
+    def __init__(self, name, *, nullable=True, metadata=[]):
         min_val, max_val = [-10000*12, 10000*12]  # +/- 10000 years.
         super(YearMonthIntervalField, self).__init__(
-                name, True, 32,
-                nullable=nullable, metadata=metadata,
-                min_value=min_val, max_value=max_val)
+            name, True, 32,
+            nullable=nullable, metadata=metadata,
+            min_value=min_val, max_value=max_val)
 
     def _get_type(self):
         fields = [
@@ -319,7 +320,7 @@ def _get_type(self):
 
 
 class DayTimeIntervalField(PrimitiveField):
-    def __init__(self, name, nullable=True, metadata=[]):
+    def __init__(self, name, *, nullable=True, metadata=[]):
         super(DayTimeIntervalField, self).__init__(name,
                                                    nullable=True,
                                                    metadata=metadata)
@@ -349,7 +350,7 @@ def generate_column(self, size, name=None):
 
 class FloatingPointField(PrimitiveField):
 
-    def __init__(self, name, bit_width, nullable=True,
+    def __init__(self, name, bit_width, *, nullable=True,
                  metadata=[]):
         super(FloatingPointField, self).__init__(name,
                                                  nullable=nullable,
@@ -401,8 +402,8 @@ def decimal_range_from_precision(precision):
 
 
 class DecimalField(PrimitiveField):
-    def __init__(self, name, precision, scale, bit_width=128, nullable=True,
-                 metadata=[]):
+    def __init__(self, name, precision, scale, bit_width=128, *,
+                 nullable=True, metadata=[]):
         super(DecimalField, self).__init__(name, nullable=True,
                                            metadata=metadata)
         self.precision = precision
@@ -471,17 +472,18 @@ def column_class(self):
     def _get_type(self):
         return OrderedDict([('name', 'binary')])
 
+    def _random_sizes(self, size):
+        return np.random.exponential(scale=4, size=size).astype(np.int32)
+
     def generate_column(self, size, name=None):
-        K = 7
         is_valid = self._make_is_valid(size)
         values = []
 
-        for i in range(size):
+        sizes = self._random_sizes(size)
+
+        for i, nbytes in enumerate(sizes):
             if is_valid[i]:
-                draw = (np.random.randint(0, 255, size=K)
-                        .astype(np.uint8)
-                        .tostring())
-                values.append(draw)
+                values.append(random_bytes(nbytes))
             else:
                 values.append(b"")
 
@@ -490,9 +492,15 @@ def generate_column(self, size, name=None):
         return self.column_class(name, size, is_valid, values)
 
 
+class LargeBinaryField(BinaryField):
+
+    def _get_type(self):
+        return OrderedDict([('name', 'largebinary')])
+
+
 class FixedSizeBinaryField(PrimitiveField):
 
-    def __init__(self, name, byte_width, nullable=True,
+    def __init__(self, name, byte_width, *, nullable=True,
                  metadata=[]):
         super(FixedSizeBinaryField, self).__init__(name, nullable=nullable,
                                                    metadata=metadata)
@@ -510,23 +518,12 @@ def _get_type(self):
         return OrderedDict([('name', 'fixedsizebinary'),
                             ('byteWidth', self.byte_width)])
 
-    def _get_type_layout(self):
-        return OrderedDict([
-            ('vectors',
-             [OrderedDict([('type', 'VALIDITY'),
-                           ('typeBitWidth', 1)]),
-              OrderedDict([('type', 'DATA'),
-                           ('typeBitWidth', self.byte_width)])])])
-
     def generate_column(self, size, name=None):
         is_valid = self._make_is_valid(size)
         values = []
 
         for i in range(size):
-            draw = (np.random.randint(0, 255, size=self.byte_width)
-                    .astype(np.uint8)
-                    .tostring())
-            values.append(draw)
+            values.append(random_bytes(self.byte_width))
 
         if name is None:
             name = self.name
@@ -549,7 +546,7 @@ def generate_column(self, size, name=None):
 
         for i in range(size):
             if is_valid[i]:
-                values.append(tobytes(rands(K)))
+                values.append(tobytes(random_utf8(K)))
             else:
                 values.append(b"")
 
@@ -558,6 +555,12 @@ def generate_column(self, size, name=None):
         return self.column_class(name, size, is_valid, values)
 
 
+class LargeStringField(StringField):
+
+    def _get_type(self):
+        return OrderedDict([('name', 'largeutf8')])
+
+
 class Schema(object):
 
     def __init__(self, fields, metadata=None):
@@ -604,7 +607,7 @@ def _get_buffers(self):
 class FixedSizeBinaryColumn(PrimitiveColumn):
 
     def _encode_value(self, x):
-        return ''.join('{:02x}'.format(c).upper() for c in x)
+        return frombytes(binascii.hexlify(x).upper())
 
     def _get_buffers(self):
         data = []
@@ -625,7 +628,7 @@ def _encode_value(self, x):
 
 class ListField(Field):
 
-    def __init__(self, name, value_field, nullable=True,
+    def __init__(self, name, value_field, *, nullable=True,
                  metadata=[]):
         super(ListField, self).__init__(name, nullable=nullable,
                                         metadata=metadata)
@@ -660,6 +663,14 @@ def generate_column(self, size, name=None):
         return ListColumn(name, size, is_valid, offsets, values)
 
 
+class LargeListField(ListField):
+
+    def _get_type(self):
+        return OrderedDict([
+            ('name', 'largelist')
+        ])
+
+
 class ListColumn(Column):
 
     def __init__(self, name, count, is_valid, offsets, values):
@@ -680,22 +691,22 @@ def _get_children(self):
 
 class MapField(Field):
 
-    def __init__(self, name, key_field, item_field, nullable=True,
-                 metadata=[], keysSorted=False):
+    def __init__(self, name, key_field, item_field, *, nullable=True,
+                 metadata=[], keys_sorted=False):
         super(MapField, self).__init__(name, nullable=nullable,
                                        metadata=metadata)
 
         assert not key_field.nullable
         self.key_field = key_field
         self.item_field = item_field
-        self.pair_field = StructField(
-            'entries', [key_field, item_field], False)
-        self.keysSorted = keysSorted
+        self.pair_field = StructField('entries', [key_field, item_field],
+                                      nullable=False)
+        self.keys_sorted = keys_sorted
 
     def _get_type(self):
         return OrderedDict([
             ('name', 'map'),
-            ('keysSorted', self.keysSorted)
+            ('keysSorted', self.keys_sorted)
         ])
 
     def _get_children(self):
@@ -742,7 +753,7 @@ def _get_children(self):
 
 class FixedSizeListField(Field):
 
-    def __init__(self, name, value_field, list_size, nullable=True,
+    def __init__(self, name, value_field, list_size, *, nullable=True,
                  metadata=[]):
         super(FixedSizeListField, self).__init__(name, nullable=nullable,
                                                  metadata=metadata)
@@ -785,7 +796,7 @@ def _get_children(self):
 
 class StructField(Field):
 
-    def __init__(self, name, fields, nullable=True,
+    def __init__(self, name, fields, *, nullable=True,
                  metadata=[]):
         super(StructField, self).__init__(name, nullable=nullable,
                                           metadata=metadata)
@@ -829,7 +840,7 @@ def get_json(self):
 
 class DictionaryField(Field):
 
-    def __init__(self, name, index_field, dictionary, nullable=True,
+    def __init__(self, name, index_field, dictionary, *, nullable=True,
                  metadata=[]):
         super(DictionaryField, self).__init__(name, nullable=nullable,
                                               metadata=metadata)
@@ -935,6 +946,10 @@ def get_field(name, type_, **kwargs):
         return BinaryField(name, **kwargs)
     elif type_ == 'utf8':
         return StringField(name, **kwargs)
+    elif type_ == 'largebinary':
+        return LargeBinaryField(name, **kwargs)
+    elif type_ == 'largeutf8':
+        return LargeStringField(name, **kwargs)
     elif type_.startswith('fixedsizebinary_'):
         byte_width = int(type_.split('_')[1])
         return FixedSizeBinaryField(name, byte_width=byte_width, **kwargs)
@@ -1020,6 +1035,18 @@ def generate_primitive_case(batch_sizes, name='primitive'):
     return _generate_file(name, fields, batch_sizes)
 
 
+def generate_primitive_large_offsets_case(batch_sizes):
+    types = ['largebinary', 'largeutf8']
+
+    fields = []
+
+    for type_ in types:
+        fields.append(get_field(type_ + "_nullable", type_, nullable=True))
+        fields.append(get_field(type_ + "_nonnullable", type_, nullable=False))
+
+    return _generate_file('primitive_large_offsets', fields, batch_sizes)
+
+
 def generate_null_case(batch_sizes):
     # Interleave null with non-null types to ensure the appropriate number of
     # buffers (0) is read and written
@@ -1106,15 +1133,42 @@ def generate_nested_case():
                            get_field('item', 'int32'), 4),
         StructField('struct_nullable', [get_field('f1', 'int32'),
                                         get_field('f2', 'utf8')]),
-
-        # TODO(wesm): this causes segfault
-        # ListField('list_nonnullable', get_field('item', 'int32'), False),
+        # Fails on Go (ARROW-8452)
+        # ListField('list_nonnullable', get_field('item', 'int32'),
+        #           nullable=False),
     ]
 
     batch_sizes = [7, 10]
     return _generate_file("nested", fields, batch_sizes)
 
 
+def generate_recursive_nested_case():
+    fields = [
+        ListField('lists_list',
+                  ListField('inner_list', get_field('item', 'int16'))),
+        ListField('structs_list',
+                  StructField('inner_struct',
+                              [get_field('f1', 'int32'),
+                               get_field('f2', 'utf8')])),
+    ]
+
+    batch_sizes = [7, 10]
+    return _generate_file("recursive_nested", fields, batch_sizes)
+
+
+def generate_nested_large_offsets_case():
+    fields = [
+        LargeListField('large_list_nullable', get_field('item', 'int32')),
+        LargeListField('large_list_nonnullable',
+                       get_field('item', 'int32'), nullable=False),
+        LargeListField('large_list_nested',
+                       ListField('inner_list', get_field('item', 'int16'))),
+    ]
+
+    batch_sizes = [7, 10]
+    return _generate_file("nested_large_offsets", fields, batch_sizes)
+
+
 def generate_dictionary_case():
     dict0 = Dictionary(0, StringField('dictionary1'), size=10, name='DICT0')
     dict1 = Dictionary(1, StringField('dictionary1'), size=5, name='DICT1')
@@ -1140,9 +1194,9 @@ def generate_nested_dictionary_case():
     dict1 = Dictionary(1, list_of_dict, size=30, name='DICT1')
 
     struct_of_dict = StructField('struct', [
-            DictionaryField('str_dict_a', get_field('', 'int8'), dict0),
-            DictionaryField('str_dict_b', get_field('', 'int8'), dict0)
-        ])
+        DictionaryField('str_dict_a', get_field('', 'int8'), dict0),
+        DictionaryField('str_dict_b', get_field('', 'int8'), dict0)
+    ])
     dict2 = Dictionary(2, struct_of_dict, size=30, name='DICT2')
 
     fields = [
@@ -1166,6 +1220,11 @@ def _temp_path():
         generate_primitive_case([17, 20], name='primitive'),
         generate_primitive_case([0, 0, 0], name='primitive_zerolength'),
 
+        generate_primitive_large_offsets_case([17, 20])
+        .skip_category('Go')
+        .skip_category('Java')  # TODO(ARROW-6110)
+        .skip_category('JS'),
+
         generate_null_case([10, 0])
         .skip_category('JS')   # TODO(ARROW-7900)
         .skip_category('Go'),  # TODO(ARROW-7901)
@@ -1187,6 +1246,14 @@ def _temp_path():
 
         generate_nested_case(),
 
+        # TODO(ARROW-8453)
+        generate_recursive_nested_case().skip_category('Go'),
+
+        generate_nested_large_offsets_case()
+        .skip_category('Go')
+        .skip_category('Java')  # TODO(ARROW-6111)
+        .skip_category('JS'),
+
         generate_custom_metadata_case().skip_category('Go')
                                        .skip_category('Java')
                                        .skip_category('JS'),
diff --git a/dev/archery/archery/integration/util.py b/dev/archery/archery/integration/util.py
index e3f2542b1e8..a4c4982ecb3 100644
--- a/dev/archery/archery/integration/util.py
+++ b/dev/archery/archery/integration/util.py
@@ -18,8 +18,8 @@
 import contextlib
 import io
 import os
+import random
 import socket
-import string
 import subprocess
 import sys
 import threading
@@ -32,9 +32,6 @@ def guid():
     return uuid.uuid4().hex
 
 
-RANDS_CHARS = np.array(list(string.ascii_letters + string.digits),
-                       dtype=(np.str_, 1))
-
 # SKIP categories
 SKIP_ARROW = 'arrow'
 SKIP_FLIGHT = 'flight'
@@ -100,14 +97,26 @@ def cork(self):
 log = printer.print
 
 
-def rands(nchars):
+_RAND_CHARS = np.array(list("abcdefghijklmnop123456Ârrôwµ£°€矢"), dtype="U")
+
+
+def random_utf8(nchars):
     """
-    Generate one random byte string.
+    Generate one random UTF8 string.
+    """
+    return ''.join(np.random.choice(_RAND_CHARS, nchars))
 
-    See `rands_array` if you want to create an array of random strings.
 
+def random_bytes(nbytes):
+    """
+    Generate one random binary string.
     """
-    return ''.join(np.random.choice(RANDS_CHARS, nchars))
+    # NOTE getrandbits(0) fails
+    if nbytes > 0:
+        return random.getrandbits(nbytes * 8).to_bytes(nbytes,
+                                                       byteorder='little')
+    else:
+        return b""
 
 
 def tobytes(o):

From 01beb13e79383733f9b5b42e06c48e68728df492 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Wed, 15 Apr 2020 17:53:53 +0200
Subject: [PATCH 2/2] Write large offsets as strings

---
 cpp/src/arrow/ipc/json_internal.cc         |  48 +++++--
 dev/archery/archery/integration/datagen.py | 142 ++++++++++++++-------
 2 files changed, 137 insertions(+), 53 deletions(-)

diff --git a/cpp/src/arrow/ipc/json_internal.cc b/cpp/src/arrow/ipc/json_internal.cc
index 72dabefce9e..3dfae2a5666 100644
--- a/cpp/src/arrow/ipc/json_internal.cc
+++ b/cpp/src/arrow/ipc/json_internal.cc
@@ -38,8 +38,10 @@
 #include "arrow/util/bit_util.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
+#include "arrow/util/formatting.h"
 #include "arrow/util/key_value_metadata.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/parsing.h"
 #include "arrow/util/string.h"
 #include "arrow/visitor_inline.h"
 
@@ -523,8 +525,21 @@ class ArrayWriter {
   void WriteIntegerField(const char* name, const T* values, int64_t length) {
     writer_->Key(name);
     writer_->StartArray();
-    for (int i = 0; i < length; ++i) {
-      writer_->Int64(values[i]);
+    if (sizeof(T) < sizeof(int64_t)) {
+      for (int i = 0; i < length; ++i) {
+        writer_->Int64(values[i]);
+      }
+    } else {
+      // Represent 64-bit integers as strings, as JSON numbers cannot represent
+      // them exactly.
+      ::arrow::internal::StringFormatter<typename CTypeTraits<T>::ArrowType> formatter;
+      auto append = [this](util::string_view v) {
+        writer_->String(v.data(), static_cast<rj::SizeType>(v.size()));
+        return Status::OK();
+      };
+      for (int i = 0; i < length; ++i) {
+        DCHECK_OK(formatter(values[i], append));
+      }
     }
     writer_->EndArray();
   }
@@ -1297,13 +1312,28 @@ class ArrayReader {
     ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(length * sizeof(T), pool_));
 
     T* values = reinterpret_cast<T*>(buffer->mutable_data());
-    for (int i = 0; i < length; ++i) {
-      const rj::Value& val = json_array[i];
-      DCHECK(val.IsInt() || val.IsInt64());
-      if (val.IsInt()) {
-        values[i] = static_cast<T>(val.GetInt());
-      } else {
-        values[i] = static_cast<T>(val.GetInt64());
+    if (sizeof(T) < sizeof(int64_t)) {
+      for (int i = 0; i < length; ++i) {
+        const rj::Value& val = json_array[i];
+        DCHECK(val.IsInt() || val.IsInt64());
+        if (val.IsInt()) {
+          values[i] = static_cast<T>(val.GetInt());
+        } else {
+          values[i] = static_cast<T>(val.GetInt64());
+        }
+      }
+    } else {
+      // Read 64-bit integers as strings, as JSON numbers cannot represent
+      // them exactly.
+      ::arrow::internal::StringConverter<typename CTypeTraits<T>::ArrowType> converter;
+      for (int i = 0; i < length; ++i) {
+        const rj::Value& val = json_array[i];
+        DCHECK(val.IsString());
+        if (!converter(val.GetString(), val.GetStringLength(), &values[i])) {
+          return Status::Invalid("Failed to parse integer: '",
+                                 std::string(val.GetString(), val.GetStringLength()),
+                                 "'");
+        }
       }
     }
 
diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index 8f4e9c29ca5..3c15abc2ab2 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -459,7 +459,13 @@ def generate_column(self, size, name=None):
         return PrimitiveColumn(name, size, is_valid, values)
 
 
-class BinaryField(PrimitiveField):
+class FixedSizeBinaryField(PrimitiveField):
+
+    def __init__(self, name, byte_width, *, nullable=True,
+                 metadata=[]):
+        super(FixedSizeBinaryField, self).__init__(name, nullable=nullable,
+                                                   metadata=metadata)
+        self.byte_width = byte_width
 
     @property
     def numpy_type(self):
@@ -467,44 +473,25 @@ def numpy_type(self):
 
     @property
     def column_class(self):
-        return BinaryColumn
+        return FixedSizeBinaryColumn
 
     def _get_type(self):
-        return OrderedDict([('name', 'binary')])
-
-    def _random_sizes(self, size):
-        return np.random.exponential(scale=4, size=size).astype(np.int32)
+        return OrderedDict([('name', 'fixedsizebinary'),
+                            ('byteWidth', self.byte_width)])
 
     def generate_column(self, size, name=None):
         is_valid = self._make_is_valid(size)
         values = []
 
-        sizes = self._random_sizes(size)
-
-        for i, nbytes in enumerate(sizes):
-            if is_valid[i]:
-                values.append(random_bytes(nbytes))
-            else:
-                values.append(b"")
+        for i in range(size):
+            values.append(random_bytes(self.byte_width))
 
         if name is None:
             name = self.name
         return self.column_class(name, size, is_valid, values)
 
 
-class LargeBinaryField(BinaryField):
-
-    def _get_type(self):
-        return OrderedDict([('name', 'largebinary')])
-
-
-class FixedSizeBinaryField(PrimitiveField):
-
-    def __init__(self, name, byte_width, *, nullable=True,
-                 metadata=[]):
-        super(FixedSizeBinaryField, self).__init__(name, nullable=nullable,
-                                                   metadata=metadata)
-        self.byte_width = byte_width
+class BinaryField(PrimitiveField):
 
     @property
     def numpy_type(self):
@@ -512,18 +499,25 @@ def numpy_type(self):
 
     @property
     def column_class(self):
-        return FixedSizeBinaryColumn
+        return BinaryColumn
 
     def _get_type(self):
-        return OrderedDict([('name', 'fixedsizebinary'),
-                            ('byteWidth', self.byte_width)])
+        return OrderedDict([('name', 'binary')])
+
+    def _random_sizes(self, size):
+        return np.random.exponential(scale=4, size=size).astype(np.int32)
 
     def generate_column(self, size, name=None):
         is_valid = self._make_is_valid(size)
         values = []
 
-        for i in range(size):
-            values.append(random_bytes(self.byte_width))
+        sizes = self._random_sizes(size)
+
+        for i, nbytes in enumerate(sizes):
+            if is_valid[i]:
+                values.append(random_bytes(nbytes))
+            else:
+                values.append(b"")
 
         if name is None:
             name = self.name
@@ -555,8 +549,22 @@ def generate_column(self, size, name=None):
         return self.column_class(name, size, is_valid, values)
 
 
+class LargeBinaryField(BinaryField):
+
+    @property
+    def column_class(self):
+        return LargeBinaryColumn
+
+    def _get_type(self):
+        return OrderedDict([('name', 'largebinary')])
+
+
 class LargeStringField(StringField):
 
+    @property
+    def column_class(self):
+        return LargeStringColumn
+
     def _get_type(self):
         return OrderedDict([('name', 'largeutf8')])
 
@@ -578,7 +586,21 @@ def get_json(self):
         return OrderedDict(entries)
 
 
-class BinaryColumn(PrimitiveColumn):
+class _NarrowOffsetsMixin:
+
+    def _encode_offsets(self, offsets):
+        return list(map(int, offsets))
+
+
+class _LargeOffsetsMixin:
+
+    def _encode_offsets(self, offsets):
+        # 64-bit offsets have to be represented as strings to roundtrip
+        # through JSON.
+        return list(map(str, offsets))
+
+
+class _BaseBinaryColumn(PrimitiveColumn):
 
     def _encode_value(self, x):
         return frombytes(binascii.hexlify(x).upper())
@@ -599,11 +621,33 @@ def _get_buffers(self):
 
         return [
             ('VALIDITY', [int(x) for x in self.is_valid]),
-            ('OFFSET', offsets),
+            ('OFFSET', self._encode_offsets(offsets)),
             ('DATA', data)
         ]
 
 
+class _BaseStringColumn(_BaseBinaryColumn):
+
+    def _encode_value(self, x):
+        return frombytes(x)
+
+
+class BinaryColumn(_BaseBinaryColumn, _NarrowOffsetsMixin):
+    pass
+
+
+class StringColumn(_BaseStringColumn, _NarrowOffsetsMixin):
+    pass
+
+
+class LargeBinaryColumn(_BaseBinaryColumn, _LargeOffsetsMixin):
+    pass
+
+
+class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin):
+    pass
+
+
 class FixedSizeBinaryColumn(PrimitiveColumn):
 
     def _encode_value(self, x):
@@ -620,12 +664,6 @@ def _get_buffers(self):
         ]
 
 
-class StringColumn(BinaryColumn):
-
-    def _encode_value(self, x):
-        return frombytes(x)
-
-
 class ListField(Field):
 
     def __init__(self, name, value_field, *, nullable=True,
@@ -634,6 +672,10 @@ def __init__(self, name, value_field, *, nullable=True,
                                         metadata=metadata)
         self.value_field = value_field
 
+    @property
+    def column_class(self):
+        return ListColumn
+
     def _get_type(self):
         return OrderedDict([
             ('name', 'list')
@@ -660,21 +702,25 @@ def generate_column(self, size, name=None):
 
         if name is None:
             name = self.name
-        return ListColumn(name, size, is_valid, offsets, values)
+        return self.column_class(name, size, is_valid, offsets, values)
 
 
 class LargeListField(ListField):
 
+    @property
+    def column_class(self):
+        return LargeListColumn
+
     def _get_type(self):
         return OrderedDict([
             ('name', 'largelist')
         ])
 
 
-class ListColumn(Column):
+class _BaseListColumn(Column):
 
     def __init__(self, name, count, is_valid, offsets, values):
-        super(ListColumn, self).__init__(name, count)
+        super().__init__(name, count)
         self.is_valid = is_valid
         self.offsets = offsets
         self.values = values
@@ -682,13 +728,21 @@ def __init__(self, name, count, is_valid, offsets, values):
     def _get_buffers(self):
         return [
             ('VALIDITY', [int(v) for v in self.is_valid]),
-            ('OFFSET', list(self.offsets))
+            ('OFFSET', self._encode_offsets(self.offsets))
         ]
 
     def _get_children(self):
         return [self.values.get_json()]
 
 
+class ListColumn(_BaseListColumn, _NarrowOffsetsMixin):
+    pass
+
+
+class LargeListColumn(_BaseListColumn, _LargeOffsetsMixin):
+    pass
+
+
 class MapField(Field):
 
     def __init__(self, name, key_field, item_field, *, nullable=True,
@@ -1165,7 +1219,7 @@ def generate_nested_large_offsets_case():
                        ListField('inner_list', get_field('item', 'int16'))),
     ]
 
-    batch_sizes = [7, 10]
+    batch_sizes = [0, 13]
     return _generate_file("nested_large_offsets", fields, batch_sizes)