Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
56f7134
GH-39013: [Go][Integration] Support cABI import/export of StringView …
bkietz Dec 1, 2023
5dd6ea5
amend integration JSON field names
bkietz Dec 1, 2023
9766f85
debugging, maybe
bkietz Dec 1, 2023
a276351
add datagen.py support for list view
bkietz Dec 8, 2023
b89bd69
debugging session
bkietz Dec 11, 2023
8e80f5a
fix failing binaryview test
zeroshade Dec 11, 2023
c656338
add c ABI support for list view
bkietz Dec 12, 2023
0c33184
ensure offsets under null bits are not ignored
bkietz Dec 12, 2023
33a679f
move GetBytes, GetData and other utilities to type_traits.go
bkietz Dec 12, 2023
44d8f73
fmt
bkietz Dec 12, 2023
8e4a09c
GetOffsets
bkietz Dec 12, 2023
2505971
unused import
bkietz Dec 12, 2023
7005868
GetData/GetBytes with empty slices
bkietz Dec 13, 2023
0678158
fmt
bkietz Dec 13, 2023
14fe8ec
ensure slices have full capacity
bkietz Dec 14, 2023
12dd7ea
rename JSON fields
bkietz Dec 14, 2023
062dade
replace null skipping in minOffset/maxEnd
bkietz Dec 14, 2023
623c9f9
remove usage of minOffset/maxEnd from ipc writer
bkietz Dec 14, 2023
6f14220
guard for nil buffers
bkietz Dec 14, 2023
6d3c69b
remove unused function
bkietz Dec 14, 2023
952001f
get list_view's offset buffer size right
bkietz Dec 14, 2023
97edc75
revert autoformat noise
bkietz Dec 14, 2023
03a12f9
randomize offsets
bkietz Dec 15, 2023
b923766
update Integration.rst
bkietz Dec 15, 2023
e10341a
review comments
bkietz Dec 15, 2023
cb6a4ef
avoid use of reflect.SliceHeader
bkietz Dec 15, 2023
297f456
fix listview slicing optimization
zeroshade Dec 15, 2023
e3467e9
Add pre-condition comment back
felipecrv Dec 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 92 additions & 1 deletion dev/archery/archery/integration/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,6 +927,83 @@ class LargeListColumn(_BaseListColumn, _LargeOffsetsMixin):
pass


class ListViewField(Field):

def __init__(self, name, value_field, *, nullable=True,
metadata=None):
super().__init__(name, nullable=nullable,
metadata=metadata)
self.value_field = value_field

@property
def column_class(self):
return ListViewColumn

def _get_type(self):
return OrderedDict([
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FTR, we don't need OrderedDict anymore as regular dicts are now ordered by default.

('name', 'listview')
])

def _get_children(self):
return [self.value_field.get_json()]

def generate_column(self, size, name=None):
MAX_LIST_SIZE = 4
VALUES_SIZE = size * MAX_LIST_SIZE

is_valid = self._make_is_valid(size)

MAX_OFFSET = VALUES_SIZE - MAX_LIST_SIZE
offsets = np.random.randint(0, MAX_OFFSET + 1, size=size)
sizes = np.random.randint(0, MAX_LIST_SIZE + 1, size=size)

values = self.value_field.generate_column(VALUES_SIZE)

if name is None:
name = self.name
return self.column_class(name, size, is_valid, offsets, sizes, values)


class LargeListViewField(ListViewField):

@property
def column_class(self):
return LargeListViewColumn

def _get_type(self):
return OrderedDict([
('name', 'largelistview')
])


class _BaseListViewColumn(Column):

def __init__(self, name, count, is_valid, offsets, sizes, values):
super().__init__(name, count)
self.is_valid = is_valid
self.offsets = offsets
self.sizes = sizes
self.values = values

def _get_buffers(self):
return [
('VALIDITY', [int(v) for v in self.is_valid]),
('OFFSET', self._encode_offsets(self.offsets)),
('SIZE', self._encode_offsets(self.sizes)),
]

def _get_children(self):
return [self.values.get_json()]


class ListViewColumn(_BaseListViewColumn, _NarrowOffsetsMixin):
pass


class LargeListViewColumn(_BaseListViewColumn, _LargeOffsetsMixin):
pass


class MapField(Field):

def __init__(self, name, key_field, item_field, *, nullable=True,
Expand Down Expand Up @@ -1663,6 +1740,15 @@ def generate_binary_view_case():
return _generate_file("binary_view", fields, batch_sizes)


def generate_list_view_case():
fields = [
ListViewField('lv', get_field('item', 'float32')),
LargeListViewField('llv', get_field('item', 'float32')),
]
batch_sizes = [0, 7, 256]
return _generate_file("list_view", fields, batch_sizes)


def generate_nested_large_offsets_case():
fields = [
LargeListField('large_list_nullable', get_field('item', 'int32')),
Expand Down Expand Up @@ -1847,7 +1933,12 @@ def _temp_path():

generate_binary_view_case()
.skip_tester('C#')
.skip_tester('Go')
.skip_tester('Java')
.skip_tester('JS')
.skip_tester('Rust'),

generate_list_view_case()
.skip_tester('C#')
.skip_tester('Java')
.skip_tester('JS')
.skip_tester('Rust'),
Expand Down
2 changes: 2 additions & 0 deletions dev/archery/archery/integration/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ def _run_test_cases(self,
``case_runner`` ran against ``test_cases``
"""
def case_wrapper(test_case):
if serial:
return case_runner(test_case)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this addition deliberate?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes: printer.cork() was failing to dump output when the test segfaulted. In any case, corking the printer adds no value when we're running in serial.

with printer.cork():
return case_runner(test_case)

Expand Down
23 changes: 18 additions & 5 deletions docs/source/format/Integration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ considered equivalent to ``[]`` (no metadata). Duplicated keys are not forbidden
**Type**: ::

{
"name" : "null|struct|list|largelist|fixedsizelist|union|int|floatingpoint|utf8|largeutf8|binary|largebinary|fixedsizebinary|bool|decimal|date|time|timestamp|interval|duration|map"
"name" : "null|struct|list|largelist|listview|largelistview|fixedsizelist|union|int|floatingpoint|utf8|largeutf8|binary|largebinary|utf8view|binaryview|fixedsizebinary|bool|decimal|date|time|timestamp|interval|duration|map|runendencoded"
}

A ``Type`` will have other fields as defined in
Expand Down Expand Up @@ -446,12 +446,22 @@ or ``DATA``.

``BufferData`` is encoded based on the type of buffer:

* ``VALIDITY``: a JSON array of 1 (valid) and 0 (null). Data for non-nullable
* ``VALIDITY``: a JSON array of 1 (valid) and 0 (null). Data for non-nullable
``Field`` still has a ``VALIDITY`` array, even though all values are 1.
* ``OFFSET``: a JSON array of integers for 32-bit offsets or
string-formatted integers for 64-bit offsets
* ``TYPE_ID``: a JSON array of integers
* ``DATA``: a JSON array of encoded values
string-formatted integers for 64-bit offsets.
* ``TYPE_ID``: a JSON array of integers.
* ``DATA``: a JSON array of encoded values.
* ``VARIADIC_DATA_BUFFERS``: a JSON array of data buffers represented as
hex encoded strings.
* ``VIEWS``: a JSON array of encoded views, which are JSON objects with:
* ``SIZE``: an integer indicating the size of the view,
* ``INLINED``: an encoded value (this field will be present if ``SIZE``
is smaller than 12, otherwise the next three fields will be present),
* ``PREFIX_HEX``: the first four bytes of the view encoded as hex,
* ``BUFFER_INDEX``: the index in ``VARIADIC_DATA_BUFFERS`` of the buffer
viewed,
* ``OFFSET``: the offset in the buffer viewed.

The value encoding for ``DATA`` is different depending on the logical
type:
Expand Down Expand Up @@ -527,6 +537,9 @@ in ``datagen.py``):
- Signed indices
- Unsigned indices
- Nested dictionaries
* Run end encoded
* Binary view and string view
* List view and large list view
* Extension Types


Expand Down
6 changes: 3 additions & 3 deletions go/arrow/array/encoded.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,19 +150,19 @@ func (r *RunEndEncoded) LogicalRunEndsArray(mem memory.Allocator) arrow.Array {
case *Int16:
for _, v := range e.Int16Values()[physOffset : physOffset+physLength] {
v -= int16(r.data.offset)
v = int16(utils.MinInt(int(v), r.data.length))
v = int16(utils.Min(int(v), r.data.length))
bldr.(*Int16Builder).Append(v)
}
case *Int32:
for _, v := range e.Int32Values()[physOffset : physOffset+physLength] {
v -= int32(r.data.offset)
v = int32(utils.MinInt(int(v), r.data.length))
v = int32(utils.Min(int(v), r.data.length))
bldr.(*Int32Builder).Append(v)
}
case *Int64:
for _, v := range e.Int64Values()[physOffset : physOffset+physLength] {
v -= int64(r.data.offset)
v = int64(utils.MinInt(int(v), r.data.length))
v = int64(utils.Min(int(v), r.data.length))
bldr.(*Int64Builder).Append(v)
}
}
Expand Down
Loading