From d88fc912143608c86e5f2b275326996cddba3a7c Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 19 May 2023 16:36:41 -0400 Subject: [PATCH 01/37] flatbuffer regen and add types --- go/arrow/array/binary.go | 106 ++++++++++++ go/arrow/array/compare.go | 12 ++ go/arrow/array/string.go | 98 +++++++++++ go/arrow/datatype.go | 4 + go/arrow/datatype_binary.go | 156 ++++++++++++++++++ go/arrow/internal/flatbuf/Binary.go | 7 + go/arrow/internal/flatbuf/BodyCompression.go | 7 + go/arrow/internal/flatbuf/Bool.go | 7 + go/arrow/internal/flatbuf/Date.go | 7 + go/arrow/internal/flatbuf/Decimal.go | 7 + go/arrow/internal/flatbuf/DictionaryBatch.go | 7 + .../internal/flatbuf/DictionaryEncoding.go | 7 + go/arrow/internal/flatbuf/Duration.go | 7 + go/arrow/internal/flatbuf/Field.go | 7 + go/arrow/internal/flatbuf/FixedSizeBinary.go | 7 + go/arrow/internal/flatbuf/FixedSizeList.go | 7 + go/arrow/internal/flatbuf/FloatingPoint.go | 7 + go/arrow/internal/flatbuf/Footer.go | 7 + go/arrow/internal/flatbuf/Int.go | 7 + go/arrow/internal/flatbuf/Interval.go | 7 + go/arrow/internal/flatbuf/KeyValue.go | 7 + go/arrow/internal/flatbuf/LargeBinary.go | 7 + go/arrow/internal/flatbuf/LargeList.go | 7 + go/arrow/internal/flatbuf/LargeUtf8.go | 7 + go/arrow/internal/flatbuf/List.go | 7 + go/arrow/internal/flatbuf/Map.go | 7 + go/arrow/internal/flatbuf/Message.go | 7 + go/arrow/internal/flatbuf/Null.go | 7 + go/arrow/internal/flatbuf/RecordBatch.go | 7 + go/arrow/internal/flatbuf/RunEndEncoded.go | 7 + go/arrow/internal/flatbuf/Schema.go | 7 + .../internal/flatbuf/SparseMatrixIndexCSX.go | 7 + go/arrow/internal/flatbuf/SparseTensor.go | 7 + .../internal/flatbuf/SparseTensorIndexCOO.go | 7 + .../internal/flatbuf/SparseTensorIndexCSF.go | 7 + go/arrow/internal/flatbuf/Struct_.go | 7 + go/arrow/internal/flatbuf/Tensor.go | 7 + go/arrow/internal/flatbuf/TensorDim.go | 7 + go/arrow/internal/flatbuf/Time.go | 7 + go/arrow/internal/flatbuf/Timestamp.go | 7 + go/arrow/internal/flatbuf/Union.go | 7 + go/arrow/internal/flatbuf/Utf8.go | 7 + go/arrow/type_traits_string_view.go | 63 +++++++ 43 files changed, 698 insertions(+) create mode 100644 go/arrow/type_traits_string_view.go diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index e9e6e66e7e8..ac03e60a88e 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -24,6 +24,7 @@ import ( "unsafe" "github.com/apache/arrow/go/v14/arrow" + "github.com/apache/arrow/go/v14/arrow/memory" "github.com/apache/arrow/go/v14/internal/json" ) @@ -318,6 +319,111 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool { return true } +type BinaryView struct { + array + values []arrow.StringHeader + dataBuffers []*memory.Buffer +} + +func NewBinaryViewData(data arrow.ArrayData) *BinaryView { + a := &BinaryView{} + a.refCount = 1 + a.setData(data.(*Data)) + return a +} + +func (a *BinaryView) setData(data *Data) { + if len(data.buffers) < 2 { + panic("len(data.buffers) < 2") + } + a.array.setData(data) + + if valueData := data.buffers[1]; valueData != nil { + a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes()) + } + + a.dataBuffers = data.buffers[2:] +} + +func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + return &a.values[a.array.data.offset+i] +} + +func (a *BinaryView) Value(i int) []byte { + s := a.ValueHeader(i) + if s.IsInline() { + return s.InlineBytes() + } + start := s.BufferOffset() + buf := a.dataBuffers[s.BufferIndex()] + return buf.Bytes()[start : start+uint32(s.Len())] +} + +func (a *BinaryView) ValueString(i int) string { + b := a.Value(i) + return *(*string)(unsafe.Pointer(&b)) +} + +func (a *BinaryView) String() string { + var o strings.Builder + o.WriteString("[") + for i := 0; i < a.Len(); i++ { + if i > 0 { + o.WriteString(" ") + } + switch { + case a.IsNull(i): + o.WriteString(NullValueStr) + default: + fmt.Fprintf(&o, "%q", a.ValueString(i)) + } + } + o.WriteString("]") + return o.String() +} + +func (a *BinaryView) ValueStr(i int) string { + if a.IsNull(i) { + return NullValueStr + } + return base64.StdEncoding.EncodeToString(a.Value(i)) +} + +func (a *BinaryView) GetOneForMarshal(i int) interface{} { + if a.IsNull(i) { + return nil + } + return a.Value(i) +} + +func (a *BinaryView) MarshalJSON() ([]byte, error) { + vals := make([]interface{}, a.Len()) + for i := 0; i < a.Len(); i++ { + vals[i] = a.GetOneForMarshal(i) + } + // golang marshal standard says that []byte will be marshalled + // as a base64-encoded string + return json.Marshal(vals) +} + +func arrayEqualBinaryView(left, right *BinaryView) bool { + leftBufs, rightBufs := left.dataBuffers, right.dataBuffers + for i := 0; i < left.Len(); i++ { + if left.IsNull(i) { + continue + } + if !left.ValueHeader(i).Equals(leftBufs, right.ValueHeader(i), rightBufs) { + return false + } + } + return true +} + var ( _ arrow.Array = (*Binary)(nil) + _ arrow.Array = (*LargeBinary)(nil) + _ arrow.Array = (*BinaryView)(nil) ) diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go index e70716bee91..389cbe56df5 100644 --- a/go/arrow/array/compare.go +++ b/go/arrow/array/compare.go @@ -232,6 +232,12 @@ func Equal(left, right arrow.Array) bool { case *LargeString: r := right.(*LargeString) return arrayEqualLargeString(l, r) + case *BinaryView: + r := right.(*BinaryView) + return arrayEqualBinaryView(l, r) + case *StringView: + r := right.(*StringView) + return arrayEqualStringView(l, r) case *Int8: r := right.(*Int8) return arrayEqualInt8(l, r) @@ -482,6 +488,12 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool { case *LargeString: r := right.(*LargeString) return arrayEqualLargeString(l, r) + case *BinaryView: + r := right.(*BinaryView) + return arrayEqualBinaryView(l, r) + case *StringView: + r := right.(*StringView) + return arrayEqualStringView(l, r) case *Int8: r := right.(*Int8) return arrayEqualInt8(l, r) diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go index 86e27c970cb..4b7f5293183 100644 --- a/go/arrow/array/string.go +++ b/go/arrow/array/string.go @@ -310,6 +310,103 @@ func arrayEqualLargeString(left, right *LargeString) bool { return true } +type StringView struct { + array + values []arrow.StringHeader + dataBuffers []*memory.Buffer +} + +func NewStringViewData(data arrow.ArrayData) *StringView { + a := &StringView{} + a.refCount = 1 + a.setData(data.(*Data)) + return a +} + +func (a *StringView) setData(data *Data) { + if len(data.buffers) < 2 { + panic("len(data.buffers) < 2") + } + a.array.setData(data) + + if valueData := data.buffers[1]; valueData != nil { + a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes()) + } + + a.dataBuffers = data.buffers[2:] +} + +func (a *StringView) ValueHeader(i int) *arrow.StringHeader { + if i < 0 || i >= a.array.data.length { + panic("arrow/array: index out of range") + } + return &a.values[a.array.data.offset+i] +} + +func (a *StringView) Value(i int) string { + s := a.ValueHeader(i) + if s.IsInline() { + return s.InlineData() + } + start := s.BufferOffset() + buf := a.dataBuffers[s.BufferIndex()] + value := buf.Bytes()[start : start+uint32(s.Len())] + return *(*string)(unsafe.Pointer(&value)) +} + +func (a *StringView) String() string { + var o strings.Builder + o.WriteString("[") + for i := 0; i < a.Len(); i++ { + if i > 0 { + o.WriteString(" ") + } + switch { + case a.IsNull(i): + o.WriteString(NullValueStr) + default: + fmt.Fprintf(&o, "%q", a.Value(i)) + } + } + o.WriteString("]") + return o.String() +} + +func (a *StringView) ValueStr(i int) string { + if a.IsNull(i) { + return NullValueStr + } + return a.Value(i) +} + +func (a *StringView) GetOneForMarshal(i int) interface{} { + if a.IsNull(i) { + return nil + } + return a.Value(i) +} + +func (a *StringView) MarshalJSON() ([]byte, error) { + vals := make([]interface{}, a.Len()) + for i := 0; i < a.Len(); i++ { + vals[i] = a.GetOneForMarshal(i) + } + return json.Marshal(vals) +} + +func arrayEqualStringView(left, right *StringView) bool { + leftBufs, rightBufs := left.dataBuffers, right.dataBuffers + for i := 0; i < left.Len(); i++ { + if left.IsNull(i) { + continue + } + if !left.ValueHeader(i).Equals(leftBufs, right.ValueHeader(i), rightBufs) { + return false + } + } + return true +} + // A StringBuilder is used to build a String array using the Append methods. type StringBuilder struct { *BinaryBuilder @@ -514,6 +611,7 @@ type StringLikeBuilder interface { var ( _ arrow.Array = (*String)(nil) _ arrow.Array = (*LargeString)(nil) + _ arrow.Array = (*StringView)(nil) _ Builder = (*StringBuilder)(nil) _ Builder = (*LargeStringBuilder)(nil) _ StringLikeBuilder = (*StringBuilder)(nil) diff --git a/go/arrow/datatype.go b/go/arrow/datatype.go index f0fb24ec873..2cd18fd53bc 100644 --- a/go/arrow/datatype.go +++ b/go/arrow/datatype.go @@ -272,6 +272,10 @@ func (b BufferSpec) Equals(other BufferSpec) bool { type DataTypeLayout struct { Buffers []BufferSpec HasDict bool + // if this is non-nil, the number of buffers expected is only + // lower-bounded by len(buffers). Buffers beyond this lower bound + // are expected to conform to this variadic spec. + VariadicSpec *BufferSpec } func SpecFixedWidth(w int) BufferSpec { return BufferSpec{KindFixedWidth, w} } diff --git a/go/arrow/datatype_binary.go b/go/arrow/datatype_binary.go index a3a85686450..479d6b3b147 100644 --- a/go/arrow/datatype_binary.go +++ b/go/arrow/datatype_binary.go @@ -16,6 +16,16 @@ package arrow +import ( + "bytes" + "reflect" + "unsafe" + + "github.com/apache/arrow/go/v13/arrow/endian" + "github.com/apache/arrow/go/v13/arrow/internal/debug" + "github.com/apache/arrow/go/v13/arrow/memory" +) + // OffsetTraits is a convenient interface over the various type traits // constants such as arrow.Int32Traits allowing types with offsets, like // BinaryType, StringType, LargeBinaryType and LargeStringType to have @@ -83,16 +93,162 @@ func (t *LargeStringType) Layout() DataTypeLayout { func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits } func (LargeStringType) IsUtf8() bool { return true } +const ( + StringHeaderPrefixLen = 4 + stringHeaderInlineSize = 12 +) + +// StringHeader is a variable length string (utf8) or byte slice with +// a 4 byte prefix and inline optimization for small values (12 bytes +// or fewer). This is similar to Go's standard string but limited by +// a length of Uint32Max and up to the first four bytes of the string +// are copied into the struct. This prefix allows failing comparisons +// early and can reduce CPU cache working set when dealing with short +// strings. +// +// There are two situations: +// +// Short string |----|----|--------| +// ^ ^ ^ +// | | | +// size prefix remaining in-line portion, zero padded +// +// IO Long String |----|----|----|----| +// ^ ^ ^ ^ +// | | | | +// size prefix buffer index and offset to out-of-line portion +// +// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. +// +// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf +type StringHeader struct { + size uint32 + // the first 4 bytes of this are the prefix for the string + // if size <= StringHeaderInlineSize, then the entire string + // is in the data array and is zero padded. + // if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32 + // values which are the buffer index and offset in that buffer + // containing the full string. + data [stringHeaderInlineSize]byte +} + +func (sh *StringHeader) IsInline() bool { + return sh.size <= uint32(stringHeaderInlineSize) +} + +func (sh *StringHeader) Len() int { return int(sh.size) } +func (sh *StringHeader) Prefix() [StringHeaderPrefixLen]byte { + return *(*[4]byte)(unsafe.Pointer(&sh.data)) +} + +func (sh *StringHeader) BufferIndex() uint32 { + return endian.Native.Uint32(sh.data[:]) +} + +func (sh *StringHeader) BufferOffset() uint32 { + return endian.Native.Uint32(sh.data[4:]) +} + +func (sh *StringHeader) InlineBytes() (data []byte) { + debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader") + return sh.data[:sh.size] +} + +func (sh *StringHeader) InlineData() (data string) { + debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader") + h := (*reflect.StringHeader)(unsafe.Pointer(&data)) + h.Data = uintptr(unsafe.Pointer(&sh.data)) + h.Len = int(sh.size) + return +} + +func (sh *StringHeader) SetBytes(data []byte) int { + sh.size = uint32(len(data)) + if sh.IsInline() { + return copy(sh.data[:], data) + } + return copy(sh.data[:4], data) +} + +func (sh *StringHeader) SetString(data string) int { + sh.size = uint32(len(data)) + if sh.IsInline() { + return copy(sh.data[:], data) + } + return copy(sh.data[:4], data) +} + +func (sh *StringHeader) SetIndexOffset(bufferIndex, offset uint32) { + endian.Native.PutUint32(sh.data[:], bufferIndex) + endian.Native.PutUint32(sh.data[4:], offset) +} + +func (sh *StringHeader) Equals(buffers []*memory.Buffer, other *StringHeader, otherBuffers []*memory.Buffer) bool { + if sh.sizeAndPrefixAsInt() != other.sizeAndPrefixAsInt() { + return false + } + + if sh.IsInline() { + return sh.inlinedAsInt64() == other.inlinedAsInt64() + } + + data := buffers[sh.BufferIndex()].Bytes()[sh.BufferOffset() : sh.BufferOffset()+sh.size] + otherData := otherBuffers[other.BufferIndex()].Bytes()[other.BufferOffset() : other.BufferOffset()+other.size] + return bytes.Equal(data, otherData) +} + +func (sh *StringHeader) inlinedAsInt64() int64 { + s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2) + return s[1] +} + +func (sh *StringHeader) sizeAndPrefixAsInt() int64 { + s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2) + return s[0] +} + +type BinaryViewType struct{} + +func (*BinaryViewType) ID() Type { return BINARY_VIEW } +func (*BinaryViewType) Name() string { return "binary_view" } +func (*BinaryViewType) String() string { return "binary_view" } +func (*BinaryViewType) IsUtf8() bool { return false } +func (*BinaryViewType) binary() {} +func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) } +func (*BinaryViewType) Layout() DataTypeLayout { + variadic := SpecVariableWidth() + return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(), + SpecFixedWidth(StringHeaderSizeBytes)}, VariadicSpec: &variadic} +} + +type StringViewType struct{} + +func (*StringViewType) ID() Type { return STRING_VIEW } +func (*StringViewType) Name() string { return "string_view" } +func (*StringViewType) String() string { return "string_view" } +func (*StringViewType) IsUtf8() bool { return true } +func (*StringViewType) binary() {} +func (t *StringViewType) Fingerprint() string { return typeFingerprint(t) } +func (*StringViewType) Layout() DataTypeLayout { + variadic := SpecVariableWidth() + return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(), + SpecFixedWidth(StringHeaderSizeBytes)}, VariadicSpec: &variadic} +} + var ( BinaryTypes = struct { Binary BinaryDataType String BinaryDataType LargeBinary BinaryDataType LargeString BinaryDataType + BinaryView BinaryDataType + StringView BinaryDataType }{ Binary: &BinaryType{}, String: &StringType{}, LargeBinary: &LargeBinaryType{}, LargeString: &LargeStringType{}, + BinaryView: &BinaryViewType{}, + StringView: &StringViewType{}, } ) diff --git a/go/arrow/internal/flatbuf/Binary.go b/go/arrow/internal/flatbuf/Binary.go index e8018e74c41..ca16f2d71c3 100644 --- a/go/arrow/internal/flatbuf/Binary.go +++ b/go/arrow/internal/flatbuf/Binary.go @@ -34,6 +34,13 @@ func GetRootAsBinary(buf []byte, offset flatbuffers.UOffsetT) *Binary { return x } +func GetSizePrefixedRootAsBinary(buf []byte, offset flatbuffers.UOffsetT) *Binary { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Binary{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Binary) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/BodyCompression.go b/go/arrow/internal/flatbuf/BodyCompression.go index 6468e231352..036d8d2936a 100644 --- a/go/arrow/internal/flatbuf/BodyCompression.go +++ b/go/arrow/internal/flatbuf/BodyCompression.go @@ -36,6 +36,13 @@ func GetRootAsBodyCompression(buf []byte, offset flatbuffers.UOffsetT) *BodyComp return x } +func GetSizePrefixedRootAsBodyCompression(buf []byte, offset flatbuffers.UOffsetT) *BodyCompression { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &BodyCompression{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *BodyCompression) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Bool.go b/go/arrow/internal/flatbuf/Bool.go index 6a4a9d26867..a4a6596b5b7 100644 --- a/go/arrow/internal/flatbuf/Bool.go +++ b/go/arrow/internal/flatbuf/Bool.go @@ -33,6 +33,13 @@ func GetRootAsBool(buf []byte, offset flatbuffers.UOffsetT) *Bool { return x } +func GetSizePrefixedRootAsBool(buf []byte, offset flatbuffers.UOffsetT) *Bool { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Bool{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Bool) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Date.go b/go/arrow/internal/flatbuf/Date.go index 32983ec54cc..ac28583a5d9 100644 --- a/go/arrow/internal/flatbuf/Date.go +++ b/go/arrow/internal/flatbuf/Date.go @@ -39,6 +39,13 @@ func GetRootAsDate(buf []byte, offset flatbuffers.UOffsetT) *Date { return x } +func GetSizePrefixedRootAsDate(buf []byte, offset flatbuffers.UOffsetT) *Date { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Date{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Date) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Decimal.go b/go/arrow/internal/flatbuf/Decimal.go index c9de254d1dc..096ca08a766 100644 --- a/go/arrow/internal/flatbuf/Decimal.go +++ b/go/arrow/internal/flatbuf/Decimal.go @@ -37,6 +37,13 @@ func GetRootAsDecimal(buf []byte, offset flatbuffers.UOffsetT) *Decimal { return x } +func GetSizePrefixedRootAsDecimal(buf []byte, offset flatbuffers.UOffsetT) *Decimal { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Decimal{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Decimal) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/DictionaryBatch.go b/go/arrow/internal/flatbuf/DictionaryBatch.go index 25b5384e46a..ed6f060b2db 100644 --- a/go/arrow/internal/flatbuf/DictionaryBatch.go +++ b/go/arrow/internal/flatbuf/DictionaryBatch.go @@ -39,6 +39,13 @@ func GetRootAsDictionaryBatch(buf []byte, offset flatbuffers.UOffsetT) *Dictiona return x } +func GetSizePrefixedRootAsDictionaryBatch(buf []byte, offset flatbuffers.UOffsetT) *DictionaryBatch { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &DictionaryBatch{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *DictionaryBatch) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/DictionaryEncoding.go b/go/arrow/internal/flatbuf/DictionaryEncoding.go index a9b09530b2a..dbfdfad771d 100644 --- a/go/arrow/internal/flatbuf/DictionaryEncoding.go +++ b/go/arrow/internal/flatbuf/DictionaryEncoding.go @@ -33,6 +33,13 @@ func GetRootAsDictionaryEncoding(buf []byte, offset flatbuffers.UOffsetT) *Dicti return x } +func GetSizePrefixedRootAsDictionaryEncoding(buf []byte, offset flatbuffers.UOffsetT) *DictionaryEncoding { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &DictionaryEncoding{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *DictionaryEncoding) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Duration.go b/go/arrow/internal/flatbuf/Duration.go index 57b7b2a037f..efb8b2721ff 100644 --- a/go/arrow/internal/flatbuf/Duration.go +++ b/go/arrow/internal/flatbuf/Duration.go @@ -33,6 +33,13 @@ func GetRootAsDuration(buf []byte, offset flatbuffers.UOffsetT) *Duration { return x } +func GetSizePrefixedRootAsDuration(buf []byte, offset flatbuffers.UOffsetT) *Duration { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Duration{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Duration) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Field.go b/go/arrow/internal/flatbuf/Field.go index c03cf2f878b..8263a219e5f 100644 --- a/go/arrow/internal/flatbuf/Field.go +++ b/go/arrow/internal/flatbuf/Field.go @@ -36,6 +36,13 @@ func GetRootAsField(buf []byte, offset flatbuffers.UOffsetT) *Field { return x } +func GetSizePrefixedRootAsField(buf []byte, offset flatbuffers.UOffsetT) *Field { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Field{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Field) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/FixedSizeBinary.go b/go/arrow/internal/flatbuf/FixedSizeBinary.go index 4e660d5077f..b90f6ad9690 100644 --- a/go/arrow/internal/flatbuf/FixedSizeBinary.go +++ b/go/arrow/internal/flatbuf/FixedSizeBinary.go @@ -33,6 +33,13 @@ func GetRootAsFixedSizeBinary(buf []byte, offset flatbuffers.UOffsetT) *FixedSiz return x } +func GetSizePrefixedRootAsFixedSizeBinary(buf []byte, offset flatbuffers.UOffsetT) *FixedSizeBinary { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &FixedSizeBinary{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *FixedSizeBinary) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/FixedSizeList.go b/go/arrow/internal/flatbuf/FixedSizeList.go index dabf5cc8581..8e10f5b2b25 100644 --- a/go/arrow/internal/flatbuf/FixedSizeList.go +++ b/go/arrow/internal/flatbuf/FixedSizeList.go @@ -33,6 +33,13 @@ func GetRootAsFixedSizeList(buf []byte, offset flatbuffers.UOffsetT) *FixedSizeL return x } +func GetSizePrefixedRootAsFixedSizeList(buf []byte, offset flatbuffers.UOffsetT) *FixedSizeList { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &FixedSizeList{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *FixedSizeList) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/FloatingPoint.go b/go/arrow/internal/flatbuf/FloatingPoint.go index 241d448dcf9..dfa8a591a9b 100644 --- a/go/arrow/internal/flatbuf/FloatingPoint.go +++ b/go/arrow/internal/flatbuf/FloatingPoint.go @@ -33,6 +33,13 @@ func GetRootAsFloatingPoint(buf []byte, offset flatbuffers.UOffsetT) *FloatingPo return x } +func GetSizePrefixedRootAsFloatingPoint(buf []byte, offset flatbuffers.UOffsetT) *FloatingPoint { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &FloatingPoint{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *FloatingPoint) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Footer.go b/go/arrow/internal/flatbuf/Footer.go index 65b0ff09546..193b9e1f9fb 100644 --- a/go/arrow/internal/flatbuf/Footer.go +++ b/go/arrow/internal/flatbuf/Footer.go @@ -36,6 +36,13 @@ func GetRootAsFooter(buf []byte, offset flatbuffers.UOffsetT) *Footer { return x } +func GetSizePrefixedRootAsFooter(buf []byte, offset flatbuffers.UOffsetT) *Footer { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Footer{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Footer) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Int.go b/go/arrow/internal/flatbuf/Int.go index 9f4b1911705..a3323954af7 100644 --- a/go/arrow/internal/flatbuf/Int.go +++ b/go/arrow/internal/flatbuf/Int.go @@ -33,6 +33,13 @@ func GetRootAsInt(buf []byte, offset flatbuffers.UOffsetT) *Int { return x } +func GetSizePrefixedRootAsInt(buf []byte, offset flatbuffers.UOffsetT) *Int { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Int{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Int) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Interval.go b/go/arrow/internal/flatbuf/Interval.go index 12c56d5c210..501eada3e8b 100644 --- a/go/arrow/internal/flatbuf/Interval.go +++ b/go/arrow/internal/flatbuf/Interval.go @@ -33,6 +33,13 @@ func GetRootAsInterval(buf []byte, offset flatbuffers.UOffsetT) *Interval { return x } +func GetSizePrefixedRootAsInterval(buf []byte, offset flatbuffers.UOffsetT) *Interval { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Interval{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Interval) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/KeyValue.go b/go/arrow/internal/flatbuf/KeyValue.go index c1b85318ecd..d41988a3301 100644 --- a/go/arrow/internal/flatbuf/KeyValue.go +++ b/go/arrow/internal/flatbuf/KeyValue.go @@ -36,6 +36,13 @@ func GetRootAsKeyValue(buf []byte, offset flatbuffers.UOffsetT) *KeyValue { return x } +func GetSizePrefixedRootAsKeyValue(buf []byte, offset flatbuffers.UOffsetT) *KeyValue { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &KeyValue{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *KeyValue) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/LargeBinary.go b/go/arrow/internal/flatbuf/LargeBinary.go index 2c3befcc16f..6a563f6c0f3 100644 --- a/go/arrow/internal/flatbuf/LargeBinary.go +++ b/go/arrow/internal/flatbuf/LargeBinary.go @@ -35,6 +35,13 @@ func GetRootAsLargeBinary(buf []byte, offset flatbuffers.UOffsetT) *LargeBinary return x } +func GetSizePrefixedRootAsLargeBinary(buf []byte, offset flatbuffers.UOffsetT) *LargeBinary { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &LargeBinary{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *LargeBinary) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/LargeList.go b/go/arrow/internal/flatbuf/LargeList.go index 92f22845874..da6206d1aef 100644 --- a/go/arrow/internal/flatbuf/LargeList.go +++ b/go/arrow/internal/flatbuf/LargeList.go @@ -35,6 +35,13 @@ func GetRootAsLargeList(buf []byte, offset flatbuffers.UOffsetT) *LargeList { return x } +func GetSizePrefixedRootAsLargeList(buf []byte, offset flatbuffers.UOffsetT) *LargeList { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &LargeList{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *LargeList) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/LargeUtf8.go b/go/arrow/internal/flatbuf/LargeUtf8.go index e78b33e1100..a61624c1310 100644 --- a/go/arrow/internal/flatbuf/LargeUtf8.go +++ b/go/arrow/internal/flatbuf/LargeUtf8.go @@ -35,6 +35,13 @@ func GetRootAsLargeUtf8(buf []byte, offset flatbuffers.UOffsetT) *LargeUtf8 { return x } +func GetSizePrefixedRootAsLargeUtf8(buf []byte, offset flatbuffers.UOffsetT) *LargeUtf8 { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &LargeUtf8{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *LargeUtf8) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/List.go b/go/arrow/internal/flatbuf/List.go index ba84319d3f6..bcbaefb38a8 100644 --- a/go/arrow/internal/flatbuf/List.go +++ b/go/arrow/internal/flatbuf/List.go @@ -33,6 +33,13 @@ func GetRootAsList(buf []byte, offset flatbuffers.UOffsetT) *List { return x } +func GetSizePrefixedRootAsList(buf []byte, offset flatbuffers.UOffsetT) *List { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &List{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *List) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Map.go b/go/arrow/internal/flatbuf/Map.go index 8802aba1ebd..a2845170598 100644 --- a/go/arrow/internal/flatbuf/Map.go +++ b/go/arrow/internal/flatbuf/Map.go @@ -58,6 +58,13 @@ func GetRootAsMap(buf []byte, offset flatbuffers.UOffsetT) *Map { return x } +func GetSizePrefixedRootAsMap(buf []byte, offset flatbuffers.UOffsetT) *Map { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Map{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Map) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Message.go b/go/arrow/internal/flatbuf/Message.go index f4b4a0ff80e..13ac1e4a118 100644 --- a/go/arrow/internal/flatbuf/Message.go +++ b/go/arrow/internal/flatbuf/Message.go @@ -33,6 +33,13 @@ func GetRootAsMessage(buf []byte, offset flatbuffers.UOffsetT) *Message { return x } +func GetSizePrefixedRootAsMessage(buf []byte, offset flatbuffers.UOffsetT) *Message { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Message{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Message) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Null.go b/go/arrow/internal/flatbuf/Null.go index 3c3eb4bda36..0979f312e7d 100644 --- a/go/arrow/internal/flatbuf/Null.go +++ b/go/arrow/internal/flatbuf/Null.go @@ -34,6 +34,13 @@ func GetRootAsNull(buf []byte, offset flatbuffers.UOffsetT) *Null { return x } +func GetSizePrefixedRootAsNull(buf []byte, offset flatbuffers.UOffsetT) *Null { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Null{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Null) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/RecordBatch.go b/go/arrow/internal/flatbuf/RecordBatch.go index c50f4a6e868..41051849eff 100644 --- a/go/arrow/internal/flatbuf/RecordBatch.go +++ b/go/arrow/internal/flatbuf/RecordBatch.go @@ -36,6 +36,13 @@ func GetRootAsRecordBatch(buf []byte, offset flatbuffers.UOffsetT) *RecordBatch return x } +func GetSizePrefixedRootAsRecordBatch(buf []byte, offset flatbuffers.UOffsetT) *RecordBatch { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &RecordBatch{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *RecordBatch) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/RunEndEncoded.go b/go/arrow/internal/flatbuf/RunEndEncoded.go index fa414c1bf0e..c4d4f657cff 100644 --- a/go/arrow/internal/flatbuf/RunEndEncoded.go +++ b/go/arrow/internal/flatbuf/RunEndEncoded.go @@ -38,6 +38,13 @@ func GetRootAsRunEndEncoded(buf []byte, offset flatbuffers.UOffsetT) *RunEndEnco return x } +func GetSizePrefixedRootAsRunEndEncoded(buf []byte, offset flatbuffers.UOffsetT) *RunEndEncoded { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &RunEndEncoded{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *RunEndEncoded) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Schema.go b/go/arrow/internal/flatbuf/Schema.go index 4ee5ecc9e5e..e289c57eb36 100644 --- a/go/arrow/internal/flatbuf/Schema.go +++ b/go/arrow/internal/flatbuf/Schema.go @@ -35,6 +35,13 @@ func GetRootAsSchema(buf []byte, offset flatbuffers.UOffsetT) *Schema { return x } +func GetSizePrefixedRootAsSchema(buf []byte, offset flatbuffers.UOffsetT) *Schema { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Schema{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Schema) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/SparseMatrixIndexCSX.go b/go/arrow/internal/flatbuf/SparseMatrixIndexCSX.go index c28cc5d082f..2d86589a225 100644 --- a/go/arrow/internal/flatbuf/SparseMatrixIndexCSX.go +++ b/go/arrow/internal/flatbuf/SparseMatrixIndexCSX.go @@ -34,6 +34,13 @@ func GetRootAsSparseMatrixIndexCSX(buf []byte, offset flatbuffers.UOffsetT) *Spa return x } +func GetSizePrefixedRootAsSparseMatrixIndexCSX(buf []byte, offset flatbuffers.UOffsetT) *SparseMatrixIndexCSX { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &SparseMatrixIndexCSX{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *SparseMatrixIndexCSX) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/SparseTensor.go b/go/arrow/internal/flatbuf/SparseTensor.go index 6f3f55797d7..d0febc7e51c 100644 --- a/go/arrow/internal/flatbuf/SparseTensor.go +++ b/go/arrow/internal/flatbuf/SparseTensor.go @@ -33,6 +33,13 @@ func GetRootAsSparseTensor(buf []byte, offset flatbuffers.UOffsetT) *SparseTenso return x } +func GetSizePrefixedRootAsSparseTensor(buf []byte, offset flatbuffers.UOffsetT) *SparseTensor { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &SparseTensor{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *SparseTensor) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/SparseTensorIndexCOO.go b/go/arrow/internal/flatbuf/SparseTensorIndexCOO.go index f8eee99fa69..dcb0ee8f5f2 100644 --- a/go/arrow/internal/flatbuf/SparseTensorIndexCOO.go +++ b/go/arrow/internal/flatbuf/SparseTensorIndexCOO.go @@ -65,6 +65,13 @@ func GetRootAsSparseTensorIndexCOO(buf []byte, offset flatbuffers.UOffsetT) *Spa return x } +func GetSizePrefixedRootAsSparseTensorIndexCOO(buf []byte, offset flatbuffers.UOffsetT) *SparseTensorIndexCOO { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &SparseTensorIndexCOO{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *SparseTensorIndexCOO) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/SparseTensorIndexCSF.go b/go/arrow/internal/flatbuf/SparseTensorIndexCSF.go index a824c84ebfe..4bf86fe398b 100644 --- a/go/arrow/internal/flatbuf/SparseTensorIndexCSF.go +++ b/go/arrow/internal/flatbuf/SparseTensorIndexCSF.go @@ -34,6 +34,13 @@ func GetRootAsSparseTensorIndexCSF(buf []byte, offset flatbuffers.UOffsetT) *Spa return x } +func GetSizePrefixedRootAsSparseTensorIndexCSF(buf []byte, offset flatbuffers.UOffsetT) *SparseTensorIndexCSF { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &SparseTensorIndexCSF{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *SparseTensorIndexCSF) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Struct_.go b/go/arrow/internal/flatbuf/Struct_.go index 427e7060382..e2fe1fd83fc 100644 --- a/go/arrow/internal/flatbuf/Struct_.go +++ b/go/arrow/internal/flatbuf/Struct_.go @@ -36,6 +36,13 @@ func GetRootAsStruct_(buf []byte, offset flatbuffers.UOffsetT) *Struct_ { return x } +func GetSizePrefixedRootAsStruct_(buf []byte, offset flatbuffers.UOffsetT) *Struct_ { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Struct_{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Struct_) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Tensor.go b/go/arrow/internal/flatbuf/Tensor.go index 39d70e351e3..d36995a5f9c 100644 --- a/go/arrow/internal/flatbuf/Tensor.go +++ b/go/arrow/internal/flatbuf/Tensor.go @@ -33,6 +33,13 @@ func GetRootAsTensor(buf []byte, offset flatbuffers.UOffsetT) *Tensor { return x } +func GetSizePrefixedRootAsTensor(buf []byte, offset flatbuffers.UOffsetT) *Tensor { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Tensor{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Tensor) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/TensorDim.go b/go/arrow/internal/flatbuf/TensorDim.go index 14b82120887..5e1ac01e1ce 100644 --- a/go/arrow/internal/flatbuf/TensorDim.go +++ b/go/arrow/internal/flatbuf/TensorDim.go @@ -36,6 +36,13 @@ func GetRootAsTensorDim(buf []byte, offset flatbuffers.UOffsetT) *TensorDim { return x } +func GetSizePrefixedRootAsTensorDim(buf []byte, offset flatbuffers.UOffsetT) *TensorDim { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &TensorDim{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *TensorDim) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Time.go b/go/arrow/internal/flatbuf/Time.go index 2fb6e4c110e..4c7c221dfd8 100644 --- a/go/arrow/internal/flatbuf/Time.go +++ b/go/arrow/internal/flatbuf/Time.go @@ -47,6 +47,13 @@ func GetRootAsTime(buf []byte, offset flatbuffers.UOffsetT) *Time { return x } +func GetSizePrefixedRootAsTime(buf []byte, offset flatbuffers.UOffsetT) *Time { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Time{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Time) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Timestamp.go b/go/arrow/internal/flatbuf/Timestamp.go index f53211455c0..9305e1f9681 100644 --- a/go/arrow/internal/flatbuf/Timestamp.go +++ b/go/arrow/internal/flatbuf/Timestamp.go @@ -138,6 +138,13 @@ func GetRootAsTimestamp(buf []byte, offset flatbuffers.UOffsetT) *Timestamp { return x } +func GetSizePrefixedRootAsTimestamp(buf []byte, offset flatbuffers.UOffsetT) *Timestamp { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Timestamp{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Timestamp) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Union.go b/go/arrow/internal/flatbuf/Union.go index e34121d4757..e17c95516ea 100644 --- a/go/arrow/internal/flatbuf/Union.go +++ b/go/arrow/internal/flatbuf/Union.go @@ -37,6 +37,13 @@ func GetRootAsUnion(buf []byte, offset flatbuffers.UOffsetT) *Union { return x } +func GetSizePrefixedRootAsUnion(buf []byte, offset flatbuffers.UOffsetT) *Union { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Union{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Union) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/internal/flatbuf/Utf8.go b/go/arrow/internal/flatbuf/Utf8.go index 4ff365a3750..f62b5ad59b0 100644 --- a/go/arrow/internal/flatbuf/Utf8.go +++ b/go/arrow/internal/flatbuf/Utf8.go @@ -34,6 +34,13 @@ func GetRootAsUtf8(buf []byte, offset flatbuffers.UOffsetT) *Utf8 { return x } +func GetSizePrefixedRootAsUtf8(buf []byte, offset flatbuffers.UOffsetT) *Utf8 { + n := flatbuffers.GetUOffsetT(buf[offset+flatbuffers.SizeUint32:]) + x := &Utf8{} + x.Init(buf, n+offset+flatbuffers.SizeUint32) + return x +} + func (rcv *Utf8) Init(buf []byte, i flatbuffers.UOffsetT) { rcv._tab.Bytes = buf rcv._tab.Pos = i diff --git a/go/arrow/type_traits_string_view.go b/go/arrow/type_traits_string_view.go new file mode 100644 index 00000000000..8df79fa5334 --- /dev/null +++ b/go/arrow/type_traits_string_view.go @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arrow + +import ( + "reflect" + "unsafe" + + "github.com/apache/arrow/go/v13/arrow/endian" +) + +var StringHeaderTraits stringHeaderTraits + +const ( + StringHeaderSizeBytes = int(unsafe.Sizeof(StringHeader{})) +) + +type stringHeaderTraits struct{} + +func (stringHeaderTraits) BytesRequired(n int) int { return StringHeaderSizeBytes * n } + +func (stringHeaderTraits) PutValue(b []byte, v StringHeader) { + endian.Native.PutUint32(b, v.size) + copy(b[4:], v.data[:]) +} + +func (stringHeaderTraits) CastFromBytes(b []byte) (res []StringHeader) { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len / StringHeaderSizeBytes + s.Cap = h.Cap / StringHeaderSizeBytes + + return +} + +func (stringHeaderTraits) CastToBytes(b []StringHeader) (res []byte) { + h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) + + s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) + s.Data = h.Data + s.Len = h.Len * StringHeaderSizeBytes + s.Cap = h.Cap * StringHeaderSizeBytes + + return +} + +func (stringHeaderTraits) Copy(dst, src []StringHeader) { copy(dst, src) } From a0eb73684ff98ba9c07ac1b3221078134df6d332 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 22 May 2023 16:40:47 -0400 Subject: [PATCH 02/37] initial impl of binary/string views --- go/arrow/array/array.go | 3 +- go/arrow/array/binarybuilder.go | 288 +++++++++++++++++++++++++++++++ go/arrow/array/bufferbuilder.go | 114 ++++++++++++ go/arrow/array/builder.go | 4 + go/arrow/array/string.go | 98 ++++++++++- go/arrow/array/string_test.go | 173 +++++++++++++++++++ go/arrow/datatype_binary.go | 12 +- go/arrow/datatype_binary_test.go | 30 ++++ 8 files changed, 709 insertions(+), 13 deletions(-) diff --git a/go/arrow/array/array.go b/go/arrow/array/array.go index 1ee04c7aa2b..68a981aaf83 100644 --- a/go/arrow/array/array.go +++ b/go/arrow/array/array.go @@ -178,7 +178,8 @@ func init() { arrow.RUN_END_ENCODED: func(data arrow.ArrayData) arrow.Array { return NewRunEndEncodedData(data) }, arrow.LIST_VIEW: func(data arrow.ArrayData) arrow.Array { return NewListViewData(data) }, arrow.LARGE_LIST_VIEW: func(data arrow.ArrayData) arrow.Array { return NewLargeListViewData(data) }, - + arrow.BINARY_VIEW: func(data arrow.ArrayData) arrow.Array { return NewBinaryViewData(data) }, + arrow.STRING_VIEW: func(data arrow.ArrayData) arrow.Array { return NewStringViewData(data) }, // invalid data types to fill out array to size 2^6 - 1 63: invalidDataType, } diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go index 3cb709b45b7..276fe55628f 100644 --- a/go/arrow/array/binarybuilder.go +++ b/go/arrow/array/binarybuilder.go @@ -23,6 +23,7 @@ import ( "math" "reflect" "sync/atomic" + "unsafe" "github.com/apache/arrow/go/v14/arrow" "github.com/apache/arrow/go/v14/arrow/internal/debug" @@ -370,6 +371,293 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error { return b.Unmarshal(dec) } +const ( + dfltBlockSize = 1 << 20 // 1 MB + viewValueSizeLimit = math.MaxUint32 +) + +type BinaryViewBuilder struct { + builder + dtype arrow.BinaryDataType + + data *memory.Buffer + rawData []arrow.StringHeader + + blockBuilder multiBufferBuilder +} + +func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder { + return &BinaryViewBuilder{ + dtype: arrow.BinaryTypes.BinaryView, + builder: builder{ + refCount: 1, + mem: mem, + }, + blockBuilder: multiBufferBuilder{ + refCount: 1, + blockSize: dfltBlockSize, + mem: mem, + }, + } +} + +func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype } + +func (b *BinaryViewBuilder) Release() { + debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") + + if atomic.AddInt64(&b.refCount, -1) == 0 { + if b.nullBitmap != nil { + b.nullBitmap.Release() + b.nullBitmap = nil + } + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + } + } +} + +func (b *BinaryViewBuilder) init(capacity int) { + b.builder.init(capacity) + b.data = memory.NewResizableBuffer(b.mem) + bytesN := arrow.StringHeaderTraits.BytesRequired(capacity) + b.data.Resize(bytesN) + b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes()) +} + +func (b *BinaryViewBuilder) Resize(n int) { + nbuild := n + if n < minBuilderCapacity { + n = minBuilderCapacity + } + + if b.capacity == 0 { + b.init(n) + } else { + b.builder.resize(nbuild, b.init) + b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n)) + b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes()) + } +} + +func (b *BinaryViewBuilder) ReserveData(length int) { + if length > viewValueSizeLimit { + panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", + arrow.ErrInvalid)) + } + b.blockBuilder.Reserve(int(length)) +} + +func (b *BinaryViewBuilder) Reserve(n int) { + b.builder.reserve(n, b.Resize) +} + +func (b *BinaryViewBuilder) Append(v []byte) { + if len(v) > viewValueSizeLimit { + panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid)) + } + + if !arrow.IsStringHeaderInline(len(v)) { + b.ReserveData(len(v)) + } + + b.Reserve(1) + b.UnsafeAppend(v) +} + +func (b *BinaryViewBuilder) AppendString(v string) { + // create a []byte without copying the bytes + // in go1.20 this would be unsafe.StringData + val := *(*[]byte)(unsafe.Pointer(&struct { + string + int + }{v, len(v)})) + b.Append(val) +} + +func (b *BinaryViewBuilder) AppendNull() { + b.Reserve(1) + b.UnsafeAppendBoolToBitmap(false) +} + +func (b *BinaryViewBuilder) AppendEmptyValue() { + b.Reserve(1) + b.UnsafeAppendBoolToBitmap(true) +} + +func (b *BinaryViewBuilder) UnsafeAppend(v []byte) { + hdr := &b.rawData[b.length] + hdr.SetBytes(v) + if !hdr.IsInline() { + b.blockBuilder.UnsafeAppend(hdr, v) + } + b.UnsafeAppendBoolToBitmap(true) +} + +func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) { + if len(v) != len(valid) && len(valid) != 0 { + panic("len(v) != len(valid) && len(valid) != 0") + } + + if len(v) == 0 { + return + } + + b.Reserve(len(v)) + outOfLineTotal := 0 + for _, vv := range v { + if !arrow.IsStringHeaderInline(len(vv)) { + outOfLineTotal += len(vv) + } + } + + b.ReserveData(outOfLineTotal) + for i, vv := range v { + hdr := &b.rawData[b.length+i] + hdr.SetBytes(vv) + if !hdr.IsInline() { + b.blockBuilder.UnsafeAppend(hdr, vv) + } + } + + b.builder.unsafeAppendBoolsToBitmap(valid, len(v)) +} + +func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) { + if len(v) != len(valid) && len(valid) != 0 { + panic("len(v) != len(valid) && len(valid) != 0") + } + + if len(v) == 0 { + return + } + + b.Reserve(len(v)) + outOfLineTotal := 0 + for _, vv := range v { + if !arrow.IsStringHeaderInline(len(vv)) { + outOfLineTotal += len(vv) + } + } + + b.ReserveData(outOfLineTotal) + for i, vv := range v { + hdr := &b.rawData[b.length+i] + hdr.SetString(vv) + if !hdr.IsInline() { + b.blockBuilder.UnsafeAppendString(hdr, vv) + } + } + + b.builder.unsafeAppendBoolsToBitmap(valid, len(v)) +} + +func (b *BinaryViewBuilder) AppendValueFromString(s string) error { + if s == NullValueStr { + b.AppendNull() + return nil + } + + if b.dtype.IsUtf8() { + b.Append([]byte(s)) + return nil + } + + decodedVal, err := base64.StdEncoding.DecodeString(s) + if err != nil { + return fmt.Errorf("could not decode base64 string: %w", err) + } + b.Append(decodedVal) + return nil +} + +func (b *BinaryViewBuilder) UnmarshalOne(dec *json.Decoder) error { + t, err := dec.Token() + if err != nil { + return err + } + + switch v := t.(type) { + case string: + data, err := base64.StdEncoding.DecodeString(v) + if err != nil { + return err + } + b.Append(data) + case []byte: + b.Append(v) + case nil: + b.AppendNull() + default: + return &json.UnmarshalTypeError{ + Value: fmt.Sprint(t), + Type: reflect.TypeOf([]byte{}), + Offset: dec.InputOffset(), + } + } + return nil +} + +func (b *BinaryViewBuilder) Unmarshal(dec *json.Decoder) error { + for dec.More() { + if err := b.UnmarshalOne(dec); err != nil { + return err + } + } + return nil +} + +func (b *BinaryViewBuilder) UnmarshalJSON(data []byte) error { + dec := json.NewDecoder(bytes.NewReader(data)) + t, err := dec.Token() + if err != nil { + return err + } + + if delim, ok := t.(json.Delim); !ok || delim != '[' { + return fmt.Errorf("binary view builder must unpack from json array, found %s", delim) + } + + return b.Unmarshal(dec) +} + +func (b *BinaryViewBuilder) newData() (data *Data) { + bytesRequired := arrow.StringHeaderTraits.BytesRequired(b.length) + if bytesRequired > 0 && bytesRequired < b.data.Len() { + // trim buffers + b.data.Resize(bytesRequired) + } + + dataBuffers := b.blockBuilder.Finish() + data = NewData(b.dtype, b.length, append([]*memory.Buffer{ + b.nullBitmap, b.data}, dataBuffers...), nil, b.nulls, 0) + b.reset() + + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil + for _, buf := range dataBuffers { + buf.Release() + } + } + return +} + +func (b *BinaryViewBuilder) NewBinaryViewArray() (a *BinaryView) { + data := b.newData() + a = NewBinaryViewData(data) + data.Release() + return +} + +func (b *BinaryViewBuilder) NewArray() arrow.Array { + return b.NewBinaryViewArray() +} + var ( _ Builder = (*BinaryBuilder)(nil) + _ Builder = (*BinaryViewBuilder)(nil) ) diff --git a/go/arrow/array/bufferbuilder.go b/go/arrow/array/bufferbuilder.go index e023b0d9074..61731dc3a5e 100644 --- a/go/arrow/array/bufferbuilder.go +++ b/go/arrow/array/bufferbuilder.go @@ -18,7 +18,9 @@ package array import ( "sync/atomic" + "unsafe" + "github.com/apache/arrow/go/v14/arrow" "github.com/apache/arrow/go/v14/arrow/bitutil" "github.com/apache/arrow/go/v14/arrow/internal/debug" "github.com/apache/arrow/go/v14/arrow/memory" @@ -151,3 +153,115 @@ func (b *bufferBuilder) unsafeAppend(data []byte) { copy(b.bytes[b.length:], data) b.length += len(data) } + +type multiBufferBuilder struct { + refCount int64 + blockSize int + + mem memory.Allocator + blocks []*memory.Buffer + currentOutBuffer int +} + +// Retain increases the reference count by 1. +// Retain may be called simultaneously from multiple goroutines. +func (b *multiBufferBuilder) Retain() { + atomic.AddInt64(&b.refCount, 1) +} + +// Release decreases the reference count by 1. +// When the reference count goes to zero, the memory is freed. +// Release may be called simultaneously from multiple goroutines. +func (b *multiBufferBuilder) Release() { + debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") + + if atomic.AddInt64(&b.refCount, -1) == 0 { + for i, buf := range b.blocks { + buf.Release() + b.blocks[i] = nil + } + } +} + +func (b *multiBufferBuilder) Reserve(nbytes int) { + if len(b.blocks) == 0 { + out := memory.NewResizableBuffer(b.mem) + if nbytes < b.blockSize { + nbytes = b.blockSize + } + out.Reserve(nbytes) + b.currentOutBuffer = 0 + b.blocks = []*memory.Buffer{out} + return + } + + curBuf := b.blocks[b.currentOutBuffer] + remain := curBuf.Cap() - curBuf.Len() + if nbytes <= remain { + return + } + + // search for underfull block that has enough bytes + for i, block := range b.blocks { + remaining := block.Cap() - block.Len() + if nbytes <= remaining { + b.currentOutBuffer = i + return + } + } + + // current buffer doesn't have enough space, no underfull buffers + // make new buffer and set that as our current. + newBuf := memory.NewResizableBuffer(b.mem) + if nbytes < b.blockSize { + nbytes = b.blockSize + } + + newBuf.Reserve(nbytes) + b.currentOutBuffer = len(b.blocks) + b.blocks = append(b.blocks, newBuf) +} + +func (b *multiBufferBuilder) RemainingBytes() int { + if len(b.blocks) == 0 { + return 0 + } + + buf := b.blocks[b.currentOutBuffer] + return buf.Cap() - buf.Len() +} + +func (b *multiBufferBuilder) Reset() { + b.currentOutBuffer = 0 + for i, block := range b.blocks { + block.Release() + b.blocks[i] = nil + } + b.blocks = nil +} + +func (b *multiBufferBuilder) UnsafeAppend(hdr *arrow.StringHeader, val []byte) { + buf := b.blocks[b.currentOutBuffer] + idx, offset := b.currentOutBuffer, buf.Len() + hdr.SetIndexOffset(uint32(idx), uint32(offset)) + + n := copy(buf.Buf()[offset:], val) + buf.ResizeNoShrink(offset + n) +} + +func (b *multiBufferBuilder) UnsafeAppendString(hdr *arrow.StringHeader, val string) { + // create a byte slice with zero-copies + // in go1.20 this would be equivalent to unsafe.StringData + v := *(*[]byte)(unsafe.Pointer(&struct { + string + int + }{val, len(val)})) + b.UnsafeAppend(hdr, v) +} + +func (b *multiBufferBuilder) Finish() (out []*memory.Buffer) { + b.currentOutBuffer = 0 + out = b.blocks + b.blocks = nil + return +} diff --git a/go/arrow/array/builder.go b/go/arrow/array/builder.go index 2f15ac965e0..05da0bd2958 100644 --- a/go/arrow/array/builder.go +++ b/go/arrow/array/builder.go @@ -364,6 +364,10 @@ func NewBuilder(mem memory.Allocator, dtype arrow.DataType) Builder { case arrow.RUN_END_ENCODED: typ := dtype.(*arrow.RunEndEncodedType) return NewRunEndEncodedBuilder(mem, typ.RunEnds(), typ.Encoded()) + case arrow.BINARY_VIEW: + return NewBinaryViewBuilder(mem) + case arrow.STRING_VIEW: + return NewStringViewBuilder(mem) } panic(fmt.Errorf("arrow/array: unsupported builder for %T", dtype)) } diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go index 4b7f5293183..3339a67f4f8 100644 --- a/go/arrow/array/string.go +++ b/go/arrow/array/string.go @@ -28,6 +28,11 @@ import ( "github.com/apache/arrow/go/v14/internal/json" ) +type StringLike interface { + arrow.Array + Value(int) string +} + // String represents an immutable sequence of variable-length UTF-8 strings. type String struct { array @@ -323,6 +328,11 @@ func NewStringViewData(data arrow.ArrayData) *StringView { return a } +// Reset resets the String with a different set of Data. +func (a *StringView) Reset(data arrow.ArrayData) { + a.setData(data.(*Data)) +} + func (a *StringView) setData(data *Data) { if len(data.buffers) < 2 { panic("len(data.buffers) < 2") @@ -441,10 +451,6 @@ func (b *StringBuilder) Value(i int) string { return string(b.BinaryBuilder.Value(i)) } -// func (b *StringBuilder) UnsafeAppend(v string) { -// b.BinaryBuilder.UnsafeAppend([]byte(v)) -// } - // NewArray creates a String array from the memory buffers used by the builder and resets the StringBuilder // so it can be used to build a new array. func (b *StringBuilder) NewArray() arrow.Array { @@ -538,10 +544,6 @@ func (b *LargeStringBuilder) Value(i int) string { return string(b.BinaryBuilder.Value(i)) } -// func (b *LargeStringBuilder) UnsafeAppend(v string) { -// b.BinaryBuilder.UnsafeAppend([]byte(v)) -// } - // NewArray creates a String array from the memory buffers used by the builder and resets the StringBuilder // so it can be used to build a new array. func (b *LargeStringBuilder) NewArray() arrow.Array { @@ -601,9 +603,87 @@ func (b *LargeStringBuilder) UnmarshalJSON(data []byte) error { return b.Unmarshal(dec) } +type StringViewBuilder struct { + *BinaryViewBuilder +} + +func NewStringViewBuilder(mem memory.Allocator) *StringViewBuilder { + bldr := &StringViewBuilder{ + BinaryViewBuilder: NewBinaryViewBuilder(mem), + } + bldr.dtype = arrow.BinaryTypes.StringView + return bldr +} + +func (b *StringViewBuilder) Append(v string) { + b.BinaryViewBuilder.AppendString(v) +} + +func (b *StringViewBuilder) AppendValues(v []string, valid []bool) { + b.BinaryViewBuilder.AppendStringValues(v, valid) +} + +func (b *StringViewBuilder) UnmarshalOne(dec *json.Decoder) error { + t, err := dec.Token() + if err != nil { + return err + } + + switch v := t.(type) { + case string: + b.Append(v) + case []byte: + b.BinaryViewBuilder.Append(v) + case nil: + b.AppendNull() + default: + return &json.UnmarshalTypeError{ + Value: fmt.Sprint(t), + Type: reflect.TypeOf([]byte{}), + Offset: dec.InputOffset(), + } + } + return nil +} + +func (b *StringViewBuilder) Unmarshal(dec *json.Decoder) error { + for dec.More() { + if err := b.UnmarshalOne(dec); err != nil { + return err + } + } + return nil +} + +func (b *StringViewBuilder) UnmarshalJSON(data []byte) error { + dec := json.NewDecoder(bytes.NewReader(data)) + t, err := dec.Token() + if err != nil { + return err + } + + if delim, ok := t.(json.Delim); !ok || delim != '[' { + return fmt.Errorf("binary view builder must unpack from json array, found %s", delim) + } + + return b.Unmarshal(dec) +} + +func (b *StringViewBuilder) NewArray() arrow.Array { + return b.NewStringViewArray() +} + +func (b *StringViewBuilder) NewStringViewArray() (a *StringView) { + data := b.newData() + a = NewStringViewData(data) + data.Release() + return +} + type StringLikeBuilder interface { Builder Append(string) + AppendValues([]string, []bool) UnsafeAppend([]byte) ReserveData(int) } @@ -614,6 +694,8 @@ var ( _ arrow.Array = (*StringView)(nil) _ Builder = (*StringBuilder)(nil) _ Builder = (*LargeStringBuilder)(nil) + _ Builder = (*StringViewBuilder)(nil) _ StringLikeBuilder = (*StringBuilder)(nil) _ StringLikeBuilder = (*LargeStringBuilder)(nil) + _ StringLikeBuilder = (*StringViewBuilder)(nil) ) diff --git a/go/arrow/array/string_test.go b/go/arrow/array/string_test.go index fbc106b0983..836fb9adb1f 100644 --- a/go/arrow/array/string_test.go +++ b/go/arrow/array/string_test.go @@ -619,3 +619,176 @@ func TestStringValueLen(t *testing.T) { assert.Equal(t, len(v), slice.ValueLen(i)) } } +func TestStringViewArray(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + var ( + // only the last string is long enough to not get inlined + want = []string{"hello", "世界", "", "say goodbye daffy"} + valids = []bool{true, true, false, true} + ) + + sb := array.NewStringViewBuilder(mem) + defer sb.Release() + + sb.Retain() + sb.Release() + + assert.NoError(t, sb.AppendValueFromString(want[0])) + sb.AppendValues(want[1:2], nil) + + sb.AppendNull() + sb.Append(want[3]) + + if got, want := sb.Len(), len(want); got != want { + t.Fatalf("invalid len: got=%d, want=%d", got, want) + } + + if got, want := sb.NullN(), 1; got != want { + t.Fatalf("invalid nulls: got=%d, want=%d", got, want) + } + + arr := sb.NewStringViewArray() + defer arr.Release() + + arr.Retain() + arr.Release() + + assert.Equal(t, "hello", arr.ValueStr(0)) + + if got, want := arr.Len(), len(want); got != want { + t.Fatalf("invalid len: got=%d, want=%d", got, want) + } + + if got, want := arr.NullN(), 1; got != want { + t.Fatalf("invalid nulls: got=%d, want=%d", got, want) + } + + for i := range want { + if arr.IsNull(i) != !valids[i] { + t.Fatalf("arr[%d]-validity: got=%v want=%v", i, !arr.IsNull(i), valids[i]) + } + switch { + case arr.IsNull(i): + default: + got := arr.Value(i) + if got != want[i] { + t.Fatalf("arr[%d]: got=%q, want=%q", i, got, want[i]) + } + } + } + + sub := array.MakeFromData(arr.Data()) + defer sub.Release() + + if sub.DataType().ID() != arrow.STRING_VIEW { + t.Fatalf("invalid type: got=%q, want=string view", sub.DataType().Name()) + } + + if _, ok := sub.(*array.StringView); !ok { + t.Fatalf("could not type-assert to array.String") + } + + if got, want := arr.String(), `["hello" "世界" (null) "say goodbye daffy"]`; got != want { + t.Fatalf("got=%q, want=%q", got, want) + } + + // only the last string gets stuck into a buffer the rest are inlined + // in the headers. + if !bytes.Equal([]byte(`say goodbye daffy`), arr.Data().Buffers()[2].Bytes()) { + t.Fatalf("got=%q, want=%q", string(arr.Data().Buffers()[2].Bytes()), `say goodbye daffy`) + } + + // check the prefix for the non-inlined value + if [4]byte{'s', 'a', 'y', ' '} != arr.ValueHeader(3).Prefix() { + t.Fatalf("got=%q, want=%q", arr.ValueHeader(3).Prefix(), `say `) + } + + slice := array.NewSliceData(arr.Data(), 2, 4) + defer slice.Release() + + sub1 := array.MakeFromData(slice) + defer sub1.Release() + + v, ok := sub1.(*array.StringView) + if !ok { + t.Fatalf("could not type-assert to array.StringView") + } + + if got, want := v.String(), `[(null) "say goodbye daffy"]`; got != want { + t.Fatalf("got=%q, want=%q", got, want) + } + + if !bytes.Equal([]byte(`say goodbye daffy`), v.Data().Buffers()[2].Bytes()) { + t.Fatalf("got=%q, want=%q", string(v.Data().Buffers()[2].Bytes()), `say goodbye daffy`) + } + + // check the prefix for the non-inlined value + if [4]byte{'s', 'a', 'y', ' '} != v.ValueHeader(1).Prefix() { + t.Fatalf("got=%q, want=%q", v.ValueHeader(1).Prefix(), `say `) + } +} + +func TestStringViewBuilder_Empty(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + want := []string{"hello", "世界", "", "say goodbye daffy"} + + ab := array.NewStringViewBuilder(mem) + defer ab.Release() + + stringValues := func(a *array.StringView) []string { + vs := make([]string, a.Len()) + for i := range vs { + vs[i] = a.Value(i) + } + return vs + } + + ab.AppendValues([]string{}, nil) + a := ab.NewStringViewArray() + assert.Zero(t, a.Len()) + a.Release() + + ab.AppendValues(nil, nil) + a = ab.NewStringViewArray() + assert.Zero(t, a.Len()) + a.Release() + + ab.AppendValues([]string{}, nil) + ab.AppendValues(want, nil) + a = ab.NewStringViewArray() + assert.Equal(t, want, stringValues(a)) + a.Release() + + ab.AppendValues(want, nil) + ab.AppendValues([]string{}, nil) + a = ab.NewStringViewArray() + assert.Equal(t, want, stringValues(a)) + a.Release() +} + +// TestStringReset tests the Reset() method on the String type by creating two different Strings and then +// reseting the contents of string2 with the values from string1. +func TestStringViewReset(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + sb1 := array.NewStringViewBuilder(mem) + sb2 := array.NewStringViewBuilder(mem) + defer sb1.Release() + defer sb2.Release() + + sb1.Append("string1") + sb1.AppendNull() + + var ( + string1 = sb1.NewStringViewArray() + string2 = sb2.NewStringViewArray() + + string1Data = string1.Data() + ) + string2.Reset(string1Data) + + assert.Equal(t, "string1", string2.Value(0)) +} diff --git a/go/arrow/datatype_binary.go b/go/arrow/datatype_binary.go index 479d6b3b147..c30a1768072 100644 --- a/go/arrow/datatype_binary.go +++ b/go/arrow/datatype_binary.go @@ -98,6 +98,10 @@ const ( stringHeaderInlineSize = 12 ) +func IsStringHeaderInline(length int) bool { + return length < stringHeaderInlineSize +} + // StringHeader is a variable length string (utf8) or byte slice with // a 4 byte prefix and inline optimization for small values (12 bytes // or fewer). This is similar to Go's standard string but limited by @@ -142,11 +146,11 @@ func (sh *StringHeader) Prefix() [StringHeaderPrefixLen]byte { } func (sh *StringHeader) BufferIndex() uint32 { - return endian.Native.Uint32(sh.data[:]) + return endian.Native.Uint32(sh.data[StringHeaderPrefixLen:]) } func (sh *StringHeader) BufferOffset() uint32 { - return endian.Native.Uint32(sh.data[4:]) + return endian.Native.Uint32(sh.data[StringHeaderPrefixLen+4:]) } func (sh *StringHeader) InlineBytes() (data []byte) { @@ -179,8 +183,8 @@ func (sh *StringHeader) SetString(data string) int { } func (sh *StringHeader) SetIndexOffset(bufferIndex, offset uint32) { - endian.Native.PutUint32(sh.data[:], bufferIndex) - endian.Native.PutUint32(sh.data[4:], offset) + endian.Native.PutUint32(sh.data[StringHeaderPrefixLen:], bufferIndex) + endian.Native.PutUint32(sh.data[StringHeaderPrefixLen+4:], offset) } func (sh *StringHeader) Equals(buffers []*memory.Buffer, other *StringHeader, otherBuffers []*memory.Buffer) bool { diff --git a/go/arrow/datatype_binary_test.go b/go/arrow/datatype_binary_test.go index c47df3da1db..9d2de48c077 100644 --- a/go/arrow/datatype_binary_test.go +++ b/go/arrow/datatype_binary_test.go @@ -81,3 +81,33 @@ func TestLargeStringType(t *testing.T) { t.Fatalf("invalid string type stringer. got=%v, want=%v", got, want) } } + +func TestBinaryViewType(t *testing.T) { + var nt *arrow.BinaryViewType + if got, want := nt.ID(), arrow.BINARY_VIEW; got != want { + t.Fatalf("invalid string type id. got=%v, want=%v", got, want) + } + + if got, want := nt.Name(), "binary_view"; got != want { + t.Fatalf("invalid string type name. got=%v, want=%v", got, want) + } + + if got, want := nt.String(), "binary_view"; got != want { + t.Fatalf("invalid string type stringer. got=%v, want=%v", got, want) + } +} + +func TestStringViewType(t *testing.T) { + var nt *arrow.StringViewType + if got, want := nt.ID(), arrow.STRING_VIEW; got != want { + t.Fatalf("invalid string type id. got=%v, want=%v", got, want) + } + + if got, want := nt.Name(), "string_view"; got != want { + t.Fatalf("invalid string type name. got=%v, want=%v", got, want) + } + + if got, want := nt.String(), "string_view"; got != want { + t.Fatalf("invalid string type stringer. got=%v, want=%v", got, want) + } +} From daf179692d514fae43690fad5ef20e97d1f2dc2a Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Tue, 23 May 2023 11:38:48 -0400 Subject: [PATCH 03/37] implement concat --- go/arrow/array/concat.go | 30 +++++++++++++++++++++++++++++- go/arrow/datatype.go | 5 +++++ go/arrow/datatype_binary.go | 2 ++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index 9d815023c4b..28ff9ef83a6 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -600,6 +600,35 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, } case arrow.FixedWidthDataType: out.buffers[1] = concatBuffers(gatherBuffersFixedWidthType(data, 1, dt), mem) + case arrow.BinaryViewDataType: + out.buffers = out.buffers[:2] + for _, d := range data { + for _, buf := range d.Buffers()[2:] { + buf.Retain() + out.buffers = append(out.buffers, buf) + } + } + + out.buffers[1] = concatBuffers(gatherFixedBuffers(data, 1, arrow.StringHeaderSizeBytes), mem) + + var ( + s = arrow.StringHeaderTraits.CastFromBytes(out.buffers[1].Bytes()) + i = data[0].Len() + precedingBufsCount int + ) + + for idx := 1; idx < len(data); idx++ { + precedingBufsCount += len(data[idx-1].Buffers()) - 2 + + for end := i + data[idx].Len(); i < end; i++ { + if s[i].IsInline() { + continue + } + + bufIndex := s[i].BufferIndex() + uint32(precedingBufsCount) + s[i].SetIndexOffset(bufIndex, s[i].BufferOffset()) + } + } case arrow.BinaryDataType: offsetWidth := dt.Layout().Buffers[1].ByteWidth offsetBuffer, valueRanges, err := concatOffsets(gatherFixedBuffers(data, 1, offsetWidth), offsetWidth, mem) @@ -739,7 +768,6 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, out.childData[0].Release() return nil, err } - default: return nil, fmt.Errorf("concatenate not implemented for type %s", dt) } diff --git a/go/arrow/datatype.go b/go/arrow/datatype.go index 2cd18fd53bc..4c57e9fe595 100644 --- a/go/arrow/datatype.go +++ b/go/arrow/datatype.go @@ -210,6 +210,11 @@ type BinaryDataType interface { binary() } +type BinaryViewDataType interface { + BinaryDataType + view() +} + type OffsetsDataType interface { DataType OffsetTypeTraits() OffsetTraits diff --git a/go/arrow/datatype_binary.go b/go/arrow/datatype_binary.go index c30a1768072..2dce18e9795 100644 --- a/go/arrow/datatype_binary.go +++ b/go/arrow/datatype_binary.go @@ -218,6 +218,7 @@ func (*BinaryViewType) Name() string { return "binary_view" } func (*BinaryViewType) String() string { return "binary_view" } func (*BinaryViewType) IsUtf8() bool { return false } func (*BinaryViewType) binary() {} +func (*BinaryViewType) view() {} func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) } func (*BinaryViewType) Layout() DataTypeLayout { variadic := SpecVariableWidth() @@ -232,6 +233,7 @@ func (*StringViewType) Name() string { return "string_view" } func (*StringViewType) String() string { return "string_view" } func (*StringViewType) IsUtf8() bool { return true } func (*StringViewType) binary() {} +func (*StringViewType) view() {} func (t *StringViewType) Fingerprint() string { return typeFingerprint(t) } func (*StringViewType) Layout() DataTypeLayout { variadic := SpecVariableWidth() From 0491fd283b7f93fe64a30afcc72f339925a119b0 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 24 May 2023 11:04:41 -0400 Subject: [PATCH 04/37] initial IPC stuff --- format/Schema.fbs | 13 ++++++- go/arrow/array/concat_test.go | 3 ++ go/arrow/internal/flatbuf/MetadataVersion.go | 14 +++++++- .../internal/testing/gen/random_array_gen.go | 34 ++++++++++++++++++ go/arrow/ipc/message.go | 1 + go/arrow/ipc/metadata.go | 35 +++++++++++++++---- go/arrow/ipc/writer.go | 9 ++--- 7 files changed, 97 insertions(+), 12 deletions(-) diff --git a/format/Schema.fbs b/format/Schema.fbs index 6adbcb115cd..63c87722549 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -40,7 +40,7 @@ enum MetadataVersion:short { /// >= 0.8.0 (December 2017). Non-backwards compatible with V3. V4, - /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 + /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 /// metadata and IPC messages). Implementations are recommended to provide a /// V4 compatibility mode with V5 format changes disabled. /// @@ -48,6 +48,17 @@ enum MetadataVersion:short { /// - Union buffer layout has changed. In V5, Unions don't have a validity /// bitmap buffer. V5, + + /// >= 13.0.0 (May 2023). Backwards compatible with V5 (V6 readers can read V5 + /// metadata and IPC messages). Implementations are recommended to provide a + /// V5 compatibility mode with V6 format changes disabled. + /// + /// Incompatible changes between V5 and V6: + /// - variadicCounts field added to RecordBatch table in order to support + /// the variable number of buffers possible with Utf8View and BinaryView + /// columns. Those types can't be safely read by a = 0.8.0 (December 2017). Non-backwards compatible with V3. MetadataVersionV4 MetadataVersion = 3 - /// >= 1.0.0 (July 2020. Backwards compatible with V4 (V5 readers can read V4 + /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 /// metadata and IPC messages). Implementations are recommended to provide a /// V4 compatibility mode with V5 format changes disabled. /// @@ -39,6 +39,16 @@ const ( /// - Union buffer layout has changed. In V5, Unions don't have a validity /// bitmap buffer. MetadataVersionV5 MetadataVersion = 4 + /// >= 13.0.0 (May 2023). Backwards compatible with V5 (V6 readers can read V5 + /// metadata and IPC messages). Implementations are recommended to provide a + /// V5 compatibility mode with V6 format changes disabled. + /// + /// Incompatible changes between V5 and V6: + /// - variadicCounts field added to RecordBatch table in order to support + /// the variable number of buffers possible with Utf8View and BinaryView + /// columns. Those types can't be safely read by a = Arrow-0.8.0 MetadataV5 = MetadataVersion(flatbuf.MetadataVersionV5) // version for >= Arrow-1.0.0, backward compatible with v4 + MetadataV6 = MetadataVersion(flatbuf.MetadataVersionV6) // version for >= Arrow-13.0.0, backwards compatible with v5 ) func (m MetadataVersion) String() string { diff --git a/go/arrow/ipc/metadata.go b/go/arrow/ipc/metadata.go index 9bab47d6fa0..84a8f74a2d9 100644 --- a/go/arrow/ipc/metadata.go +++ b/go/arrow/ipc/metadata.go @@ -35,7 +35,7 @@ import ( var Magic = []byte("ARROW1") const ( - currentMetadataVersion = MetadataV5 + currentMetadataVersion = MetadataV6 minMetadataVersion = MetadataV4 // constants for the extension type metadata keys for the type name and @@ -323,6 +323,16 @@ func (fv *fieldVisitor) visit(field arrow.Field) { flatbuf.LargeUtf8Start(fv.b) fv.offset = flatbuf.LargeUtf8End(fv.b) + case *arrow.BinaryViewType: + fv.dtype = flatbuf.TypeBinaryView + flatbuf.BinaryViewStart(fv.b) + fv.offset = flatbuf.BinaryViewEnd(fv.b) + + case *arrow.StringViewType: + fv.dtype = flatbuf.TypeUtf8View + flatbuf.Utf8ViewStart(fv.b) + fv.offset = flatbuf.Utf8ViewEnd(fv.b) + case *arrow.Date32Type: fv.dtype = flatbuf.TypeDate flatbuf.DateStart(fv.b) @@ -713,6 +723,12 @@ func concreteTypeFromFB(typ flatbuf.Type, data flatbuffers.Table, children []arr case flatbuf.TypeLargeUtf8: return arrow.BinaryTypes.LargeString, nil + case flatbuf.TypeUtf8View: + return arrow.BinaryTypes.StringView, nil + + case flatbuf.TypeBinaryView: + return arrow.BinaryTypes.BinaryView, nil + case flatbuf.TypeBool: return arrow.FixedWidthTypes.Boolean, nil @@ -1168,15 +1184,15 @@ func writeFileFooter(schema *arrow.Schema, dicts, recs []fileBlock, w io.Writer) return err } -func writeRecordMessage(mem memory.Allocator, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType) *memory.Buffer { +func writeRecordMessage(mem memory.Allocator, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType, variadicCounts []int64) *memory.Buffer { b := flatbuffers.NewBuilder(0) - recFB := recordToFB(b, size, bodyLength, fields, meta, codec) + recFB := recordToFB(b, size, bodyLength, fields, meta, codec, variadicCounts) return writeMessageFB(b, mem, flatbuf.MessageHeaderRecordBatch, recFB, bodyLength) } -func writeDictionaryMessage(mem memory.Allocator, id int64, isDelta bool, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType) *memory.Buffer { +func writeDictionaryMessage(mem memory.Allocator, id int64, isDelta bool, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType, variadicCounts []int64) *memory.Buffer { b := flatbuffers.NewBuilder(0) - recFB := recordToFB(b, size, bodyLength, fields, meta, codec) + recFB := recordToFB(b, size, bodyLength, fields, meta, codec, variadicCounts) flatbuf.DictionaryBatchStart(b) flatbuf.DictionaryBatchAddId(b, id) @@ -1186,7 +1202,7 @@ func writeDictionaryMessage(mem memory.Allocator, id int64, isDelta bool, size, return writeMessageFB(b, mem, flatbuf.MessageHeaderDictionaryBatch, dictFB, bodyLength) } -func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType) flatbuffers.UOffsetT { +func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMetadata, meta []bufferMetadata, codec flatbuf.CompressionType, variadicCounts []int64) flatbuffers.UOffsetT { fieldsFB := writeFieldNodes(b, fields, flatbuf.RecordBatchStartNodesVector) metaFB := writeBuffers(b, meta, flatbuf.RecordBatchStartBuffersVector) var bodyCompressFB flatbuffers.UOffsetT @@ -1194,10 +1210,17 @@ func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMe bodyCompressFB = writeBodyCompression(b, codec) } + flatbuf.RecordBatchStartVariadicCountsVector(b, len(variadicCounts)) + for i := len(variadicCounts) - 1; i >= 0; i-- { + b.PrependInt64(variadicCounts[i]) + } + vcFB := b.EndVector(len(variadicCounts)) + flatbuf.RecordBatchStart(b) flatbuf.RecordBatchAddLength(b, size) flatbuf.RecordBatchAddNodes(b, fieldsFB) flatbuf.RecordBatchAddBuffers(b, metaFB) + flatbuf.RecordBatchAddVariadicCounts(b, vcFB) if codec != -1 { flatbuf.RecordBatchAddCompression(b, bodyCompressFB) } diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go index a97f47ef4aa..f173a386bd0 100644 --- a/go/arrow/ipc/writer.go +++ b/go/arrow/ipc/writer.go @@ -277,7 +277,7 @@ type dictEncoder struct { } func (d *dictEncoder) encodeMetadata(p *Payload, isDelta bool, id, nrows int64) error { - p.meta = writeDictionaryMessage(d.mem, id, isDelta, nrows, p.size, d.fields, d.meta, d.codec) + p.meta = writeDictionaryMessage(d.mem, id, isDelta, nrows, p.size, d.fields, d.meta, d.codec, d.variadicCounts) return nil } @@ -300,8 +300,9 @@ func (d *dictEncoder) Encode(p *Payload, id int64, isDelta bool, dict arrow.Arra type recordEncoder struct { mem memory.Allocator - fields []fieldMetadata - meta []bufferMetadata + fields []fieldMetadata + meta []bufferMetadata + variadicCounts []int64 depth int64 start int64 @@ -946,7 +947,7 @@ func (w *recordEncoder) Encode(p *Payload, rec arrow.Record) error { } func (w *recordEncoder) encodeMetadata(p *Payload, nrows int64) error { - p.meta = writeRecordMessage(w.mem, nrows, p.size, w.fields, w.meta, w.codec) + p.meta = writeRecordMessage(w.mem, nrows, p.size, w.fields, w.meta, w.codec, w.variadicCounts) return nil } From 0ba9d5648c44e6adbcf7656eaaa8b40c1c0901f6 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 25 May 2023 13:54:10 -0400 Subject: [PATCH 05/37] ipc integration tests --- docs/source/status.rst | 4 + format/Schema.fbs | 2 +- go/arrow/array/binary.go | 5 + go/arrow/array/binarybuilder.go | 36 +-- go/arrow/internal/arrdata/arrdata.go | 81 +++++++ go/arrow/internal/arrjson/arrjson.go | 161 ++++++++++++++ go/arrow/internal/arrjson/arrjson_test.go | 259 ++++++++++++++++++++++ go/arrow/ipc/file_reader.go | 41 +++- go/arrow/ipc/message.go | 12 +- go/arrow/ipc/writer.go | 27 +++ 10 files changed, 601 insertions(+), 27 deletions(-) diff --git a/docs/source/status.rst b/docs/source/status.rst index c8c0e6dfc1d..c059ab3cef9 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -68,6 +68,10 @@ Data Types +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Large Utf8 | ✓ | ✓ | ✓ | | | ✓ | ✓ | | +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| Binary View | ✓ | | ✓ | | | | | | ++-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +| String View | ✓ | | ✓ | | | | | | ++-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ +-------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Data type | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift | diff --git a/format/Schema.fbs b/format/Schema.fbs index 63c87722549..1b70e02fdda 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -49,7 +49,7 @@ enum MetadataVersion:short { /// bitmap buffer. V5, - /// >= 13.0.0 (May 2023). Backwards compatible with V5 (V6 readers can read V5 + /// >= 1.4.0 (May 2023). Backwards compatible with V5 (V6 readers can read V5 /// metadata and IPC messages). Implementations are recommended to provide a /// V5 compatibility mode with V6 format changes disabled. /// diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index ac03e60a88e..460a9eee220 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -319,6 +319,11 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool { return true } +type ViewLike interface { + arrow.Array + ValueHeader(int) *arrow.StringHeader +} + type BinaryView struct { array values []arrow.StringHeader diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go index 276fe55628f..92d9bdcebcd 100644 --- a/go/arrow/array/binarybuilder.go +++ b/go/arrow/array/binarybuilder.go @@ -507,18 +507,22 @@ func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) { b.Reserve(len(v)) outOfLineTotal := 0 - for _, vv := range v { - if !arrow.IsStringHeaderInline(len(vv)) { - outOfLineTotal += len(vv) + for i, vv := range v { + if len(valid) == 0 || valid[i] { + if !arrow.IsStringHeaderInline(len(vv)) { + outOfLineTotal += len(vv) + } } } b.ReserveData(outOfLineTotal) for i, vv := range v { - hdr := &b.rawData[b.length+i] - hdr.SetBytes(vv) - if !hdr.IsInline() { - b.blockBuilder.UnsafeAppend(hdr, vv) + if len(valid) == 0 || valid[i] { + hdr := &b.rawData[b.length+i] + hdr.SetBytes(vv) + if !hdr.IsInline() { + b.blockBuilder.UnsafeAppend(hdr, vv) + } } } @@ -536,18 +540,22 @@ func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) { b.Reserve(len(v)) outOfLineTotal := 0 - for _, vv := range v { - if !arrow.IsStringHeaderInline(len(vv)) { - outOfLineTotal += len(vv) + for i, vv := range v { + if len(valid) == 0 || valid[i] { + if !arrow.IsStringHeaderInline(len(vv)) { + outOfLineTotal += len(vv) + } } } b.ReserveData(outOfLineTotal) for i, vv := range v { - hdr := &b.rawData[b.length+i] - hdr.SetString(vv) - if !hdr.IsInline() { - b.blockBuilder.UnsafeAppendString(hdr, vv) + if len(valid) == 0 || valid[i] { + hdr := &b.rawData[b.length+i] + hdr.SetString(vv) + if !hdr.IsInline() { + b.blockBuilder.UnsafeAppendString(hdr, vv) + } } } diff --git a/go/arrow/internal/arrdata/arrdata.go b/go/arrow/internal/arrdata/arrdata.go index 0851bff0fe0..b680fbbe64e 100644 --- a/go/arrow/internal/arrdata/arrdata.go +++ b/go/arrow/internal/arrdata/arrdata.go @@ -54,6 +54,7 @@ func init() { Records["extension"] = makeExtensionRecords() Records["union"] = makeUnionRecords() Records["run_end_encoded"] = makeRunEndEncodedRecords() + Records["view_types"] = makeStringViewRecords() for k := range Records { RecordNames = append(RecordNames, k) @@ -1155,6 +1156,65 @@ func makeRunEndEncodedRecords() []arrow.Record { return recs } +func makeStringViewRecords() []arrow.Record { + mem := memory.NewGoAllocator() + schema := arrow.NewSchema([]arrow.Field{ + {Name: "binary_view", Type: arrow.BinaryTypes.BinaryView, Nullable: true}, + {Name: "string_view", Type: arrow.BinaryTypes.StringView, Nullable: true}, + }, nil) + + mask := []bool{true, false, false, true, true} + chunks := [][]arrow.Array{ + { + viewTypeArrayOf(mem, [][]byte{[]byte("1é"), []byte("2"), []byte("3"), []byte("4"), []byte("5")}, mask), + viewTypeArrayOf(mem, []string{"1é", "2", "3", "4", "5"}, mask), + }, + { + viewTypeArrayOf(mem, [][]byte{[]byte("1é"), []byte("22222222222222"), []byte("33333333333333"), []byte("4444"), []byte("5555")}, mask), + viewTypeArrayOf(mem, []string{"1é", "22222222222222", "33333333333333", "4444", "5555"}, nil), + }, + { + viewTypeArrayOf(mem, [][]byte{[]byte("1é1é"), []byte("22222222222222"), []byte("33333333333333"), []byte("44"), []byte("55")}, nil), + viewTypeArrayOf(mem, []string{"1é1é", "22222222222222", "33333333333333", "44", "55"}, mask), + }, + } + + defer func() { + for _, chunk := range chunks { + for _, col := range chunk { + col.Release() + } + } + }() + + recs := make([]arrow.Record, len(chunks)) + for i, chunk := range chunks { + recs[i] = array.NewRecord(schema, chunk, -1) + } + + return recs +} + +func viewTypeArrayOf(mem memory.Allocator, a interface{}, valids []bool) arrow.Array { + if mem == nil { + mem = memory.NewGoAllocator() + } + + switch a := a.(type) { + case []string: + bldr := array.NewStringViewBuilder(mem) + defer bldr.Release() + bldr.AppendValues(a, valids) + return bldr.NewArray() + case [][]byte: + bldr := array.NewBinaryViewBuilder(mem) + defer bldr.Release() + bldr.AppendValues(a, valids) + return bldr.NewArray() + } + return nil +} + func extArray(mem memory.Allocator, dt arrow.ExtensionType, a interface{}, valids []bool) arrow.Array { var storage arrow.Array switch st := dt.StorageType().(type) { @@ -1750,5 +1810,26 @@ func buildArray(bldr array.Builder, data arrow.Array) { bldr.AppendNull() } } + + case *array.BinaryViewBuilder: + data := data.(*array.BinaryView) + for i := 0; i < data.Len(); i++ { + switch { + case data.IsValid(i): + bldr.Append(data.Value(i)) + default: + bldr.AppendNull() + } + } + case *array.StringViewBuilder: + data := data.(*array.StringView) + for i := 0; i < data.Len(); i++ { + switch { + case data.IsValid(i): + bldr.Append(data.Value(i)) + default: + bldr.AppendNull() + } + } } } diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index ad87b73fc4d..48162d2ae98 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -158,6 +158,10 @@ func typeToJSON(arrowType arrow.DataType) (json.RawMessage, error) { typ = nameJSON{"utf8"} case *arrow.LargeStringType: typ = nameJSON{"largeutf8"} + case *arrow.BinaryViewType: + typ = nameJSON{"binary_view"} + case *arrow.StringViewType: + typ = nameJSON{"utf8_view"} case *arrow.Date32Type: typ = unitZoneJSON{Name: "date", Unit: "DAY"} case *arrow.Date64Type: @@ -342,6 +346,10 @@ func typeFromJSON(typ json.RawMessage, children []FieldWrapper) (arrowType arrow arrowType = arrow.BinaryTypes.String case "largeutf8": arrowType = arrow.BinaryTypes.LargeString + case "binary_view": + arrowType = arrow.BinaryTypes.BinaryView + case "utf8_view": + arrowType = arrow.BinaryTypes.StringView case "date": t := unitZoneJSON{} if err = json.Unmarshal(typ, &t); err != nil { @@ -818,6 +826,7 @@ type Array struct { Offset interface{} `json:"OFFSET,omitempty"` Size interface{} `json:"SIZE,omitempty"` Children []Array `json:"children,omitempty"` + Variadic []string `json:"VARIADIC_BUFFERS,omitempty"` } func (a *Array) MarshalJSON() ([]byte, error) { @@ -1078,6 +1087,30 @@ func arrayFromJSON(mem memory.Allocator, dt arrow.DataType, arr Array) arrow.Arr bldr.AppendValues(data, valids) return returnNewArrayData(bldr) + case *arrow.BinaryViewType: + valids := validsToBitmap(validsFromJSON(arr.Valids), mem) + nulls := arr.Count - bitutil.CountSetBits(valids.Bytes(), 0, arr.Count) + headers := stringHeadersFromJSON(mem, true, arr.Data) + extraBufs := variadicBuffersFromJSON(arr.Variadic) + defer valids.Release() + defer headers.Release() + + return array.NewData(dt, arr.Count, + append([]*memory.Buffer{valids, headers}, extraBufs...), + nil, nulls, 0) + + case *arrow.StringViewType: + valids := validsToBitmap(validsFromJSON(arr.Valids), mem) + nulls := arr.Count - bitutil.CountSetBits(valids.Bytes(), 0, arr.Count) + headers := stringHeadersFromJSON(mem, false, arr.Data) + extraBufs := variadicBuffersFromJSON(arr.Variadic) + defer valids.Release() + defer headers.Release() + + return array.NewData(dt, arr.Count, + append([]*memory.Buffer{valids, headers}, extraBufs...), + nil, nulls, 0) + case *arrow.ListType: valids := validsFromJSON(arr.Valids) elems := arrayFromJSON(mem, dt.Elem(), arr.Children[0]) @@ -1486,6 +1519,24 @@ func arrayToJSON(field arrow.Field, arr arrow.Array) Array { Offset: strOffsets, } + case *array.StringView: + variadic := variadicBuffersToJSON(arr.Data().Buffers()[2:]) + return Array{ + Name: field.Name, + Count: arr.Len(), + Valids: validsToJSON(arr), + Data: stringHeadersToJSON(arr, false), + Variadic: variadic, + } + case *array.BinaryView: + variadic := variadicBuffersToJSON(arr.Data().Buffers()[2:]) + return Array{ + Name: field.Name, + Count: arr.Len(), + Valids: validsToJSON(arr), + Data: stringHeadersToJSON(arr, true), + Variadic: variadic, + } case *array.List: o := Array{ Name: field.Name, @@ -2309,3 +2360,113 @@ func durationToJSON(arr *array.Duration) []interface{} { } return o } + +func variadicBuffersFromJSON(bufs []string) []*memory.Buffer { + out := make([]*memory.Buffer, len(bufs)) + for i, data := range bufs { + rawData, err := hex.DecodeString(data) + if err != nil { + panic(err) + } + + out[i] = memory.NewBufferBytes(rawData) + } + return out +} + +func variadicBuffersToJSON(bufs []*memory.Buffer) []string { + out := make([]string, len(bufs)) + for i, data := range bufs { + out[i] = strings.ToUpper(hex.EncodeToString(data.Bytes())) + } + return out +} + +func stringHeadersFromJSON(mem memory.Allocator, isBinary bool, data []interface{}) *memory.Buffer { + buf := memory.NewResizableBuffer(mem) + buf.Resize(arrow.StringHeaderTraits.BytesRequired(len(data))) + + values := arrow.StringHeaderTraits.CastFromBytes(buf.Bytes()) + + for i, d := range data { + switch v := d.(type) { + case nil: + continue + case map[string]interface{}: + if inlined, ok := v["INLINED"]; ok { + if isBinary { + val, err := hex.DecodeString(inlined.(string)) + if err != nil { + panic(fmt.Errorf("could not decode %v: %v", inlined, err)) + } + values[i].SetBytes(val) + } else { + values[i].SetString(inlined.(string)) + } + continue + } + + idx, offset := v["BUFFER_INDEX"].(json.Number), v["OFFSET"].(json.Number) + bufIdx, err := idx.Int64() + if err != nil { + panic(err) + } + + bufOffset, err := offset.Int64() + if err != nil { + panic(err) + } + + values[i].SetIndexOffset(uint32(bufIdx), uint32(bufOffset)) + prefix, err := hex.DecodeString(v["PREFIX"].(string)) + if err != nil { + panic(err) + } + sz, err := v["SIZE"].(json.Number).Int64() + if err != nil { + panic(err) + } + + rawData := make([]byte, sz) + copy(rawData, prefix) + values[i].SetBytes(rawData) + } + } + return buf +} + +func stringHeadersToJSON(arr array.ViewLike, isBinary bool) []interface{} { + type StringHeader struct { + Size int `json:"SIZE"` + Prefix *string `json:"PREFIX,omitempty"` + BufferIdx *int `json:"BUFFER_INDEX,omitempty"` + BufferOff *int `json:"OFFSET,omitempty"` + Inlined *string `json:"INLINED,omitempty"` + } + + o := make([]interface{}, arr.Len()) + for i := range o { + hdr := arr.ValueHeader(i) + if hdr.IsInline() { + data := hdr.InlineData() + if isBinary { + data = strings.ToUpper(hex.EncodeToString(hdr.InlineBytes())) + } + o[i] = StringHeader{ + Size: hdr.Len(), + Inlined: &data, + } + } else { + idx, off := int(hdr.BufferIndex()), int(hdr.BufferOffset()) + prefix := hdr.Prefix() + encodedPrefix := strings.ToUpper(hex.EncodeToString(prefix[:])) + o[i] = StringHeader{ + Size: hdr.Len(), + Prefix: &encodedPrefix, + BufferIdx: &idx, + BufferOff: &off, + } + } + } + return o +} diff --git a/go/arrow/internal/arrjson/arrjson_test.go b/go/arrow/internal/arrjson/arrjson_test.go index ee85d431805..e423c7818fe 100644 --- a/go/arrow/internal/arrjson/arrjson_test.go +++ b/go/arrow/internal/arrjson/arrjson_test.go @@ -48,6 +48,7 @@ func TestReadWrite(t *testing.T) { wantJSONs["dictionary"] = makeDictionaryWantJSONs() wantJSONs["union"] = makeUnionWantJSONs() wantJSONs["run_end_encoded"] = makeRunEndEncodedWantJSONs() + wantJSONs["view_types"] = makeViewTypesWantJSONs() tempDir := t.TempDir() for name, recs := range arrdata.Records { @@ -6127,3 +6128,261 @@ func makeRunEndEncodedWantJSONs() string { ] }` } + +func makeViewTypesWantJSONs() string { + return `{ + "schema": { + "fields": [ + { + "name": "binary_view", + "type": { + "name": "binary_view" + }, + "nullable": true, + "children": [] + }, + { + "name": "string_view", + "type": { + "name": "utf8_view" + }, + "nullable": true, + "children": [] + } + ] + }, + "batches": [ + { + "count": 5, + "columns": [ + { + "name": "binary_view", + "count": 5, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1 + ], + "DATA": [ + { + "SIZE": 3, + "INLINED": "31C3A9" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 1, + "INLINED": "34" + }, + { + "SIZE": 1, + "INLINED": "35" + } + ], + "VARIADIC_BUFFERS": [""] + }, + { + "name": "string_view", + "count": 5, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1 + ], + "DATA": [ + { + "SIZE": 3, + "INLINED": "1é" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 1, + "INLINED": "4" + }, + { + "SIZE": 1, + "INLINED": "5" + } + ], + "VARIADIC_BUFFERS": [""] + } + ] + }, + { + "count": 5, + "columns": [ + { + "name": "binary_view", + "count": 5, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1 + ], + "DATA": [ + { + "SIZE": 3, + "INLINED": "31C3A9" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 4, + "INLINED": "34343434" + }, + { + "SIZE": 4, + "INLINED": "35353535" + } + ], + "VARIADIC_BUFFERS": [""] + }, + { + "name": "string_view", + "count": 5, + "VALIDITY": [ + 1, + 1, + 1, + 1, + 1 + ], + "DATA": [ + { + "SIZE": 3, + "INLINED": "1é" + }, + { + "SIZE": 14, + "PREFIX": "32323232", + "BUFFER_INDEX": 0, + "OFFSET": 0 + }, + { + "SIZE": 14, + "PREFIX": "33333333", + "BUFFER_INDEX": 0, + "OFFSET": 14 + }, + { + "SIZE": 4, + "INLINED": "4444" + }, + { + "SIZE": 4, + "INLINED": "5555" + } + ], + "VARIADIC_BUFFERS": [ + "32323232323232323232323232323333333333333333333333333333" + ] + } + ] + }, + { + "count": 5, + "columns": [ + { + "name": "binary_view", + "count": 5, + "VALIDITY": [ + 1, + 1, + 1, + 1, + 1 + ], + "DATA": [ + { + "SIZE": 6, + "INLINED": "31C3A931C3A9" + }, + { + "SIZE": 14, + "PREFIX": "32323232", + "BUFFER_INDEX": 0, + "OFFSET": 0 + }, + { + "SIZE": 14, + "PREFIX": "33333333", + "BUFFER_INDEX": 0, + "OFFSET": 14 + }, + { + "SIZE": 2, + "INLINED": "3434" + }, + { + "SIZE": 2, + "INLINED": "3535" + } + ], + "VARIADIC_BUFFERS": [ + "32323232323232323232323232323333333333333333333333333333" + ] + }, + { + "name": "string_view", + "count": 5, + "VALIDITY": [ + 1, + 0, + 0, + 1, + 1 + ], + "DATA": [ + { + "SIZE": 6, + "INLINED": "1é1é" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 0, + "INLINED": "" + }, + { + "SIZE": 2, + "INLINED": "44" + }, + { + "SIZE": 2, + "INLINED": "55" + } + ], + "VARIADIC_BUFFERS": [""] + } + ] + } + ] +}` +} diff --git a/go/arrow/ipc/file_reader.go b/go/arrow/ipc/file_reader.go index 10cb2cae764..67c51f29a29 100644 --- a/go/arrow/ipc/file_reader.go +++ b/go/arrow/ipc/file_reader.go @@ -430,13 +430,18 @@ func (src *ipcSource) fieldMetadata(i int) *flatbuf.FieldNode { return &node } +func (src *ipcSource) variadicCount(i int) int64 { + return src.meta.VariadicCounts(i) +} + type arrayLoaderContext struct { - src ipcSource - ifield int - ibuffer int - max int - memo *dictutils.Memo - version MetadataVersion + src ipcSource + ifield int + ibuffer int + ivariadic int + max int + memo *dictutils.Memo + version MetadataVersion } func (ctx *arrayLoaderContext) field() *flatbuf.FieldNode { @@ -451,6 +456,12 @@ func (ctx *arrayLoaderContext) buffer() *memory.Buffer { return buf } +func (ctx *arrayLoaderContext) variadic() int64 { + v := ctx.src.variadicCount(ctx.ivariadic) + ctx.ivariadic++ + return v +} + func (ctx *arrayLoaderContext) loadArray(dt arrow.DataType) arrow.ArrayData { switch dt := dt.(type) { case *arrow.NullType: @@ -476,6 +487,12 @@ func (ctx *arrayLoaderContext) loadArray(dt arrow.DataType) arrow.ArrayData { case *arrow.BinaryType, *arrow.StringType, *arrow.LargeStringType, *arrow.LargeBinaryType: return ctx.loadBinary(dt) + case arrow.BinaryViewDataType: + if ctx.version < MetadataV6 { + panic("arrow/ipc: cannot safely read BinaryView/StringView columns with metadata version = Arrow-0.8.0 - MetadataV5 = MetadataVersion(flatbuf.MetadataVersionV5) // version for >= Arrow-1.0.0, backward compatible with v4 - MetadataV6 = MetadataVersion(flatbuf.MetadataVersionV6) // version for >= Arrow-13.0.0, backwards compatible with v5 + MetadataV1 = MetadataVersion(flatbuf.MetadataVersionV1) // version for Arrow Format-0.1.0 + MetadataV2 = MetadataVersion(flatbuf.MetadataVersionV2) // version for Arrow Format-0.2.0 + MetadataV3 = MetadataVersion(flatbuf.MetadataVersionV3) // version for Arrow Format-0.3.0 to 0.7.1 + MetadataV4 = MetadataVersion(flatbuf.MetadataVersionV4) // version for >= Arrow Format-0.8.0 + MetadataV5 = MetadataVersion(flatbuf.MetadataVersionV5) // version for >= Arrow Format-1.0.0, backward compatible with v4 + MetadataV6 = MetadataVersion(flatbuf.MetadataVersionV6) // version for >= Arrow Format-1.4.0, backwards compatible with v5 ) func (m MetadataVersion) String() string { diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go index f173a386bd0..f365f983ad8 100644 --- a/go/arrow/ipc/writer.go +++ b/go/arrow/ipc/writer.go @@ -603,6 +603,33 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error { p.body = append(p.body, voffsets) p.body = append(p.body, values) + case arrow.BinaryViewDataType: + data := arr.Data() + values := data.Buffers()[1] + arrLen := int64(arr.Len()) + typeWidth := int64(arrow.StringHeaderSizeBytes) + minLength := paddedLength(arrLen*typeWidth, kArrowAlignment) + + switch { + case needTruncate(int64(data.Offset()), values, minLength): + // non-zero offset: slice the buffer + offset := int64(data.Offset()) * typeWidth + // send padding if available + len := minI64(bitutil.CeilByte64(arrLen*typeWidth), int64(values.Len())-offset) + values = memory.NewBufferBytes(values.Bytes()[offset : offset+len]) + default: + if values != nil { + values.Retain() + } + } + p.body = append(p.body, values) + + w.variadicCounts = append(w.variadicCounts, int64(len(data.Buffers())-2)) + for _, b := range data.Buffers()[2:] { + b.Retain() + p.body = append(p.body, b) + } + case *arrow.StructType: w.depth-- arr := arr.(*array.Struct) From 46e7034f7d04f56357714a968ea5369ed8e7c87a Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 25 May 2023 14:07:54 -0400 Subject: [PATCH 06/37] don't bump to v6 metadata yet --- format/Schema.fbs | 11 ----------- go/arrow/internal/flatbuf/Binary.go | 7 ------- go/arrow/internal/flatbuf/BodyCompression.go | 7 ------- go/arrow/internal/flatbuf/Bool.go | 7 ------- go/arrow/internal/flatbuf/Date.go | 7 ------- go/arrow/internal/flatbuf/Decimal.go | 7 ------- go/arrow/internal/flatbuf/DictionaryBatch.go | 7 ------- go/arrow/internal/flatbuf/DictionaryEncoding.go | 7 ------- go/arrow/internal/flatbuf/Duration.go | 7 ------- go/arrow/internal/flatbuf/Field.go | 7 ------- go/arrow/internal/flatbuf/FixedSizeBinary.go | 7 ------- go/arrow/internal/flatbuf/FixedSizeList.go | 7 ------- go/arrow/internal/flatbuf/FloatingPoint.go | 7 ------- go/arrow/internal/flatbuf/Footer.go | 7 ------- go/arrow/internal/flatbuf/Int.go | 7 ------- go/arrow/internal/flatbuf/Interval.go | 7 ------- go/arrow/internal/flatbuf/KeyValue.go | 7 ------- go/arrow/internal/flatbuf/LargeBinary.go | 7 ------- go/arrow/internal/flatbuf/LargeList.go | 7 ------- go/arrow/internal/flatbuf/LargeUtf8.go | 7 ------- go/arrow/internal/flatbuf/List.go | 7 ------- go/arrow/internal/flatbuf/Map.go | 7 ------- go/arrow/internal/flatbuf/Message.go | 7 ------- go/arrow/internal/flatbuf/MetadataVersion.go | 12 ------------ go/arrow/internal/flatbuf/Null.go | 7 ------- go/arrow/internal/flatbuf/RecordBatch.go | 7 ------- go/arrow/internal/flatbuf/RunEndEncoded.go | 7 ------- go/arrow/internal/flatbuf/Schema.go | 7 ------- go/arrow/internal/flatbuf/SparseMatrixIndexCSX.go | 7 ------- go/arrow/internal/flatbuf/SparseTensor.go | 7 ------- go/arrow/internal/flatbuf/SparseTensorIndexCOO.go | 7 ------- go/arrow/internal/flatbuf/SparseTensorIndexCSF.go | 7 ------- go/arrow/internal/flatbuf/Struct_.go | 7 ------- go/arrow/internal/flatbuf/Tensor.go | 7 ------- go/arrow/internal/flatbuf/TensorDim.go | 7 ------- go/arrow/internal/flatbuf/Time.go | 7 ------- go/arrow/internal/flatbuf/Timestamp.go | 7 ------- go/arrow/internal/flatbuf/Union.go | 7 ------- go/arrow/internal/flatbuf/Utf8.go | 7 ------- go/arrow/ipc/file_reader.go | 3 --- go/arrow/ipc/message.go | 1 - go/arrow/ipc/metadata.go | 2 +- 42 files changed, 1 insertion(+), 287 deletions(-) diff --git a/format/Schema.fbs b/format/Schema.fbs index 1b70e02fdda..dbf482e6cc7 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -48,17 +48,6 @@ enum MetadataVersion:short { /// - Union buffer layout has changed. In V5, Unions don't have a validity /// bitmap buffer. V5, - - /// >= 1.4.0 (May 2023). Backwards compatible with V5 (V6 readers can read V5 - /// metadata and IPC messages). Implementations are recommended to provide a - /// V5 compatibility mode with V6 format changes disabled. - /// - /// Incompatible changes between V5 and V6: - /// - variadicCounts field added to RecordBatch table in order to support - /// the variable number of buffers possible with Utf8View and BinaryView - /// columns. Those types can't be safely read by a = 13.0.0 (May 2023). Backwards compatible with V5 (V6 readers can read V5 - /// metadata and IPC messages). Implementations are recommended to provide a - /// V5 compatibility mode with V6 format changes disabled. - /// - /// Incompatible changes between V5 and V6: - /// - variadicCounts field added to RecordBatch table in order to support - /// the variable number of buffers possible with Utf8View and BinaryView - /// columns. Those types can't be safely read by a = Arrow Format-0.8.0 MetadataV5 = MetadataVersion(flatbuf.MetadataVersionV5) // version for >= Arrow Format-1.0.0, backward compatible with v4 - MetadataV6 = MetadataVersion(flatbuf.MetadataVersionV6) // version for >= Arrow Format-1.4.0, backwards compatible with v5 ) func (m MetadataVersion) String() string { diff --git a/go/arrow/ipc/metadata.go b/go/arrow/ipc/metadata.go index 84a8f74a2d9..044681f081d 100644 --- a/go/arrow/ipc/metadata.go +++ b/go/arrow/ipc/metadata.go @@ -35,7 +35,7 @@ import ( var Magic = []byte("ARROW1") const ( - currentMetadataVersion = MetadataV6 + currentMetadataVersion = MetadataV5 minMetadataVersion = MetadataV4 // constants for the extension type metadata keys for the type name and From 2a85125aeacb1c959e6f8dfc3ca4c357697c865f Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 25 May 2023 16:28:27 -0400 Subject: [PATCH 07/37] update datagen with fix --- dev/archery/archery/integration/datagen.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 1ce2775c160..a0bffe83b64 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -826,6 +826,7 @@ def _get_buffers(self): ] + class StringViewColumn(BinaryViewColumn): def _encode_value(self, x): From 0b141d823b9680b4ddd1b5c748e5982350ac927d Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 26 May 2023 11:55:34 -0400 Subject: [PATCH 08/37] fix lint and integration issue --- dev/archery/archery/integration/datagen.py | 1 - go/arrow/compute/executor.go | 2 ++ go/arrow/internal/arrjson/arrjson.go | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index a0bffe83b64..1ce2775c160 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -826,7 +826,6 @@ def _get_buffers(self): ] - class StringViewColumn(BinaryViewColumn): def _encode_value(self, x): diff --git a/go/arrow/compute/executor.go b/go/arrow/compute/executor.go index 6da7ed12930..5085bc179c5 100644 --- a/go/arrow/compute/executor.go +++ b/go/arrow/compute/executor.go @@ -171,6 +171,8 @@ func addComputeDataPrealloc(dt arrow.DataType, widths []bufferPrealloc) []buffer return append(widths, bufferPrealloc{bitWidth: 32, addLen: 1}) case arrow.LARGE_BINARY, arrow.LARGE_STRING, arrow.LARGE_LIST: return append(widths, bufferPrealloc{bitWidth: 64, addLen: 1}) + case arrow.STRING_VIEW, arrow.BINARY_VIEW: + return append(widths, bufferPrealloc{bitWidth: arrow.StringHeaderSizeBytes * 8}) } return widths } diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index 48162d2ae98..45695fb065d 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -159,9 +159,9 @@ func typeToJSON(arrowType arrow.DataType) (json.RawMessage, error) { case *arrow.LargeStringType: typ = nameJSON{"largeutf8"} case *arrow.BinaryViewType: - typ = nameJSON{"binary_view"} + typ = nameJSON{"binaryview"} case *arrow.StringViewType: - typ = nameJSON{"utf8_view"} + typ = nameJSON{"utf8view"} case *arrow.Date32Type: typ = unitZoneJSON{Name: "date", Unit: "DAY"} case *arrow.Date64Type: @@ -346,9 +346,9 @@ func typeFromJSON(typ json.RawMessage, children []FieldWrapper) (arrowType arrow arrowType = arrow.BinaryTypes.String case "largeutf8": arrowType = arrow.BinaryTypes.LargeString - case "binary_view": + case "binaryview": arrowType = arrow.BinaryTypes.BinaryView - case "utf8_view": + case "utf8view": arrowType = arrow.BinaryTypes.StringView case "date": t := unitZoneJSON{} From 7e19d390d14dfd5315d3973a224b36bf515e75eb Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 26 May 2023 12:02:13 -0400 Subject: [PATCH 09/37] forgot to update the test --- go/arrow/internal/arrjson/arrjson_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/arrow/internal/arrjson/arrjson_test.go b/go/arrow/internal/arrjson/arrjson_test.go index e423c7818fe..faab5e94ecb 100644 --- a/go/arrow/internal/arrjson/arrjson_test.go +++ b/go/arrow/internal/arrjson/arrjson_test.go @@ -6136,7 +6136,7 @@ func makeViewTypesWantJSONs() string { { "name": "binary_view", "type": { - "name": "binary_view" + "name": "binaryview" }, "nullable": true, "children": [] @@ -6144,7 +6144,7 @@ func makeViewTypesWantJSONs() string { { "name": "string_view", "type": { - "name": "utf8_view" + "name": "utf8view" }, "nullable": true, "children": [] From 15888ac40e50822798fc1597f354693f5a3b3974 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 15 Jun 2023 12:57:39 -0400 Subject: [PATCH 10/37] fix 32bit and tinygo compat --- go/arrow/array/binarybuilder.go | 8 ++++---- go/arrow/type_traits_string_view.go | 14 ++------------ 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go index 92d9bdcebcd..0c798f5bd4e 100644 --- a/go/arrow/array/binarybuilder.go +++ b/go/arrow/array/binarybuilder.go @@ -372,8 +372,8 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error { } const ( - dfltBlockSize = 1 << 20 // 1 MB - viewValueSizeLimit = math.MaxUint32 + dfltBlockSize = 1 << 20 // 1 MB + viewValueSizeLimit uint32 = math.MaxUint32 ) type BinaryViewBuilder struct { @@ -443,7 +443,7 @@ func (b *BinaryViewBuilder) Resize(n int) { } func (b *BinaryViewBuilder) ReserveData(length int) { - if length > viewValueSizeLimit { + if uint32(length) > viewValueSizeLimit { panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid)) } @@ -455,7 +455,7 @@ func (b *BinaryViewBuilder) Reserve(n int) { } func (b *BinaryViewBuilder) Append(v []byte) { - if len(v) > viewValueSizeLimit { + if uint32(len(v)) > viewValueSizeLimit { panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid)) } diff --git a/go/arrow/type_traits_string_view.go b/go/arrow/type_traits_string_view.go index 8df79fa5334..1006dfb2eb0 100644 --- a/go/arrow/type_traits_string_view.go +++ b/go/arrow/type_traits_string_view.go @@ -41,23 +41,13 @@ func (stringHeaderTraits) PutValue(b []byte, v StringHeader) { func (stringHeaderTraits) CastFromBytes(b []byte) (res []StringHeader) { h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) - s.Data = h.Data - s.Len = h.Len / StringHeaderSizeBytes - s.Cap = h.Cap / StringHeaderSizeBytes - - return + return unsafe.Slice((*StringHeader)(unsafe.Pointer(h.Data)), cap(b)/StringHeaderSizeBytes)[:len(b)/StringHeaderSizeBytes] } func (stringHeaderTraits) CastToBytes(b []StringHeader) (res []byte) { h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - s := (*reflect.SliceHeader)(unsafe.Pointer(&res)) - s.Data = h.Data - s.Len = h.Len * StringHeaderSizeBytes - s.Cap = h.Cap * StringHeaderSizeBytes - - return + return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*StringHeaderSizeBytes)[:len(b)*StringHeaderSizeBytes] } func (stringHeaderTraits) Copy(dst, src []StringHeader) { copy(dst, src) } From 76e9bbc71d9868de4a3d40035065184dabe7e16f Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 15 Jun 2023 13:23:23 -0400 Subject: [PATCH 11/37] update inlinedata for compatibility with go1.20 and tinygo --- go/arrow/datatype_binary.go | 128 ----------------- go/arrow/datatype_stringheader.go | 136 ++++++++++++++++++ go/arrow/datatype_stringheader_inline.go | 31 ++++ .../datatype_stringheader_inline_go1.19.go | 35 +++++ .../datatype_stringheader_inline_tinygo.go | 35 +++++ 5 files changed, 237 insertions(+), 128 deletions(-) create mode 100644 go/arrow/datatype_stringheader.go create mode 100644 go/arrow/datatype_stringheader_inline.go create mode 100644 go/arrow/datatype_stringheader_inline_go1.19.go create mode 100644 go/arrow/datatype_stringheader_inline_tinygo.go diff --git a/go/arrow/datatype_binary.go b/go/arrow/datatype_binary.go index 2dce18e9795..d49506e6a7b 100644 --- a/go/arrow/datatype_binary.go +++ b/go/arrow/datatype_binary.go @@ -16,16 +16,6 @@ package arrow -import ( - "bytes" - "reflect" - "unsafe" - - "github.com/apache/arrow/go/v13/arrow/endian" - "github.com/apache/arrow/go/v13/arrow/internal/debug" - "github.com/apache/arrow/go/v13/arrow/memory" -) - // OffsetTraits is a convenient interface over the various type traits // constants such as arrow.Int32Traits allowing types with offsets, like // BinaryType, StringType, LargeBinaryType and LargeStringType to have @@ -93,124 +83,6 @@ func (t *LargeStringType) Layout() DataTypeLayout { func (t *LargeStringType) OffsetTypeTraits() OffsetTraits { return Int64Traits } func (LargeStringType) IsUtf8() bool { return true } -const ( - StringHeaderPrefixLen = 4 - stringHeaderInlineSize = 12 -) - -func IsStringHeaderInline(length int) bool { - return length < stringHeaderInlineSize -} - -// StringHeader is a variable length string (utf8) or byte slice with -// a 4 byte prefix and inline optimization for small values (12 bytes -// or fewer). This is similar to Go's standard string but limited by -// a length of Uint32Max and up to the first four bytes of the string -// are copied into the struct. This prefix allows failing comparisons -// early and can reduce CPU cache working set when dealing with short -// strings. -// -// There are two situations: -// -// Short string |----|----|--------| -// ^ ^ ^ -// | | | -// size prefix remaining in-line portion, zero padded -// -// IO Long String |----|----|----|----| -// ^ ^ ^ ^ -// | | | | -// size prefix buffer index and offset to out-of-line portion -// -// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. -// -// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf -type StringHeader struct { - size uint32 - // the first 4 bytes of this are the prefix for the string - // if size <= StringHeaderInlineSize, then the entire string - // is in the data array and is zero padded. - // if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32 - // values which are the buffer index and offset in that buffer - // containing the full string. - data [stringHeaderInlineSize]byte -} - -func (sh *StringHeader) IsInline() bool { - return sh.size <= uint32(stringHeaderInlineSize) -} - -func (sh *StringHeader) Len() int { return int(sh.size) } -func (sh *StringHeader) Prefix() [StringHeaderPrefixLen]byte { - return *(*[4]byte)(unsafe.Pointer(&sh.data)) -} - -func (sh *StringHeader) BufferIndex() uint32 { - return endian.Native.Uint32(sh.data[StringHeaderPrefixLen:]) -} - -func (sh *StringHeader) BufferOffset() uint32 { - return endian.Native.Uint32(sh.data[StringHeaderPrefixLen+4:]) -} - -func (sh *StringHeader) InlineBytes() (data []byte) { - debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader") - return sh.data[:sh.size] -} - -func (sh *StringHeader) InlineData() (data string) { - debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader") - h := (*reflect.StringHeader)(unsafe.Pointer(&data)) - h.Data = uintptr(unsafe.Pointer(&sh.data)) - h.Len = int(sh.size) - return -} - -func (sh *StringHeader) SetBytes(data []byte) int { - sh.size = uint32(len(data)) - if sh.IsInline() { - return copy(sh.data[:], data) - } - return copy(sh.data[:4], data) -} - -func (sh *StringHeader) SetString(data string) int { - sh.size = uint32(len(data)) - if sh.IsInline() { - return copy(sh.data[:], data) - } - return copy(sh.data[:4], data) -} - -func (sh *StringHeader) SetIndexOffset(bufferIndex, offset uint32) { - endian.Native.PutUint32(sh.data[StringHeaderPrefixLen:], bufferIndex) - endian.Native.PutUint32(sh.data[StringHeaderPrefixLen+4:], offset) -} - -func (sh *StringHeader) Equals(buffers []*memory.Buffer, other *StringHeader, otherBuffers []*memory.Buffer) bool { - if sh.sizeAndPrefixAsInt() != other.sizeAndPrefixAsInt() { - return false - } - - if sh.IsInline() { - return sh.inlinedAsInt64() == other.inlinedAsInt64() - } - - data := buffers[sh.BufferIndex()].Bytes()[sh.BufferOffset() : sh.BufferOffset()+sh.size] - otherData := otherBuffers[other.BufferIndex()].Bytes()[other.BufferOffset() : other.BufferOffset()+other.size] - return bytes.Equal(data, otherData) -} - -func (sh *StringHeader) inlinedAsInt64() int64 { - s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2) - return s[1] -} - -func (sh *StringHeader) sizeAndPrefixAsInt() int64 { - s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2) - return s[0] -} - type BinaryViewType struct{} func (*BinaryViewType) ID() Type { return BINARY_VIEW } diff --git a/go/arrow/datatype_stringheader.go b/go/arrow/datatype_stringheader.go new file mode 100644 index 00000000000..6a8ad463b6d --- /dev/null +++ b/go/arrow/datatype_stringheader.go @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arrow + +import ( + "bytes" + "unsafe" + + "github.com/apache/arrow/go/v13/arrow/endian" + "github.com/apache/arrow/go/v13/arrow/internal/debug" + "github.com/apache/arrow/go/v13/arrow/memory" +) + +const ( + StringHeaderPrefixLen = 4 + stringHeaderInlineSize = 12 +) + +func IsStringHeaderInline(length int) bool { + return length < stringHeaderInlineSize +} + +// StringHeader is a variable length string (utf8) or byte slice with +// a 4 byte prefix and inline optimization for small values (12 bytes +// or fewer). This is similar to Go's standard string but limited by +// a length of Uint32Max and up to the first four bytes of the string +// are copied into the struct. This prefix allows failing comparisons +// early and can reduce CPU cache working set when dealing with short +// strings. +// +// There are two situations: +// +// Short string |----|----|--------| +// ^ ^ ^ +// | | | +// size prefix remaining in-line portion, zero padded +// +// IO Long String |----|----|----|----| +// ^ ^ ^ ^ +// | | | | +// size prefix buffer index and offset to out-of-line portion +// +// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. +// +// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf +type StringHeader struct { + size uint32 + // the first 4 bytes of this are the prefix for the string + // if size <= StringHeaderInlineSize, then the entire string + // is in the data array and is zero padded. + // if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32 + // values which are the buffer index and offset in that buffer + // containing the full string. + data [stringHeaderInlineSize]byte +} + +func (sh *StringHeader) IsInline() bool { + return sh.size <= uint32(stringHeaderInlineSize) +} + +func (sh *StringHeader) Len() int { return int(sh.size) } +func (sh *StringHeader) Prefix() [StringHeaderPrefixLen]byte { + return *(*[4]byte)(unsafe.Pointer(&sh.data)) +} + +func (sh *StringHeader) BufferIndex() uint32 { + return endian.Native.Uint32(sh.data[StringHeaderPrefixLen:]) +} + +func (sh *StringHeader) BufferOffset() uint32 { + return endian.Native.Uint32(sh.data[StringHeaderPrefixLen+4:]) +} + +func (sh *StringHeader) InlineBytes() (data []byte) { + debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader") + return sh.data[:sh.size] +} + +func (sh *StringHeader) SetBytes(data []byte) int { + sh.size = uint32(len(data)) + if sh.IsInline() { + return copy(sh.data[:], data) + } + return copy(sh.data[:4], data) +} + +func (sh *StringHeader) SetString(data string) int { + sh.size = uint32(len(data)) + if sh.IsInline() { + return copy(sh.data[:], data) + } + return copy(sh.data[:4], data) +} + +func (sh *StringHeader) SetIndexOffset(bufferIndex, offset uint32) { + endian.Native.PutUint32(sh.data[StringHeaderPrefixLen:], bufferIndex) + endian.Native.PutUint32(sh.data[StringHeaderPrefixLen+4:], offset) +} + +func (sh *StringHeader) Equals(buffers []*memory.Buffer, other *StringHeader, otherBuffers []*memory.Buffer) bool { + if sh.sizeAndPrefixAsInt() != other.sizeAndPrefixAsInt() { + return false + } + + if sh.IsInline() { + return sh.inlinedAsInt64() == other.inlinedAsInt64() + } + + data := buffers[sh.BufferIndex()].Bytes()[sh.BufferOffset() : sh.BufferOffset()+sh.size] + otherData := otherBuffers[other.BufferIndex()].Bytes()[other.BufferOffset() : other.BufferOffset()+other.size] + return bytes.Equal(data, otherData) +} + +func (sh *StringHeader) inlinedAsInt64() int64 { + s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2) + return s[1] +} + +func (sh *StringHeader) sizeAndPrefixAsInt() int64 { + s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2) + return s[0] +} diff --git a/go/arrow/datatype_stringheader_inline.go b/go/arrow/datatype_stringheader_inline.go new file mode 100644 index 00000000000..e33938f3a1c --- /dev/null +++ b/go/arrow/datatype_stringheader_inline.go @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build go1.20 + +package arrow + +import ( + "unsafe" + + "github.com/apache/arrow/go/v13/arrow/internal/debug" +) + +func (sh *StringHeader) InlineData() (data string) { + debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader") + + return unsafe.String((*byte)(unsafe.Pointer(&sh.data)), sh.size) +} diff --git a/go/arrow/datatype_stringheader_inline_go1.19.go b/go/arrow/datatype_stringheader_inline_go1.19.go new file mode 100644 index 00000000000..ab1cc64c138 --- /dev/null +++ b/go/arrow/datatype_stringheader_inline_go1.19.go @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !go1.20 && !tinygo + +package arrow + +import ( + "reflect" + "unsafe" + + "github.com/apache/arrow/go/v13/arrow/internal/debug" +) + +func (sh *StringHeader) InlineData() (data string) { + debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader") + + h := (*reflect.StringHeader)(unsafe.Pointer(&data)) + h.Data = uintptr(unsafe.Pointer(&sh.data)) + h.Len = int(sh.size) + return +} diff --git a/go/arrow/datatype_stringheader_inline_tinygo.go b/go/arrow/datatype_stringheader_inline_tinygo.go new file mode 100644 index 00000000000..c5ab89da83f --- /dev/null +++ b/go/arrow/datatype_stringheader_inline_tinygo.go @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !go1.20 && tinygo + +package arrow + +import ( + "reflect" + "unsafe" + + "github.com/apache/arrow/go/v13/arrow/internal/debug" +) + +func (sh *StringHeader) InlineData() (data string) { + debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader") + + h := (*reflect.StringHeader)(unsafe.Pointer(&data)) + h.Data = uintptr(unsafe.Pointer(&sh.data)) + h.Len = uintptr(sh.size) + return +} From 3595e5d9d8ad8019e65e32d1f6d8df6ebd79bd87 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 21 Jun 2023 12:16:04 -0400 Subject: [PATCH 12/37] Update go/arrow/datatype_binary.go Co-authored-by: Alex Shcherbakov --- go/arrow/datatype_binary.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/go/arrow/datatype_binary.go b/go/arrow/datatype_binary.go index d49506e6a7b..d74e33e5b84 100644 --- a/go/arrow/datatype_binary.go +++ b/go/arrow/datatype_binary.go @@ -94,8 +94,10 @@ func (*BinaryViewType) view() {} func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) } func (*BinaryViewType) Layout() DataTypeLayout { variadic := SpecVariableWidth() - return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(), - SpecFixedWidth(StringHeaderSizeBytes)}, VariadicSpec: &variadic} + return DataTypeLayout{ + Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(StringHeaderSizeBytes)}, + VariadicSpec: &variadic, + } } type StringViewType struct{} From 92a8362a2357d6d50f486a4873316f4487997874 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 21 Jun 2023 12:17:29 -0400 Subject: [PATCH 13/37] Update go/arrow/datatype_binary.go Co-authored-by: Alex Shcherbakov --- go/arrow/datatype_binary.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/go/arrow/datatype_binary.go b/go/arrow/datatype_binary.go index d74e33e5b84..603e854d4d9 100644 --- a/go/arrow/datatype_binary.go +++ b/go/arrow/datatype_binary.go @@ -111,8 +111,10 @@ func (*StringViewType) view() {} func (t *StringViewType) Fingerprint() string { return typeFingerprint(t) } func (*StringViewType) Layout() DataTypeLayout { variadic := SpecVariableWidth() - return DataTypeLayout{Buffers: []BufferSpec{SpecBitmap(), - SpecFixedWidth(StringHeaderSizeBytes)}, VariadicSpec: &variadic} + return DataTypeLayout{ + Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(StringHeaderSizeBytes)}, + VariadicSpec: &variadic, + } } var ( From 704bf82efaa898fb6588721176ac8adf18a873e3 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 21 Jun 2023 12:20:15 -0400 Subject: [PATCH 14/37] Update go/arrow/array/binary.go Co-authored-by: Alex Shcherbakov --- go/arrow/array/binary.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index 460a9eee220..6a382a34e7d 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -331,8 +331,7 @@ type BinaryView struct { } func NewBinaryViewData(data arrow.ArrayData) *BinaryView { - a := &BinaryView{} - a.refCount = 1 + a := &BinaryView{refCount: 1} a.setData(data.(*Data)) return a } From dcdc1b14ec0f17c8904810c933be078a240ef92b Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 21 Jun 2023 12:50:15 -0400 Subject: [PATCH 15/37] embedded field --- go/arrow/array/binary.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index 6a382a34e7d..460a9eee220 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -331,7 +331,8 @@ type BinaryView struct { } func NewBinaryViewData(data arrow.ArrayData) *BinaryView { - a := &BinaryView{refCount: 1} + a := &BinaryView{} + a.refCount = 1 a.setData(data.(*Data)) return a } From c15b7ba95f94e77d633fe67eaa4713fec186bb10 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 21 Jun 2023 15:33:27 -0400 Subject: [PATCH 16/37] updates from review feedback --- go/arrow/array/binary.go | 10 ++++++++++ go/arrow/array/binarybuilder.go | 14 ++++++++++++++ go/arrow/datatype_binary.go | 3 +++ 3 files changed, 27 insertions(+) diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index 460a9eee220..7848c46c212 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -367,6 +367,8 @@ func (a *BinaryView) Value(i int) []byte { return buf.Bytes()[start : start+uint32(s.Len())] } +// ValueString returns the value at index i as a string instead of +// a byte slice, without copying the underlying data. func (a *BinaryView) ValueString(i int) string { b := a.Value(i) return *(*string)(unsafe.Pointer(&b)) @@ -390,6 +392,14 @@ func (a *BinaryView) String() string { return o.String() } +// ValueStr is paired with AppendValueFromString in that it returns +// the value at index i as a string: Semantically this means that for +// a null value it will return the string "(null)", otherwise it will +// return the value as a base64 encoded string suitable for CSV/JSON. +// +// This is always going to be less performant than just using ValueString +// and exists to fulfill the Array interface to provide a method which +// can produce a human readable string for a given index. func (a *BinaryView) ValueStr(i int) string { if a.IsNull(i) { return NullValueStr diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go index 0c798f5bd4e..ea03c1e1d39 100644 --- a/go/arrow/array/binarybuilder.go +++ b/go/arrow/array/binarybuilder.go @@ -467,6 +467,14 @@ func (b *BinaryViewBuilder) Append(v []byte) { b.UnsafeAppend(v) } +// AppendString is identical to Append, only accepting a string instead +// of a byte slice, avoiding the extra copy that would occur if you simply +// did []byte(v). +// +// This is different than AppendValueFromString which exists for the +// Builder interface, in that this expects raw binary data which is +// appended as such. AppendValueFromString expects base64 encoded binary +// data instead. func (b *BinaryViewBuilder) AppendString(v string) { // create a []byte without copying the bytes // in go1.20 this would be unsafe.StringData @@ -562,6 +570,12 @@ func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) { b.builder.unsafeAppendBoolsToBitmap(valid, len(v)) } +// AppendValueFromString is paired with ValueStr for fulfilling the +// base Builder interface. This is intended to read in a human-readable +// string such as from CSV or JSON and append it to the array. +// +// For Binary values are expected to be base64 encoded (and will be +// decoded as such before being appended). func (b *BinaryViewBuilder) AppendValueFromString(s string) error { if s == NullValueStr { b.AppendNull() diff --git a/go/arrow/datatype_binary.go b/go/arrow/datatype_binary.go index 603e854d4d9..cb3c3e2ed87 100644 --- a/go/arrow/datatype_binary.go +++ b/go/arrow/datatype_binary.go @@ -133,4 +133,7 @@ var ( BinaryView: &BinaryViewType{}, StringView: &StringViewType{}, } + + _ BinaryViewDataType = (*StringViewType)(nil) + _ BinaryViewDataType = (*BinaryViewType)(nil) ) From d6bbd35a5d78d0ba92228e18e651df8ed05c5782 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 21 Jun 2023 16:13:55 -0400 Subject: [PATCH 17/37] add AppendNulls and AppendEmptyValues --- go/arrow/array/binarybuilder.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go index ea03c1e1d39..7f503c8e9ea 100644 --- a/go/arrow/array/binarybuilder.go +++ b/go/arrow/array/binarybuilder.go @@ -490,11 +490,23 @@ func (b *BinaryViewBuilder) AppendNull() { b.UnsafeAppendBoolToBitmap(false) } +func (b *BinaryViewBuilder) AppendNulls(n int) { + b.Reserve(n) + for i := 0; i < n; i++ { + b.UnsafeAppendBoolToBitmap(false) + } +} + func (b *BinaryViewBuilder) AppendEmptyValue() { b.Reserve(1) b.UnsafeAppendBoolToBitmap(true) } +func (b *BinaryViewBuilder) AppendEmptyValues(n int) { + b.Reserve(n) + b.unsafeAppendBoolsToBitmap(nil, n) +} + func (b *BinaryViewBuilder) UnsafeAppend(v []byte) { hdr := &b.rawData[b.length] hdr.SetBytes(v) From 8dbcf526ada3fc578d5055a86336ddd7382ce580 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 21 Jun 2023 16:28:05 -0400 Subject: [PATCH 18/37] handle flaky test --- go/arrow/compute/executor.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/go/arrow/compute/executor.go b/go/arrow/compute/executor.go index 5085bc179c5..2aae50b9d02 100644 --- a/go/arrow/compute/executor.go +++ b/go/arrow/compute/executor.go @@ -1009,9 +1009,10 @@ func (v *vectorExecutor) WrapResults(ctx context.Context, out <-chan Datum, hasC case <-ctx.Done(): return nil case output = <-out: - if output == nil { + if output == nil || ctx.Err() != nil { return nil } + // if the inputs contained at least one chunked array // then we want to return chunked output if hasChunked { From d0e03bb3f1b10552f5b6f6be82f1e05274c21b56 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 22 Jun 2023 11:35:03 -0400 Subject: [PATCH 19/37] Update go/arrow/internal/testing/gen/random_array_gen.go Co-authored-by: Alex Shcherbakov --- go/arrow/internal/testing/gen/random_array_gen.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/go/arrow/internal/testing/gen/random_array_gen.go b/go/arrow/internal/testing/gen/random_array_gen.go index ec2d9085238..caf7fd89622 100644 --- a/go/arrow/internal/testing/gen/random_array_gen.go +++ b/go/arrow/internal/testing/gen/random_array_gen.go @@ -375,11 +375,11 @@ func (r *RandomArrayGenerator) generateBinaryView(dt arrow.DataType, size int64, } for i := 0; i < lengths.Len(); i++ { - if lengths.IsValid(i) { - bldr.Append(gen(lengths.Value(i))) - } else { + if lengths.IsNull(i) { bldr.AppendNull() + continue } + bldr.Append(gen(lengths.Value(i))) } return bldr.NewArray() From 646b1e29032f9496016acc7a9694a68e6516097e Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 22 Jun 2023 11:45:46 -0400 Subject: [PATCH 20/37] Update go/arrow/array/binarybuilder.go Co-authored-by: Alex Shcherbakov --- go/arrow/array/binarybuilder.go | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go index 7f503c8e9ea..81f038092ee 100644 --- a/go/arrow/array/binarybuilder.go +++ b/go/arrow/array/binarybuilder.go @@ -406,16 +406,18 @@ func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype } func (b *BinaryViewBuilder) Release() { debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") - if atomic.AddInt64(&b.refCount, -1) == 0 { - if b.nullBitmap != nil { - b.nullBitmap.Release() - b.nullBitmap = nil - } - if b.data != nil { - b.data.Release() - b.data = nil - b.rawData = nil - } + if atomic.AddInt64(&b.refCount, -1) != 0 { + return + } + + if b.nullBitmap != nil { + b.nullBitmap.Release() + b.nullBitmap = nil + } + if b.data != nil { + b.data.Release() + b.data = nil + b.rawData = nil } } From 24fb62808483869a9fec2ecfb58d8380d29b8af4 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 22 Jun 2023 11:46:44 -0400 Subject: [PATCH 21/37] Update go/arrow/array/binarybuilder.go Co-authored-by: Alex Shcherbakov --- go/arrow/array/binarybuilder.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go index 81f038092ee..9203716ee30 100644 --- a/go/arrow/array/binarybuilder.go +++ b/go/arrow/array/binarybuilder.go @@ -437,11 +437,12 @@ func (b *BinaryViewBuilder) Resize(n int) { if b.capacity == 0 { b.init(n) - } else { - b.builder.resize(nbuild, b.init) - b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n)) - b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes()) + return } + + b.builder.resize(nbuild, b.init) + b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n)) + b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes()) } func (b *BinaryViewBuilder) ReserveData(length int) { From 306ee9459feaf5bf938cf03c43171c317fab73b1 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 22 Jun 2023 11:56:10 -0400 Subject: [PATCH 22/37] Update go/arrow/array/bufferbuilder.go Co-authored-by: Alex Shcherbakov --- go/arrow/array/bufferbuilder.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/go/arrow/array/bufferbuilder.go b/go/arrow/array/bufferbuilder.go index 61731dc3a5e..e62dcc10bcb 100644 --- a/go/arrow/array/bufferbuilder.go +++ b/go/arrow/array/bufferbuilder.go @@ -176,10 +176,7 @@ func (b *multiBufferBuilder) Release() { debug.Assert(atomic.LoadInt64(&b.refCount) > 0, "too many releases") if atomic.AddInt64(&b.refCount, -1) == 0 { - for i, buf := range b.blocks { - buf.Release() - b.blocks[i] = nil - } + b.Reset() } } From 5dc1d51e4025716f78045d046dfaa9958e4d4506 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 22 Jun 2023 11:56:38 -0400 Subject: [PATCH 23/37] Update go/arrow/datatype_stringheader.go Co-authored-by: Alex Shcherbakov --- go/arrow/datatype_stringheader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/arrow/datatype_stringheader.go b/go/arrow/datatype_stringheader.go index 6a8ad463b6d..053cc91a045 100644 --- a/go/arrow/datatype_stringheader.go +++ b/go/arrow/datatype_stringheader.go @@ -36,7 +36,7 @@ func IsStringHeaderInline(length int) bool { // StringHeader is a variable length string (utf8) or byte slice with // a 4 byte prefix and inline optimization for small values (12 bytes -// or fewer). This is similar to Go's standard string but limited by +// or fewer). This is similar to Go's standard string but limited by // a length of Uint32Max and up to the first four bytes of the string // are copied into the struct. This prefix allows failing comparisons // early and can reduce CPU cache working set when dealing with short From b620e454c2b18f7971814fb53d8ff7167777e846 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 22 Jun 2023 11:57:59 -0400 Subject: [PATCH 24/37] updates from review feedback --- go/arrow/internal/arrjson/arrjson.go | 37 ++++++++++------------------ go/arrow/ipc/writer.go | 6 ++--- 2 files changed, 16 insertions(+), 27 deletions(-) diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index 45695fb065d..89b7e525ec5 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -1087,22 +1087,10 @@ func arrayFromJSON(mem memory.Allocator, dt arrow.DataType, arr Array) arrow.Arr bldr.AppendValues(data, valids) return returnNewArrayData(bldr) - case *arrow.BinaryViewType: - valids := validsToBitmap(validsFromJSON(arr.Valids), mem) - nulls := arr.Count - bitutil.CountSetBits(valids.Bytes(), 0, arr.Count) - headers := stringHeadersFromJSON(mem, true, arr.Data) - extraBufs := variadicBuffersFromJSON(arr.Variadic) - defer valids.Release() - defer headers.Release() - - return array.NewData(dt, arr.Count, - append([]*memory.Buffer{valids, headers}, extraBufs...), - nil, nulls, 0) - - case *arrow.StringViewType: + case arrow.BinaryViewDataType: valids := validsToBitmap(validsFromJSON(arr.Valids), mem) nulls := arr.Count - bitutil.CountSetBits(valids.Bytes(), 0, arr.Count) - headers := stringHeadersFromJSON(mem, false, arr.Data) + headers := stringHeadersFromJSON(mem, !dt.IsUtf8(), arr.Data) extraBufs := variadicBuffersFromJSON(arr.Variadic) defer valids.Release() defer headers.Release() @@ -2456,16 +2444,17 @@ func stringHeadersToJSON(arr array.ViewLike, isBinary bool) []interface{} { Size: hdr.Len(), Inlined: &data, } - } else { - idx, off := int(hdr.BufferIndex()), int(hdr.BufferOffset()) - prefix := hdr.Prefix() - encodedPrefix := strings.ToUpper(hex.EncodeToString(prefix[:])) - o[i] = StringHeader{ - Size: hdr.Len(), - Prefix: &encodedPrefix, - BufferIdx: &idx, - BufferOff: &off, - } + continue + } + + idx, off := int(hdr.BufferIndex()), int(hdr.BufferOffset()) + prefix := hdr.Prefix() + encodedPrefix := strings.ToUpper(hex.EncodeToString(prefix[:])) + o[i] = StringHeader{ + Size: hdr.Len(), + Prefix: &encodedPrefix, + BufferIdx: &idx, + BufferOff: &off, } } return o diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go index f365f983ad8..ef1dd1c1dab 100644 --- a/go/arrow/ipc/writer.go +++ b/go/arrow/ipc/writer.go @@ -613,10 +613,10 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error { switch { case needTruncate(int64(data.Offset()), values, minLength): // non-zero offset: slice the buffer - offset := int64(data.Offset()) * typeWidth + offset := data.Offset() * int(typeWidth) // send padding if available - len := minI64(bitutil.CeilByte64(arrLen*typeWidth), int64(values.Len())-offset) - values = memory.NewBufferBytes(values.Bytes()[offset : offset+len]) + len := int(minI64(bitutil.CeilByte64(arrLen*typeWidth), int64(values.Len()-offset))) + values = memory.SliceBuffer(values, offset, len) default: if values != nil { values.Retain() From 0b10bed98d66aadf6e1080353295ed91a059fde1 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 22 Jun 2023 12:03:40 -0400 Subject: [PATCH 25/37] update gitattributes for generated go files --- .gitattributes | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitattributes b/.gitattributes index 69f4139c4e4..70007c26c8b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3,6 +3,9 @@ cpp/src/generated/*.cpp linguist-generated=true cpp/src/generated/*.h linguist-generated=true go/**/*.s linguist-generated=true go/arrow/unionmode_string.go linguist-generated=true +go/arrow/internal/flatbuf/*.go linguist-generated=true +go/**/*.pb.go linguist-generated=true +go/parquet/internal/gen-go/parquet/*.go linguist-generated=true r/R/RcppExports.R linguist-generated=true r/R/arrowExports.R linguist-generated=true r/src/RcppExports.cpp linguist-generated=true From a009c4743d05533fd78f30d07ffd539f64d9deaf Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 25 Oct 2023 17:37:01 -0400 Subject: [PATCH 26/37] updates from merge --- go/arrow/datatype_stringheader.go | 6 +++--- go/arrow/datatype_stringheader_inline.go | 2 +- go/arrow/datatype_stringheader_inline_go1.19.go | 2 +- go/arrow/datatype_stringheader_inline_tinygo.go | 2 +- go/arrow/ipc/file_reader.go | 2 +- go/arrow/ipc/metadata.go | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go/arrow/datatype_stringheader.go b/go/arrow/datatype_stringheader.go index 053cc91a045..1fd829e5766 100644 --- a/go/arrow/datatype_stringheader.go +++ b/go/arrow/datatype_stringheader.go @@ -20,9 +20,9 @@ import ( "bytes" "unsafe" - "github.com/apache/arrow/go/v13/arrow/endian" - "github.com/apache/arrow/go/v13/arrow/internal/debug" - "github.com/apache/arrow/go/v13/arrow/memory" + "github.com/apache/arrow/go/v14/arrow/endian" + "github.com/apache/arrow/go/v14/arrow/internal/debug" + "github.com/apache/arrow/go/v14/arrow/memory" ) const ( diff --git a/go/arrow/datatype_stringheader_inline.go b/go/arrow/datatype_stringheader_inline.go index e33938f3a1c..5a208716dc9 100644 --- a/go/arrow/datatype_stringheader_inline.go +++ b/go/arrow/datatype_stringheader_inline.go @@ -21,7 +21,7 @@ package arrow import ( "unsafe" - "github.com/apache/arrow/go/v13/arrow/internal/debug" + "github.com/apache/arrow/go/v14/arrow/internal/debug" ) func (sh *StringHeader) InlineData() (data string) { diff --git a/go/arrow/datatype_stringheader_inline_go1.19.go b/go/arrow/datatype_stringheader_inline_go1.19.go index ab1cc64c138..e6bc318d275 100644 --- a/go/arrow/datatype_stringheader_inline_go1.19.go +++ b/go/arrow/datatype_stringheader_inline_go1.19.go @@ -22,7 +22,7 @@ import ( "reflect" "unsafe" - "github.com/apache/arrow/go/v13/arrow/internal/debug" + "github.com/apache/arrow/go/v14/arrow/internal/debug" ) func (sh *StringHeader) InlineData() (data string) { diff --git a/go/arrow/datatype_stringheader_inline_tinygo.go b/go/arrow/datatype_stringheader_inline_tinygo.go index c5ab89da83f..05f4c555e7b 100644 --- a/go/arrow/datatype_stringheader_inline_tinygo.go +++ b/go/arrow/datatype_stringheader_inline_tinygo.go @@ -22,7 +22,7 @@ import ( "reflect" "unsafe" - "github.com/apache/arrow/go/v13/arrow/internal/debug" + "github.com/apache/arrow/go/v14/arrow/internal/debug" ) func (sh *StringHeader) InlineData() (data string) { diff --git a/go/arrow/ipc/file_reader.go b/go/arrow/ipc/file_reader.go index 98c99d9fda3..d21d8fffeeb 100644 --- a/go/arrow/ipc/file_reader.go +++ b/go/arrow/ipc/file_reader.go @@ -431,7 +431,7 @@ func (src *ipcSource) fieldMetadata(i int) *flatbuf.FieldNode { } func (src *ipcSource) variadicCount(i int) int64 { - return src.meta.VariadicCounts(i) + return src.meta.VariadicBufferCounts(i) } type arrayLoaderContext struct { diff --git a/go/arrow/ipc/metadata.go b/go/arrow/ipc/metadata.go index 044681f081d..aa77c07f178 100644 --- a/go/arrow/ipc/metadata.go +++ b/go/arrow/ipc/metadata.go @@ -1210,7 +1210,7 @@ func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMe bodyCompressFB = writeBodyCompression(b, codec) } - flatbuf.RecordBatchStartVariadicCountsVector(b, len(variadicCounts)) + flatbuf.RecordBatchStartVariadicBufferCountsVector(b, len(variadicCounts)) for i := len(variadicCounts) - 1; i >= 0; i-- { b.PrependInt64(variadicCounts[i]) } @@ -1220,7 +1220,7 @@ func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMe flatbuf.RecordBatchAddLength(b, size) flatbuf.RecordBatchAddNodes(b, fieldsFB) flatbuf.RecordBatchAddBuffers(b, metaFB) - flatbuf.RecordBatchAddVariadicCounts(b, vcFB) + flatbuf.RecordBatchAddVariadicBufferCounts(b, vcFB) if codec != -1 { flatbuf.RecordBatchAddCompression(b, bodyCompressFB) } From bccebbe7d9eb9de927f7532302ca574f63495b66 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 26 Oct 2023 11:47:22 -0400 Subject: [PATCH 27/37] rename and fix imports --- go/arrow/array/binary.go | 6 +-- go/arrow/array/binarybuilder.go | 10 ++-- go/arrow/array/bufferbuilder.go | 4 +- go/arrow/array/string.go | 4 +- go/arrow/datatype_stringheader.go | 66 ++++++++++++------------ go/arrow/datatype_stringheader_inline.go | 2 +- go/arrow/ipc/endian_swap.go | 3 ++ go/arrow/type_traits_string_view.go | 14 ++--- 8 files changed, 57 insertions(+), 52 deletions(-) diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index 7848c46c212..b1458f6425d 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -321,12 +321,12 @@ func arrayEqualLargeBinary(left, right *LargeBinary) bool { type ViewLike interface { arrow.Array - ValueHeader(int) *arrow.StringHeader + ValueHeader(int) *arrow.ViewHeader } type BinaryView struct { array - values []arrow.StringHeader + values []arrow.ViewHeader dataBuffers []*memory.Buffer } @@ -350,7 +350,7 @@ func (a *BinaryView) setData(data *Data) { a.dataBuffers = data.buffers[2:] } -func (a *BinaryView) ValueHeader(i int) *arrow.StringHeader { +func (a *BinaryView) ValueHeader(i int) *arrow.ViewHeader { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go index 9203716ee30..5fad6e54f5a 100644 --- a/go/arrow/array/binarybuilder.go +++ b/go/arrow/array/binarybuilder.go @@ -381,7 +381,7 @@ type BinaryViewBuilder struct { dtype arrow.BinaryDataType data *memory.Buffer - rawData []arrow.StringHeader + rawData []arrow.ViewHeader blockBuilder multiBufferBuilder } @@ -439,7 +439,7 @@ func (b *BinaryViewBuilder) Resize(n int) { b.init(n) return } - + b.builder.resize(nbuild, b.init) b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n)) b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes()) @@ -462,7 +462,7 @@ func (b *BinaryViewBuilder) Append(v []byte) { panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid)) } - if !arrow.IsStringHeaderInline(len(v)) { + if !arrow.IsViewInline(len(v)) { b.ReserveData(len(v)) } @@ -532,7 +532,7 @@ func (b *BinaryViewBuilder) AppendValues(v [][]byte, valid []bool) { outOfLineTotal := 0 for i, vv := range v { if len(valid) == 0 || valid[i] { - if !arrow.IsStringHeaderInline(len(vv)) { + if !arrow.IsViewInline(len(vv)) { outOfLineTotal += len(vv) } } @@ -565,7 +565,7 @@ func (b *BinaryViewBuilder) AppendStringValues(v []string, valid []bool) { outOfLineTotal := 0 for i, vv := range v { if len(valid) == 0 || valid[i] { - if !arrow.IsStringHeaderInline(len(vv)) { + if !arrow.IsViewInline(len(vv)) { outOfLineTotal += len(vv) } } diff --git a/go/arrow/array/bufferbuilder.go b/go/arrow/array/bufferbuilder.go index e62dcc10bcb..0ce07548fb8 100644 --- a/go/arrow/array/bufferbuilder.go +++ b/go/arrow/array/bufferbuilder.go @@ -237,7 +237,7 @@ func (b *multiBufferBuilder) Reset() { b.blocks = nil } -func (b *multiBufferBuilder) UnsafeAppend(hdr *arrow.StringHeader, val []byte) { +func (b *multiBufferBuilder) UnsafeAppend(hdr *arrow.ViewHeader, val []byte) { buf := b.blocks[b.currentOutBuffer] idx, offset := b.currentOutBuffer, buf.Len() hdr.SetIndexOffset(uint32(idx), uint32(offset)) @@ -246,7 +246,7 @@ func (b *multiBufferBuilder) UnsafeAppend(hdr *arrow.StringHeader, val []byte) { buf.ResizeNoShrink(offset + n) } -func (b *multiBufferBuilder) UnsafeAppendString(hdr *arrow.StringHeader, val string) { +func (b *multiBufferBuilder) UnsafeAppendString(hdr *arrow.ViewHeader, val string) { // create a byte slice with zero-copies // in go1.20 this would be equivalent to unsafe.StringData v := *(*[]byte)(unsafe.Pointer(&struct { diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go index 3339a67f4f8..6b2620e2d16 100644 --- a/go/arrow/array/string.go +++ b/go/arrow/array/string.go @@ -317,7 +317,7 @@ func arrayEqualLargeString(left, right *LargeString) bool { type StringView struct { array - values []arrow.StringHeader + values []arrow.ViewHeader dataBuffers []*memory.Buffer } @@ -346,7 +346,7 @@ func (a *StringView) setData(data *Data) { a.dataBuffers = data.buffers[2:] } -func (a *StringView) ValueHeader(i int) *arrow.StringHeader { +func (a *StringView) ValueHeader(i int) *arrow.ViewHeader { if i < 0 || i >= a.array.data.length { panic("arrow/array: index out of range") } diff --git a/go/arrow/datatype_stringheader.go b/go/arrow/datatype_stringheader.go index 1fd829e5766..fc4dcbeb85c 100644 --- a/go/arrow/datatype_stringheader.go +++ b/go/arrow/datatype_stringheader.go @@ -26,15 +26,15 @@ import ( ) const ( - StringHeaderPrefixLen = 4 - stringHeaderInlineSize = 12 + StringViewPrefixLen = 4 + stringViewInlineSize = 12 ) -func IsStringHeaderInline(length int) bool { - return length < stringHeaderInlineSize +func IsViewInline(length int) bool { + return length < stringViewInlineSize } -// StringHeader is a variable length string (utf8) or byte slice with +// ViewHeader is a variable length string (utf8) or byte slice with // a 4 byte prefix and inline optimization for small values (12 bytes // or fewer). This is similar to Go's standard string but limited by // a length of Uint32Max and up to the first four bytes of the string @@ -44,20 +44,22 @@ func IsStringHeaderInline(length int) bool { // // There are two situations: // -// Short string |----|----|--------| -// ^ ^ ^ -// | | | -// size prefix remaining in-line portion, zero padded +// Entirely inlined string data +// |----|------------| +// ^ ^ +// | | +// size inline string data, zero padded // -// IO Long String |----|----|----|----| -// ^ ^ ^ ^ -// | | | | -// size prefix buffer index and offset to out-of-line portion +// Reference into buffer +// |----|----|----|----| +// ^ ^ ^ ^ +// | | | | +// size prefix buffer index and offset to out-of-line portion // // Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. // // [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf -type StringHeader struct { +type ViewHeader struct { size uint32 // the first 4 bytes of this are the prefix for the string // if size <= StringHeaderInlineSize, then the entire string @@ -65,32 +67,32 @@ type StringHeader struct { // if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32 // values which are the buffer index and offset in that buffer // containing the full string. - data [stringHeaderInlineSize]byte + data [stringViewInlineSize]byte } -func (sh *StringHeader) IsInline() bool { - return sh.size <= uint32(stringHeaderInlineSize) +func (sh *ViewHeader) IsInline() bool { + return sh.size <= uint32(stringViewInlineSize) } -func (sh *StringHeader) Len() int { return int(sh.size) } -func (sh *StringHeader) Prefix() [StringHeaderPrefixLen]byte { +func (sh *ViewHeader) Len() int { return int(sh.size) } +func (sh *ViewHeader) Prefix() [StringViewPrefixLen]byte { return *(*[4]byte)(unsafe.Pointer(&sh.data)) } -func (sh *StringHeader) BufferIndex() uint32 { - return endian.Native.Uint32(sh.data[StringHeaderPrefixLen:]) +func (sh *ViewHeader) BufferIndex() uint32 { + return endian.Native.Uint32(sh.data[StringViewPrefixLen:]) } -func (sh *StringHeader) BufferOffset() uint32 { - return endian.Native.Uint32(sh.data[StringHeaderPrefixLen+4:]) +func (sh *ViewHeader) BufferOffset() uint32 { + return endian.Native.Uint32(sh.data[StringViewPrefixLen+4:]) } -func (sh *StringHeader) InlineBytes() (data []byte) { +func (sh *ViewHeader) InlineBytes() (data []byte) { debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader") return sh.data[:sh.size] } -func (sh *StringHeader) SetBytes(data []byte) int { +func (sh *ViewHeader) SetBytes(data []byte) int { sh.size = uint32(len(data)) if sh.IsInline() { return copy(sh.data[:], data) @@ -98,7 +100,7 @@ func (sh *StringHeader) SetBytes(data []byte) int { return copy(sh.data[:4], data) } -func (sh *StringHeader) SetString(data string) int { +func (sh *ViewHeader) SetString(data string) int { sh.size = uint32(len(data)) if sh.IsInline() { return copy(sh.data[:], data) @@ -106,12 +108,12 @@ func (sh *StringHeader) SetString(data string) int { return copy(sh.data[:4], data) } -func (sh *StringHeader) SetIndexOffset(bufferIndex, offset uint32) { - endian.Native.PutUint32(sh.data[StringHeaderPrefixLen:], bufferIndex) - endian.Native.PutUint32(sh.data[StringHeaderPrefixLen+4:], offset) +func (sh *ViewHeader) SetIndexOffset(bufferIndex, offset uint32) { + endian.Native.PutUint32(sh.data[StringViewPrefixLen:], bufferIndex) + endian.Native.PutUint32(sh.data[StringViewPrefixLen+4:], offset) } -func (sh *StringHeader) Equals(buffers []*memory.Buffer, other *StringHeader, otherBuffers []*memory.Buffer) bool { +func (sh *ViewHeader) Equals(buffers []*memory.Buffer, other *ViewHeader, otherBuffers []*memory.Buffer) bool { if sh.sizeAndPrefixAsInt() != other.sizeAndPrefixAsInt() { return false } @@ -125,12 +127,12 @@ func (sh *StringHeader) Equals(buffers []*memory.Buffer, other *StringHeader, ot return bytes.Equal(data, otherData) } -func (sh *StringHeader) inlinedAsInt64() int64 { +func (sh *ViewHeader) inlinedAsInt64() int64 { s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2) return s[1] } -func (sh *StringHeader) sizeAndPrefixAsInt() int64 { +func (sh *ViewHeader) sizeAndPrefixAsInt() int64 { s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2) return s[0] } diff --git a/go/arrow/datatype_stringheader_inline.go b/go/arrow/datatype_stringheader_inline.go index 5a208716dc9..db6b8c7e05c 100644 --- a/go/arrow/datatype_stringheader_inline.go +++ b/go/arrow/datatype_stringheader_inline.go @@ -24,7 +24,7 @@ import ( "github.com/apache/arrow/go/v14/arrow/internal/debug" ) -func (sh *StringHeader) InlineData() (data string) { +func (sh *ViewHeader) InlineData() (data string) { debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader") return unsafe.String((*byte)(unsafe.Pointer(&sh.data)), sh.size) diff --git a/go/arrow/ipc/endian_swap.go b/go/arrow/ipc/endian_swap.go index d98fec1089f..06bcbc77be3 100644 --- a/go/arrow/ipc/endian_swap.go +++ b/go/arrow/ipc/endian_swap.go @@ -18,6 +18,7 @@ package ipc import ( "errors" + "fmt" "math/bits" "github.com/apache/arrow/go/v14/arrow" @@ -120,6 +121,8 @@ func swapType(dt arrow.DataType, data *array.Data) (err error) { case arrow.FixedWidthDataType: byteSwapBuffer(dt.BitWidth(), data.Buffers()[1]) } + + err = fmt.Errorf("%w: swapping endianness of %s", arrow.ErrNotImplemented, dt) return } diff --git a/go/arrow/type_traits_string_view.go b/go/arrow/type_traits_string_view.go index 1006dfb2eb0..a8e1d2f36b9 100644 --- a/go/arrow/type_traits_string_view.go +++ b/go/arrow/type_traits_string_view.go @@ -20,34 +20,34 @@ import ( "reflect" "unsafe" - "github.com/apache/arrow/go/v13/arrow/endian" + "github.com/apache/arrow/go/v14/arrow/endian" ) var StringHeaderTraits stringHeaderTraits const ( - StringHeaderSizeBytes = int(unsafe.Sizeof(StringHeader{})) + StringHeaderSizeBytes = int(unsafe.Sizeof(ViewHeader{})) ) type stringHeaderTraits struct{} func (stringHeaderTraits) BytesRequired(n int) int { return StringHeaderSizeBytes * n } -func (stringHeaderTraits) PutValue(b []byte, v StringHeader) { +func (stringHeaderTraits) PutValue(b []byte, v ViewHeader) { endian.Native.PutUint32(b, v.size) copy(b[4:], v.data[:]) } -func (stringHeaderTraits) CastFromBytes(b []byte) (res []StringHeader) { +func (stringHeaderTraits) CastFromBytes(b []byte) (res []ViewHeader) { h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - return unsafe.Slice((*StringHeader)(unsafe.Pointer(h.Data)), cap(b)/StringHeaderSizeBytes)[:len(b)/StringHeaderSizeBytes] + return unsafe.Slice((*ViewHeader)(unsafe.Pointer(h.Data)), cap(b)/StringHeaderSizeBytes)[:len(b)/StringHeaderSizeBytes] } -func (stringHeaderTraits) CastToBytes(b []StringHeader) (res []byte) { +func (stringHeaderTraits) CastToBytes(b []ViewHeader) (res []byte) { h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*StringHeaderSizeBytes)[:len(b)*StringHeaderSizeBytes] } -func (stringHeaderTraits) Copy(dst, src []StringHeader) { copy(dst, src) } +func (stringHeaderTraits) Copy(dst, src []ViewHeader) { copy(dst, src) } From 1792a98fee32674b05b20c0f9fd875582d1ae2f2 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 26 Oct 2023 11:54:30 -0400 Subject: [PATCH 28/37] implement rename --- go/arrow/{datatype_stringheader.go => datatype_viewheader.go} | 0 ...e_stringheader_inline.go => datatype_viewheader_inline.go} | 0 ..._inline_go1.19.go => datatype_viewheader_inline_go1.19.go} | 4 ++-- ..._inline_tinygo.go => datatype_viewheader_inline_tinygo.go} | 4 ++-- 4 files changed, 4 insertions(+), 4 deletions(-) rename go/arrow/{datatype_stringheader.go => datatype_viewheader.go} (100%) rename go/arrow/{datatype_stringheader_inline.go => datatype_viewheader_inline.go} (100%) rename go/arrow/{datatype_stringheader_inline_go1.19.go => datatype_viewheader_inline_go1.19.go} (89%) rename go/arrow/{datatype_stringheader_inline_tinygo.go => datatype_viewheader_inline_tinygo.go} (89%) diff --git a/go/arrow/datatype_stringheader.go b/go/arrow/datatype_viewheader.go similarity index 100% rename from go/arrow/datatype_stringheader.go rename to go/arrow/datatype_viewheader.go diff --git a/go/arrow/datatype_stringheader_inline.go b/go/arrow/datatype_viewheader_inline.go similarity index 100% rename from go/arrow/datatype_stringheader_inline.go rename to go/arrow/datatype_viewheader_inline.go diff --git a/go/arrow/datatype_stringheader_inline_go1.19.go b/go/arrow/datatype_viewheader_inline_go1.19.go similarity index 89% rename from go/arrow/datatype_stringheader_inline_go1.19.go rename to go/arrow/datatype_viewheader_inline_go1.19.go index e6bc318d275..ba548f9bd14 100644 --- a/go/arrow/datatype_stringheader_inline_go1.19.go +++ b/go/arrow/datatype_viewheader_inline_go1.19.go @@ -25,8 +25,8 @@ import ( "github.com/apache/arrow/go/v14/arrow/internal/debug" ) -func (sh *StringHeader) InlineData() (data string) { - debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader") +func (sh *ViewHeader) InlineData() (data string) { + debug.Assert(sh.IsInline(), "calling InlineData on non-inline ViewHeader") h := (*reflect.StringHeader)(unsafe.Pointer(&data)) h.Data = uintptr(unsafe.Pointer(&sh.data)) diff --git a/go/arrow/datatype_stringheader_inline_tinygo.go b/go/arrow/datatype_viewheader_inline_tinygo.go similarity index 89% rename from go/arrow/datatype_stringheader_inline_tinygo.go rename to go/arrow/datatype_viewheader_inline_tinygo.go index 05f4c555e7b..92a536224b6 100644 --- a/go/arrow/datatype_stringheader_inline_tinygo.go +++ b/go/arrow/datatype_viewheader_inline_tinygo.go @@ -25,8 +25,8 @@ import ( "github.com/apache/arrow/go/v14/arrow/internal/debug" ) -func (sh *StringHeader) InlineData() (data string) { - debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader") +func (sh *ViewHeader) InlineData() (data string) { + debug.Assert(sh.IsInline(), "calling InlineData on non-inline ViewHeader") h := (*reflect.StringHeader)(unsafe.Pointer(&data)) h.Data = uintptr(unsafe.Pointer(&sh.data)) From 5cfc237a4cf5e091165273f1cdbaca3577dd25fb Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Thu, 26 Oct 2023 12:05:26 -0400 Subject: [PATCH 29/37] fix endian swap default --- go/arrow/ipc/endian_swap.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/go/arrow/ipc/endian_swap.go b/go/arrow/ipc/endian_swap.go index 06bcbc77be3..6cfa748122e 100644 --- a/go/arrow/ipc/endian_swap.go +++ b/go/arrow/ipc/endian_swap.go @@ -120,9 +120,10 @@ func swapType(dt arrow.DataType, data *array.Data) (err error) { return swapType(dt.IndexType, data) case arrow.FixedWidthDataType: byteSwapBuffer(dt.BitWidth(), data.Buffers()[1]) + default: + err = fmt.Errorf("%w: swapping endianness of %s", arrow.ErrNotImplemented, dt) } - err = fmt.Errorf("%w: swapping endianness of %s", arrow.ErrNotImplemented, dt) return } From 829a850d49262366577ccdde2656cc212ed54eca Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 27 Oct 2023 12:42:47 -0400 Subject: [PATCH 30/37] updates from round of feedback --- go/arrow/array/binary.go | 2 +- go/arrow/array/binarybuilder.go | 18 +++++++++++------- go/arrow/array/bufferbuilder.go | 2 +- go/arrow/array/concat.go | 2 +- go/arrow/array/string.go | 2 +- go/arrow/datatype_viewheader.go | 22 +++++++++++----------- go/arrow/internal/arrjson/arrjson.go | 2 +- go/arrow/type_traits_string_view.go | 2 +- 8 files changed, 28 insertions(+), 24 deletions(-) diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index b1458f6425d..3f75afd5840 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -364,7 +364,7 @@ func (a *BinaryView) Value(i int) []byte { } start := s.BufferOffset() buf := a.dataBuffers[s.BufferIndex()] - return buf.Bytes()[start : start+uint32(s.Len())] + return buf.Bytes()[start : start+int32(s.Len())] } // ValueString returns the value at index i as a string instead of diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go index 5fad6e54f5a..bec03b96be1 100644 --- a/go/arrow/array/binarybuilder.go +++ b/go/arrow/array/binarybuilder.go @@ -372,8 +372,8 @@ func (b *BinaryBuilder) UnmarshalJSON(data []byte) error { } const ( - dfltBlockSize = 1 << 20 // 1 MB - viewValueSizeLimit uint32 = math.MaxUint32 + dfltBlockSize = 32 << 10 // 32 KB + viewValueSizeLimit int32 = math.MaxInt32 ) type BinaryViewBuilder struct { @@ -401,6 +401,10 @@ func NewBinaryViewBuilder(mem memory.Allocator) *BinaryViewBuilder { } } +func (b *BinaryViewBuilder) SetBlockSize(sz uint) { + b.blockBuilder.blockSize = int(sz) +} + func (b *BinaryViewBuilder) Type() arrow.DataType { return b.dtype } func (b *BinaryViewBuilder) Release() { @@ -446,8 +450,8 @@ func (b *BinaryViewBuilder) Resize(n int) { } func (b *BinaryViewBuilder) ReserveData(length int) { - if uint32(length) > viewValueSizeLimit { - panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", + if int32(length) > viewValueSizeLimit { + panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 2GB", arrow.ErrInvalid)) } b.blockBuilder.Reserve(int(length)) @@ -458,8 +462,8 @@ func (b *BinaryViewBuilder) Reserve(n int) { } func (b *BinaryViewBuilder) Append(v []byte) { - if uint32(len(v)) > viewValueSizeLimit { - panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 4GB", arrow.ErrInvalid)) + if int32(len(v)) > viewValueSizeLimit { + panic(fmt.Errorf("%w: BinaryView or StringView elements cannot reference strings larger than 2GB", arrow.ErrInvalid)) } if !arrow.IsViewInline(len(v)) { @@ -476,7 +480,7 @@ func (b *BinaryViewBuilder) Append(v []byte) { // // This is different than AppendValueFromString which exists for the // Builder interface, in that this expects raw binary data which is -// appended as such. AppendValueFromString expects base64 encoded binary +// appended unmodified. AppendValueFromString expects base64 encoded binary // data instead. func (b *BinaryViewBuilder) AppendString(v string) { // create a []byte without copying the bytes diff --git a/go/arrow/array/bufferbuilder.go b/go/arrow/array/bufferbuilder.go index 0ce07548fb8..9fabbf82924 100644 --- a/go/arrow/array/bufferbuilder.go +++ b/go/arrow/array/bufferbuilder.go @@ -240,7 +240,7 @@ func (b *multiBufferBuilder) Reset() { func (b *multiBufferBuilder) UnsafeAppend(hdr *arrow.ViewHeader, val []byte) { buf := b.blocks[b.currentOutBuffer] idx, offset := b.currentOutBuffer, buf.Len() - hdr.SetIndexOffset(uint32(idx), uint32(offset)) + hdr.SetIndexOffset(int32(idx), int32(offset)) n := copy(buf.Buf()[offset:], val) buf.ResizeNoShrink(offset + n) diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index 28ff9ef83a6..bcae53c4898 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -625,7 +625,7 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, continue } - bufIndex := s[i].BufferIndex() + uint32(precedingBufsCount) + bufIndex := s[i].BufferIndex() + int32(precedingBufsCount) s[i].SetIndexOffset(bufIndex, s[i].BufferOffset()) } } diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go index 6b2620e2d16..bd55e4ccd55 100644 --- a/go/arrow/array/string.go +++ b/go/arrow/array/string.go @@ -360,7 +360,7 @@ func (a *StringView) Value(i int) string { } start := s.BufferOffset() buf := a.dataBuffers[s.BufferIndex()] - value := buf.Bytes()[start : start+uint32(s.Len())] + value := buf.Bytes()[start : start+int32(s.Len())] return *(*string)(unsafe.Pointer(&value)) } diff --git a/go/arrow/datatype_viewheader.go b/go/arrow/datatype_viewheader.go index fc4dcbeb85c..df75e512ec2 100644 --- a/go/arrow/datatype_viewheader.go +++ b/go/arrow/datatype_viewheader.go @@ -60,7 +60,7 @@ func IsViewInline(length int) bool { // // [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf type ViewHeader struct { - size uint32 + size int32 // the first 4 bytes of this are the prefix for the string // if size <= StringHeaderInlineSize, then the entire string // is in the data array and is zero padded. @@ -71,7 +71,7 @@ type ViewHeader struct { } func (sh *ViewHeader) IsInline() bool { - return sh.size <= uint32(stringViewInlineSize) + return sh.size <= int32(stringViewInlineSize) } func (sh *ViewHeader) Len() int { return int(sh.size) } @@ -79,12 +79,12 @@ func (sh *ViewHeader) Prefix() [StringViewPrefixLen]byte { return *(*[4]byte)(unsafe.Pointer(&sh.data)) } -func (sh *ViewHeader) BufferIndex() uint32 { - return endian.Native.Uint32(sh.data[StringViewPrefixLen:]) +func (sh *ViewHeader) BufferIndex() int32 { + return int32(endian.Native.Uint32(sh.data[StringViewPrefixLen:])) } -func (sh *ViewHeader) BufferOffset() uint32 { - return endian.Native.Uint32(sh.data[StringViewPrefixLen+4:]) +func (sh *ViewHeader) BufferOffset() int32 { + return int32(endian.Native.Uint32(sh.data[StringViewPrefixLen+4:])) } func (sh *ViewHeader) InlineBytes() (data []byte) { @@ -93,7 +93,7 @@ func (sh *ViewHeader) InlineBytes() (data []byte) { } func (sh *ViewHeader) SetBytes(data []byte) int { - sh.size = uint32(len(data)) + sh.size = int32(len(data)) if sh.IsInline() { return copy(sh.data[:], data) } @@ -101,16 +101,16 @@ func (sh *ViewHeader) SetBytes(data []byte) int { } func (sh *ViewHeader) SetString(data string) int { - sh.size = uint32(len(data)) + sh.size = int32(len(data)) if sh.IsInline() { return copy(sh.data[:], data) } return copy(sh.data[:4], data) } -func (sh *ViewHeader) SetIndexOffset(bufferIndex, offset uint32) { - endian.Native.PutUint32(sh.data[StringViewPrefixLen:], bufferIndex) - endian.Native.PutUint32(sh.data[StringViewPrefixLen+4:], offset) +func (sh *ViewHeader) SetIndexOffset(bufferIndex, offset int32) { + endian.Native.PutUint32(sh.data[StringViewPrefixLen:], uint32(bufferIndex)) + endian.Native.PutUint32(sh.data[StringViewPrefixLen+4:], uint32(offset)) } func (sh *ViewHeader) Equals(buffers []*memory.Buffer, other *ViewHeader, otherBuffers []*memory.Buffer) bool { diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index 89b7e525ec5..26acde534b4 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -2405,7 +2405,7 @@ func stringHeadersFromJSON(mem memory.Allocator, isBinary bool, data []interface panic(err) } - values[i].SetIndexOffset(uint32(bufIdx), uint32(bufOffset)) + values[i].SetIndexOffset(int32(bufIdx), int32(bufOffset)) prefix, err := hex.DecodeString(v["PREFIX"].(string)) if err != nil { panic(err) diff --git a/go/arrow/type_traits_string_view.go b/go/arrow/type_traits_string_view.go index a8e1d2f36b9..a37082f2586 100644 --- a/go/arrow/type_traits_string_view.go +++ b/go/arrow/type_traits_string_view.go @@ -34,7 +34,7 @@ type stringHeaderTraits struct{} func (stringHeaderTraits) BytesRequired(n int) int { return StringHeaderSizeBytes * n } func (stringHeaderTraits) PutValue(b []byte, v ViewHeader) { - endian.Native.PutUint32(b, v.size) + endian.Native.PutUint32(b, uint32(v.size)) copy(b[4:], v.data[:]) } From a5609176d833a83cab46a7b80fc48deef5ff7d37 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Fri, 27 Oct 2023 16:11:58 -0400 Subject: [PATCH 31/37] rename to InlineString --- go/arrow/array/string.go | 2 +- go/arrow/datatype_viewheader_inline.go | 4 ++-- go/arrow/datatype_viewheader_inline_go1.19.go | 4 ++-- go/arrow/datatype_viewheader_inline_tinygo.go | 4 ++-- go/arrow/internal/arrjson/arrjson.go | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go index bd55e4ccd55..1c98573e388 100644 --- a/go/arrow/array/string.go +++ b/go/arrow/array/string.go @@ -356,7 +356,7 @@ func (a *StringView) ValueHeader(i int) *arrow.ViewHeader { func (a *StringView) Value(i int) string { s := a.ValueHeader(i) if s.IsInline() { - return s.InlineData() + return s.InlineString() } start := s.BufferOffset() buf := a.dataBuffers[s.BufferIndex()] diff --git a/go/arrow/datatype_viewheader_inline.go b/go/arrow/datatype_viewheader_inline.go index db6b8c7e05c..e618916a88b 100644 --- a/go/arrow/datatype_viewheader_inline.go +++ b/go/arrow/datatype_viewheader_inline.go @@ -24,8 +24,8 @@ import ( "github.com/apache/arrow/go/v14/arrow/internal/debug" ) -func (sh *ViewHeader) InlineData() (data string) { - debug.Assert(sh.IsInline(), "calling InlineData on non-inline StringHeader") +func (sh *ViewHeader) InlineString() (data string) { + debug.Assert(sh.IsInline(), "calling InlineString on non-inline ViewHeader") return unsafe.String((*byte)(unsafe.Pointer(&sh.data)), sh.size) } diff --git a/go/arrow/datatype_viewheader_inline_go1.19.go b/go/arrow/datatype_viewheader_inline_go1.19.go index ba548f9bd14..26a12fd47c6 100644 --- a/go/arrow/datatype_viewheader_inline_go1.19.go +++ b/go/arrow/datatype_viewheader_inline_go1.19.go @@ -25,8 +25,8 @@ import ( "github.com/apache/arrow/go/v14/arrow/internal/debug" ) -func (sh *ViewHeader) InlineData() (data string) { - debug.Assert(sh.IsInline(), "calling InlineData on non-inline ViewHeader") +func (sh *ViewHeader) InlineString() (data string) { + debug.Assert(sh.IsInline(), "calling InlineString on non-inline ViewHeader") h := (*reflect.StringHeader)(unsafe.Pointer(&data)) h.Data = uintptr(unsafe.Pointer(&sh.data)) diff --git a/go/arrow/datatype_viewheader_inline_tinygo.go b/go/arrow/datatype_viewheader_inline_tinygo.go index 92a536224b6..c830c1106a9 100644 --- a/go/arrow/datatype_viewheader_inline_tinygo.go +++ b/go/arrow/datatype_viewheader_inline_tinygo.go @@ -25,8 +25,8 @@ import ( "github.com/apache/arrow/go/v14/arrow/internal/debug" ) -func (sh *ViewHeader) InlineData() (data string) { - debug.Assert(sh.IsInline(), "calling InlineData on non-inline ViewHeader") +func (sh *ViewHeader) InlineString() (data string) { + debug.Assert(sh.IsInline(), "calling InlineString on non-inline ViewHeader") h := (*reflect.StringHeader)(unsafe.Pointer(&data)) h.Data = uintptr(unsafe.Pointer(&sh.data)) diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index 26acde534b4..9e479afba3a 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -2436,7 +2436,7 @@ func stringHeadersToJSON(arr array.ViewLike, isBinary bool) []interface{} { for i := range o { hdr := arr.ValueHeader(i) if hdr.IsInline() { - data := hdr.InlineData() + data := hdr.InlineString() if isBinary { data = strings.ToUpper(hex.EncodeToString(hdr.InlineBytes())) } From a84ee2edf3e6a6f1c4f2f7829b6d151c61df973a Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 30 Oct 2023 12:01:57 -0400 Subject: [PATCH 32/37] Update go/arrow/datatype.go Co-authored-by: Alex Shcherbakov --- go/arrow/datatype.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/go/arrow/datatype.go b/go/arrow/datatype.go index 4c57e9fe595..bbb68fa8bc2 100644 --- a/go/arrow/datatype.go +++ b/go/arrow/datatype.go @@ -277,9 +277,7 @@ func (b BufferSpec) Equals(other BufferSpec) bool { type DataTypeLayout struct { Buffers []BufferSpec HasDict bool - // if this is non-nil, the number of buffers expected is only - // lower-bounded by len(buffers). Buffers beyond this lower bound - // are expected to conform to this variadic spec. + // VariadicSpec is what the buffers beyond len(Buffers) are expected to conform to. VariadicSpec *BufferSpec } From be3b3a539b9c8f89dc23ae51e35c8b57250c5022 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 30 Oct 2023 12:22:17 -0400 Subject: [PATCH 33/37] updates from feedback --- go/arrow/array/binary.go | 2 +- go/arrow/array/binarybuilder.go | 10 +++++----- go/arrow/array/bufferbuilder.go | 7 ++----- go/arrow/array/concat.go | 4 ++-- go/arrow/array/string.go | 2 +- go/arrow/compute/executor.go | 2 +- go/arrow/datatype_binary.go | 4 ++-- go/arrow/datatype_viewheader.go | 13 +++++++----- go/arrow/internal/arrjson/arrjson.go | 4 ++-- go/arrow/ipc/metadata.go | 17 +++++++++++----- go/arrow/ipc/writer.go | 2 +- ...its_string_view.go => type_traits_view.go} | 20 +++++++++---------- 12 files changed, 47 insertions(+), 40 deletions(-) rename go/arrow/{type_traits_string_view.go => type_traits_view.go} (63%) diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index 3f75afd5840..5eef11a810c 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -344,7 +344,7 @@ func (a *BinaryView) setData(data *Data) { a.array.setData(data) if valueData := data.buffers[1]; valueData != nil { - a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes()) + a.values = arrow.ViewHeaderTraits.CastFromBytes(valueData.Bytes()) } a.dataBuffers = data.buffers[2:] diff --git a/go/arrow/array/binarybuilder.go b/go/arrow/array/binarybuilder.go index bec03b96be1..93dabca1e61 100644 --- a/go/arrow/array/binarybuilder.go +++ b/go/arrow/array/binarybuilder.go @@ -428,9 +428,9 @@ func (b *BinaryViewBuilder) Release() { func (b *BinaryViewBuilder) init(capacity int) { b.builder.init(capacity) b.data = memory.NewResizableBuffer(b.mem) - bytesN := arrow.StringHeaderTraits.BytesRequired(capacity) + bytesN := arrow.ViewHeaderTraits.BytesRequired(capacity) b.data.Resize(bytesN) - b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes()) + b.rawData = arrow.ViewHeaderTraits.CastFromBytes(b.data.Bytes()) } func (b *BinaryViewBuilder) Resize(n int) { @@ -445,8 +445,8 @@ func (b *BinaryViewBuilder) Resize(n int) { } b.builder.resize(nbuild, b.init) - b.data.Resize(arrow.StringHeaderTraits.BytesRequired(n)) - b.rawData = arrow.StringHeaderTraits.CastFromBytes(b.data.Bytes()) + b.data.Resize(arrow.ViewHeaderTraits.BytesRequired(n)) + b.rawData = arrow.ViewHeaderTraits.CastFromBytes(b.data.Bytes()) } func (b *BinaryViewBuilder) ReserveData(length int) { @@ -665,7 +665,7 @@ func (b *BinaryViewBuilder) UnmarshalJSON(data []byte) error { } func (b *BinaryViewBuilder) newData() (data *Data) { - bytesRequired := arrow.StringHeaderTraits.BytesRequired(b.length) + bytesRequired := arrow.ViewHeaderTraits.BytesRequired(b.length) if bytesRequired > 0 && bytesRequired < b.data.Len() { // trim buffers b.data.Resize(bytesRequired) diff --git a/go/arrow/array/bufferbuilder.go b/go/arrow/array/bufferbuilder.go index 9fabbf82924..7086a1bba3b 100644 --- a/go/arrow/array/bufferbuilder.go +++ b/go/arrow/array/bufferbuilder.go @@ -230,11 +230,9 @@ func (b *multiBufferBuilder) RemainingBytes() int { func (b *multiBufferBuilder) Reset() { b.currentOutBuffer = 0 - for i, block := range b.blocks { + for _, block := range b.Finish() { block.Release() - b.blocks[i] = nil } - b.blocks = nil } func (b *multiBufferBuilder) UnsafeAppend(hdr *arrow.ViewHeader, val []byte) { @@ -258,7 +256,6 @@ func (b *multiBufferBuilder) UnsafeAppendString(hdr *arrow.ViewHeader, val strin func (b *multiBufferBuilder) Finish() (out []*memory.Buffer) { b.currentOutBuffer = 0 - out = b.blocks - b.blocks = nil + out, b.blocks = b.blocks, nil return } diff --git a/go/arrow/array/concat.go b/go/arrow/array/concat.go index bcae53c4898..1fe55be4253 100644 --- a/go/arrow/array/concat.go +++ b/go/arrow/array/concat.go @@ -609,10 +609,10 @@ func concat(data []arrow.ArrayData, mem memory.Allocator) (arr arrow.ArrayData, } } - out.buffers[1] = concatBuffers(gatherFixedBuffers(data, 1, arrow.StringHeaderSizeBytes), mem) + out.buffers[1] = concatBuffers(gatherFixedBuffers(data, 1, arrow.ViewHeaderSizeBytes), mem) var ( - s = arrow.StringHeaderTraits.CastFromBytes(out.buffers[1].Bytes()) + s = arrow.ViewHeaderTraits.CastFromBytes(out.buffers[1].Bytes()) i = data[0].Len() precedingBufsCount int ) diff --git a/go/arrow/array/string.go b/go/arrow/array/string.go index 1c98573e388..9aed2843656 100644 --- a/go/arrow/array/string.go +++ b/go/arrow/array/string.go @@ -340,7 +340,7 @@ func (a *StringView) setData(data *Data) { a.array.setData(data) if valueData := data.buffers[1]; valueData != nil { - a.values = arrow.StringHeaderTraits.CastFromBytes(valueData.Bytes()) + a.values = arrow.ViewHeaderTraits.CastFromBytes(valueData.Bytes()) } a.dataBuffers = data.buffers[2:] diff --git a/go/arrow/compute/executor.go b/go/arrow/compute/executor.go index 2aae50b9d02..0e7a3280eb9 100644 --- a/go/arrow/compute/executor.go +++ b/go/arrow/compute/executor.go @@ -172,7 +172,7 @@ func addComputeDataPrealloc(dt arrow.DataType, widths []bufferPrealloc) []buffer case arrow.LARGE_BINARY, arrow.LARGE_STRING, arrow.LARGE_LIST: return append(widths, bufferPrealloc{bitWidth: 64, addLen: 1}) case arrow.STRING_VIEW, arrow.BINARY_VIEW: - return append(widths, bufferPrealloc{bitWidth: arrow.StringHeaderSizeBytes * 8}) + return append(widths, bufferPrealloc{bitWidth: arrow.ViewHeaderSizeBytes * 8}) } return widths } diff --git a/go/arrow/datatype_binary.go b/go/arrow/datatype_binary.go index cb3c3e2ed87..f3e601f08ec 100644 --- a/go/arrow/datatype_binary.go +++ b/go/arrow/datatype_binary.go @@ -95,7 +95,7 @@ func (t *BinaryViewType) Fingerprint() string { return typeFingerprint(t) } func (*BinaryViewType) Layout() DataTypeLayout { variadic := SpecVariableWidth() return DataTypeLayout{ - Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(StringHeaderSizeBytes)}, + Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(ViewHeaderSizeBytes)}, VariadicSpec: &variadic, } } @@ -112,7 +112,7 @@ func (t *StringViewType) Fingerprint() string { return typeFingerprint(t) } func (*StringViewType) Layout() DataTypeLayout { variadic := SpecVariableWidth() return DataTypeLayout{ - Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(StringHeaderSizeBytes)}, + Buffers: []BufferSpec{SpecBitmap(), SpecFixedWidth(ViewHeaderSizeBytes)}, VariadicSpec: &variadic, } } diff --git a/go/arrow/datatype_viewheader.go b/go/arrow/datatype_viewheader.go index df75e512ec2..0f21e8da0fe 100644 --- a/go/arrow/datatype_viewheader.go +++ b/go/arrow/datatype_viewheader.go @@ -114,7 +114,7 @@ func (sh *ViewHeader) SetIndexOffset(bufferIndex, offset int32) { } func (sh *ViewHeader) Equals(buffers []*memory.Buffer, other *ViewHeader, otherBuffers []*memory.Buffer) bool { - if sh.sizeAndPrefixAsInt() != other.sizeAndPrefixAsInt() { + if sh.sizeAndPrefixAsInt64() != other.sizeAndPrefixAsInt64() { return false } @@ -122,9 +122,12 @@ func (sh *ViewHeader) Equals(buffers []*memory.Buffer, other *ViewHeader, otherB return sh.inlinedAsInt64() == other.inlinedAsInt64() } - data := buffers[sh.BufferIndex()].Bytes()[sh.BufferOffset() : sh.BufferOffset()+sh.size] - otherData := otherBuffers[other.BufferIndex()].Bytes()[other.BufferOffset() : other.BufferOffset()+other.size] - return bytes.Equal(data, otherData) + return bytes.Equal(sh.getBufferBytes(buffers), other.getBufferBytes(otherBuffers)) +} + +func (sh *ViewHeader) getBufferBytes(buffers []*memory.Buffer) []byte { + offset := sh.BufferOffset() + return buffers[sh.BufferIndex()].Bytes()[offset : offset+sh.size] } func (sh *ViewHeader) inlinedAsInt64() int64 { @@ -132,7 +135,7 @@ func (sh *ViewHeader) inlinedAsInt64() int64 { return s[1] } -func (sh *ViewHeader) sizeAndPrefixAsInt() int64 { +func (sh *ViewHeader) sizeAndPrefixAsInt64() int64 { s := unsafe.Slice((*int64)(unsafe.Pointer(sh)), 2) return s[0] } diff --git a/go/arrow/internal/arrjson/arrjson.go b/go/arrow/internal/arrjson/arrjson.go index 9e479afba3a..1610109e298 100644 --- a/go/arrow/internal/arrjson/arrjson.go +++ b/go/arrow/internal/arrjson/arrjson.go @@ -2372,9 +2372,9 @@ func variadicBuffersToJSON(bufs []*memory.Buffer) []string { func stringHeadersFromJSON(mem memory.Allocator, isBinary bool, data []interface{}) *memory.Buffer { buf := memory.NewResizableBuffer(mem) - buf.Resize(arrow.StringHeaderTraits.BytesRequired(len(data))) + buf.Resize(arrow.ViewHeaderTraits.BytesRequired(len(data))) - values := arrow.StringHeaderTraits.CastFromBytes(buf.Bytes()) + values := arrow.ViewHeaderTraits.CastFromBytes(buf.Bytes()) for i, d := range data { switch v := d.(type) { diff --git a/go/arrow/ipc/metadata.go b/go/arrow/ipc/metadata.go index aa77c07f178..c69664a1a91 100644 --- a/go/arrow/ipc/metadata.go +++ b/go/arrow/ipc/metadata.go @@ -1210,17 +1210,24 @@ func recordToFB(b *flatbuffers.Builder, size, bodyLength int64, fields []fieldMe bodyCompressFB = writeBodyCompression(b, codec) } - flatbuf.RecordBatchStartVariadicBufferCountsVector(b, len(variadicCounts)) - for i := len(variadicCounts) - 1; i >= 0; i-- { - b.PrependInt64(variadicCounts[i]) + var vcFB *flatbuffers.UOffsetT + if len(variadicCounts) > 0 { + flatbuf.RecordBatchStartVariadicBufferCountsVector(b, len(variadicCounts)) + for i := len(variadicCounts) - 1; i >= 0; i-- { + b.PrependInt64(variadicCounts[i]) + } + vcFBVal := b.EndVector(len(variadicCounts)) + vcFB = &vcFBVal } - vcFB := b.EndVector(len(variadicCounts)) flatbuf.RecordBatchStart(b) flatbuf.RecordBatchAddLength(b, size) flatbuf.RecordBatchAddNodes(b, fieldsFB) flatbuf.RecordBatchAddBuffers(b, metaFB) - flatbuf.RecordBatchAddVariadicBufferCounts(b, vcFB) + if vcFB != nil { + flatbuf.RecordBatchAddVariadicBufferCounts(b, *vcFB) + } + if codec != -1 { flatbuf.RecordBatchAddCompression(b, bodyCompressFB) } diff --git a/go/arrow/ipc/writer.go b/go/arrow/ipc/writer.go index ef1dd1c1dab..75f657610ee 100644 --- a/go/arrow/ipc/writer.go +++ b/go/arrow/ipc/writer.go @@ -607,7 +607,7 @@ func (w *recordEncoder) visit(p *Payload, arr arrow.Array) error { data := arr.Data() values := data.Buffers()[1] arrLen := int64(arr.Len()) - typeWidth := int64(arrow.StringHeaderSizeBytes) + typeWidth := int64(arrow.ViewHeaderSizeBytes) minLength := paddedLength(arrLen*typeWidth, kArrowAlignment) switch { diff --git a/go/arrow/type_traits_string_view.go b/go/arrow/type_traits_view.go similarity index 63% rename from go/arrow/type_traits_string_view.go rename to go/arrow/type_traits_view.go index a37082f2586..972c3e2c35a 100644 --- a/go/arrow/type_traits_string_view.go +++ b/go/arrow/type_traits_view.go @@ -23,31 +23,31 @@ import ( "github.com/apache/arrow/go/v14/arrow/endian" ) -var StringHeaderTraits stringHeaderTraits +var ViewHeaderTraits viewHeaderTraits const ( - StringHeaderSizeBytes = int(unsafe.Sizeof(ViewHeader{})) + ViewHeaderSizeBytes = int(unsafe.Sizeof(ViewHeader{})) ) -type stringHeaderTraits struct{} +type viewHeaderTraits struct{} -func (stringHeaderTraits) BytesRequired(n int) int { return StringHeaderSizeBytes * n } +func (viewHeaderTraits) BytesRequired(n int) int { return ViewHeaderSizeBytes * n } -func (stringHeaderTraits) PutValue(b []byte, v ViewHeader) { +func (viewHeaderTraits) PutValue(b []byte, v ViewHeader) { endian.Native.PutUint32(b, uint32(v.size)) copy(b[4:], v.data[:]) } -func (stringHeaderTraits) CastFromBytes(b []byte) (res []ViewHeader) { +func (viewHeaderTraits) CastFromBytes(b []byte) (res []ViewHeader) { h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - return unsafe.Slice((*ViewHeader)(unsafe.Pointer(h.Data)), cap(b)/StringHeaderSizeBytes)[:len(b)/StringHeaderSizeBytes] + return unsafe.Slice((*ViewHeader)(unsafe.Pointer(h.Data)), cap(b)/ViewHeaderSizeBytes)[:len(b)/ViewHeaderSizeBytes] } -func (stringHeaderTraits) CastToBytes(b []ViewHeader) (res []byte) { +func (viewHeaderTraits) CastToBytes(b []ViewHeader) (res []byte) { h := (*reflect.SliceHeader)(unsafe.Pointer(&b)) - return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*StringHeaderSizeBytes)[:len(b)*StringHeaderSizeBytes] + return unsafe.Slice((*byte)(unsafe.Pointer(h.Data)), cap(b)*ViewHeaderSizeBytes)[:len(b)*ViewHeaderSizeBytes] } -func (stringHeaderTraits) Copy(dst, src []ViewHeader) { copy(dst, src) } +func (viewHeaderTraits) Copy(dst, src []ViewHeader) { copy(dst, src) } From e2bbe6f04089125ced046c7c72e4bb11a8bfa39f Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 13 Nov 2023 12:01:40 -0500 Subject: [PATCH 34/37] Update go/arrow/datatype_viewheader.go Co-authored-by: Alex Shcherbakov --- go/arrow/datatype_viewheader.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/arrow/datatype_viewheader.go b/go/arrow/datatype_viewheader.go index 0f21e8da0fe..d40290a7b81 100644 --- a/go/arrow/datatype_viewheader.go +++ b/go/arrow/datatype_viewheader.go @@ -88,7 +88,7 @@ func (sh *ViewHeader) BufferOffset() int32 { } func (sh *ViewHeader) InlineBytes() (data []byte) { - debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline StringHeader") + debug.Assert(sh.IsInline(), "calling InlineBytes on non-inline ViewHeader") return sh.data[:sh.size] } From 9db655775b7961fe559c5ca6f18b2ecd91ee2744 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 13 Nov 2023 12:06:26 -0500 Subject: [PATCH 35/37] update with rebase --- go/arrow/datatype_viewheader.go | 6 +++--- go/arrow/datatype_viewheader_inline.go | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/go/arrow/datatype_viewheader.go b/go/arrow/datatype_viewheader.go index d40290a7b81..7d9d79b205b 100644 --- a/go/arrow/datatype_viewheader.go +++ b/go/arrow/datatype_viewheader.go @@ -20,9 +20,9 @@ import ( "bytes" "unsafe" - "github.com/apache/arrow/go/v14/arrow/endian" - "github.com/apache/arrow/go/v14/arrow/internal/debug" - "github.com/apache/arrow/go/v14/arrow/memory" + "github.com/apache/arrow/go/v15/arrow/endian" + "github.com/apache/arrow/go/v15/arrow/internal/debug" + "github.com/apache/arrow/go/v15/arrow/memory" ) const ( diff --git a/go/arrow/datatype_viewheader_inline.go b/go/arrow/datatype_viewheader_inline.go index e618916a88b..89ac1d06adc 100644 --- a/go/arrow/datatype_viewheader_inline.go +++ b/go/arrow/datatype_viewheader_inline.go @@ -21,7 +21,7 @@ package arrow import ( "unsafe" - "github.com/apache/arrow/go/v14/arrow/internal/debug" + "github.com/apache/arrow/go/v15/arrow/internal/debug" ) func (sh *ViewHeader) InlineString() (data string) { From 460c06e66467a6985979355c37f32ec4c107c014 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 13 Nov 2023 12:11:40 -0500 Subject: [PATCH 36/37] fix version imports --- go/arrow/datatype_viewheader_inline_go1.19.go | 2 +- go/arrow/datatype_viewheader_inline_tinygo.go | 2 +- go/arrow/type_traits_view.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/arrow/datatype_viewheader_inline_go1.19.go b/go/arrow/datatype_viewheader_inline_go1.19.go index 26a12fd47c6..aec66009d94 100644 --- a/go/arrow/datatype_viewheader_inline_go1.19.go +++ b/go/arrow/datatype_viewheader_inline_go1.19.go @@ -22,7 +22,7 @@ import ( "reflect" "unsafe" - "github.com/apache/arrow/go/v14/arrow/internal/debug" + "github.com/apache/arrow/go/v15/arrow/internal/debug" ) func (sh *ViewHeader) InlineString() (data string) { diff --git a/go/arrow/datatype_viewheader_inline_tinygo.go b/go/arrow/datatype_viewheader_inline_tinygo.go index c830c1106a9..bff63a273a7 100644 --- a/go/arrow/datatype_viewheader_inline_tinygo.go +++ b/go/arrow/datatype_viewheader_inline_tinygo.go @@ -22,7 +22,7 @@ import ( "reflect" "unsafe" - "github.com/apache/arrow/go/v14/arrow/internal/debug" + "github.com/apache/arrow/go/v15/arrow/internal/debug" ) func (sh *ViewHeader) InlineString() (data string) { diff --git a/go/arrow/type_traits_view.go b/go/arrow/type_traits_view.go index 972c3e2c35a..c3846db2946 100644 --- a/go/arrow/type_traits_view.go +++ b/go/arrow/type_traits_view.go @@ -20,7 +20,7 @@ import ( "reflect" "unsafe" - "github.com/apache/arrow/go/v14/arrow/endian" + "github.com/apache/arrow/go/v15/arrow/endian" ) var ViewHeaderTraits viewHeaderTraits From 8e4184058ec168751684de1117c0b8c3c0d82fe9 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Mon, 13 Nov 2023 16:01:34 -0500 Subject: [PATCH 37/37] fix formatting, updates from feedback, add tests --- go/arrow/array/binary.go | 2 +- go/arrow/array/binary_test.go | 24 ++++++++++++++++++++++++ go/arrow/array/bufferbuilder.go | 4 ++-- go/arrow/datatype_viewheader.go | 20 ++++++++++---------- 4 files changed, 37 insertions(+), 13 deletions(-) diff --git a/go/arrow/array/binary.go b/go/arrow/array/binary.go index 7a72c2be1d3..c226297da04 100644 --- a/go/arrow/array/binary.go +++ b/go/arrow/array/binary.go @@ -24,7 +24,7 @@ import ( "unsafe" "github.com/apache/arrow/go/v15/arrow" - "github.com/apache/arrow/go/v15/arrow/memory" + "github.com/apache/arrow/go/v15/arrow/memory" "github.com/apache/arrow/go/v15/internal/json" ) diff --git a/go/arrow/array/binary_test.go b/go/arrow/array/binary_test.go index 9c1770950a8..c9e16551522 100644 --- a/go/arrow/array/binary_test.go +++ b/go/arrow/array/binary_test.go @@ -700,3 +700,27 @@ func TestBinaryStringRoundTrip(t *testing.T) { assert.True(t, Equal(arr, arr1)) } + +func TestBinaryViewStringRoundTrip(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + values := []string{"a", "bc", "", "", "supercalifragilistic", "", "expeallodocious"} + valid := []bool{true, true, false, false, true, true, true} + + b := NewBinaryViewBuilder(mem) + defer b.Release() + + b.AppendStringValues(values, valid) + arr := b.NewArray().(*BinaryView) + defer arr.Release() + + for i := 0; i < arr.Len(); i++ { + assert.NoError(t, b.AppendValueFromString(arr.ValueStr(i))) + } + + arr1 := b.NewArray().(*BinaryView) + defer arr1.Release() + + assert.True(t, Equal(arr, arr1)) +} diff --git a/go/arrow/array/bufferbuilder.go b/go/arrow/array/bufferbuilder.go index 9497c5cb7a7..13741ba8926 100644 --- a/go/arrow/array/bufferbuilder.go +++ b/go/arrow/array/bufferbuilder.go @@ -19,8 +19,8 @@ package array import ( "sync/atomic" "unsafe" - - "github.com/apache/arrow/go/v15/arrow" + + "github.com/apache/arrow/go/v15/arrow" "github.com/apache/arrow/go/v15/arrow/bitutil" "github.com/apache/arrow/go/v15/arrow/internal/debug" "github.com/apache/arrow/go/v15/arrow/memory" diff --git a/go/arrow/datatype_viewheader.go b/go/arrow/datatype_viewheader.go index 7d9d79b205b..54b9256b346 100644 --- a/go/arrow/datatype_viewheader.go +++ b/go/arrow/datatype_viewheader.go @@ -26,12 +26,12 @@ import ( ) const ( - StringViewPrefixLen = 4 - stringViewInlineSize = 12 + ViewPrefixLen = 4 + viewInlineSize = 12 ) func IsViewInline(length int) bool { - return length < stringViewInlineSize + return length < viewInlineSize } // ViewHeader is a variable length string (utf8) or byte slice with @@ -67,24 +67,24 @@ type ViewHeader struct { // if size > StringHeaderInlineSize, the next 8 bytes are 2 uint32 // values which are the buffer index and offset in that buffer // containing the full string. - data [stringViewInlineSize]byte + data [viewInlineSize]byte } func (sh *ViewHeader) IsInline() bool { - return sh.size <= int32(stringViewInlineSize) + return sh.size <= int32(viewInlineSize) } func (sh *ViewHeader) Len() int { return int(sh.size) } -func (sh *ViewHeader) Prefix() [StringViewPrefixLen]byte { +func (sh *ViewHeader) Prefix() [ViewPrefixLen]byte { return *(*[4]byte)(unsafe.Pointer(&sh.data)) } func (sh *ViewHeader) BufferIndex() int32 { - return int32(endian.Native.Uint32(sh.data[StringViewPrefixLen:])) + return int32(endian.Native.Uint32(sh.data[ViewPrefixLen:])) } func (sh *ViewHeader) BufferOffset() int32 { - return int32(endian.Native.Uint32(sh.data[StringViewPrefixLen+4:])) + return int32(endian.Native.Uint32(sh.data[ViewPrefixLen+4:])) } func (sh *ViewHeader) InlineBytes() (data []byte) { @@ -109,8 +109,8 @@ func (sh *ViewHeader) SetString(data string) int { } func (sh *ViewHeader) SetIndexOffset(bufferIndex, offset int32) { - endian.Native.PutUint32(sh.data[StringViewPrefixLen:], uint32(bufferIndex)) - endian.Native.PutUint32(sh.data[StringViewPrefixLen+4:], uint32(offset)) + endian.Native.PutUint32(sh.data[ViewPrefixLen:], uint32(bufferIndex)) + endian.Native.PutUint32(sh.data[ViewPrefixLen+4:], uint32(offset)) } func (sh *ViewHeader) Equals(buffers []*memory.Buffer, other *ViewHeader, otherBuffers []*memory.Buffer) bool {