From 77c260451c4e616eef31a9f1ee77ff9e333ee0bf Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 12 Jul 2024 07:39:56 -0400 Subject: [PATCH 01/14] initial bool8 --- go/arrow/type_traits.go | 8 + go/internal/types/extension_types.go | 193 ++++++++++++++++++ go/internal/types/extension_types_test.go | 232 +++++++++++++++++++++- 3 files changed, 431 insertions(+), 2 deletions(-) diff --git a/go/arrow/type_traits.go b/go/arrow/type_traits.go index aae6ad10648..93bd86cb283 100644 --- a/go/arrow/type_traits.go +++ b/go/arrow/type_traits.go @@ -117,6 +117,14 @@ func GetBytes[T FixedWidthType | ViewHeader](in []T) []byte { return reinterpretSlice[byte](in) } +// GetBool8Bytes reinterprets a slice of bool to a slice of bytes. +// +// NOTE: This must not be used for standard Arrow Boolean arrays as they are bitpacked. +// The resulting byte slice is only valid for the Bool8 extension type. +func GetBool8Bytes(in []bool) []byte { + return reinterpretSlice[byte](in) +} + // GetData reinterprets a slice of bytes to a slice of T. // // NOTE: the buffer's length must be a multiple of Sizeof(T). diff --git a/go/internal/types/extension_types.go b/go/internal/types/extension_types.go index 3c63b368746..d7bfd6c342d 100644 --- a/go/internal/types/extension_types.go +++ b/go/internal/types/extension_types.go @@ -22,6 +22,7 @@ import ( "encoding/binary" "fmt" "reflect" + "strconv" "strings" "github.com/apache/arrow/go/v18/arrow" @@ -31,6 +32,8 @@ import ( "golang.org/x/xerrors" ) +const ExtensionNameBool8 = "bool8" + var UUID = NewUUIDType() type UUIDBuilder struct { @@ -38,6 +41,7 @@ type UUIDBuilder struct { } func NewUUIDBuilder(builder *array.ExtensionBuilder) *UUIDBuilder { + builder.Retain() return &UUIDBuilder{ExtensionBuilder: builder} } @@ -533,15 +537,204 @@ func (SmallintType) Deserialize(storageType arrow.DataType, data string) (arrow. return NewSmallintType(), nil } +type Bool8Type struct { + arrow.ExtensionBase +} + +// ArrayType implements arrow.ExtensionType. +func (b *Bool8Type) ArrayType() reflect.Type { + return reflect.TypeOf(Bool8Array{}) +} + +// Deserialize implements arrow.ExtensionType. +func (b *Bool8Type) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) { + if data != ExtensionNameBool8 { + return nil, fmt.Errorf("type identifier did not match: '%s'", data) + } + if !arrow.TypeEqual(storageType, arrow.PrimitiveTypes.Uint8) { + return nil, fmt.Errorf("invalid storage type for Bool8Type: %s", storageType.Name()) + } + return NewBool8Type(), nil +} + +// ExtensionEquals implements arrow.ExtensionType. +func (b *Bool8Type) ExtensionEquals(other arrow.ExtensionType) bool { + return b.ExtensionName() == other.ExtensionName() +} + +// ExtensionName implements arrow.ExtensionType. +func (b *Bool8Type) ExtensionName() string { + return ExtensionNameBool8 +} + +// Serialize implements arrow.ExtensionType. +func (b *Bool8Type) Serialize() string { + return ExtensionNameBool8 +} + +// String implements arrow.ExtensionType. +// Subtle: this method shadows the method (ExtensionBase).String of Bool8Type.ExtensionBase. +func (b *Bool8Type) String() string { + panic("unimplemented") +} + +func NewBool8Type() *Bool8Type { + return &Bool8Type{ExtensionBase: arrow.ExtensionBase{ + Storage: arrow.PrimitiveTypes.Uint8}} +} + +func (*Bool8Type) NewBuilder(bldr *array.ExtensionBuilder) array.Builder { + return NewBool8Builder(bldr) +} + +type Bool8Array struct { + array.ExtensionArrayBase +} + +func (a *Bool8Array) String() string { + var o strings.Builder + arr := a.Storage().(*array.Uint8) + o.WriteString("[") + for i := 0; i < arr.Len(); i++ { + if i > 0 { + o.WriteString(" ") + } + switch { + case a.IsNull(i): + o.WriteString(array.NullValueStr) + default: + fmt.Fprintf(&o, "%v", a.Value(i)) + } + } + o.WriteString("]") + return o.String() +} + +func (a *Bool8Array) Value(i int) bool { + return uint8ToBool(a.Storage().(*array.Uint8).Value(i)) +} + +func (a *Bool8Array) ValueStr(i int) string { + switch { + case a.IsNull(i): + return array.NullValueStr + default: + return fmt.Sprint(a.Value(i)) + } +} + +func (a *Bool8Array) MarshalJSON() ([]byte, error) { + values := make([]interface{}, a.Len()) + for i := 0; i < a.Len()-a.NullN(); i++ { + if a.IsValid(i) { + values[i] = a.Value(i) + } + } + return json.Marshal(values) +} + +func (a *Bool8Array) GetOneForMarshal(i int) interface{} { + if a.IsNull(i) { + return nil + } + return a.Value(i) +} + +func boolToUint8(v bool) uint8 { + var res uint8 + if v { + res = 1 + } + return res +} + +func uint8ToBool(v uint8) bool { + return v != 0 +} + +type Bool8Builder struct { + *array.ExtensionBuilder +} + +func NewBool8Builder(builder *array.ExtensionBuilder) *Bool8Builder { + builder.Retain() + return &Bool8Builder{ExtensionBuilder: builder} +} + +func (b *Bool8Builder) Append(v bool) { + b.ExtensionBuilder.Builder.(*array.Uint8Builder).Append(boolToUint8(v)) +} + +func (b *Bool8Builder) UnsafeAppend(v bool) { + b.ExtensionBuilder.Builder.(*array.Uint8Builder).UnsafeAppend(boolToUint8(v)) +} + +func (b *Bool8Builder) AppendValueFromString(s string) error { + if s == array.NullValueStr { + b.AppendNull() + return nil + } + + val, err := strconv.ParseBool(s) + if err != nil { + return err + } + + b.Append(val) + return nil +} + +func (b *Bool8Builder) AppendValues(v []bool, valid []bool) { + boolsAsBytes := arrow.GetBool8Bytes(v) + boolsAsUint8s := arrow.GetData[uint8](boolsAsBytes) + b.ExtensionBuilder.Builder.(*array.Uint8Builder).AppendValues(boolsAsUint8s, valid) +} + +func (b *Bool8Builder) UnmarshalOne(dec *json.Decoder) error { + t, err := dec.Token() + if err != nil { + return err + } + + switch v := t.(type) { + case bool: + b.Append(v) + return nil + case string: + return b.AppendValueFromString(v) + case nil: + b.AppendNull() + return nil + default: + return &json.UnmarshalTypeError{ + Value: fmt.Sprint(t), + Type: reflect.TypeOf([]byte{}), + Offset: dec.InputOffset(), + Struct: fmt.Sprintf("FixedSizeBinary[%d]", 16), + } + } +} + +func (b *Bool8Builder) Unmarshal(dec *json.Decoder) error { + for dec.More() { + if err := b.UnmarshalOne(dec); err != nil { + return err + } + } + return nil +} + var ( _ arrow.ExtensionType = (*Parametric1Type)(nil) _ arrow.ExtensionType = (*Parametric2Type)(nil) _ arrow.ExtensionType = (*ExtStructType)(nil) _ arrow.ExtensionType = (*DictExtensionType)(nil) _ arrow.ExtensionType = (*SmallintType)(nil) + _ arrow.ExtensionType = (*Bool8Type)(nil) _ array.ExtensionArray = (*Parametric1Array)(nil) _ array.ExtensionArray = (*Parametric2Array)(nil) _ array.ExtensionArray = (*ExtStructArray)(nil) _ array.ExtensionArray = (*DictExtensionArray)(nil) _ array.ExtensionArray = (*SmallintArray)(nil) + _ array.ExtensionArray = (*Bool8Array)(nil) ) diff --git a/go/internal/types/extension_types_test.go b/go/internal/types/extension_types_test.go index 50abaae3a9e..682161cd498 100644 --- a/go/internal/types/extension_types_test.go +++ b/go/internal/types/extension_types_test.go @@ -18,6 +18,7 @@ package types_test import ( "bytes" + "fmt" "testing" "github.com/apache/arrow/go/v18/arrow" @@ -32,7 +33,7 @@ import ( var testUUID = uuid.New() -func TestExtensionBuilder(t *testing.T) { +func TestUUIDExtensionBuilder(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) extBuilder := array.NewExtensionBuilder(mem, types.NewUUIDType()) @@ -52,7 +53,7 @@ func TestExtensionBuilder(t *testing.T) { assert.Equal(t, arr, arr1) } -func TestExtensionRecordBuilder(t *testing.T) { +func TestUUIDExtensionRecordBuilder(t *testing.T) { schema := arrow.NewSchema([]arrow.Field{ {Name: "uuid", Type: types.NewUUIDType()}, }, nil) @@ -99,3 +100,230 @@ func TestUUIDStringRoundTrip(t *testing.T) { assert.True(t, array.Equal(arr, arr1)) } + +func TestBool8ExtensionBuilder(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + extBuilder := array.NewExtensionBuilder(mem, types.NewBool8Type()) + defer extBuilder.Release() + + builder := types.NewBool8Builder(extBuilder) + defer builder.Release() + + builder.Append(true) + arr := builder.NewArray() + defer arr.Release() + + arrStr := arr.String() + require.Equal(t, `[true]`, arrStr) + + jsonStr, err := json.Marshal(arr) + require.NoError(t, err) + + arr1, _, err := array.FromJSON(mem, types.NewBool8Type(), bytes.NewReader(jsonStr)) + require.NoError(t, err) + defer arr1.Release() + + require.Equal(t, arr, arr1) +} + +func TestBool8ExtensionRecordBuilder(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "bool8", Type: types.NewBool8Type()}, + }, nil) + + builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) + builder.Field(0).(*types.Bool8Builder).Append(true) + record := builder.NewRecord() + defer record.Release() + + b, err := record.MarshalJSON() + require.NoError(t, err) + require.Equal(t, "[{\"bool8\":true}\n]", string(b)) + + record1, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(b)) + require.NoError(t, err) + require.Equal(t, record, record1) + + require.NoError(t, builder.UnmarshalJSON([]byte(`{"bool8":true}`))) + record = builder.NewRecord() + defer record.Release() + + require.Equal(t, schema, record.Schema()) + require.Equal(t, true, record.Column(0).(*types.Bool8Array).Value(0)) +} + +func TestBool8StringRoundTrip(t *testing.T) { + // 1. create array + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + extBuilder := array.NewExtensionBuilder(mem, types.NewBool8Type()) + defer extBuilder.Release() + b := types.NewBool8Builder(extBuilder) + b.Append(true) + b.AppendNull() + b.Append(false) + b.AppendNull() + b.Append(true) + + arr := b.NewArray() + defer arr.Release() + + // 2. create array via AppendValueFromString + extBuilder1 := array.NewExtensionBuilder(mem, types.NewBool8Type()) + defer extBuilder1.Release() + b1 := types.NewBool8Builder(extBuilder1) + defer b1.Release() + + for i := 0; i < arr.Len(); i++ { + assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i))) + } + + arr1 := b1.NewArray() + defer arr1.Release() + + assert.True(t, array.Equal(arr, arr1)) +} + +func TestCompareBool8AndBoolean(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + extBuilder := array.NewExtensionBuilder(mem, types.NewBool8Type()) + defer extBuilder.Release() + + bool8bldr := types.NewBool8Builder(extBuilder) + defer bool8bldr.Release() + + boolbldr := array.NewBooleanBuilder(mem) + defer boolbldr.Release() + + inputVals := []bool{true, false, false, false, true} + inputValidity := []bool{true, false, true, false, true} + + bool8bldr.AppendValues(inputVals, inputValidity) + bool8Arr := bool8bldr.NewExtensionArray().(*types.Bool8Array) + defer bool8Arr.Release() + + boolbldr.AppendValues(inputVals, inputValidity) + boolArr := boolbldr.NewBooleanArray() + defer boolArr.Release() + + require.Equal(t, boolArr.Len(), bool8Arr.Len()) + for i := 0; i < boolArr.Len(); i++ { + require.Equal(t, boolArr.Value(i), bool8Arr.Value(i)) + } +} + +const ( + MINSIZE = 1024 + MAXSIZE = 65536 +) + +func BenchmarkWriteBool8Array(b *testing.B) { + extBuilder := array.NewExtensionBuilder(memory.DefaultAllocator, types.NewBool8Type()) + defer extBuilder.Release() + + bool8bldr := types.NewBool8Builder(extBuilder) + defer bool8bldr.Release() + + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + + values := make([]bool, sz) + for idx := range values { + values[idx] = true + } + + b.ResetTimer() + b.SetBytes(int64(sz)) + for n := 0; n < b.N; n++ { + bool8bldr.AppendValues(values, nil) + bool8bldr.NewArray() + } + }) + } +} + +func BenchmarkWriteBooleanArray(b *testing.B) { + boolbldr := array.NewBooleanBuilder(memory.DefaultAllocator) + defer boolbldr.Release() + + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + + values := make([]bool, sz) + for idx := range values { + values[idx] = true + } + + b.ResetTimer() + b.SetBytes(int64(len(values))) + for n := 0; n < b.N; n++ { + boolbldr.AppendValues(values, nil) + boolbldr.NewArray() + } + }) + } +} + +func BenchmarkReadBool8Array(b *testing.B) { + extBuilder := array.NewExtensionBuilder(memory.DefaultAllocator, types.NewBool8Type()) + defer extBuilder.Release() + + bool8bldr := types.NewBool8Builder(extBuilder) + defer bool8bldr.Release() + + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + + values := make([]bool, sz) + output := make([]bool, sz) + for idx := range values { + values[idx] = true + } + + bool8bldr.AppendValues(values, nil) + bool8Arr := bool8bldr.NewArray().(*types.Bool8Array) + defer bool8Arr.Release() + + b.ResetTimer() + b.SetBytes(int64(len(values))) + for n := 0; n < b.N; n++ { + for i := 0; i < bool8Arr.Len(); i++ { + output[i] = bool8Arr.Value(i) + } + } + }) + } +} + +func BenchmarkReadBooleanArray(b *testing.B) { + boolbldr := array.NewBooleanBuilder(memory.DefaultAllocator) + defer boolbldr.Release() + + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + + values := make([]bool, sz) + output := make([]bool, sz) + for idx := range values { + values[idx] = true + } + + boolbldr.AppendValues(values, nil) + boolArr := boolbldr.NewArray().(*array.Boolean) + defer boolArr.Release() + + b.ResetTimer() + b.SetBytes(int64(len(values))) + for n := 0; n < b.N; n++ { + for i := 0; i < boolArr.Len(); i++ { + output[i] = boolArr.Value(i) + } + } + }) + } +} From 2bf2bb18fb08c362855524d5dcda265207de007b Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 12 Jul 2024 07:57:09 -0400 Subject: [PATCH 02/14] move to extensions dir --- go/arrow/extensions/bool8.go | 207 ++++++++++++++++++ go/arrow/extensions/bool8_test.go | 242 ++++++++++++++++++++++ go/internal/types/extension_types.go | 192 ----------------- go/internal/types/extension_types_test.go | 228 -------------------- 4 files changed, 449 insertions(+), 420 deletions(-) create mode 100644 go/arrow/extensions/bool8.go create mode 100644 go/arrow/extensions/bool8_test.go diff --git a/go/arrow/extensions/bool8.go b/go/arrow/extensions/bool8.go new file mode 100644 index 00000000000..020b4acc856 --- /dev/null +++ b/go/arrow/extensions/bool8.go @@ -0,0 +1,207 @@ +package extensions + +import ( + "fmt" + "reflect" + "strconv" + "strings" + + "github.com/apache/arrow/go/v17/arrow" + "github.com/apache/arrow/go/v17/arrow/array" + "github.com/apache/arrow/go/v17/internal/json" +) + +const ExtensionNameBool8 = "bool8" + +type Bool8Type struct { + arrow.ExtensionBase +} + +// ArrayType implements arrow.ExtensionType. +func (b *Bool8Type) ArrayType() reflect.Type { + return reflect.TypeOf(Bool8Array{}) +} + +// Deserialize implements arrow.ExtensionType. +func (b *Bool8Type) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) { + if data != ExtensionNameBool8 { + return nil, fmt.Errorf("type identifier did not match: '%s'", data) + } + if !arrow.TypeEqual(storageType, arrow.PrimitiveTypes.Uint8) { + return nil, fmt.Errorf("invalid storage type for Bool8Type: %s", storageType.Name()) + } + return NewBool8Type(), nil +} + +// ExtensionEquals implements arrow.ExtensionType. +func (b *Bool8Type) ExtensionEquals(other arrow.ExtensionType) bool { + return b.ExtensionName() == other.ExtensionName() +} + +// ExtensionName implements arrow.ExtensionType. +func (b *Bool8Type) ExtensionName() string { + return ExtensionNameBool8 +} + +// Serialize implements arrow.ExtensionType. +func (b *Bool8Type) Serialize() string { + return ExtensionNameBool8 +} + +// String implements arrow.ExtensionType. +// Subtle: this method shadows the method (ExtensionBase).String of Bool8Type.ExtensionBase. +func (b *Bool8Type) String() string { + panic("unimplemented") +} + +func NewBool8Type() *Bool8Type { + return &Bool8Type{ExtensionBase: arrow.ExtensionBase{ + Storage: arrow.PrimitiveTypes.Uint8}} +} + +func (*Bool8Type) NewBuilder(bldr *array.ExtensionBuilder) array.Builder { + return NewBool8Builder(bldr) +} + +type Bool8Array struct { + array.ExtensionArrayBase +} + +func (a *Bool8Array) String() string { + var o strings.Builder + arr := a.Storage().(*array.Uint8) + o.WriteString("[") + for i := 0; i < arr.Len(); i++ { + if i > 0 { + o.WriteString(" ") + } + switch { + case a.IsNull(i): + o.WriteString(array.NullValueStr) + default: + fmt.Fprintf(&o, "%v", a.Value(i)) + } + } + o.WriteString("]") + return o.String() +} + +func (a *Bool8Array) Value(i int) bool { + return uint8ToBool(a.Storage().(*array.Uint8).Value(i)) +} + +func (a *Bool8Array) ValueStr(i int) string { + switch { + case a.IsNull(i): + return array.NullValueStr + default: + return fmt.Sprint(a.Value(i)) + } +} + +func (a *Bool8Array) MarshalJSON() ([]byte, error) { + values := make([]interface{}, a.Len()) + for i := 0; i < a.Len()-a.NullN(); i++ { + if a.IsValid(i) { + values[i] = a.Value(i) + } + } + return json.Marshal(values) +} + +func (a *Bool8Array) GetOneForMarshal(i int) interface{} { + if a.IsNull(i) { + return nil + } + return a.Value(i) +} + +func boolToUint8(v bool) uint8 { + var res uint8 + if v { + res = 1 + } + return res +} + +func uint8ToBool(v uint8) bool { + return v != 0 +} + +type Bool8Builder struct { + *array.ExtensionBuilder +} + +func NewBool8Builder(builder *array.ExtensionBuilder) *Bool8Builder { + builder.Retain() + return &Bool8Builder{ExtensionBuilder: builder} +} + +func (b *Bool8Builder) Append(v bool) { + b.ExtensionBuilder.Builder.(*array.Uint8Builder).Append(boolToUint8(v)) +} + +func (b *Bool8Builder) UnsafeAppend(v bool) { + b.ExtensionBuilder.Builder.(*array.Uint8Builder).UnsafeAppend(boolToUint8(v)) +} + +func (b *Bool8Builder) AppendValueFromString(s string) error { + if s == array.NullValueStr { + b.AppendNull() + return nil + } + + val, err := strconv.ParseBool(s) + if err != nil { + return err + } + + b.Append(val) + return nil +} + +func (b *Bool8Builder) AppendValues(v []bool, valid []bool) { + boolsAsBytes := arrow.GetBool8Bytes(v) + boolsAsUint8s := arrow.GetData[uint8](boolsAsBytes) + b.ExtensionBuilder.Builder.(*array.Uint8Builder).AppendValues(boolsAsUint8s, valid) +} + +func (b *Bool8Builder) UnmarshalOne(dec *json.Decoder) error { + t, err := dec.Token() + if err != nil { + return err + } + + switch v := t.(type) { + case bool: + b.Append(v) + return nil + case string: + return b.AppendValueFromString(v) + case nil: + b.AppendNull() + return nil + default: + return &json.UnmarshalTypeError{ + Value: fmt.Sprint(t), + Type: reflect.TypeOf([]byte{}), + Offset: dec.InputOffset(), + Struct: fmt.Sprintf("FixedSizeBinary[%d]", 16), + } + } +} + +func (b *Bool8Builder) Unmarshal(dec *json.Decoder) error { + for dec.More() { + if err := b.UnmarshalOne(dec); err != nil { + return err + } + } + return nil +} + +var ( + _ arrow.ExtensionType = (*Bool8Type)(nil) + _ array.ExtensionArray = (*Bool8Array)(nil) + _ array.Builder = (*Bool8Builder)(nil) +) diff --git a/go/arrow/extensions/bool8_test.go b/go/arrow/extensions/bool8_test.go new file mode 100644 index 00000000000..5c388429b16 --- /dev/null +++ b/go/arrow/extensions/bool8_test.go @@ -0,0 +1,242 @@ +package extensions_test + +import ( + "bytes" + "fmt" + "testing" + + "github.com/apache/arrow/go/v17/arrow" + "github.com/apache/arrow/go/v17/arrow/array" + "github.com/apache/arrow/go/v17/arrow/extensions" + "github.com/apache/arrow/go/v17/arrow/memory" + "github.com/apache/arrow/go/v17/internal/json" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBool8ExtensionBuilder(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + extBuilder := array.NewExtensionBuilder(mem, extensions.NewBool8Type()) + defer extBuilder.Release() + + builder := extensions.NewBool8Builder(extBuilder) + defer builder.Release() + + builder.Append(true) + arr := builder.NewArray() + defer arr.Release() + + arrStr := arr.String() + require.Equal(t, `[true]`, arrStr) + + jsonStr, err := json.Marshal(arr) + require.NoError(t, err) + + arr1, _, err := array.FromJSON(mem, extensions.NewBool8Type(), bytes.NewReader(jsonStr)) + require.NoError(t, err) + defer arr1.Release() + + require.Equal(t, arr, arr1) +} + +func TestBool8ExtensionRecordBuilder(t *testing.T) { + schema := arrow.NewSchema([]arrow.Field{ + {Name: "bool8", Type: extensions.NewBool8Type()}, + }, nil) + + builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) + builder.Field(0).(*extensions.Bool8Builder).Append(true) + record := builder.NewRecord() + defer record.Release() + + b, err := record.MarshalJSON() + require.NoError(t, err) + require.Equal(t, "[{\"bool8\":true}\n]", string(b)) + + record1, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(b)) + require.NoError(t, err) + require.Equal(t, record, record1) + + require.NoError(t, builder.UnmarshalJSON([]byte(`{"bool8":true}`))) + record = builder.NewRecord() + defer record.Release() + + require.Equal(t, schema, record.Schema()) + require.Equal(t, true, record.Column(0).(*extensions.Bool8Array).Value(0)) +} + +func TestBool8StringRoundTrip(t *testing.T) { + // 1. create array + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + extBuilder := array.NewExtensionBuilder(mem, extensions.NewBool8Type()) + defer extBuilder.Release() + b := extensions.NewBool8Builder(extBuilder) + b.Append(true) + b.AppendNull() + b.Append(false) + b.AppendNull() + b.Append(true) + + arr := b.NewArray() + defer arr.Release() + + // 2. create array via AppendValueFromString + extBuilder1 := array.NewExtensionBuilder(mem, extensions.NewBool8Type()) + defer extBuilder1.Release() + b1 := extensions.NewBool8Builder(extBuilder1) + defer b1.Release() + + for i := 0; i < arr.Len(); i++ { + assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i))) + } + + arr1 := b1.NewArray() + defer arr1.Release() + + assert.True(t, array.Equal(arr, arr1)) +} + +func TestCompareBool8AndBoolean(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + extBuilder := array.NewExtensionBuilder(mem, extensions.NewBool8Type()) + defer extBuilder.Release() + + bool8bldr := extensions.NewBool8Builder(extBuilder) + defer bool8bldr.Release() + + boolbldr := array.NewBooleanBuilder(mem) + defer boolbldr.Release() + + inputVals := []bool{true, false, false, false, true} + inputValidity := []bool{true, false, true, false, true} + + bool8bldr.AppendValues(inputVals, inputValidity) + bool8Arr := bool8bldr.NewExtensionArray().(*extensions.Bool8Array) + defer bool8Arr.Release() + + boolbldr.AppendValues(inputVals, inputValidity) + boolArr := boolbldr.NewBooleanArray() + defer boolArr.Release() + + require.Equal(t, boolArr.Len(), bool8Arr.Len()) + for i := 0; i < boolArr.Len(); i++ { + require.Equal(t, boolArr.Value(i), bool8Arr.Value(i)) + } +} + +const ( + MINSIZE = 1024 + MAXSIZE = 65536 +) + +func BenchmarkWriteBool8Array(b *testing.B) { + extBuilder := array.NewExtensionBuilder(memory.DefaultAllocator, extensions.NewBool8Type()) + defer extBuilder.Release() + + bool8bldr := extensions.NewBool8Builder(extBuilder) + defer bool8bldr.Release() + + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + + values := make([]bool, sz) + for idx := range values { + values[idx] = true + } + + b.ResetTimer() + b.SetBytes(int64(sz)) + for n := 0; n < b.N; n++ { + bool8bldr.AppendValues(values, nil) + bool8bldr.NewArray() + } + }) + } +} + +func BenchmarkWriteBooleanArray(b *testing.B) { + boolbldr := array.NewBooleanBuilder(memory.DefaultAllocator) + defer boolbldr.Release() + + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + + values := make([]bool, sz) + for idx := range values { + values[idx] = true + } + + b.ResetTimer() + b.SetBytes(int64(len(values))) + for n := 0; n < b.N; n++ { + boolbldr.AppendValues(values, nil) + boolbldr.NewArray() + } + }) + } +} + +func BenchmarkReadBool8Array(b *testing.B) { + extBuilder := array.NewExtensionBuilder(memory.DefaultAllocator, extensions.NewBool8Type()) + defer extBuilder.Release() + + bool8bldr := extensions.NewBool8Builder(extBuilder) + defer bool8bldr.Release() + + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + + values := make([]bool, sz) + output := make([]bool, sz) + for idx := range values { + values[idx] = true + } + + bool8bldr.AppendValues(values, nil) + bool8Arr := bool8bldr.NewArray().(*extensions.Bool8Array) + defer bool8Arr.Release() + + b.ResetTimer() + b.SetBytes(int64(len(values))) + for n := 0; n < b.N; n++ { + for i := 0; i < bool8Arr.Len(); i++ { + output[i] = bool8Arr.Value(i) + } + } + }) + } +} + +func BenchmarkReadBooleanArray(b *testing.B) { + boolbldr := array.NewBooleanBuilder(memory.DefaultAllocator) + defer boolbldr.Release() + + for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { + b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { + + values := make([]bool, sz) + output := make([]bool, sz) + for idx := range values { + values[idx] = true + } + + boolbldr.AppendValues(values, nil) + boolArr := boolbldr.NewArray().(*array.Boolean) + defer boolArr.Release() + + b.ResetTimer() + b.SetBytes(int64(len(values))) + for n := 0; n < b.N; n++ { + for i := 0; i < boolArr.Len(); i++ { + output[i] = boolArr.Value(i) + } + } + }) + } +} diff --git a/go/internal/types/extension_types.go b/go/internal/types/extension_types.go index d7bfd6c342d..3465f9d58cc 100644 --- a/go/internal/types/extension_types.go +++ b/go/internal/types/extension_types.go @@ -22,7 +22,6 @@ import ( "encoding/binary" "fmt" "reflect" - "strconv" "strings" "github.com/apache/arrow/go/v18/arrow" @@ -32,8 +31,6 @@ import ( "golang.org/x/xerrors" ) -const ExtensionNameBool8 = "bool8" - var UUID = NewUUIDType() type UUIDBuilder struct { @@ -537,204 +534,15 @@ func (SmallintType) Deserialize(storageType arrow.DataType, data string) (arrow. return NewSmallintType(), nil } -type Bool8Type struct { - arrow.ExtensionBase -} - -// ArrayType implements arrow.ExtensionType. -func (b *Bool8Type) ArrayType() reflect.Type { - return reflect.TypeOf(Bool8Array{}) -} - -// Deserialize implements arrow.ExtensionType. -func (b *Bool8Type) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) { - if data != ExtensionNameBool8 { - return nil, fmt.Errorf("type identifier did not match: '%s'", data) - } - if !arrow.TypeEqual(storageType, arrow.PrimitiveTypes.Uint8) { - return nil, fmt.Errorf("invalid storage type for Bool8Type: %s", storageType.Name()) - } - return NewBool8Type(), nil -} - -// ExtensionEquals implements arrow.ExtensionType. -func (b *Bool8Type) ExtensionEquals(other arrow.ExtensionType) bool { - return b.ExtensionName() == other.ExtensionName() -} - -// ExtensionName implements arrow.ExtensionType. -func (b *Bool8Type) ExtensionName() string { - return ExtensionNameBool8 -} - -// Serialize implements arrow.ExtensionType. -func (b *Bool8Type) Serialize() string { - return ExtensionNameBool8 -} - -// String implements arrow.ExtensionType. -// Subtle: this method shadows the method (ExtensionBase).String of Bool8Type.ExtensionBase. -func (b *Bool8Type) String() string { - panic("unimplemented") -} - -func NewBool8Type() *Bool8Type { - return &Bool8Type{ExtensionBase: arrow.ExtensionBase{ - Storage: arrow.PrimitiveTypes.Uint8}} -} - -func (*Bool8Type) NewBuilder(bldr *array.ExtensionBuilder) array.Builder { - return NewBool8Builder(bldr) -} - -type Bool8Array struct { - array.ExtensionArrayBase -} - -func (a *Bool8Array) String() string { - var o strings.Builder - arr := a.Storage().(*array.Uint8) - o.WriteString("[") - for i := 0; i < arr.Len(); i++ { - if i > 0 { - o.WriteString(" ") - } - switch { - case a.IsNull(i): - o.WriteString(array.NullValueStr) - default: - fmt.Fprintf(&o, "%v", a.Value(i)) - } - } - o.WriteString("]") - return o.String() -} - -func (a *Bool8Array) Value(i int) bool { - return uint8ToBool(a.Storage().(*array.Uint8).Value(i)) -} - -func (a *Bool8Array) ValueStr(i int) string { - switch { - case a.IsNull(i): - return array.NullValueStr - default: - return fmt.Sprint(a.Value(i)) - } -} - -func (a *Bool8Array) MarshalJSON() ([]byte, error) { - values := make([]interface{}, a.Len()) - for i := 0; i < a.Len()-a.NullN(); i++ { - if a.IsValid(i) { - values[i] = a.Value(i) - } - } - return json.Marshal(values) -} - -func (a *Bool8Array) GetOneForMarshal(i int) interface{} { - if a.IsNull(i) { - return nil - } - return a.Value(i) -} - -func boolToUint8(v bool) uint8 { - var res uint8 - if v { - res = 1 - } - return res -} - -func uint8ToBool(v uint8) bool { - return v != 0 -} - -type Bool8Builder struct { - *array.ExtensionBuilder -} - -func NewBool8Builder(builder *array.ExtensionBuilder) *Bool8Builder { - builder.Retain() - return &Bool8Builder{ExtensionBuilder: builder} -} - -func (b *Bool8Builder) Append(v bool) { - b.ExtensionBuilder.Builder.(*array.Uint8Builder).Append(boolToUint8(v)) -} - -func (b *Bool8Builder) UnsafeAppend(v bool) { - b.ExtensionBuilder.Builder.(*array.Uint8Builder).UnsafeAppend(boolToUint8(v)) -} - -func (b *Bool8Builder) AppendValueFromString(s string) error { - if s == array.NullValueStr { - b.AppendNull() - return nil - } - - val, err := strconv.ParseBool(s) - if err != nil { - return err - } - - b.Append(val) - return nil -} - -func (b *Bool8Builder) AppendValues(v []bool, valid []bool) { - boolsAsBytes := arrow.GetBool8Bytes(v) - boolsAsUint8s := arrow.GetData[uint8](boolsAsBytes) - b.ExtensionBuilder.Builder.(*array.Uint8Builder).AppendValues(boolsAsUint8s, valid) -} - -func (b *Bool8Builder) UnmarshalOne(dec *json.Decoder) error { - t, err := dec.Token() - if err != nil { - return err - } - - switch v := t.(type) { - case bool: - b.Append(v) - return nil - case string: - return b.AppendValueFromString(v) - case nil: - b.AppendNull() - return nil - default: - return &json.UnmarshalTypeError{ - Value: fmt.Sprint(t), - Type: reflect.TypeOf([]byte{}), - Offset: dec.InputOffset(), - Struct: fmt.Sprintf("FixedSizeBinary[%d]", 16), - } - } -} - -func (b *Bool8Builder) Unmarshal(dec *json.Decoder) error { - for dec.More() { - if err := b.UnmarshalOne(dec); err != nil { - return err - } - } - return nil -} - var ( _ arrow.ExtensionType = (*Parametric1Type)(nil) _ arrow.ExtensionType = (*Parametric2Type)(nil) _ arrow.ExtensionType = (*ExtStructType)(nil) _ arrow.ExtensionType = (*DictExtensionType)(nil) _ arrow.ExtensionType = (*SmallintType)(nil) - _ arrow.ExtensionType = (*Bool8Type)(nil) _ array.ExtensionArray = (*Parametric1Array)(nil) _ array.ExtensionArray = (*Parametric2Array)(nil) _ array.ExtensionArray = (*ExtStructArray)(nil) _ array.ExtensionArray = (*DictExtensionArray)(nil) _ array.ExtensionArray = (*SmallintArray)(nil) - _ array.ExtensionArray = (*Bool8Array)(nil) ) diff --git a/go/internal/types/extension_types_test.go b/go/internal/types/extension_types_test.go index 682161cd498..f37fc0158a8 100644 --- a/go/internal/types/extension_types_test.go +++ b/go/internal/types/extension_types_test.go @@ -18,7 +18,6 @@ package types_test import ( "bytes" - "fmt" "testing" "github.com/apache/arrow/go/v18/arrow" @@ -100,230 +99,3 @@ func TestUUIDStringRoundTrip(t *testing.T) { assert.True(t, array.Equal(arr, arr1)) } - -func TestBool8ExtensionBuilder(t *testing.T) { - mem := memory.NewCheckedAllocator(memory.DefaultAllocator) - defer mem.AssertSize(t, 0) - - extBuilder := array.NewExtensionBuilder(mem, types.NewBool8Type()) - defer extBuilder.Release() - - builder := types.NewBool8Builder(extBuilder) - defer builder.Release() - - builder.Append(true) - arr := builder.NewArray() - defer arr.Release() - - arrStr := arr.String() - require.Equal(t, `[true]`, arrStr) - - jsonStr, err := json.Marshal(arr) - require.NoError(t, err) - - arr1, _, err := array.FromJSON(mem, types.NewBool8Type(), bytes.NewReader(jsonStr)) - require.NoError(t, err) - defer arr1.Release() - - require.Equal(t, arr, arr1) -} - -func TestBool8ExtensionRecordBuilder(t *testing.T) { - schema := arrow.NewSchema([]arrow.Field{ - {Name: "bool8", Type: types.NewBool8Type()}, - }, nil) - - builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) - builder.Field(0).(*types.Bool8Builder).Append(true) - record := builder.NewRecord() - defer record.Release() - - b, err := record.MarshalJSON() - require.NoError(t, err) - require.Equal(t, "[{\"bool8\":true}\n]", string(b)) - - record1, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(b)) - require.NoError(t, err) - require.Equal(t, record, record1) - - require.NoError(t, builder.UnmarshalJSON([]byte(`{"bool8":true}`))) - record = builder.NewRecord() - defer record.Release() - - require.Equal(t, schema, record.Schema()) - require.Equal(t, true, record.Column(0).(*types.Bool8Array).Value(0)) -} - -func TestBool8StringRoundTrip(t *testing.T) { - // 1. create array - mem := memory.NewCheckedAllocator(memory.DefaultAllocator) - defer mem.AssertSize(t, 0) - - extBuilder := array.NewExtensionBuilder(mem, types.NewBool8Type()) - defer extBuilder.Release() - b := types.NewBool8Builder(extBuilder) - b.Append(true) - b.AppendNull() - b.Append(false) - b.AppendNull() - b.Append(true) - - arr := b.NewArray() - defer arr.Release() - - // 2. create array via AppendValueFromString - extBuilder1 := array.NewExtensionBuilder(mem, types.NewBool8Type()) - defer extBuilder1.Release() - b1 := types.NewBool8Builder(extBuilder1) - defer b1.Release() - - for i := 0; i < arr.Len(); i++ { - assert.NoError(t, b1.AppendValueFromString(arr.ValueStr(i))) - } - - arr1 := b1.NewArray() - defer arr1.Release() - - assert.True(t, array.Equal(arr, arr1)) -} - -func TestCompareBool8AndBoolean(t *testing.T) { - mem := memory.NewCheckedAllocator(memory.DefaultAllocator) - defer mem.AssertSize(t, 0) - - extBuilder := array.NewExtensionBuilder(mem, types.NewBool8Type()) - defer extBuilder.Release() - - bool8bldr := types.NewBool8Builder(extBuilder) - defer bool8bldr.Release() - - boolbldr := array.NewBooleanBuilder(mem) - defer boolbldr.Release() - - inputVals := []bool{true, false, false, false, true} - inputValidity := []bool{true, false, true, false, true} - - bool8bldr.AppendValues(inputVals, inputValidity) - bool8Arr := bool8bldr.NewExtensionArray().(*types.Bool8Array) - defer bool8Arr.Release() - - boolbldr.AppendValues(inputVals, inputValidity) - boolArr := boolbldr.NewBooleanArray() - defer boolArr.Release() - - require.Equal(t, boolArr.Len(), bool8Arr.Len()) - for i := 0; i < boolArr.Len(); i++ { - require.Equal(t, boolArr.Value(i), bool8Arr.Value(i)) - } -} - -const ( - MINSIZE = 1024 - MAXSIZE = 65536 -) - -func BenchmarkWriteBool8Array(b *testing.B) { - extBuilder := array.NewExtensionBuilder(memory.DefaultAllocator, types.NewBool8Type()) - defer extBuilder.Release() - - bool8bldr := types.NewBool8Builder(extBuilder) - defer bool8bldr.Release() - - for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { - b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { - - values := make([]bool, sz) - for idx := range values { - values[idx] = true - } - - b.ResetTimer() - b.SetBytes(int64(sz)) - for n := 0; n < b.N; n++ { - bool8bldr.AppendValues(values, nil) - bool8bldr.NewArray() - } - }) - } -} - -func BenchmarkWriteBooleanArray(b *testing.B) { - boolbldr := array.NewBooleanBuilder(memory.DefaultAllocator) - defer boolbldr.Release() - - for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { - b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { - - values := make([]bool, sz) - for idx := range values { - values[idx] = true - } - - b.ResetTimer() - b.SetBytes(int64(len(values))) - for n := 0; n < b.N; n++ { - boolbldr.AppendValues(values, nil) - boolbldr.NewArray() - } - }) - } -} - -func BenchmarkReadBool8Array(b *testing.B) { - extBuilder := array.NewExtensionBuilder(memory.DefaultAllocator, types.NewBool8Type()) - defer extBuilder.Release() - - bool8bldr := types.NewBool8Builder(extBuilder) - defer bool8bldr.Release() - - for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { - b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { - - values := make([]bool, sz) - output := make([]bool, sz) - for idx := range values { - values[idx] = true - } - - bool8bldr.AppendValues(values, nil) - bool8Arr := bool8bldr.NewArray().(*types.Bool8Array) - defer bool8Arr.Release() - - b.ResetTimer() - b.SetBytes(int64(len(values))) - for n := 0; n < b.N; n++ { - for i := 0; i < bool8Arr.Len(); i++ { - output[i] = bool8Arr.Value(i) - } - } - }) - } -} - -func BenchmarkReadBooleanArray(b *testing.B) { - boolbldr := array.NewBooleanBuilder(memory.DefaultAllocator) - defer boolbldr.Release() - - for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { - b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { - - values := make([]bool, sz) - output := make([]bool, sz) - for idx := range values { - values[idx] = true - } - - boolbldr.AppendValues(values, nil) - boolArr := boolbldr.NewArray().(*array.Boolean) - defer boolArr.Release() - - b.ResetTimer() - b.SetBytes(int64(len(values))) - for n := 0; n < b.N; n++ { - for i := 0; i < boolArr.Len(); i++ { - output[i] = boolArr.Value(i) - } - } - }) - } -} From 2507f90d270b4a47795661b986516c5c1f2c6d90 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 12 Jul 2024 09:36:25 -0400 Subject: [PATCH 03/14] general cleanup --- go/arrow/extensions/bool8.go | 47 ++++++++++++------------------- go/arrow/extensions/bool8_test.go | 4 ++- 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/go/arrow/extensions/bool8.go b/go/arrow/extensions/bool8.go index 020b4acc856..09c1aa61e89 100644 --- a/go/arrow/extensions/bool8.go +++ b/go/arrow/extensions/bool8.go @@ -13,16 +13,17 @@ import ( const ExtensionNameBool8 = "bool8" +// Bool8Type represents a logical boolean that is stored using 8 bits. type Bool8Type struct { arrow.ExtensionBase } -// ArrayType implements arrow.ExtensionType. -func (b *Bool8Type) ArrayType() reflect.Type { - return reflect.TypeOf(Bool8Array{}) +func NewBool8Type() *Bool8Type { + return &Bool8Type{ExtensionBase: arrow.ExtensionBase{Storage: arrow.PrimitiveTypes.Uint8}} } -// Deserialize implements arrow.ExtensionType. +func (b *Bool8Type) ArrayType() reflect.Type { return reflect.TypeOf(Bool8Array{}) } + func (b *Bool8Type) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) { if data != ExtensionNameBool8 { return nil, fmt.Errorf("type identifier did not match: '%s'", data) @@ -33,45 +34,30 @@ func (b *Bool8Type) Deserialize(storageType arrow.DataType, data string) (arrow. return NewBool8Type(), nil } -// ExtensionEquals implements arrow.ExtensionType. func (b *Bool8Type) ExtensionEquals(other arrow.ExtensionType) bool { return b.ExtensionName() == other.ExtensionName() } -// ExtensionName implements arrow.ExtensionType. -func (b *Bool8Type) ExtensionName() string { - return ExtensionNameBool8 -} - -// Serialize implements arrow.ExtensionType. -func (b *Bool8Type) Serialize() string { - return ExtensionNameBool8 -} +func (b *Bool8Type) ExtensionName() string { return ExtensionNameBool8 } -// String implements arrow.ExtensionType. -// Subtle: this method shadows the method (ExtensionBase).String of Bool8Type.ExtensionBase. -func (b *Bool8Type) String() string { - panic("unimplemented") -} +func (b *Bool8Type) Serialize() string { return ExtensionNameBool8 } -func NewBool8Type() *Bool8Type { - return &Bool8Type{ExtensionBase: arrow.ExtensionBase{ - Storage: arrow.PrimitiveTypes.Uint8}} -} +func (b *Bool8Type) String() string { return fmt.Sprintf("Bool8", b.Storage) } func (*Bool8Type) NewBuilder(bldr *array.ExtensionBuilder) array.Builder { return NewBool8Builder(bldr) } +// Bool8Array is logically an array of boolean values but uses +// 8 bits to store values instead of 1 bit as in the native BooleanArray. type Bool8Array struct { array.ExtensionArrayBase } func (a *Bool8Array) String() string { var o strings.Builder - arr := a.Storage().(*array.Uint8) o.WriteString("[") - for i := 0; i < arr.Len(); i++ { + for i := 0; i < a.Len(); i++ { if i > 0 { o.WriteString(" ") } @@ -101,7 +87,7 @@ func (a *Bool8Array) ValueStr(i int) string { func (a *Bool8Array) MarshalJSON() ([]byte, error) { values := make([]interface{}, a.Len()) - for i := 0; i < a.Len()-a.NullN(); i++ { + for i := 0; i < a.Len(); i++ { if a.IsValid(i) { values[i] = a.Value(i) } @@ -128,6 +114,8 @@ func uint8ToBool(v uint8) bool { return v != 0 } +// Bool8Builder is a convenience builder for the Bool8 extension type, +// allowing arrays to be built with boolean values rather than the underlying storage type. type Bool8Builder struct { *array.ExtensionBuilder } @@ -201,7 +189,8 @@ func (b *Bool8Builder) Unmarshal(dec *json.Decoder) error { } var ( - _ arrow.ExtensionType = (*Bool8Type)(nil) - _ array.ExtensionArray = (*Bool8Array)(nil) - _ array.Builder = (*Bool8Builder)(nil) + _ arrow.ExtensionType = (*Bool8Type)(nil) + _ array.ExtensionBuilderWrapper = (*Bool8Type)(nil) + _ array.ExtensionArray = (*Bool8Array)(nil) + _ array.Builder = (*Bool8Builder)(nil) ) diff --git a/go/arrow/extensions/bool8_test.go b/go/arrow/extensions/bool8_test.go index 5c388429b16..77253c00712 100644 --- a/go/arrow/extensions/bool8_test.go +++ b/go/arrow/extensions/bool8_test.go @@ -25,11 +25,13 @@ func TestBool8ExtensionBuilder(t *testing.T) { defer builder.Release() builder.Append(true) + builder.AppendNull() + builder.Append(false) arr := builder.NewArray() defer arr.Release() arrStr := arr.String() - require.Equal(t, `[true]`, arrStr) + require.Equal(t, "[true (null) false]", arrStr) jsonStr, err := json.Marshal(arr) require.NoError(t, err) From c738bfa2d46f09d82415fd271beac4a46ae19dbb Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 12 Jul 2024 09:43:00 -0400 Subject: [PATCH 04/14] change storage type to int8 --- go/arrow/extensions/bool8.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/go/arrow/extensions/bool8.go b/go/arrow/extensions/bool8.go index 09c1aa61e89..25e3c6a92fb 100644 --- a/go/arrow/extensions/bool8.go +++ b/go/arrow/extensions/bool8.go @@ -19,7 +19,7 @@ type Bool8Type struct { } func NewBool8Type() *Bool8Type { - return &Bool8Type{ExtensionBase: arrow.ExtensionBase{Storage: arrow.PrimitiveTypes.Uint8}} + return &Bool8Type{ExtensionBase: arrow.ExtensionBase{Storage: arrow.PrimitiveTypes.Int8}} } func (b *Bool8Type) ArrayType() reflect.Type { return reflect.TypeOf(Bool8Array{}) } @@ -28,7 +28,7 @@ func (b *Bool8Type) Deserialize(storageType arrow.DataType, data string) (arrow. if data != ExtensionNameBool8 { return nil, fmt.Errorf("type identifier did not match: '%s'", data) } - if !arrow.TypeEqual(storageType, arrow.PrimitiveTypes.Uint8) { + if !arrow.TypeEqual(storageType, arrow.PrimitiveTypes.Int8) { return nil, fmt.Errorf("invalid storage type for Bool8Type: %s", storageType.Name()) } return NewBool8Type(), nil @@ -73,7 +73,7 @@ func (a *Bool8Array) String() string { } func (a *Bool8Array) Value(i int) bool { - return uint8ToBool(a.Storage().(*array.Uint8).Value(i)) + return int8ToBool(a.Storage().(*array.Int8).Value(i)) } func (a *Bool8Array) ValueStr(i int) string { @@ -102,15 +102,15 @@ func (a *Bool8Array) GetOneForMarshal(i int) interface{} { return a.Value(i) } -func boolToUint8(v bool) uint8 { - var res uint8 +func boolToInt8(v bool) int8 { + var res int8 if v { res = 1 } return res } -func uint8ToBool(v uint8) bool { +func int8ToBool(v int8) bool { return v != 0 } @@ -126,11 +126,11 @@ func NewBool8Builder(builder *array.ExtensionBuilder) *Bool8Builder { } func (b *Bool8Builder) Append(v bool) { - b.ExtensionBuilder.Builder.(*array.Uint8Builder).Append(boolToUint8(v)) + b.ExtensionBuilder.Builder.(*array.Int8Builder).Append(boolToInt8(v)) } func (b *Bool8Builder) UnsafeAppend(v bool) { - b.ExtensionBuilder.Builder.(*array.Uint8Builder).UnsafeAppend(boolToUint8(v)) + b.ExtensionBuilder.Builder.(*array.Int8Builder).UnsafeAppend(boolToInt8(v)) } func (b *Bool8Builder) AppendValueFromString(s string) error { @@ -150,8 +150,8 @@ func (b *Bool8Builder) AppendValueFromString(s string) error { func (b *Bool8Builder) AppendValues(v []bool, valid []bool) { boolsAsBytes := arrow.GetBool8Bytes(v) - boolsAsUint8s := arrow.GetData[uint8](boolsAsBytes) - b.ExtensionBuilder.Builder.(*array.Uint8Builder).AppendValues(boolsAsUint8s, valid) + boolsAsInt8s := arrow.GetData[int8](boolsAsBytes) + b.ExtensionBuilder.Builder.(*array.Int8Builder).AppendValues(boolsAsInt8s, valid) } func (b *Bool8Builder) UnmarshalOne(dec *json.Decoder) error { From 8ee08a574f01a360b9ab6260cd49f2635bb5f019 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 12 Jul 2024 10:33:51 -0400 Subject: [PATCH 05/14] refactor custom builders to create extension builder internally --- go/arrow/array/builder.go | 8 ++--- go/arrow/array/extension_builder.go | 10 +++--- go/arrow/extensions/bool8.go | 18 +++++----- go/arrow/extensions/bool8_test.go | 42 ++++++++--------------- go/internal/types/extension_types.go | 10 +++--- go/internal/types/extension_types_test.go | 12 ++----- go/parquet/pqarrow/encode_arrow_test.go | 4 +-- 7 files changed, 42 insertions(+), 62 deletions(-) diff --git a/go/arrow/array/builder.go b/go/arrow/array/builder.go index 6c8ea877a2f..d17eae1f18d 100644 --- a/go/arrow/array/builder.go +++ b/go/arrow/array/builder.go @@ -349,12 +349,10 @@ func NewBuilder(mem memory.Allocator, dtype arrow.DataType) Builder { typ := dtype.(*arrow.LargeListViewType) return NewLargeListViewBuilderWithField(mem, typ.ElemField()) case arrow.EXTENSION: - typ := dtype.(arrow.ExtensionType) - bldr := NewExtensionBuilder(mem, typ) - if custom, ok := typ.(ExtensionBuilderWrapper); ok { - return custom.NewBuilder(bldr) + if custom, ok := dtype.(CustomExtensionBuilder); ok { + return custom.NewBuilder(mem) } - return bldr + return NewExtensionBuilder(mem, dtype.(arrow.ExtensionType)) case arrow.FIXED_SIZE_LIST: typ := dtype.(*arrow.FixedSizeListType) return NewFixedSizeListBuilderWithField(mem, typ.Len(), typ.ElemField()) diff --git a/go/arrow/array/extension_builder.go b/go/arrow/array/extension_builder.go index a71287faf0e..9c4c8a85e61 100644 --- a/go/arrow/array/extension_builder.go +++ b/go/arrow/array/extension_builder.go @@ -16,8 +16,10 @@ package array -// ExtensionBuilderWrapper is an interface that you need to implement in your custom extension type if you want to provide a customer builder as well. -// See example in ./arrow/internal/testing/types/extension_types.go -type ExtensionBuilderWrapper interface { - NewBuilder(bldr *ExtensionBuilder) Builder +import "github.com/apache/arrow/go/v17/arrow/memory" + +// CustomExtensionBuilder is an interface that custom extension types may implement to provide a custom builder +// instead of the underlying storage type's builder when array.NewBuilder is called with that type. +type CustomExtensionBuilder interface { + NewBuilder(memory.Allocator) Builder } diff --git a/go/arrow/extensions/bool8.go b/go/arrow/extensions/bool8.go index 25e3c6a92fb..f06ec9d3103 100644 --- a/go/arrow/extensions/bool8.go +++ b/go/arrow/extensions/bool8.go @@ -8,6 +8,7 @@ import ( "github.com/apache/arrow/go/v17/arrow" "github.com/apache/arrow/go/v17/arrow/array" + "github.com/apache/arrow/go/v17/arrow/memory" "github.com/apache/arrow/go/v17/internal/json" ) @@ -44,8 +45,8 @@ func (b *Bool8Type) Serialize() string { return ExtensionNameBool8 } func (b *Bool8Type) String() string { return fmt.Sprintf("Bool8", b.Storage) } -func (*Bool8Type) NewBuilder(bldr *array.ExtensionBuilder) array.Builder { - return NewBool8Builder(bldr) +func (*Bool8Type) NewBuilder(mem memory.Allocator) array.Builder { + return NewBool8Builder(mem) } // Bool8Array is logically an array of boolean values but uses @@ -120,9 +121,8 @@ type Bool8Builder struct { *array.ExtensionBuilder } -func NewBool8Builder(builder *array.ExtensionBuilder) *Bool8Builder { - builder.Retain() - return &Bool8Builder{ExtensionBuilder: builder} +func NewBool8Builder(mem memory.Allocator) *Bool8Builder { + return &Bool8Builder{ExtensionBuilder: array.NewExtensionBuilder(mem, NewBool8Type())} } func (b *Bool8Builder) Append(v bool) { @@ -189,8 +189,8 @@ func (b *Bool8Builder) Unmarshal(dec *json.Decoder) error { } var ( - _ arrow.ExtensionType = (*Bool8Type)(nil) - _ array.ExtensionBuilderWrapper = (*Bool8Type)(nil) - _ array.ExtensionArray = (*Bool8Array)(nil) - _ array.Builder = (*Bool8Builder)(nil) + _ arrow.ExtensionType = (*Bool8Type)(nil) + _ array.CustomExtensionBuilder = (*Bool8Type)(nil) + _ array.ExtensionArray = (*Bool8Array)(nil) + _ array.Builder = (*Bool8Builder)(nil) ) diff --git a/go/arrow/extensions/bool8_test.go b/go/arrow/extensions/bool8_test.go index 77253c00712..20c2e7e4978 100644 --- a/go/arrow/extensions/bool8_test.go +++ b/go/arrow/extensions/bool8_test.go @@ -14,14 +14,16 @@ import ( "github.com/stretchr/testify/require" ) +const ( + MINSIZE = 1024 + MAXSIZE = 65536 +) + func TestBool8ExtensionBuilder(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) - extBuilder := array.NewExtensionBuilder(mem, extensions.NewBool8Type()) - defer extBuilder.Release() - - builder := extensions.NewBool8Builder(extBuilder) + builder := extensions.NewBool8Builder(mem) defer builder.Release() builder.Append(true) @@ -49,6 +51,8 @@ func TestBool8ExtensionRecordBuilder(t *testing.T) { }, nil) builder := array.NewRecordBuilder(memory.DefaultAllocator, schema) + defer builder.Release() + builder.Field(0).(*extensions.Bool8Builder).Append(true) record := builder.NewRecord() defer record.Release() @@ -59,6 +63,8 @@ func TestBool8ExtensionRecordBuilder(t *testing.T) { record1, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, bytes.NewReader(b)) require.NoError(t, err) + defer record1.Release() + require.Equal(t, record, record1) require.NoError(t, builder.UnmarshalJSON([]byte(`{"bool8":true}`))) @@ -74,9 +80,7 @@ func TestBool8StringRoundTrip(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) - extBuilder := array.NewExtensionBuilder(mem, extensions.NewBool8Type()) - defer extBuilder.Release() - b := extensions.NewBool8Builder(extBuilder) + b := extensions.NewBool8Builder(mem) b.Append(true) b.AppendNull() b.Append(false) @@ -87,9 +91,7 @@ func TestBool8StringRoundTrip(t *testing.T) { defer arr.Release() // 2. create array via AppendValueFromString - extBuilder1 := array.NewExtensionBuilder(mem, extensions.NewBool8Type()) - defer extBuilder1.Release() - b1 := extensions.NewBool8Builder(extBuilder1) + b1 := extensions.NewBool8Builder(mem) defer b1.Release() for i := 0; i < arr.Len(); i++ { @@ -106,10 +108,7 @@ func TestCompareBool8AndBoolean(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) - extBuilder := array.NewExtensionBuilder(mem, extensions.NewBool8Type()) - defer extBuilder.Release() - - bool8bldr := extensions.NewBool8Builder(extBuilder) + bool8bldr := extensions.NewBool8Builder(mem) defer bool8bldr.Release() boolbldr := array.NewBooleanBuilder(mem) @@ -132,16 +131,8 @@ func TestCompareBool8AndBoolean(t *testing.T) { } } -const ( - MINSIZE = 1024 - MAXSIZE = 65536 -) - func BenchmarkWriteBool8Array(b *testing.B) { - extBuilder := array.NewExtensionBuilder(memory.DefaultAllocator, extensions.NewBool8Type()) - defer extBuilder.Release() - - bool8bldr := extensions.NewBool8Builder(extBuilder) + bool8bldr := extensions.NewBool8Builder(memory.DefaultAllocator) defer bool8bldr.Release() for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { @@ -185,10 +176,7 @@ func BenchmarkWriteBooleanArray(b *testing.B) { } func BenchmarkReadBool8Array(b *testing.B) { - extBuilder := array.NewExtensionBuilder(memory.DefaultAllocator, extensions.NewBool8Type()) - defer extBuilder.Release() - - bool8bldr := extensions.NewBool8Builder(extBuilder) + bool8bldr := extensions.NewBool8Builder(memory.DefaultAllocator) defer bool8bldr.Release() for sz := MINSIZE; sz < MAXSIZE+1; sz *= 2 { diff --git a/go/internal/types/extension_types.go b/go/internal/types/extension_types.go index 3465f9d58cc..85c64d86bff 100644 --- a/go/internal/types/extension_types.go +++ b/go/internal/types/extension_types.go @@ -26,6 +26,7 @@ import ( "github.com/apache/arrow/go/v18/arrow" "github.com/apache/arrow/go/v18/arrow/array" + "github.com/apache/arrow/go/v18/arrow/memory" "github.com/apache/arrow/go/v18/internal/json" "github.com/google/uuid" "golang.org/x/xerrors" @@ -37,9 +38,8 @@ type UUIDBuilder struct { *array.ExtensionBuilder } -func NewUUIDBuilder(builder *array.ExtensionBuilder) *UUIDBuilder { - builder.Retain() - return &UUIDBuilder{ExtensionBuilder: builder} +func NewUUIDBuilder(mem memory.Allocator) *UUIDBuilder { + return &UUIDBuilder{ExtensionBuilder: array.NewExtensionBuilder(mem, NewUUIDType())} } func (b *UUIDBuilder) Append(v uuid.UUID) { @@ -246,8 +246,8 @@ func (e *UUIDType) ExtensionEquals(other arrow.ExtensionType) bool { return e.ExtensionName() == other.ExtensionName() } -func (*UUIDType) NewBuilder(bldr *array.ExtensionBuilder) array.Builder { - return NewUUIDBuilder(bldr) +func (*UUIDType) NewBuilder(mem memory.Allocator) array.Builder { + return NewUUIDBuilder(mem) } // Parametric1Array is a simple int32 array for use with the Parametric1Type diff --git a/go/internal/types/extension_types_test.go b/go/internal/types/extension_types_test.go index f37fc0158a8..65f6353d01b 100644 --- a/go/internal/types/extension_types_test.go +++ b/go/internal/types/extension_types_test.go @@ -35,9 +35,7 @@ var testUUID = uuid.New() func TestUUIDExtensionBuilder(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) - extBuilder := array.NewExtensionBuilder(mem, types.NewUUIDType()) - defer extBuilder.Release() - builder := types.NewUUIDBuilder(extBuilder) + builder := types.NewUUIDBuilder(mem) builder.Append(testUUID) arr := builder.NewArray() defer arr.Release() @@ -72,9 +70,7 @@ func TestUUIDStringRoundTrip(t *testing.T) { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(t, 0) - extBuilder := array.NewExtensionBuilder(mem, types.NewUUIDType()) - defer extBuilder.Release() - b := types.NewUUIDBuilder(extBuilder) + b := types.NewUUIDBuilder(mem) b.Append(uuid.Nil) b.AppendNull() b.Append(uuid.NameSpaceURL) @@ -85,9 +81,7 @@ func TestUUIDStringRoundTrip(t *testing.T) { defer arr.Release() // 2. create array via AppendValueFromString - extBuilder1 := array.NewExtensionBuilder(mem, types.NewUUIDType()) - defer extBuilder1.Release() - b1 := types.NewUUIDBuilder(extBuilder1) + b1 := types.NewUUIDBuilder(mem) defer b1.Release() for i := 0; i < arr.Len(); i++ { diff --git a/go/parquet/pqarrow/encode_arrow_test.go b/go/parquet/pqarrow/encode_arrow_test.go index 9b3419988d6..16282173a68 100644 --- a/go/parquet/pqarrow/encode_arrow_test.go +++ b/go/parquet/pqarrow/encode_arrow_test.go @@ -2053,9 +2053,7 @@ func (ps *ParquetIOTestSuite) TestArrowExtensionTypeRoundTrip() { mem := memory.NewCheckedAllocator(memory.DefaultAllocator) defer mem.AssertSize(ps.T(), 0) - extBuilder := array.NewExtensionBuilder(mem, types.NewUUIDType()) - defer extBuilder.Release() - builder := types.NewUUIDBuilder(extBuilder) + builder := types.NewUUIDBuilder(mem) builder.Append(uuid.New()) arr := builder.NewArray() defer arr.Release() From bb23726a1f80191209f989e5b9e5c0e2baedf647 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 12 Jul 2024 14:56:50 -0400 Subject: [PATCH 06/14] add general extension tests --- go/arrow/array/builder.go | 5 +- go/arrow/extensions/extensions_test.go | 89 ++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 go/arrow/extensions/extensions_test.go diff --git a/go/arrow/array/builder.go b/go/arrow/array/builder.go index d17eae1f18d..1f4d0ea9635 100644 --- a/go/arrow/array/builder.go +++ b/go/arrow/array/builder.go @@ -352,7 +352,10 @@ func NewBuilder(mem memory.Allocator, dtype arrow.DataType) Builder { if custom, ok := dtype.(CustomExtensionBuilder); ok { return custom.NewBuilder(mem) } - return NewExtensionBuilder(mem, dtype.(arrow.ExtensionType)) + if typ, ok := dtype.(arrow.ExtensionType); ok { + return NewExtensionBuilder(mem, typ) + } + panic(fmt.Errorf("arrow/array: invalid extension type: %T", dtype)) case arrow.FIXED_SIZE_LIST: typ := dtype.(*arrow.FixedSizeListType) return NewFixedSizeListBuilderWithField(mem, typ.Len(), typ.ElemField()) diff --git a/go/arrow/extensions/extensions_test.go b/go/arrow/extensions/extensions_test.go new file mode 100644 index 00000000000..00b20a157fc --- /dev/null +++ b/go/arrow/extensions/extensions_test.go @@ -0,0 +1,89 @@ +package extensions_test + +import ( + "bytes" + "fmt" + "reflect" + "testing" + + "github.com/apache/arrow/go/v17/arrow" + "github.com/apache/arrow/go/v17/arrow/array" + "github.com/apache/arrow/go/v17/arrow/extensions" + "github.com/apache/arrow/go/v17/arrow/memory" + "github.com/stretchr/testify/require" +) + +// testBool8Type minimally implements arrow.ExtensionType, but importantly does not implement array.CustomExtensionBuilder +// so it will fall back to the storage type's default builder. +type testBool8Type struct { + arrow.ExtensionBase +} + +func newTestBool8Type() *testBool8Type { + return &testBool8Type{ExtensionBase: arrow.ExtensionBase{Storage: arrow.PrimitiveTypes.Int8}} +} + +func (t *testBool8Type) ArrayType() reflect.Type { return reflect.TypeOf(testBool8Array{}) } +func (t *testBool8Type) ExtensionEquals(arrow.ExtensionType) bool { panic("unimplemented") } +func (t *testBool8Type) ExtensionName() string { panic("unimplemented") } +func (t *testBool8Type) Serialize() string { panic("unimplemented") } +func (t *testBool8Type) Deserialize(arrow.DataType, string) (arrow.ExtensionType, error) { + panic("unimplemented") +} + +type testBool8Array struct { + array.ExtensionArrayBase +} + +func TestUnmarshalExtensionTypes(t *testing.T) { + logicalJSON := `[true,null,false,null,true]` + storageJSON := `[1,null,0,null,1]` + + // extensions.Bool8Type implements array.CustomExtensionBuilder so we expect the array to be built with the custom builder + arrCustomBuilder, _, err := array.FromJSON(memory.DefaultAllocator, extensions.NewBool8Type(), bytes.NewBufferString(logicalJSON)) + require.NoError(t, err) + defer arrCustomBuilder.Release() + require.Equal(t, 5, arrCustomBuilder.Len()) + + // testBoolType falls back to the default builder for the storage type, so it cannot deserialize native booleans + _, _, err = array.FromJSON(memory.DefaultAllocator, newTestBool8Type(), bytes.NewBufferString(logicalJSON)) + require.ErrorContains(t, err, "cannot unmarshal true into Go value of type int8") + + // testBoolType must build the array with the native storage type: Int8 + arrDefaultBuilder, _, err := array.FromJSON(memory.DefaultAllocator, newTestBool8Type(), bytes.NewBufferString(storageJSON)) + require.NoError(t, err) + defer arrDefaultBuilder.Release() + require.Equal(t, 5, arrDefaultBuilder.Len()) + + arrBool8, ok := arrCustomBuilder.(*extensions.Bool8Array) + require.True(t, ok) + + arrExt, ok := arrDefaultBuilder.(array.ExtensionArray) + require.True(t, ok) + + // The physical layout of both arrays is identical + require.True(t, array.Equal(arrBool8.Storage(), arrExt.Storage())) +} + +// invalidExtensionType does not fully implement the arrow.ExtensionType interface, even though it embeds arrow.ExtensionBase +type invalidExtensionType struct { + arrow.ExtensionBase +} + +func newInvalidExtensionType() *invalidExtensionType { + return &invalidExtensionType{ExtensionBase: arrow.ExtensionBase{Storage: arrow.BinaryTypes.String}} +} + +func TestInvalidExtensionType(t *testing.T) { + jsonStr := `["one","two","three"]` + typ := newInvalidExtensionType() + + require.PanicsWithError(t, fmt.Sprintf("arrow/array: invalid extension type: %T", typ), func() { + array.FromJSON(memory.DefaultAllocator, typ, bytes.NewBufferString(jsonStr)) + }) +} + +var ( + _ arrow.ExtensionType = (*testBool8Type)(nil) + _ array.ExtensionArray = (*testBool8Array)(nil) +) From 19ef649aac09ecfc3e33c638da4c4781368c4724 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 12 Jul 2024 15:06:14 -0400 Subject: [PATCH 07/14] incorporate review suggestions --- go/arrow/extensions/bool8.go | 10 ++++------ go/arrow/type_traits.go | 8 -------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/go/arrow/extensions/bool8.go b/go/arrow/extensions/bool8.go index f06ec9d3103..7c66d8ddc3c 100644 --- a/go/arrow/extensions/bool8.go +++ b/go/arrow/extensions/bool8.go @@ -5,6 +5,7 @@ import ( "reflect" "strconv" "strings" + "unsafe" "github.com/apache/arrow/go/v17/arrow" "github.com/apache/arrow/go/v17/arrow/array" @@ -19,6 +20,7 @@ type Bool8Type struct { arrow.ExtensionBase } +// NewBool8Type creates a new Bool8Type with the underlying storage type set correctly to Int8. func NewBool8Type() *Bool8Type { return &Bool8Type{ExtensionBase: arrow.ExtensionBase{Storage: arrow.PrimitiveTypes.Int8}} } @@ -26,9 +28,6 @@ func NewBool8Type() *Bool8Type { func (b *Bool8Type) ArrayType() reflect.Type { return reflect.TypeOf(Bool8Array{}) } func (b *Bool8Type) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) { - if data != ExtensionNameBool8 { - return nil, fmt.Errorf("type identifier did not match: '%s'", data) - } if !arrow.TypeEqual(storageType, arrow.PrimitiveTypes.Int8) { return nil, fmt.Errorf("invalid storage type for Bool8Type: %s", storageType.Name()) } @@ -41,7 +40,7 @@ func (b *Bool8Type) ExtensionEquals(other arrow.ExtensionType) bool { func (b *Bool8Type) ExtensionName() string { return ExtensionNameBool8 } -func (b *Bool8Type) Serialize() string { return ExtensionNameBool8 } +func (b *Bool8Type) Serialize() string { return "" } func (b *Bool8Type) String() string { return fmt.Sprintf("Bool8", b.Storage) } @@ -149,8 +148,7 @@ func (b *Bool8Builder) AppendValueFromString(s string) error { } func (b *Bool8Builder) AppendValues(v []bool, valid []bool) { - boolsAsBytes := arrow.GetBool8Bytes(v) - boolsAsInt8s := arrow.GetData[int8](boolsAsBytes) + boolsAsInt8s := unsafe.Slice((*int8)(unsafe.Pointer(unsafe.SliceData(v))), len(v)) b.ExtensionBuilder.Builder.(*array.Int8Builder).AppendValues(boolsAsInt8s, valid) } diff --git a/go/arrow/type_traits.go b/go/arrow/type_traits.go index 93bd86cb283..aae6ad10648 100644 --- a/go/arrow/type_traits.go +++ b/go/arrow/type_traits.go @@ -117,14 +117,6 @@ func GetBytes[T FixedWidthType | ViewHeader](in []T) []byte { return reinterpretSlice[byte](in) } -// GetBool8Bytes reinterprets a slice of bool to a slice of bytes. -// -// NOTE: This must not be used for standard Arrow Boolean arrays as they are bitpacked. -// The resulting byte slice is only valid for the Bool8 extension type. -func GetBool8Bytes(in []bool) []byte { - return reinterpretSlice[byte](in) -} - // GetData reinterprets a slice of bytes to a slice of T. // // NOTE: the buffer's length must be a multiple of Sizeof(T). From c518e422b07801d9d8c704bfd3504f5a133b404f Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Fri, 12 Jul 2024 15:31:10 -0400 Subject: [PATCH 08/14] add missing license headers --- go/arrow/extensions/bool8.go | 16 ++++++++++++++++ go/arrow/extensions/bool8_test.go | 16 ++++++++++++++++ go/arrow/extensions/extensions_test.go | 16 ++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/go/arrow/extensions/bool8.go b/go/arrow/extensions/bool8.go index 7c66d8ddc3c..4426e85ea06 100644 --- a/go/arrow/extensions/bool8.go +++ b/go/arrow/extensions/bool8.go @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package extensions import ( diff --git a/go/arrow/extensions/bool8_test.go b/go/arrow/extensions/bool8_test.go index 20c2e7e4978..ea0727ac359 100644 --- a/go/arrow/extensions/bool8_test.go +++ b/go/arrow/extensions/bool8_test.go @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package extensions_test import ( diff --git a/go/arrow/extensions/extensions_test.go b/go/arrow/extensions/extensions_test.go index 00b20a157fc..b28c00133cf 100644 --- a/go/arrow/extensions/extensions_test.go +++ b/go/arrow/extensions/extensions_test.go @@ -1,3 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package extensions_test import ( From 5f6aa097605cfa85f512927c38caac52d2a29bbd Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Wed, 24 Jul 2024 07:06:03 -0400 Subject: [PATCH 09/14] fix extension name and string --- go/arrow/extensions/bool8.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/go/arrow/extensions/bool8.go b/go/arrow/extensions/bool8.go index 4426e85ea06..057e7b4809d 100644 --- a/go/arrow/extensions/bool8.go +++ b/go/arrow/extensions/bool8.go @@ -29,8 +29,6 @@ import ( "github.com/apache/arrow/go/v17/internal/json" ) -const ExtensionNameBool8 = "bool8" - // Bool8Type represents a logical boolean that is stored using 8 bits. type Bool8Type struct { arrow.ExtensionBase @@ -54,11 +52,11 @@ func (b *Bool8Type) ExtensionEquals(other arrow.ExtensionType) bool { return b.ExtensionName() == other.ExtensionName() } -func (b *Bool8Type) ExtensionName() string { return ExtensionNameBool8 } +func (b *Bool8Type) ExtensionName() string { return "arrow.bool8" } func (b *Bool8Type) Serialize() string { return "" } -func (b *Bool8Type) String() string { return fmt.Sprintf("Bool8", b.Storage) } +func (b *Bool8Type) String() string { return fmt.Sprintf("extension<%s>", b.ExtensionName()) } func (*Bool8Type) NewBuilder(mem memory.Allocator) array.Builder { return NewBool8Builder(mem) From d13774e9326ed2d8c28c8a77f3e3fde117d88412 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Wed, 24 Jul 2024 07:06:16 -0400 Subject: [PATCH 10/14] add ipc test --- go/arrow/extensions/bool8_test.go | 42 +++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/go/arrow/extensions/bool8_test.go b/go/arrow/extensions/bool8_test.go index ea0727ac359..da89594c478 100644 --- a/go/arrow/extensions/bool8_test.go +++ b/go/arrow/extensions/bool8_test.go @@ -19,11 +19,13 @@ package extensions_test import ( "bytes" "fmt" + "strings" "testing" "github.com/apache/arrow/go/v17/arrow" "github.com/apache/arrow/go/v17/arrow/array" "github.com/apache/arrow/go/v17/arrow/extensions" + "github.com/apache/arrow/go/v17/arrow/ipc" "github.com/apache/arrow/go/v17/arrow/memory" "github.com/apache/arrow/go/v17/internal/json" "github.com/stretchr/testify/assert" @@ -147,6 +149,46 @@ func TestCompareBool8AndBoolean(t *testing.T) { } } +func TestBool8TypeBatchIPCRoundTrip(t *testing.T) { + typ := extensions.NewBool8Type() + arrow.RegisterExtensionType(typ) + defer arrow.UnregisterExtensionType(typ.ExtensionName()) + + storage, _, err := array.FromJSON(memory.DefaultAllocator, arrow.PrimitiveTypes.Int8, + strings.NewReader(`[-1, 0, 1, 2, null]`)) + require.NoError(t, err) + defer storage.Release() + + arr := array.NewExtensionArrayWithStorage(typ, storage) + defer arr.Release() + + batch := array.NewRecord(arrow.NewSchema([]arrow.Field{{Name: "field", Type: typ, Nullable: true}}, nil), + []arrow.Array{arr}, -1) + defer batch.Release() + + var written arrow.Record + { + var buf bytes.Buffer + wr := ipc.NewWriter(&buf, ipc.WithSchema(batch.Schema())) + require.NoError(t, wr.Write(batch)) + require.NoError(t, wr.Close()) + + rdr, err := ipc.NewReader(&buf) + require.NoError(t, err) + written, err = rdr.Read() + require.NoError(t, err) + written.Retain() + defer written.Release() + rdr.Release() + } + + assert.Truef(t, batch.Schema().Equal(written.Schema()), "expected: %s, got: %s", + batch.Schema(), written.Schema()) + + assert.Truef(t, array.RecordEqual(batch, written), "expected: %s, got: %s", + batch, written) +} + func BenchmarkWriteBool8Array(b *testing.B) { bool8bldr := extensions.NewBool8Builder(memory.DefaultAllocator) defer bool8bldr.Release() From e282e53ca1f0f05b77fdc15df625355867fe7696 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Wed, 24 Jul 2024 07:16:08 -0400 Subject: [PATCH 11/14] inline single-use function --- go/arrow/extensions/bool8.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/go/arrow/extensions/bool8.go b/go/arrow/extensions/bool8.go index 057e7b4809d..51dcf3c3f7d 100644 --- a/go/arrow/extensions/bool8.go +++ b/go/arrow/extensions/bool8.go @@ -87,7 +87,7 @@ func (a *Bool8Array) String() string { } func (a *Bool8Array) Value(i int) bool { - return int8ToBool(a.Storage().(*array.Int8).Value(i)) + return a.Storage().(*array.Int8).Value(i) != 0 } func (a *Bool8Array) ValueStr(i int) string { @@ -124,10 +124,6 @@ func boolToInt8(v bool) int8 { return res } -func int8ToBool(v int8) bool { - return v != 0 -} - // Bool8Builder is a convenience builder for the Bool8 extension type, // allowing arrays to be built with boolean values rather than the underlying storage type. type Bool8Builder struct { From 524bfbd9d63ea3c8f6e864674bfefe68676cb337 Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Mon, 12 Aug 2024 13:34:11 -0400 Subject: [PATCH 12/14] bump 17 to 18 --- go/arrow/array/extension_builder.go | 2 +- go/arrow/extensions/bool8.go | 8 ++++---- go/arrow/extensions/bool8_test.go | 12 ++++++------ go/arrow/extensions/extensions_test.go | 8 ++++---- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/go/arrow/array/extension_builder.go b/go/arrow/array/extension_builder.go index 9c4c8a85e61..9c2ee880564 100644 --- a/go/arrow/array/extension_builder.go +++ b/go/arrow/array/extension_builder.go @@ -16,7 +16,7 @@ package array -import "github.com/apache/arrow/go/v17/arrow/memory" +import "github.com/apache/arrow/go/v18/arrow/memory" // CustomExtensionBuilder is an interface that custom extension types may implement to provide a custom builder // instead of the underlying storage type's builder when array.NewBuilder is called with that type. diff --git a/go/arrow/extensions/bool8.go b/go/arrow/extensions/bool8.go index 51dcf3c3f7d..3d8a9e16d9e 100644 --- a/go/arrow/extensions/bool8.go +++ b/go/arrow/extensions/bool8.go @@ -23,10 +23,10 @@ import ( "strings" "unsafe" - "github.com/apache/arrow/go/v17/arrow" - "github.com/apache/arrow/go/v17/arrow/array" - "github.com/apache/arrow/go/v17/arrow/memory" - "github.com/apache/arrow/go/v17/internal/json" + "github.com/apache/arrow/go/v18/arrow" + "github.com/apache/arrow/go/v18/arrow/array" + "github.com/apache/arrow/go/v18/arrow/memory" + "github.com/apache/arrow/go/v18/internal/json" ) // Bool8Type represents a logical boolean that is stored using 8 bits. diff --git a/go/arrow/extensions/bool8_test.go b/go/arrow/extensions/bool8_test.go index da89594c478..b64d8147130 100644 --- a/go/arrow/extensions/bool8_test.go +++ b/go/arrow/extensions/bool8_test.go @@ -22,12 +22,12 @@ import ( "strings" "testing" - "github.com/apache/arrow/go/v17/arrow" - "github.com/apache/arrow/go/v17/arrow/array" - "github.com/apache/arrow/go/v17/arrow/extensions" - "github.com/apache/arrow/go/v17/arrow/ipc" - "github.com/apache/arrow/go/v17/arrow/memory" - "github.com/apache/arrow/go/v17/internal/json" + "github.com/apache/arrow/go/v18/arrow" + "github.com/apache/arrow/go/v18/arrow/array" + "github.com/apache/arrow/go/v18/arrow/extensions" + "github.com/apache/arrow/go/v18/arrow/ipc" + "github.com/apache/arrow/go/v18/arrow/memory" + "github.com/apache/arrow/go/v18/internal/json" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) diff --git a/go/arrow/extensions/extensions_test.go b/go/arrow/extensions/extensions_test.go index b28c00133cf..f56fed5e132 100644 --- a/go/arrow/extensions/extensions_test.go +++ b/go/arrow/extensions/extensions_test.go @@ -22,10 +22,10 @@ import ( "reflect" "testing" - "github.com/apache/arrow/go/v17/arrow" - "github.com/apache/arrow/go/v17/arrow/array" - "github.com/apache/arrow/go/v17/arrow/extensions" - "github.com/apache/arrow/go/v17/arrow/memory" + "github.com/apache/arrow/go/v18/arrow" + "github.com/apache/arrow/go/v18/arrow/array" + "github.com/apache/arrow/go/v18/arrow/extensions" + "github.com/apache/arrow/go/v18/arrow/memory" "github.com/stretchr/testify/require" ) From dbd9219e63c71845208e948429846073acb3544e Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Mon, 12 Aug 2024 14:28:47 -0400 Subject: [PATCH 13/14] add godoc and fix unmarshal --- go/arrow/extensions/bool8.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/go/arrow/extensions/bool8.go b/go/arrow/extensions/bool8.go index 3d8a9e16d9e..48422660f50 100644 --- a/go/arrow/extensions/bool8.go +++ b/go/arrow/extensions/bool8.go @@ -116,6 +116,8 @@ func (a *Bool8Array) GetOneForMarshal(i int) interface{} { return a.Value(i) } +// boolToInt8 performs the simple scalar conversion of bool to the canonical int8 +// value for the Bool8Type. func boolToInt8(v bool) int8 { var res int8 if v { @@ -130,6 +132,8 @@ type Bool8Builder struct { *array.ExtensionBuilder } +// NewBool8Builder creates a new Bool8Builder, exposing a convenient and efficient interface +// for writing boolean values to the underlying int8 storage array. func NewBool8Builder(mem memory.Allocator) *Bool8Builder { return &Bool8Builder{ExtensionBuilder: array.NewExtensionBuilder(mem, NewBool8Type())} } @@ -174,6 +178,9 @@ func (b *Bool8Builder) UnmarshalOne(dec *json.Decoder) error { return nil case string: return b.AppendValueFromString(v) + case int8: + b.ExtensionBuilder.Builder.(*array.Int8Builder).Append(v) + return nil case nil: b.AppendNull() return nil @@ -182,7 +189,7 @@ func (b *Bool8Builder) UnmarshalOne(dec *json.Decoder) error { Value: fmt.Sprint(t), Type: reflect.TypeOf([]byte{}), Offset: dec.InputOffset(), - Struct: fmt.Sprintf("FixedSizeBinary[%d]", 16), + Struct: "Bool8Builder", } } } From 424abd8152e7eba873789b1367d7624a6113bb3c Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Mon, 12 Aug 2024 14:57:07 -0400 Subject: [PATCH 14/14] add access to reinterpretted bool slice --- go/arrow/extensions/bool8.go | 5 +++++ go/arrow/extensions/bool8_test.go | 37 +++++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/go/arrow/extensions/bool8.go b/go/arrow/extensions/bool8.go index 48422660f50..20ab024a2a2 100644 --- a/go/arrow/extensions/bool8.go +++ b/go/arrow/extensions/bool8.go @@ -90,6 +90,11 @@ func (a *Bool8Array) Value(i int) bool { return a.Storage().(*array.Int8).Value(i) != 0 } +func (a *Bool8Array) BoolValues() []bool { + int8s := a.Storage().(*array.Int8).Int8Values() + return unsafe.Slice((*bool)(unsafe.Pointer(unsafe.SliceData(int8s))), len(int8s)) +} + func (a *Bool8Array) ValueStr(i int) string { switch { case a.IsNull(i): diff --git a/go/arrow/extensions/bool8_test.go b/go/arrow/extensions/bool8_test.go index b64d8147130..9f7365d1555 100644 --- a/go/arrow/extensions/bool8_test.go +++ b/go/arrow/extensions/bool8_test.go @@ -149,6 +149,33 @@ func TestCompareBool8AndBoolean(t *testing.T) { } } +func TestReinterpretStorageEqualToValues(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.DefaultAllocator) + defer mem.AssertSize(t, 0) + + bool8bldr := extensions.NewBool8Builder(mem) + defer bool8bldr.Release() + + inputVals := []bool{true, false, false, false, true} + inputValidity := []bool{true, false, true, false, true} + + bool8bldr.AppendValues(inputVals, inputValidity) + bool8Arr := bool8bldr.NewExtensionArray().(*extensions.Bool8Array) + defer bool8Arr.Release() + + boolValsCopy := make([]bool, bool8Arr.Len()) + for i := 0; i < bool8Arr.Len(); i++ { + boolValsCopy[i] = bool8Arr.Value(i) + } + + boolValsZeroCopy := bool8Arr.BoolValues() + + require.Equal(t, len(boolValsZeroCopy), len(boolValsCopy)) + for i := range boolValsCopy { + require.Equal(t, boolValsZeroCopy[i], boolValsCopy[i]) + } +} + func TestBool8TypeBatchIPCRoundTrip(t *testing.T) { typ := extensions.NewBool8Type() arrow.RegisterExtensionType(typ) @@ -233,6 +260,9 @@ func BenchmarkWriteBooleanArray(b *testing.B) { } } +// storage benchmark result at package level to prevent compiler from eliminating the function call +var result []bool + func BenchmarkReadBool8Array(b *testing.B) { bool8bldr := extensions.NewBool8Builder(memory.DefaultAllocator) defer bool8bldr.Release() @@ -241,7 +271,6 @@ func BenchmarkReadBool8Array(b *testing.B) { b.Run(fmt.Sprintf("len %d", sz), func(b *testing.B) { values := make([]bool, sz) - output := make([]bool, sz) for idx := range values { values[idx] = true } @@ -250,13 +279,13 @@ func BenchmarkReadBool8Array(b *testing.B) { bool8Arr := bool8bldr.NewArray().(*extensions.Bool8Array) defer bool8Arr.Release() + var r []bool b.ResetTimer() b.SetBytes(int64(len(values))) for n := 0; n < b.N; n++ { - for i := 0; i < bool8Arr.Len(); i++ { - output[i] = bool8Arr.Value(i) - } + r = bool8Arr.BoolValues() } + result = r }) } }