Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion go/arrow/array/binarybuilder.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package array
import (
"bytes"
"encoding/base64"
"encoding/hex"
"fmt"
"math"
"reflect"
Expand Down Expand Up @@ -319,7 +320,10 @@ func (b *BinaryBuilder) UnmarshalOne(dec *json.Decoder) error {
case string:
data, err := base64.StdEncoding.DecodeString(v)
if err != nil {
return err
data, err = hex.DecodeString(v)
if err != nil {
return err
}
Comment on lines -322 to +326
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems odd to add here. Should we also add this to AppendValueFromString and update the docs that you can also use hex to specify binary strings?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm....Prob it was in another branch and ended up here. will remove

}
b.Append(data)
case []byte:
Expand Down
33 changes: 31 additions & 2 deletions go/arrow/array/compare.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package array
import (
"fmt"
"math"
"strings"

"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/float16"
Expand Down Expand Up @@ -494,13 +495,13 @@ func arrayApproxEqual(left, right arrow.Array, opt equalOption) bool {
return arrayEqualBinary(l, r)
case *String:
r := right.(*String)
return arrayEqualString(l, r)
return arrayApproxEqualString(l, r)
case *LargeBinary:
r := right.(*LargeBinary)
return arrayEqualLargeBinary(l, r)
case *LargeString:
r := right.(*LargeString)
return arrayEqualLargeString(l, r)
return arrayApproxEqualLargeString(l, r)
case *Int8:
r := right.(*Int8)
return arrayEqualInt8(l, r)
Expand Down Expand Up @@ -630,6 +631,34 @@ func validityBitmapEqual(left, right arrow.Array) bool {
return true
}

func stripNulls(s string) string {
return strings.ReplaceAll(s, "\x00", "")
}

Comment on lines +634 to +637
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the Arrow spec a String array should be all valid utf8. So shouldn't this only be for trimming trailing Nulls? (ie: use strings.TrimRight instead of strings.ReplaceAll )?

To this end, can you have a test with a string that contains a null inside it that isn't at the end and ensure that this approxequal only trims nulls, and doesn't strip them from within the string?

Copy link
Contributor Author

@yevgenypats yevgenypats Apr 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, wasn't aware of the valid utf8 specification that can't contain NULLs in the middle of the string. In that case there is no check in string.Append method to make sure there are no nulls inside the string. Should we add this check or string nulls inside the string? wdyt?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So for performance reasons we don't validate the utf8 strings on Append currently and leave it up to the producer to ensure that they are passing valid utf-8 strings when constructing the array (if it's not valid utf-8 they should be using a Binary array).

On my list of things to do eventually is a "Validate" method for each array type like the C++ library has. That "Validate" method would do the UTF-8 validity check on the buffer so that a consumer can choose when they take the performance hit for validating the utf-8.

func arrayApproxEqualString(left, right *String) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
if stripNulls(left.Value(i)) != stripNulls(right.Value(i)) {
return false
}
}
return true
}

func arrayApproxEqualLargeString(left, right *LargeString) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
if stripNulls(left.Value(i)) != stripNulls(right.Value(i)) {
return false
}
}
return true
}

func arrayApproxEqualFloat16(left, right *Float16, opt equalOption) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
Expand Down
47 changes: 47 additions & 0 deletions go/arrow/array/compare_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,48 @@ func TestArrayApproxEqual(t *testing.T) {
}
}

func TestArrayApproxEqualStrings(t *testing.T) {
for _, tc := range []struct {
name string
a1 interface{}
a2 interface{}
want bool
}{
{
name: "string",
a1: []string{"a", "b"},
a2: []string{"a", "b"},
want: true,
},
{
name: "string",
a1: []string{"a", "b\x00"},
a2: []string{"a", "b"},
want: true,
},
{
name: "string",
a1: []string{"a", "b\x00"},
a2: []string{"a\x00", "b"},
want: true,
},
} {
t.Run(tc.name, func(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

a1 := arrayOf(mem, tc.a1, nil)
defer a1.Release()
a2 := arrayOf(mem, tc.a2, nil)
defer a2.Release()

if got, want := array.ApproxEqual(a1, a2), tc.want; got != want {
t.Fatalf("invalid comparison: got=%v, want=%v\na1: %v\na2: %v\n", got, want, a1, a2)
}
})
}
}

func TestArrayApproxEqualFloats(t *testing.T) {
f16sFrom := func(vs []float64) []float16.Num {
o := make([]float16.Num, len(vs))
Expand Down Expand Up @@ -328,7 +370,12 @@ func arrayOf(mem memory.Allocator, a interface{}, valids []bool) arrow.Array {

bldr.AppendValues(a, valids)
return bldr.NewFloat64Array()
case []string:
bldr := array.NewStringBuilder(mem)
defer bldr.Release()

bldr.AppendValues(a, valids)
return bldr.NewStringArray()
default:
panic(fmt.Errorf("arrdata: invalid data slice type %T", a))
}
Expand Down