From 6c8431900ec978cea3ef19977ca240ff92be549b Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Wed, 7 May 2025 17:36:11 +0200 Subject: [PATCH] add TestDeltaByteArray test and update parquet-testing This is intended to compare with v18 in arrow-go repository to prove that delta_byte_array.parquet file cannot be read successfully --- cpp/submodules/parquet-testing | 2 +- go/parquet/file/file_reader_test.go | 53 +++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 74278bc4a11..39b91cf8530 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 74278bc4a1122d74945969e6dec405abd1533ec3 +Subproject commit 39b91cf853062d92f0d20581d37b20dabe70a6a0 diff --git a/go/parquet/file/file_reader_test.go b/go/parquet/file/file_reader_test.go index 8056a837ea1..cd381beeb22 100644 --- a/go/parquet/file/file_reader_test.go +++ b/go/parquet/file/file_reader_test.go @@ -18,8 +18,10 @@ package file_test import ( "bytes" + "context" "crypto/rand" "encoding/binary" + "encoding/csv" "io" "os" "path" @@ -34,6 +36,7 @@ import ( format "github.com/apache/arrow/go/v17/parquet/internal/gen-go/parquet" "github.com/apache/arrow/go/v17/parquet/internal/thrift" "github.com/apache/arrow/go/v17/parquet/metadata" + "github.com/apache/arrow/go/v17/parquet/pqarrow" "github.com/apache/arrow/go/v17/parquet/schema" libthrift "github.com/apache/thrift/lib/go/thrift" "github.com/stretchr/testify/assert" @@ -446,3 +449,53 @@ func TestRleBooleanEncodingFileRead(t *testing.T) { assert.Equal(t, expected, values[:len(expected)]) } + +func TestDeltaByteArray(t *testing.T) { + dir := os.Getenv("PARQUET_TEST_DATA") + if dir == "" { + t.Skip("no path supplied with PARQUET_TEST_DATA") + } + require.DirExists(t, dir) + + expected, err := os.ReadFile(path.Join(dir, "delta_byte_array_expect.csv")) + require.NoError(t, err) + csvReader := csv.NewReader(bytes.NewReader(expected)) + + records, err := csvReader.ReadAll() + require.NoError(t, err) + + records = records[1:] // skip header + + props := parquet.NewReaderProperties(memory.DefaultAllocator) + fileReader, err := file.OpenParquetFile(path.Join(dir, "delta_byte_array.parquet"), + false, file.WithReadProps(props)) + require.NoError(t, err) + defer fileReader.Close() + + arrowReader, err := pqarrow.NewFileReader( + fileReader, + pqarrow.ArrowReadProperties{BatchSize: 1024}, + memory.DefaultAllocator, + ) + require.NoError(t, err) + + rr, err := arrowReader.GetRecordReader(context.Background(), nil, nil) + require.NoError(t, err) + defer rr.Release() + + for rr.Next() { + rec := rr.Record() + defer rec.Release() + + for i := 0; i < int(rec.NumCols()); i++ { + vals := rec.Column(i) + for j := 0; j < vals.Len(); j++ { + if vals.IsNull(j) { + require.Equal(t, records[j][i], "") + continue + } + require.Equal(t, records[j][i], vals.ValueStr(j)) + } + } + } +}