From cdf2ba04efc87caefdacfbc87873b1d7b4f7c7e3 Mon Sep 17 00:00:00 2001 From: zanmato1984 Date: Sat, 13 Jan 2024 00:44:48 +0800 Subject: [PATCH 1/2] Fix --- cpp/src/arrow/compute/light_array.cc | 21 +++---- cpp/src/arrow/compute/light_array_test.cc | 75 +++++++++++++++++++++-- 2 files changed, 77 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc index 73ea01a03a8..66d8477b029 100644 --- a/cpp/src/arrow/compute/light_array.cc +++ b/cpp/src/arrow/compute/light_array.cc @@ -383,27 +383,22 @@ int ExecBatchBuilder::NumRowsToSkip(const std::shared_ptr& column, KeyColumnMetadata column_metadata = ColumnMetadataFromDataType(column->type).ValueOrDie(); + ARROW_DCHECK(!column_metadata.is_fixed_length || column_metadata.fixed_length > 0); int num_rows_left = num_rows; int num_bytes_skipped = 0; while (num_rows_left > 0 && num_bytes_skipped < num_tail_bytes_to_skip) { + --num_rows_left; + int row_id_removed = row_ids[num_rows_left]; if (column_metadata.is_fixed_length) { - if (column_metadata.fixed_length == 0) { - num_rows_left = std::max(num_rows_left, 8) - 8; - ++num_bytes_skipped; - } else { - --num_rows_left; - num_bytes_skipped += column_metadata.fixed_length; - } + num_bytes_skipped += column_metadata.fixed_length; } else { - --num_rows_left; - int row_id_removed = row_ids[num_rows_left]; const int32_t* offsets = column->GetValues(1); num_bytes_skipped += offsets[row_id_removed + 1] - offsets[row_id_removed]; - // Skip consecutive rows with the same id - while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left - 1]) { - --num_rows_left; - } + } + // Skip consecutive rows with the same id + while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left - 1]) { + --num_rows_left; } } diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 3ceba43604b..a395504383b 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -474,15 +474,18 @@ TEST(ExecBatchBuilder, AppendBatchesSomeRows) { TEST(ExecBatchBuilder, AppendBatchDupRows) { std::unique_ptr owned_pool = MemoryPool::CreateDefault(); MemoryPool* pool = owned_pool.get(); + // Case of cross-word copying for the last row, which may exceed the buffer boundary. - // This is a simplified case of GH-32570 + // { + // This is a simplified case of GH-32570 // 64-byte data fully occupying one minimal 64-byte aligned memory region. - ExecBatch batch_string = JSONToExecBatch({binary()}, R"([["123456789ABCDEF0"], - ["123456789ABCDEF0"], - ["123456789ABCDEF0"], - ["ABCDEF0"], - ["123456789"]])"); // 9-byte tail row, larger than a word. + ExecBatch batch_string = JSONToExecBatch({binary()}, R"([ + ["123456789ABCDEF0"], + ["123456789ABCDEF0"], + ["123456789ABCDEF0"], + ["ABCDEF0"], + ["123456789"]])"); // 9-byte tail row, larger than a word. ASSERT_EQ(batch_string[0].array()->buffers[1]->capacity(), 64); ASSERT_EQ(batch_string[0].array()->buffers[2]->capacity(), 64); ExecBatchBuilder builder; @@ -494,6 +497,66 @@ TEST(ExecBatchBuilder, AppendBatchDupRows) { ASSERT_EQ(batch_string_appended, built); ASSERT_NE(0, pool->bytes_allocated()); } + + { + // This is a simplified case of GH-39358, using fsb(3) type. + // 63-byte data occupying almost one minimal 64-byte aligned memory region. + ExecBatch batch_fsb = JSONToExecBatch({fixed_size_binary(3)}, R"([ + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["000"], + ["123"]])"); // 3-byte tail row, not aligned to a word. + ASSERT_EQ(batch_fsb[0].array()->buffers[1]->capacity(), 64); + ExecBatchBuilder builder; + uint16_t row_ids[4] = {20, 20, 20, + 20}; // Get the last row 4 times, 3 to skip a word. + ASSERT_OK(builder.AppendSelected(pool, batch_fsb, 4, row_ids, /*num_cols=*/1)); + ExecBatch built = builder.Flush(); + ExecBatch batch_fsb_appended = JSONToExecBatch( + {fixed_size_binary(3)}, R"([["123"], ["123"], ["123"], ["123"]])"); + ASSERT_EQ(batch_fsb_appended, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + + { + // This is a simplified case of GH-39358, using fsb(9) type. + // 63-byte data occupying almost one minimal 64-byte aligned memory region. + ExecBatch batch_fsb = JSONToExecBatch({fixed_size_binary(9)}, R"([ + ["000000000"], + ["000000000"], + ["000000000"], + ["000000000"], + ["000000000"], + ["000000000"], + ["123456789"]])"); // 9-byte tail row, not aligned to a word. + ASSERT_EQ(batch_fsb[0].array()->buffers[1]->capacity(), 64); + ExecBatchBuilder builder; + uint16_t row_ids[2] = {6, 6}; // Get the last row 2 times, 1 to skip a word. + ASSERT_OK(builder.AppendSelected(pool, batch_fsb, 2, row_ids, /*num_cols=*/1)); + ExecBatch built = builder.Flush(); + ExecBatch batch_fsb_appended = + JSONToExecBatch({fixed_size_binary(9)}, R"([["123456789"], ["123456789"]])"); + ASSERT_EQ(batch_fsb_appended, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + ASSERT_EQ(0, pool->bytes_allocated()); } From 20d9a3829589765f804899b94438ffb3fa0cca14 Mon Sep 17 00:00:00 2001 From: zanmato1984 Date: Sat, 13 Jan 2024 00:47:41 +0800 Subject: [PATCH 2/2] Typo --- cpp/src/arrow/compute/light_array_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index a395504383b..d50e9675517 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -499,7 +499,7 @@ TEST(ExecBatchBuilder, AppendBatchDupRows) { } { - // This is a simplified case of GH-39358, using fsb(3) type. + // This is a simplified case of GH-39583, using fsb(3) type. // 63-byte data occupying almost one minimal 64-byte aligned memory region. ExecBatch batch_fsb = JSONToExecBatch({fixed_size_binary(3)}, R"([ ["000"], @@ -536,7 +536,7 @@ TEST(ExecBatchBuilder, AppendBatchDupRows) { } { - // This is a simplified case of GH-39358, using fsb(9) type. + // This is a simplified case of GH-39583, using fsb(9) type. // 63-byte data occupying almost one minimal 64-byte aligned memory region. ExecBatch batch_fsb = JSONToExecBatch({fixed_size_binary(9)}, R"([ ["000000000"],