apache · pitrou · Jan 17, 2024 · Jan 12, 2024 · Jan 12, 2024 · pitrou
diff --git a/cpp/src/arrow/compute/light_array.cc b/cpp/src/arrow/compute/light_array.cc
@@ -383,27 +383,22 @@ int ExecBatchBuilder::NumRowsToSkip(const std::shared_ptr<ArrayData>& column,
 
   KeyColumnMetadata column_metadata =
       ColumnMetadataFromDataType(column->type).ValueOrDie();
+  ARROW_DCHECK(!column_metadata.is_fixed_length || column_metadata.fixed_length > 0);
 
   int num_rows_left = num_rows;
   int num_bytes_skipped = 0;
   while (num_rows_left > 0 && num_bytes_skipped < num_tail_bytes_to_skip) {
+    --num_rows_left;
+    int row_id_removed = row_ids[num_rows_left];
     if (column_metadata.is_fixed_length) {
-      if (column_metadata.fixed_length == 0) {
-        num_rows_left = std::max(num_rows_left, 8) - 8;
-        ++num_bytes_skipped;
 case 0: 
   CollectBits(source->buffers[1]->data(), source->offset, target->mutable_data(1), 
               num_rows_before, num_rows_to_append, row_ids); 
   break; 
 case 1: 
   Visit(source, num_rows_to_append, row_ids, 
         [&](int i, const uint8_t* ptr, uint32_t num_bytes) { 
           target->mutable_data(1)[num_rows_before + i] = *ptr; 
         }); 
   break; 
 case 2: 
   Visit( 
       source, num_rows_to_append, row_ids, 
       [&](int i, const uint8_t* ptr, uint32_t num_bytes) { 
         reinterpret_cast<uint16_t*>(target->mutable_data(1))[num_rows_before + i] = 
             *reinterpret_cast<const uint16_t*>(ptr); 
       }); 
   break; 
 case 4: 
   Visit( 
       source, num_rows_to_append, row_ids, 
       [&](int i, const uint8_t* ptr, uint32_t num_bytes) { 
         reinterpret_cast<uint32_t*>(target->mutable_data(1))[num_rows_before + i] = 
             *reinterpret_cast<const uint32_t*>(ptr); 
       }); 
   break; 
 case 8: 
   Visit( 
       source, num_rows_to_append, row_ids, 
       [&](int i, const uint8_t* ptr, uint32_t num_bytes) { 
         reinterpret_cast<uint64_t*>(target->mutable_data(1))[num_rows_before + i] = 
             *reinterpret_cast<const uint64_t*>(ptr); 
       }); 
   break; 
 case 0: 
   CollectBits(source->buffers[1]->data(), source->offset, target->mutable_data(1), 
               num_rows_before, num_rows_to_append, row_ids); 
   break; 
 case 1: 
   Visit(source, num_rows_to_append, row_ids, 
         [&](int i, const uint8_t* ptr, uint32_t num_bytes) { 
           target->mutable_data(1)[num_rows_before + i] = *ptr; 
         }); 
   break; 
 case 2: 
   Visit( 
       source, num_rows_to_append, row_ids, 
       [&](int i, const uint8_t* ptr, uint32_t num_bytes) { 
         reinterpret_cast<uint16_t*>(target->mutable_data(1))[num_rows_before + i] = 
             *reinterpret_cast<const uint16_t*>(ptr); 
       }); 
   break; 
 case 4: 
   Visit( 
       source, num_rows_to_append, row_ids, 
       [&](int i, const uint8_t* ptr, uint32_t num_bytes) { 
         reinterpret_cast<uint32_t*>(target->mutable_data(1))[num_rows_before + i] = 
             *reinterpret_cast<const uint32_t*>(ptr); 
       }); 
   break; 
 case 8: 
   Visit( 
       source, num_rows_to_append, row_ids, 
       [&](int i, const uint8_t* ptr, uint32_t num_bytes) { 
         reinterpret_cast<uint64_t*>(target->mutable_data(1))[num_rows_before + i] = 
             *reinterpret_cast<const uint64_t*>(ptr); 
       }); 
   break; 
-      } else {
-        --num_rows_left;
-        num_bytes_skipped += column_metadata.fixed_length;
-      }
+      num_bytes_skipped += column_metadata.fixed_length;
     } else {
-      --num_rows_left;
-      int row_id_removed = row_ids[num_rows_left];
       const int32_t* offsets = column->GetValues<int32_t>(1);
       num_bytes_skipped += offsets[row_id_removed + 1] - offsets[row_id_removed];
-      // Skip consecutive rows with the same id
-      while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left - 1]) {
-        --num_rows_left;
-      }
+    }
+    // Skip consecutive rows with the same id
+    while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left - 1]) {
+      --num_rows_left;
     }
   }
 

diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc
@@ -474,15 +474,18 @@ TEST(ExecBatchBuilder, AppendBatchesSomeRows) {
 TEST(ExecBatchBuilder, AppendBatchDupRows) {
   std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
   MemoryPool* pool = owned_pool.get();
+
   // Case of cross-word copying for the last row, which may exceed the buffer boundary.
-  // This is a simplified case of GH-32570
+  //
   {
+    // This is a simplified case of GH-32570
     // 64-byte data fully occupying one minimal 64-byte aligned memory region.
-    ExecBatch batch_string = JSONToExecBatch({binary()}, R"([["123456789ABCDEF0"],
-      ["123456789ABCDEF0"],
-      ["123456789ABCDEF0"],
-      ["ABCDEF0"],
-      ["123456789"]])");  // 9-byte tail row, larger than a word.
+    ExecBatch batch_string = JSONToExecBatch({binary()}, R"([
+        ["123456789ABCDEF0"],
+        ["123456789ABCDEF0"],
+        ["123456789ABCDEF0"],
+        ["ABCDEF0"],
+        ["123456789"]])");  // 9-byte tail row, larger than a word.
     ASSERT_EQ(batch_string[0].array()->buffers[1]->capacity(), 64);
     ASSERT_EQ(batch_string[0].array()->buffers[2]->capacity(), 64);
     ExecBatchBuilder builder;
@@ -494,6 +497,66 @@ TEST(ExecBatchBuilder, AppendBatchDupRows) {
     ASSERT_EQ(batch_string_appended, built);
     ASSERT_NE(0, pool->bytes_allocated());
   }
+
+  {
+    // This is a simplified case of GH-39583, using fsb(3) type.
+    // 63-byte data occupying almost one minimal 64-byte aligned memory region.
+    ExecBatch batch_fsb = JSONToExecBatch({fixed_size_binary(3)}, R"([
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["123"]])");  // 3-byte tail row, not aligned to a word.
+    ASSERT_EQ(batch_fsb[0].array()->buffers[1]->capacity(), 64);
+    ExecBatchBuilder builder;
+    uint16_t row_ids[4] = {20, 20, 20,
+                           20};  // Get the last row 4 times, 3 to skip a word.
+    ASSERT_OK(builder.AppendSelected(pool, batch_fsb, 4, row_ids, /*num_cols=*/1));
+    ExecBatch built = builder.Flush();
+    ExecBatch batch_fsb_appended = JSONToExecBatch(
+        {fixed_size_binary(3)}, R"([["123"], ["123"], ["123"], ["123"]])");
+    ASSERT_EQ(batch_fsb_appended, built);
+    ASSERT_NE(0, pool->bytes_allocated());
+  }
+
+  {
+    // This is a simplified case of GH-39583, using fsb(9) type.
+    // 63-byte data occupying almost one minimal 64-byte aligned memory region.
+    ExecBatch batch_fsb = JSONToExecBatch({fixed_size_binary(9)}, R"([
+        ["000000000"],
+        ["000000000"],
+        ["000000000"],
+        ["000000000"],
+        ["000000000"],
+        ["000000000"],
+        ["123456789"]])");  // 9-byte tail row, not aligned to a word.
+    ASSERT_EQ(batch_fsb[0].array()->buffers[1]->capacity(), 64);
+    ExecBatchBuilder builder;
+    uint16_t row_ids[2] = {6, 6};  // Get the last row 2 times, 1 to skip a word.
+    ASSERT_OK(builder.AppendSelected(pool, batch_fsb, 2, row_ids, /*num_cols=*/1));
+    ExecBatch built = builder.Flush();
+    ExecBatch batch_fsb_appended =
+        JSONToExecBatch({fixed_size_binary(9)}, R"([["123456789"], ["123456789"]])");
+    ASSERT_EQ(batch_fsb_appended, built);
+    ASSERT_NE(0, pool->bytes_allocated());
+  }
+
   ASSERT_EQ(0, pool->bytes_allocated());
 }