From 7ee42d19e8abc0f4dd9408dc41adf7ca3ca24b2a Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Wed, 4 Jun 2025 09:54:42 +0200 Subject: [PATCH] GH-46704: [C++] Fix OSS-Fuzz build failure PR #46408 changed by mistake list-view IPC tests to use the same data as list tests. This was detected as a duplicate corpus file by the OSS-Fuzz CI build. This PR also includes a fix for a regression in the CUDA tests, due to reading non-CPU memory. --- cpp/build-support/fuzzing/pack_corpus.py | 10 ++++++---- cpp/src/arrow/ipc/test_common.cc | 2 +- cpp/src/arrow/ipc/writer.cc | 18 +++++++++++++++--- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/cpp/build-support/fuzzing/pack_corpus.py b/cpp/build-support/fuzzing/pack_corpus.py index 07fc09f9026..94d9a88b387 100755 --- a/cpp/build-support/fuzzing/pack_corpus.py +++ b/cpp/build-support/fuzzing/pack_corpus.py @@ -27,7 +27,7 @@ def process_dir(corpus_dir, zip_output): - seen = set() + seen_hashes = {} for child in corpus_dir.iterdir(): if not child.is_file(): @@ -35,10 +35,12 @@ def process_dir(corpus_dir, zip_output): with child.open('rb') as f: data = f.read() arcname = hashlib.sha1(data).hexdigest() - if arcname in seen: - raise ValueError(f"Duplicate hash: {arcname} (in file {child})") + if arcname in seen_hashes: + raise ValueError( + f"Duplicate hash: {arcname} (in file {child}), " + f"already seen in file {seen_hashes[arcname]}") zip_output.writestr(str(arcname), data) - seen.add(arcname) + seen_hashes[arcname] = child def main(corpus_dir, zip_output_name): diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 46060a0db10..a739990fc93 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -474,7 +474,7 @@ Status MakeListViewRecordBatchSized(const int length, std::shared_ptr* out) { - return MakeListRecordBatchSized(200, out); + return MakeListViewRecordBatchSized(200, out); } Status MakeFixedSizeListRecordBatch(std::shared_ptr* out) { diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 8b7d943fc71..4238ecbf3a0 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -329,15 +329,24 @@ class RecordBatchSerializer { return Status::OK(); } - int64_t required_bytes = sizeof(offset_type) * (array.length() + 1); - if (array.value_offset(0) > 0) { + const int64_t required_bytes = sizeof(offset_type) * (array.length() + 1); + + offset_type first_offset = 0; + RETURN_NOT_OK(MemoryManager::CopyBufferSliceToCPU( + array.data()->buffers[1], array.offset() * sizeof(offset_type), + sizeof(offset_type), reinterpret_cast(&first_offset))); + + if (first_offset > 0) { // If the offset of the first value is non-zero, then we must create a new // offsets buffer with shifted offsets. + if (!array.data()->buffers[1]->is_cpu()) { + return Status::NotImplemented("Rebasing non-CPU offsets"); + } ARROW_ASSIGN_OR_RAISE(auto shifted_offsets, AllocateBuffer(required_bytes, options_.memory_pool)); - auto dest_offsets = shifted_offsets->mutable_span_as(); const offset_type* source_offsets = array.raw_value_offsets(); + auto dest_offsets = shifted_offsets->mutable_span_as(); const offset_type start_offset = source_offsets[0]; for (int i = 0; i <= array.length(); ++i) { @@ -369,6 +378,9 @@ class RecordBatchSerializer { // If we have a non-zero offset, it's likely that the smallest offset is // not zero. We must a) create a new offsets array with shifted offsets and // b) slice the values array accordingly. + if (!array.data()->buffers[1]->is_cpu()) { + return Status::NotImplemented("Rebasing non-CPU list view offsets"); + } ARROW_ASSIGN_OR_RAISE(auto shifted_offsets, AllocateBuffer(required_bytes, options_.memory_pool));