Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions cpp/build-support/fuzzing/pack_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,20 @@


def process_dir(corpus_dir, zip_output):
seen = set()
seen_hashes = {}

for child in corpus_dir.iterdir():
if not child.is_file():
raise IOError(f"Not a file: {child}")
with child.open('rb') as f:
data = f.read()
arcname = hashlib.sha1(data).hexdigest()
if arcname in seen:
raise ValueError(f"Duplicate hash: {arcname} (in file {child})")
if arcname in seen_hashes:
raise ValueError(
f"Duplicate hash: {arcname} (in file {child}), "
f"already seen in file {seen_hashes[arcname]}")
zip_output.writestr(str(arcname), data)
seen.add(arcname)
seen_hashes[arcname] = child


def main(corpus_dir, zip_output_name):
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/ipc/test_common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -474,7 +474,7 @@ Status MakeListViewRecordBatchSized(const int length, std::shared_ptr<RecordBatc
}

Status MakeListViewRecordBatch(std::shared_ptr<RecordBatch>* out) {
return MakeListRecordBatchSized(200, out);
return MakeListViewRecordBatchSized(200, out);
}

Status MakeFixedSizeListRecordBatch(std::shared_ptr<RecordBatch>* out) {
Expand Down
18 changes: 15 additions & 3 deletions cpp/src/arrow/ipc/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -329,15 +329,24 @@ class RecordBatchSerializer {
return Status::OK();
}

int64_t required_bytes = sizeof(offset_type) * (array.length() + 1);
if (array.value_offset(0) > 0) {
const int64_t required_bytes = sizeof(offset_type) * (array.length() + 1);

offset_type first_offset = 0;
RETURN_NOT_OK(MemoryManager::CopyBufferSliceToCPU(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'll need some context here. Is this function supposed to be able to deal with non-CPU arrays? Does this rule apply to other functions in this source file too?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's supposed to.

array.data()->buffers[1], array.offset() * sizeof(offset_type),
sizeof(offset_type), reinterpret_cast<uint8_t*>(&first_offset)));

if (first_offset > 0) {
// If the offset of the first value is non-zero, then we must create a new
// offsets buffer with shifted offsets.
if (!array.data()->buffers[1]->is_cpu()) {
return Status::NotImplemented("Rebasing non-CPU offsets");
}
Comment on lines +342 to +344
Copy link
Contributor

@zanmato1984 zanmato1984 Jun 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two questions:

  1. Is this error handling necessary to fix the "CUDA tests" mentioned in the PR description?
  2. It appears that if the CUDA tests were failing with accessing the non-CPU first_offset, and the above MemoryManager::CopyBufferSliceToCPU is supposed to fix it, wouldn't this check fail the tests again with Unimplemented? (I assume the said tests are exercising non-CPU buffers with first_offset > 0.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Yes
  2. This codepath is only taken with a non-zero first offset, but the CUDA tests don't exercise that (otherwise they would have been crashing on git main too)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've opened #46747 for this.

ARROW_ASSIGN_OR_RAISE(auto shifted_offsets,
AllocateBuffer(required_bytes, options_.memory_pool));

auto dest_offsets = shifted_offsets->mutable_span_as<offset_type>();
const offset_type* source_offsets = array.raw_value_offsets();
auto dest_offsets = shifted_offsets->mutable_span_as<offset_type>();
const offset_type start_offset = source_offsets[0];

for (int i = 0; i <= array.length(); ++i) {
Expand Down Expand Up @@ -369,6 +378,9 @@ class RecordBatchSerializer {
// If we have a non-zero offset, it's likely that the smallest offset is
// not zero. We must a) create a new offsets array with shifted offsets and
// b) slice the values array accordingly.
if (!array.data()->buffers[1]->is_cpu()) {
return Status::NotImplemented("Rebasing non-CPU list view offsets");
}

ARROW_ASSIGN_OR_RAISE(auto shifted_offsets,
AllocateBuffer(required_bytes, options_.memory_pool));
Expand Down
Loading