From 3018180c6caef33863cc645d24920282d62b6788 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 8 Aug 2024 18:23:21 +0200 Subject: [PATCH 1/2] GH-38041: [C++][CI] Improve IPC fuzzing seed corpus 1. Add fuzz seeds with newer datatypes such as Run-End Encoded and String Views 2. Add fuzz seeds with buffer compression 3. Build seed corpus generation utilities even when fuzzing isn't enabled, for convenience --- cpp/src/arrow/ipc/CMakeLists.txt | 5 ++- cpp/src/arrow/ipc/generate_fuzz_corpus.cc | 44 +++++++++++++------ .../arrow/ipc/generate_tensor_fuzz_corpus.cc | 2 +- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 2fc9b145ccc..9af86cc05bf 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -71,7 +71,10 @@ endif() add_arrow_benchmark(read_write_benchmark PREFIX "arrow-ipc") -if(ARROW_FUZZING) +if(ARROW_FUZZING + OR (ARROW_BUILD_UTILITIES + AND ARROW_WITH_LZ4 + AND ARROW_WITH_ZSTD)) add_executable(arrow-ipc-generate-fuzz-corpus generate_fuzz_corpus.cc) target_link_libraries(arrow-ipc-generate-fuzz-corpus ${ARROW_UTIL_LIB} ${ARROW_TEST_LINK_LIBS}) diff --git a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc index 682c352132a..6ccf1155d12 100644 --- a/cpp/src/arrow/ipc/generate_fuzz_corpus.cc +++ b/cpp/src/arrow/ipc/generate_fuzz_corpus.cc @@ -33,11 +33,11 @@ #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/testing/extension_type.h" +#include "arrow/util/compression.h" #include "arrow/util/io_util.h" #include "arrow/util/key_value_metadata.h" -namespace arrow { -namespace ipc { +namespace arrow::ipc { using ::arrow::internal::CreateDir; using ::arrow::internal::PlatformFilename; @@ -88,6 +88,13 @@ Result>> Batches() { batches.push_back(batch); RETURN_NOT_OK(test::MakeFixedSizeListRecordBatch(&batch)); batches.push_back(batch); + RETURN_NOT_OK(test::MakeStringTypesRecordBatch(&batch)); + batches.push_back(batch); + RETURN_NOT_OK(test::MakeUuid(&batch)); + batches.push_back(batch); + RETURN_NOT_OK(test::MakeRunEndEncoded(&batch)); + batches.push_back(batch); + ARROW_ASSIGN_OR_RAISE(batch, MakeExtensionBatch()); batches.push_back(batch); ARROW_ASSIGN_OR_RAISE(batch, MakeMapBatch()); @@ -97,13 +104,14 @@ Result>> Batches() { } Result> SerializeRecordBatch( - const std::shared_ptr& batch, bool is_stream_format) { + const std::shared_ptr& batch, const IpcWriteOptions& options, + bool is_stream_format) { ARROW_ASSIGN_OR_RAISE(auto sink, io::BufferOutputStream::Create(1024)); std::shared_ptr writer; if (is_stream_format) { - ARROW_ASSIGN_OR_RAISE(writer, MakeStreamWriter(sink, batch->schema())); + ARROW_ASSIGN_OR_RAISE(writer, MakeStreamWriter(sink, batch->schema(), options)); } else { - ARROW_ASSIGN_OR_RAISE(writer, MakeFileWriter(sink, batch->schema())); + ARROW_ASSIGN_OR_RAISE(writer, MakeFileWriter(sink, batch->schema(), options)); } RETURN_NOT_OK(writer->WriteRecordBatch(*batch)); RETURN_NOT_OK(writer->Close()); @@ -119,16 +127,27 @@ Status DoMain(bool is_stream_format, const std::string& out_dir) { return "batch-" + std::to_string(sample_num++); }; + // codec 0 is uncompressed + std::vector> codecs(3, nullptr); + ARROW_ASSIGN_OR_RAISE(codecs[1], util::Codec::Create(Compression::LZ4_FRAME)); + ARROW_ASSIGN_OR_RAISE(codecs[2], util::Codec::Create(Compression::ZSTD)); + ARROW_ASSIGN_OR_RAISE(auto batches, Batches()); + // Emit a separate file for each (batch, codec) pair for (const auto& batch : batches) { RETURN_NOT_OK(batch->ValidateFull()); - ARROW_ASSIGN_OR_RAISE(auto buf, SerializeRecordBatch(batch, is_stream_format)); - ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name())); - std::cerr << sample_fn.ToString() << std::endl; - ARROW_ASSIGN_OR_RAISE(auto file, io::FileOutputStream::Open(sample_fn.ToString())); - RETURN_NOT_OK(file->Write(buf)); - RETURN_NOT_OK(file->Close()); + for (const auto& codec : codecs) { + IpcWriteOptions options = IpcWriteOptions::Defaults(); + options.codec = codec; + ARROW_ASSIGN_OR_RAISE(auto buf, + SerializeRecordBatch(batch, options, is_stream_format)); + ARROW_ASSIGN_OR_RAISE(auto sample_fn, dir_fn.Join(sample_name())); + std::cerr << sample_fn.ToString() << std::endl; + ARROW_ASSIGN_OR_RAISE(auto file, io::FileOutputStream::Open(sample_fn.ToString())); + RETURN_NOT_OK(file->Write(buf)); + RETURN_NOT_OK(file->Close()); + } } return Status::OK(); } @@ -157,7 +176,6 @@ int Main(int argc, char** argv) { return 0; } -} // namespace ipc -} // namespace arrow +} // namespace arrow::ipc int main(int argc, char** argv) { return arrow::ipc::Main(argc, argv); } diff --git a/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc b/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc index dd40ef0ab2f..870f4586708 100644 --- a/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc +++ b/cpp/src/arrow/ipc/generate_tensor_fuzz_corpus.cc @@ -41,7 +41,7 @@ using ::arrow::internal::PlatformFilename; Result PrepareDirectory(const std::string& dir) { ARROW_ASSIGN_OR_RAISE(auto dir_fn, PlatformFilename::FromString(dir)); RETURN_NOT_OK(::arrow::internal::CreateDir(dir_fn)); - return std::move(dir_fn); + return dir_fn; } Result> MakeSerializedBuffer( From 9f6f1f64337ec21996eccdd03fe3bf00f1487306 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 8 Aug 2024 18:39:42 +0200 Subject: [PATCH 2/2] Require libarrow-testing for the fuzz generation utilities --- cpp/src/arrow/ipc/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 9af86cc05bf..9e0b1d723b9 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -73,8 +73,10 @@ add_arrow_benchmark(read_write_benchmark PREFIX "arrow-ipc") if(ARROW_FUZZING OR (ARROW_BUILD_UTILITIES + AND ARROW_TESTING AND ARROW_WITH_LZ4 - AND ARROW_WITH_ZSTD)) + AND ARROW_WITH_ZSTD + )) add_executable(arrow-ipc-generate-fuzz-corpus generate_fuzz_corpus.cc) target_link_libraries(arrow-ipc-generate-fuzz-corpus ${ARROW_UTIL_LIB} ${ARROW_TEST_LINK_LIBS})