diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0558cba495a..08586a64d03 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -147,6 +147,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Build the Arrow CPython extensions" OFF) + option(ARROW_FUZZING + "Build Arrow Fuzzing executables" + OFF) + option(ARROW_SSE3 "Build Arrow with SSE3" ON) @@ -249,6 +253,10 @@ if(NOT ARROW_BUILD_BENCHMARKS) set(NO_BENCHMARKS 1) endif() +if (NOT ARROW_FUZZING) + set(NO_FUZZING 1) +endif() + if(ARROW_HDFS) set(ARROW_BOOST_HEADER_ONLY 0) else() @@ -300,6 +308,9 @@ if ("${COMPILER_FAMILY}" STREQUAL "clang") endif() # ASAN / TSAN / UBSAN +if(ARROW_FUZZING) + set(ARROW_USE_COVERAGE ON) +endif() include(san-config) # For any C code, use the same flags. diff --git a/cpp/README.md b/cpp/README.md index 52169974de4..1daf863819c 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -99,6 +99,40 @@ and benchmarks or `make runbenchmark` to run only the benchmark tests. Benchmark logs will be placed in the build directory under `build/benchmark-logs`. +## Building/Running fuzzers + +Fuzzers can help finding unhandled exceptions and problems with untrusted input that +may lead to crashes, security issues and undefined behavior. They do this by +generating random input data and observing the behavior of the executed code. To build +the fuzzer code, LLVM is required (GCC-based compilers won't work). You can build them +using the following code: + + cmake -DARROW_FUZZING=ON -DARROW_USE_ASAN=ON .. + +`ARROW_FUZZING` will enable building of fuzzer executables as well as enable the +addition of coverage helpers via `ARROW_USE_COVERAGE`, so that the fuzzer can observe +the program execution. + +It is also wise to enable some sanitizers like `ARROW_USE_ASAN` (see above), which +activates the address sanitizer. This way, we ensure that bad memory operations +provoked by the fuzzer will be found early. You may also enable other sanitizers as +well. Just keep in mind that some of them do not work together and some may result +in very long execution times, which will slow down the fuzzing procedure. + +Now you can start one of the fuzzer, e.g.: + + ./debug/debug/ipc-fuzzing-test + +This will try to find a malformed input that crashes the payload and will show the +stack trace as well as the input data. After a problem was found this way, it should +be reported and fixed. Usually, the fuzzing process cannot be continued until the +fix is applied, since the fuzzer usually converts to the problem again. + +There are some problems that may occur during the compilation process: + +- libfuzzer was not distributed with your LLVM: `ld: file not found: .../libLLVMFuzzer.a` +- your LLVM is too old: `clang: error: unsupported argument 'fuzzer' to option 'fsanitize='` + ### Third-party environment variables To set up your own specific build toolchain, here are the relevant environment diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 207bb9aede0..f8c0f5ed237 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -347,3 +347,34 @@ function(ARROW_TEST_LINK_LIBRARIES REL_TEST_NAME) target_link_libraries(${TEST_NAME} ${ARGN}) endfunction() + + +############################################################ +# Fuzzing +############################################################ +# Add new fuzzing test executable. +# +# The single source file must define a function: +# extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) +# +# No main function must be present within the source file! +# +function(ADD_ARROW_FUZZING REL_FUZZING_NAME) + if(NO_FUZZING) + return() + endif() + + if (ARROW_BUILD_STATIC) + set(FUZZ_LINK_LIBS arrow_static) + else() + set(FUZZ_LINK_LIBS arrow_shared) + endif() + + add_executable(${REL_FUZZING_NAME} "${REL_FUZZING_NAME}.cc") + target_link_libraries(${REL_FUZZING_NAME} ${FUZZ_LINK_LIBS}) + target_compile_options(${REL_FUZZING_NAME} + PRIVATE "-fsanitize=fuzzer") + set_target_properties(${REL_FUZZING_NAME} + PROPERTIES + LINK_FLAGS "-fsanitize=fuzzer") +endfunction() diff --git a/cpp/cmake_modules/san-config.cmake b/cpp/cmake_modules/san-config.cmake index 1917eabe8b4..f2de9cf1f75 100644 --- a/cpp/cmake_modules/san-config.cmake +++ b/cpp/cmake_modules/san-config.cmake @@ -87,6 +87,16 @@ if (${ARROW_USE_TSAN}) endif() +if (${ARROW_USE_COVERAGE}) + if(NOT ("${COMPILER_FAMILY}" STREQUAL "clang")) + message(SEND_ERROR "You can only enable coverage with clang") + endif() + add_definitions("-fsanitize-coverage=trace-pc-guard") + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize-coverage=trace-pc-guard") +endif() + + if ("${ARROW_USE_UBSAN}" OR "${ARROW_USE_ASAN}" OR "${ARROW_USE_TSAN}") # GCC 4.8 and 4.9 (latest as of this writing) don't allow you to specify a # sanitizer blacklist. diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index a24349fd6c9..2976a15ce27 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -117,3 +117,5 @@ if (ARROW_BUILD_UTILITIES) endif() ADD_ARROW_BENCHMARK(ipc-read-write-benchmark) + +ADD_ARROW_FUZZING(ipc-fuzzing-test) diff --git a/cpp/src/arrow/ipc/ipc-fuzzing-test.cc b/cpp/src/arrow/ipc/ipc-fuzzing-test.cc new file mode 100644 index 00000000000..83fe94b139b --- /dev/null +++ b/cpp/src/arrow/ipc/ipc-fuzzing-test.cc @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { + arrow::Status status; + + auto buffer = std::make_shared(data, size); + arrow::io::BufferReader buffer_reader(buffer); + + std::shared_ptr batch_reader; + status = arrow::ipc::RecordBatchStreamReader::Open(&buffer_reader, &batch_reader); + if (!status.ok()) { + return 0; + } + + std::shared_ptr batch; + do { + status = batch_reader->ReadNext(&batch); + if (!status.ok()) { + return 0; + } + } while (batch); + return 0; +}